In [None]:
import numpy as np
import gzip
import pickle
import os
import glob
import time
import cv2
import datetime
import pandas as pd
from sklearn.metrics import fbeta_score
from sklearn.model_selection import KFold, train_test_split
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
import random
import shutil
import operator
from PIL import Image
import platform
import json
import base64
import typing as t
import zlib
import pydicom
import re
from tqdm import tqdm

ROOT_PATH = './'
INPUT_PATH = '../input/vinbigdata-chest-xray-abnormalities-detection/'
SUBM_PATH = './'


def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    # compute the area of intersection rectangle
    interArea = max(0, xB - xA) * max(0, yB - yA)

    if interArea == 0:
        return 0.0

    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)

    # return the intersection over union value
    return iou


def find_matching_box(boxes_list, new_box, match_iou=0.7):
    best_iou = match_iou
    best_index = -1
    for i in range(len(boxes_list)):
        box = boxes_list[i]
        if box[0] != new_box[0]:
            continue
        iou = bb_intersection_over_union(box[2:], new_box[2:])
        if iou > best_iou:
            best_index = i
            best_iou = iou

    return best_index, best_iou


def merge_boxes_v2(box1, box2, w1, w2, type):
    box = [-1, -1, -1, -1, -1, -1]
    box[0] = box1[0]
    if type == 'avg':
        box[1] = ((w1 * box1[1]) + (w2 * box2[1])) / (w1 + w2)
    elif type == 'max':
        box[1] = max(box1[1], box2[1])
    elif type == 'mul':
        box[1] = np.sqrt(box1[1]*box2[1])
    else:
        exit()
    box[2] = (w1*box1[2] + w2*box2[2]) / (w1 + w2)
    box[3] = (w1*box1[3] + w2*box2[3]) / (w1 + w2)
    box[4] = (w1*box1[4] + w2*box2[4]) / (w1 + w2)
    box[5] = (w1*box1[5] + w2*box2[5]) / (w1 + w2)
    return box


def merge_all_boxes_for_image(boxes, intersection_thr=0.5, type='avg'):

    new_boxes = boxes[0].copy()
    init_weight = 1/len(boxes)
    weights = [init_weight] * len(new_boxes)

    for j in range(1, len(boxes)):
        for k in range(len(boxes[j])):
            index, best_iou = find_matching_box(new_boxes, boxes[j][k], intersection_thr)
            if index != -1:
                new_boxes[index] = merge_boxes_v2(new_boxes[index], boxes[j][k], weights[index], init_weight, type)
                weights[index] += init_weight
            else:
                new_boxes.append(boxes[j][k])
                weights.append(init_weight)

    for i in range(len(new_boxes)):
        new_boxes[i][1] *= weights[i]
    return np.array(new_boxes)


def filter_boxes(boxes, scores, labels, thr):
    new_boxes = []
    for i in range(boxes.shape[0]):
        box = []
        for j in range(boxes.shape[1]):
            label = labels[i, j].astype(np.int64)
            score = scores[i, j]
            if score < thr:
                break
            # Mirror fix !!!
            if i % 2 == 0:
                b = [int(label), float(score), float(boxes[i, j, 0]), float(boxes[i, j, 1]), float(boxes[i, j, 2]), float(boxes[i, j, 3])]
            else:
                b = [int(label), float(score), 1 - float(boxes[i, j, 2]), float(boxes[i, j, 1]), 1 - float(boxes[i, j, 0]), float(boxes[i, j, 3])]
            box.append(b)
        new_boxes.append(box)
    return new_boxes


def get_train_test_image_sizes():
    sizes = dict()
    sizes_train = pd.read_csv('../input/meta-xray/image_width_height_test.csv')
    sizes_test = pd.read_csv('../input/meta-xray/image_width_height_test.csv')
    sizes_df = pd.concat((sizes_train, sizes_test), axis=0)
    for index, row in sizes_df.iterrows():
        sizes[row['image_id']] = (row['height'], row['width'])
    return sizes


def save_in_file_fast(arr, file_name):
    pickle.dump(arr, open(file_name, 'wb'), protocol=4)


def load_from_file_fast(file_name):
    return pickle.load(open(file_name, 'rb'))


def create_csv_for_retinanet_predictions(
        input_data,
        out_file,
        skip_box_thr=0.05,
        intersection_thr=0.5,
        limit_boxes=300,
        type='avg'
):
    verbose = False
    sizes = get_train_test_image_sizes()
    out = open(out_file, 'w')
    out.write('image_id,PredictionString\n')
    input_data = load_from_file_fast(input_data)
    entries = list(input_data.keys())
    for id in entries:
        boxes, scores, labels = input_data[id]
        filtered_boxes = filter_boxes(boxes, scores, labels, skip_box_thr)
        # print(len(filtered_boxes[0]), len(filtered_boxes[1]))
        # print(filtered_boxes[0], filtered_boxes[1])
        merged_boxes = merge_all_boxes_for_image(filtered_boxes, intersection_thr, type)
        # reduced_boxes = reduce_similar(merged_boxes)
        if verbose:
            print(id, len(filtered_boxes[0]), len(filtered_boxes[1]), len(merged_boxes))
        if len(merged_boxes) > limit_boxes:
            # sort by score
            merged_boxes = np.array(merged_boxes)
            merged_boxes = merged_boxes[merged_boxes[:, 1].argsort()[::-1]][:limit_boxes]

        out.write("{},".format(id))
        if len(merged_boxes) > 0:
            for i in range(len(merged_boxes)):
                label = int(merged_boxes[i][0])
                score = merged_boxes[i][1]
                b = merged_boxes[i][2:]

                xmin = b[0]
                if xmin < 0:
                    xmin = 0
                if xmin > 1:
                    xmin = 1

                xmax = b[2]
                if xmax < 0:
                    xmax = 0
                if xmax > 1:
                    xmax = 1

                ymin = b[1]
                if ymin < 0:
                    ymin = 0
                if ymin > 1:
                    ymin = 1

                ymax = b[3]
                if ymax < 0:
                    ymax = 0
                if ymax > 1:
                    ymax = 1

                if (xmax < xmin):
                    print('X min value larger than max value {}: {} {}'.format('label', xmin, xmax))
                    continue

                if (ymax < ymin):
                    print('Y min value larger than max value {}: {} {}'.format('label', ymin, ymax))
                    continue

                if abs(xmax - xmin) < 1e-5:
                    print('Too small diff for {}: {} and {}'.format('label', xmin, xmax))
                    continue

                if abs(ymax - ymin) < 1e-5:
                    print('Too small diff for {}: {} and {}'.format('label', ymin, ymax))
                    continue

                xmin = int(round(xmin * sizes[id][1]))
                xmax = int(round(xmax * sizes[id][1]))
                ymin = int(round(ymin * sizes[id][0]))
                ymax = int(round(ymax * sizes[id][0]))
                str1 = "{} {:.6f} {} {} {} {} ".format(label, score, xmin, ymin, xmax, ymax)
                out.write(str1)
        else:
            str1 = "14 1 0 0 1 1"
            out.write(str1)
        out.write('\n')
        

if __name__ == '__main__':
    limit_boxes = 100
    type = 'avg'
    
    out_folders = [
        '../input/inference-zfturbo-retina-1-fold-0/modified_data_folder/resnet101_fold_0_0.3573_26_iou_0.3_test_result.pkl',
        '../input/inference-zfturbo-retina-1-fold-1/modified_data_folder/resnet101_fold_1_0.3481_35_iou_0.3_test_result.pkl',
        '../input/inference-zfturbo-retina-1-fold-2/modified_data_folder/resnet101_fold_2_0.3804_24_iou_0.3_test_result.pkl',
        '../input/inference-zfturbo-retina-1-fold-3/modified_data_folder/resnet101_fold_3_0.3584_24_iou_0.3_test_result.pkl',
        '../input/inference-zfturbo-retina-1-fold-4/modified_data_folder/resnet101_fold_4_0.3514_12_iou_0.3_test_result.pkl',
    ]
    best_params_list = [
        (0.05, 0.35),
        (0.01, 0.45),
        (0.03, 0.45),
        (0.05, 0.45),
        (0.03, 0.45),
    ]
    for i, o in enumerate(out_folders):
        print('Go {} Params: {}'.format(os.path.basename(o), best_params_list[i]))
        skip_box_thr = best_params_list[i][0]
        intersection_thr = best_params_list[i][1]
        out_file = SUBM_PATH + 'retina_' + os.path.basename(o)[:-4] + '_thr_{}_iou_{}_test.csv'.format(skip_box_thr, intersection_thr)
        create_csv_for_retinanet_predictions(
            o,
            out_file,
            skip_box_thr,
            intersection_thr,
            limit_boxes,
            type=type
        )
        print('Write: {}'.format(out_file))
    print('Complete retina 1!')

In [None]:
if __name__ == '__main__':
    limit_boxes = 100
    type = 'avg'
    
    out_folders = [
        '../input/inference-zfturbo-retina-2-fold-0/modified_data_folder/resnet101_fold_0_0.1817_05_iou_0.3_test_result.pkl',
        '../input/inference-zfturbo-retina-2-fold-1/modified_data_folder/resnet101_fold_1_0.2072_19_iou_0.3_test_result.pkl',
        '../input/inference-zfturbo-retina-2-fold-2/modified_data_folder/resnet101_fold_2_0.1938_03_iou_0.3_test_result.pkl',
        '../input/inference-zfturbo-retina-2-fold-3/modified_data_folder/resnet101_fold_3_0.1884_07_iou_0.3_test_result.pkl',
        '../input/inference-zfturbo-retina-2-fold-4/modified_data_folder/resnet101_fold_4_0.2227_05_iou_0.3_test_result.pkl',
    ]
    best_params_list = [
        (0.03, 0.45),
        (0.01, 0.45),
        (0.01, 0.40),
        (0.01, 0.45),
        (0.01, 0.40),
    ]
    for i, o in enumerate(out_folders):
        print('Go {} Params: {}'.format(os.path.basename(o), best_params_list[i]))
        skip_box_thr = best_params_list[i][0]
        intersection_thr = best_params_list[i][1]
        out_file = SUBM_PATH + 'retina_' + os.path.basename(o)[:-4] + '_thr_{}_iou_{}_test.csv'.format(skip_box_thr, intersection_thr)
        create_csv_for_retinanet_predictions(
            o,
            out_file,
            skip_box_thr,
            intersection_thr,
            limit_boxes,
            type=type
        )
        print('Write: {}'.format(out_file))
    print('Complete retina 2!')

In [None]:
!pip install ensemble-boxes

In [None]:
from ensemble_boxes import weighted_boxes_fusion


def ensemble(
    subm_list,
    iou_same=0.5,
    out_path=None,
    skip_box_thr=0.00000001,
):
    sizes = get_train_test_image_sizes()
    preds = []
    weights = []
    checker = None
    for path, weight in subm_list:
        s = pd.read_csv(path)
        s.sort_values('image_id', inplace=True)
        s.reset_index(drop=True, inplace=True)
        ids = s['image_id']
        if checker:
            if tuple(ids) != checker:
                print(set(checker) - set(ids))
                print('Different IDS!', len(tuple(ids)), path)
                exit()
        else:
            checker = tuple(ids)
        preds.append(s['PredictionString'].values)
        weights.append(weight)

    if out_path is None:
        out_path = SUBM_PATH + 'ensemble_iou_{}.csv'.format(iou_same)
    out = open(out_path, 'w')
    out.write('image_id,PredictionString\n')
    for j, id in enumerate(list(checker)):
        # print(id)
        boxes_list = []
        scores_list = []
        labels_list = []
        empty = True
        for i in range(len(preds)):
            boxes = []
            scores = []
            labels = []
            p1 = preds[i][j]
            if str(p1) != 'nan':
                arr = p1.strip().split(' ')
                for k in range(0, len(arr), 6):
                    cls = int(arr[k])
                    prob = float(arr[k + 1])
                    x1 = float(arr[k + 2]) / sizes[id][1]
                    y1 = float(arr[k + 3]) / sizes[id][0]
                    x2 = float(arr[k + 4]) / sizes[id][1]
                    y2 = float(arr[k + 5]) / sizes[id][0]
                    boxes.append([x1, y1, x2, y2])
                    scores.append(prob)
                    labels.append(cls)

            boxes_list.append(boxes)
            scores_list.append(scores)
            labels_list.append(labels)

        boxes, scores, labels = weighted_boxes_fusion(
            boxes_list,
            scores_list,
            labels_list,
            iou_thr=iou_same,
            skip_box_thr=skip_box_thr,
            weights=weights,
            allows_overflow=True
        )
        # print(len(boxes), len(labels), len(scores))
        if len(boxes) == 0:
            out.write('{},14 1 0 0 1 1\n'.format(id, ))
        else:
            final_str = ''
            for i in range(len(boxes)):
                cls = int(labels[i])
                prob = scores[i]
                x1 = int(boxes[i][0] * sizes[id][1])
                y1 = int(boxes[i][1] * sizes[id][0])
                x2 = int(boxes[i][2] * sizes[id][1])
                y2 = int(boxes[i][3] * sizes[id][0])
                if cls == 14:
                    final_str += '{} {} {} {} {} {} '.format(cls, prob, 0, 0, 1, 1)
                else:
                    final_str += '{} {} {} {} {} {} '.format(cls, prob, x1, y1, x2, y2)
            out.write('{},{}\n'.format(id, final_str.strip()))

    out.close()
    return out_path


def get_test_from_subm_list(subm_list):
    out = []
    for s, w in subm_list:
        s1 = s.replace('_train', '_test')
        out.append((s1, w))
    return out


def ensemble_experiment_v17_retinanet_resnet101_sqr():

    sp = SUBM_PATH
    subm_list = [
        (sp + 'retina_resnet101_fold_0_0.3573_26_iou_0.3_test_result_thr_0.05_iou_0.35_test.csv', 1),
        (sp + 'retina_resnet101_fold_1_0.3481_35_iou_0.3_test_result_thr_0.01_iou_0.45_test.csv', 1),
        (sp + 'retina_resnet101_fold_2_0.3804_24_iou_0.3_test_result_thr_0.03_iou_0.45_test.csv', 1),
        (sp + 'retina_resnet101_fold_3_0.3584_24_iou_0.3_test_result_thr_0.05_iou_0.45_test.csv', 1),
        (sp + 'retina_resnet101_fold_4_0.3514_12_iou_0.3_test_result_thr_0.03_iou_0.45_test.csv', 1),
    ]
    subm_list_test = get_test_from_subm_list(subm_list)

    best_iou = 0.4
    skip_box_thr = 0.01
    out_path = SUBM_PATH + 'ensemble_retinanet_resnet101_sqr.csv'
    predictions = ensemble(subm_list_test, best_iou, out_path, skip_box_thr)
    print('Prediction saved: {}'.format(out_path))


def ensemble_experiment_v24_retinanet_resnet101_sqr_removed_radiologists():

    sp = SUBM_PATH
    subm_list = [
        (sp + 'retina_resnet101_fold_0_0.1817_05_iou_0.3_test_result_thr_0.03_iou_0.45_test.csv', 1),
        (sp + 'retina_resnet101_fold_1_0.2072_19_iou_0.3_test_result_thr_0.01_iou_0.45_test.csv', 1),
        (sp + 'retina_resnet101_fold_2_0.1938_03_iou_0.3_test_result_thr_0.01_iou_0.4_test.csv', 1),
        (sp + 'retina_resnet101_fold_3_0.1884_07_iou_0.3_test_result_thr_0.01_iou_0.45_test.csv', 1),
        (sp + 'retina_resnet101_fold_4_0.2227_05_iou_0.3_test_result_thr_0.01_iou_0.4_test.csv', 1),
    ]
    subm_list_test = get_test_from_subm_list(subm_list)

    best_iou = 0.4
    skip_box_thr = 0.01
    out_path = SUBM_PATH + 'ensemble_retinanet_resnet101_removed_rad.csv'
    predictions = ensemble(subm_list_test, best_iou, out_path, skip_box_thr)
    print('Prediction saved: {}'.format(out_path))


if __name__ == '__main__':
    ensemble_experiment_v17_retinanet_resnet101_sqr()
    ensemble_experiment_v24_retinanet_resnet101_sqr_removed_radiologists()
