# <div id="chap1">1. Load libraries and dataframes with predictions

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [None]:
# Import outputs of each selected models
yolo = pd.read_csv('../input/vinbigdatastack/yolov5.csv')
detectron = pd.read_csv('../input/vinbigdatastack/detectron2.csv')
fasterrcnn = pd.read_csv('../input/vinbigdatastack/fasterrcnn.csv')

image_ids = yolo.image_id.values

# <div id="chap2">2. Helper functions

In [None]:
def getitem(dataframe, img_id):
    
    pred = list(dataframe.loc[dataframe.image_id == img_id, "PredictionString"])[0].split(' ')
    nb_elm = len(pred)//6
    output = {}
    
    for elm in range(nb_elm):
        output[f'elm_{elm}'] = pred[elm*6 : (elm+1)*6]
        
    return output


def sortDictByProba(dict_):
    
    for key in dict_.keys():
        dict_[key] = list(map(lambda x: float(x), dict_[key]))
    
    # item[1][1] corresponds to the second element of the value (the confidence of the class identified)
    return {k: v for k, v in sorted(dict_.items(), key=lambda item: item[1][1], reverse = True)}


def getHighestProba(*list_of_dicts, n=3):
    
    output = {}
    for index, dict_ in enumerate(list_of_dicts):
        dict_length = len(dict_)
        for i in range(dict_length):
            if i < n:
                output[f"elm_{i}_dict_{index}"] =list(dict_.values())[i]
                
    return output


def getUnique(dict_):
    
    dict_length = len(dict_)
    
    classes_non_unique = [list(dict_.values())[index][0] for index in range(dict_length)]
    classes_unique = list(set(classes_non_unique))
    
    uniques, counts = np.unique(classes_non_unique, return_counts=True)
    duplicates = uniques[counts > 1]
    singles = np.setdiff1d(classes_unique, duplicates)
    
    return singles, duplicates


def getKeysByValue(dictOfElements, valueToFind):
    
    output = list()
    listOfItems = dictOfElements.items()
    
    for item  in listOfItems:
        if item[1][0] == valueToFind:
            output.append(item[0])
            
    return  output


def getListKeysByValue(dictOfElements, valuesToFind):
    
    output = []
    
    for value in valuesToFind:
        output.append(getKeysByValue(dictOfElements, value))
        
    return output


def averaging(from_dict, single_keys, dupl_keys):
    
    output = {}
    
    # Infer single keys
    if len(np.ravel(single_keys)) != 0:
        for single in np.ravel(single_keys):
            output[single] = from_dict[single]

    # For each duplicates, get index of all occurences and average boxing
    if len(np.ravel(dupl_keys)) != 0:
        for index, list_of_duplicate_class in enumerate(dupl_keys):
            probs = [] 
            boxing1 = []
            boxing2 = []
            boxing3 = []
            boxing4 = []
            
            for elm in list_of_duplicate_class:
                probs.append(from_dict[elm][1])
                boxing1.append(from_dict[elm][2])
                boxing2.append(from_dict[elm][3])
                boxing3.append(from_dict[elm][4])
                boxing4.append(from_dict[elm][5])
            
            output[f"elm_{index}"] = [from_dict[list_of_duplicate_class[0]][0],
                                      np.mean(probs),
                                      np.mean(boxing1),
                                      np.mean(boxing2),
                                      np.mean(boxing3),
                                      np.mean(boxing4)]
            
    return output


def toString(pred_list):
    castedList = []
    for index, elm in enumerate(pred_list):
        if index%6 == 0:
            castedList.append(str(int(elm)))
        else:
            castedList.append(str(elm))
            
    output = " ".join(castedList)
    
    return output

# <div id="chap3">3. Run ensembling with appropriate strategy

In [None]:
def main():
    
    output = pd.DataFrame(columns = ["image_id", "PredictionString"])
    
    for image_id in tqdm(image_ids):
        
        # For each model, get PredictionString of image_id as a dict
        fasterrcnn_pred = getitem(fasterrcnn, image_id)
        detectron_pred = getitem(detectron, image_id)
        yolo_pred = getitem(yolo, image_id)  
        
        # Sort dicts by proba
        sorted_fasterrcnn = sortDictByProba(fasterrcnn_pred)
        sorted_detectron = sortDictByProba(detectron_pred)
        sorted_yolo = sortDictByProba(yolo_pred)

        # Filter dicts into one dict with at most top n probs
        highest_probs = getHighestProba(sorted_fasterrcnn, 
                                        sorted_detectron, 
                                        sorted_yolo,
                                        n = 3)
        
        # Get keys of unique and duplicates values in the filtered dict
        singles, duplicates = getUnique(highest_probs)
        single_keys = getListKeysByValue(highest_probs, singles)
        dupl_keys = getListKeysByValue(highest_probs, duplicates)
        
        # Apply averaging strategy
        stacked_dict = averaging(highest_probs, single_keys, dupl_keys)
        
        # Put string in right format
        prediction_int = np.ravel(list(stacked_dict.values()))
        prediction_string = toString(prediction_int)
        
        output = output.append({"image_id": image_id, 
                                "PredictionString": prediction_string},
                               ignore_index=True)
        
    return output

# <div id="chap4">4. Save results

In [None]:
final_sub = main()
path = '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/'
samp_subm = pd.read_csv(path+'sample_submission.csv')
final_sub.to_csv("submission.csv", index=False)
samp_subm.to_csv('submission.csv', index=False)

# References

* <a href = "https://medium.com/inspiredbrilliance/object-detection-through-ensemble-of-models-fed015bc1ee0">Article on object detection through ensemble of models</a>
* detectron2 : https://www.kaggle.com/c/vinbigdata-chest-xray-abnormalities-detection/code?competitionId=24800&sortBy=scoreDescending
* fasterrcnn : https://www.kaggle.com/awsaf49/vinbigdata-cxr-ad-yolov5-14-class-infer
* yolov5 : https://www.kaggle.com/basu369victor/chest-x-ray-abnormalities-detection-submission