In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import pydicom
import cv2
from tqdm import tqdm
from tensorflow.keras.models import load_model
import glob

from tensorflow.keras.applications import InceptionResNetV2
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
import time 

start_time = time.time()

In [None]:
!cp ../input/gdcm-conda-install/gdcm.tar .

!tar -xvzf gdcm.tar
!conda install --offline ./gdcm/gdcm-2.8.9-py37h71b2a6d_0.tar.bz2
print("done")

In [None]:
IMAGE_SIZE = 512
MODELS = dict()
PATH_DATA = '../input/rsna-str-pulmonary-embolism-detection/'

In [None]:
def get_pixels_hu(dcm):
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    image = dcm.pixel_array.astype(np.int16)

    # Set outside-of-scan pixels to 0
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    
    # Convert to Hounsfield units (HU)
    intercept = dcm.RescaleIntercept
    slope = dcm.RescaleSlope
        
    if slope != 1:
        image = slope * image.astype(np.float64)
        image = image.astype(np.int16)
            
    image += np.int16(intercept)
    
    del intercept, slope, dcm
    
    return np.array(image, dtype=np.int16)

def set_lungwin(img, hu=[-100, 550]):
    lungwin = np.array(hu)
    newimg = (img-lungwin[0]) / (lungwin[1]-lungwin[0])
    newimg[newimg < 0] = 0
    newimg[newimg > 1] = 1
    newimg = (newimg * 255).astype('uint8')
    
    del img 
    
    return newimg

def get_img(d):
    #d = pydicom.dcmread(path)
    im = set_lungwin(get_pixels_hu(d))
    im = cv2.resize(im, (IMAGE_SIZE, IMAGE_SIZE))
    im = cv2.cvtColor(im,cv2.COLOR_GRAY2RGB)
    print()
    del d 
    
    return im

In [None]:
def load_models():
    print('load models')
    MODELS['pe_present'] = load_model("../input/models-v1/pe_present_img_inception.h5")
    MODELS['ratio'] = load_model("../input/models-v1/TEP_ratio_model_v2.h5")
    MODELS['side'] = load_model("../input/models-v1/TEP_side_model_v2.h5")
    MODELS['chronic'] = load_model("../input/models-v1/TEP_chronic_model_v1.h5")

In [None]:
def get_slices(exam_id, series_id):
    path_patient = PATH_DATA + 'test/' + exam_id + "/" + series_id + "/"
    dcm_path = glob.glob(path_patient + "*")
    slices = [pydicom.dcmread(file) for file in dcm_path]
    
    del dcm_path, path_patient
    
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    
    start_i = round(len(slices)*0.22)
    end_i = round(len(slices)*0.85)
    
    slices_ini = slices[0:start_i]
    slices_end = slices[end_i:len(slices)]
    
    slices = slices[start_i:end_i]
    
    sop_uids = [s.SOPInstanceUID for s in slices]
    
    sop_uids_ini = [s.SOPInstanceUID for s in slices_ini]
    sop_uids_end = [s.SOPInstanceUID for s in slices_end]
    
    del slices_ini, slices_end
    
    sop_uids = pd.DataFrame(sop_uids, columns=['SOPInstanceUID'])
    
    sop_uids['StudyInstanceUID'] = exam_id
    sop_uids['SeriesInstanceUID'] = series_id
    
    sop_uids_ini = pd.DataFrame(sop_uids_ini, columns=['SOPInstanceUID'])
    sop_uids_ini['StudyInstanceUID'] = exam_id
    sop_uids_ini['SeriesInstanceUID'] = series_id
    
    sop_uids_end = pd.DataFrame(sop_uids_end, columns=['SOPInstanceUID'])
    sop_uids_end['StudyInstanceUID'] = exam_id
    sop_uids_end['SeriesInstanceUID'] = series_id
    
    del exam_id, series_id
    
    return sop_uids, sop_uids_ini, sop_uids_end, slices

In [None]:
def create_predict_columns(df):
    df["pe_present_on_image_pred"] = 0
    
    df["rv_lv_ratio_gte_1_pred"] = 0
    df["rv_lv_ratio_lt_1_pred"] = 0
    
    df["leftsided_pe_pred"] = 0
    df["rightsided_pe_pred"] = 0
    df["central_pe_pred"] = 0

    df["acute_pe_pred"] = 0
    df["chronic_pe_pred"] = 0
    df["acute_and_chronic_pe_pred"] = 0
    
    return df

def predict_dataset(df, model_pe_present, model_ratio, model_side, model_chronic, slices):
    
    df = create_predict_columns(df)
    
    N_STEP = len(df)//25
    B_STEP = len(df)%25
    START_STEP = B_STEP//2
    
    ct = 0
    pred_pe_present = 0.00001
    pred_ratio, pred_side, pred_chronic = 0.00001, 0.00001, 0.00001
    instance, series = 0, 0
    for idx in df.index:
        p = PATH_DATA + 'test/' + df.loc[idx,'StudyInstanceUID'] + '/' + df.loc[idx,'SeriesInstanceUID'] + '/' + df.loc[idx,'SOPInstanceUID'] + '.dcm'
        
        del instance, series
        
        instance = df.loc[idx,'SOPInstanceUID']
        series = df.loc[idx,'SeriesInstanceUID']
        
        if ct==0 or (ct-B_STEP)%N_STEP==0:
            images = []
            s = slices[ct]
            im = get_img(s)
            images = np.array([im])
            
            ct+=1
           
            del s, im, pred_pe_present
            
            pred_pe_present = model_pe_present.predict(images)
            
            
            df.loc[idx, "pe_present_on_image_pred"] = pred_pe_present[0][0]
            sub.loc[sub["id"]==instance, "label"] = pred_pe_present[0][0]

            if pred_pe_present[0][0] > TH_POS_IMG:
                
                del pred_ratio, pred_side, pred_chronic
                
                pred_ratio  = model_ratio.predict(images)
                pred_side = model_side.predict(images)
                pred_chronic = model_chronic.predict(images)
                
                mean_negative_exam = (pred_ratio[0][2] + pred_side[0][3] + pred_chronic[0][3])/3
                
                if mean_negative_exam < TH_POS_IMG:
                    pred_pe_present[0][0] = min(mean_negative_exam, 0.49)
                    sub.loc[sub["id"]==instance, "label"] = pred_pe_present[0][0]
                    continue
                
                df.loc[idx, "rv_lv_ratio_gte_1_pred"] = pred_ratio[0][0] 
                df.loc[idx, "rv_lv_ratio_lt_1_pred"] = pred_ratio[0][1]

                df.loc[idx, "leftsided_pe_pred"] = pred_side[0][0]
                df.loc[idx, "rightsided_pe_pred"] = pred_side[0][1]
                df.loc[idx, "central_pe_pred"] = pred_side[0][2]

                df.loc[idx, "acute_pe_pred"] = pred_chronic[0][0]
                df.loc[idx, "chronic_pe_pred"] = pred_chronic[0][1]
                df.loc[idx, "acute_and_chronic_pe_pred"] = pred_chronic[0][2]
                
                del images
            
            else:
                del images
        

        else:
            ct+=1
            df.loc[idx, "pe_present_on_image_pred"] = pred_pe_present[0][0]
            sub.loc[sub["id"]==instance, "label"] = pred_pe_present[0][0]

            if pred_pe_present[0][0] > TH_POS_IMG:
                df.loc[idx, "rv_lv_ratio_gte_1_pred"] = pred_ratio[0][0]
                df.loc[idx, "rv_lv_ratio_lt_1_pred"] = pred_ratio[0][1]

                df.loc[idx, "leftsided_pe_pred"] = pred_side[0][0]
                df.loc[idx, "rightsided_pe_pred"] = pred_side[0][1]
                df.loc[idx, "central_pe_pred"] = pred_side[0][2]

                df.loc[idx, "acute_pe_pred"] = pred_chronic[0][0]
                df.loc[idx, "chronic_pe_pred"] = pred_chronic[0][1]
                df.loc[idx, "acute_and_chronic_pe_pred"] = pred_chronic[0][2]
    
    del slices
    
    return df

In [None]:
load_models()
print('load data')

from os import path
if path.exists('../input/rsna-str-pulmonary-embolism-detection/train'):
    test=pd.read_csv(PATH_DATA+'/test.csv').head(2000)#
else:
    test=pd.read_csv(PATH_DATA+'/test.csv')

In [None]:
sub = pd.read_csv("../input/rsna-str-pulmonary-embolism-detection/sample_submission.csv")

In [None]:
#sub.label = 0

In [None]:
def treat_pred(exam_pred, prob_neg):
    if prob_neg < 0.5: #exam positive
        
        #Let's solve rule1a
        if exam_pred['rv_lv_ratio_lt_1_pred']  >  0.5  and exam_pred['rv_lv_ratio_gte_1_pred'] >  0.5:
            # The greater keeps the same, the lower will be the opposite of the greater
            # This case should not accure in our case 
            if exam_pred['rv_lv_ratio_lt_1_pred'] > exam_pred['rv_lv_ratio_gte_1_pred']:
                exam_pred['rv_lv_ratio_gte_1_pred'] = 1 - exam_pred['rv_lv_ratio_lt_1_pred']
            else:
                exam_pred['rv_lv_ratio_lt_1_pred'] = 1 - exam_pred['rv_lv_ratio_gte_1_pred']
                
        elif exam_pred['rv_lv_ratio_lt_1_pred']  <= 0.5  and exam_pred['rv_lv_ratio_gte_1_pred'] <= 0.5:
            # This case might happen too, it is when the model of ratio thinks that the exam is negative
            # To solve this we will take the biggest pred and make the max with prob_neg
            # This solution is lame but the model too, so we cannot relie on it, yet 
            if exam_pred['rv_lv_ratio_lt_1_pred'] > exam_pred['rv_lv_ratio_gte_1_pred']:
                exam_pred['rv_lv_ratio_lt_1_pred'] = 1 - prob_neg
                exam_pred['rv_lv_ratio_gte_1_pred'] = prob_neg
            else:
                exam_pred['rv_lv_ratio_gte_1_pred'] = 1 - prob_neg
                exam_pred['rv_lv_ratio_lt_1_pred'] = prob_neg
        
        #Let's solve rule1b
        if exam_pred['central_pe_pred'] <= 0.5 and exam_pred['rightsided_pe_pred'] <= 0.5 and exam_pred['leftsided_pe_pred'] <= 0.5:
            # We use the same idea, maybe we can improve this because the model is really better
            max_pred = max(exam_pred['central_pe_pred'], exam_pred['rightsided_pe_pred'], exam_pred['leftsided_pe_pred'])
            for c in side:
                if exam_pred[c] == max_pred:
                    exam_pred[c] = 1 - prob_neg
        
        # Let's solve rule1c
        if exam_pred['acute_and_chronic_pe_pred'] > 0.5 and exam_pred['chronic_pe_pred'] > 0.5:
            # Really poor chances that this case happens since our model has got a huge proportion to predict acute_pe, but we if it happens we just gonna keep the max
            # since if the model gives a high pred for these two classes it would mean that it is REALLY NOT acute
            if exam_pred['acute_and_chronic_pe_pred'] > exam_pred['chronic_pe_pred']:
                exam_pred['chronic_pe_pred'] = 1 - exam_pred['acute_and_chronic_pe_pred']
            else:
                exam_pred['acute_and_chronic_pe_pred'] = 1 - exam_pred['chronic_pe_pred']
                
        # Since we give 0 everytime for indeterminate we have no risk to break rule 1d
        
    else: # Exam negative
        # It seems to me that the rule 2a cannot be broken but I will keep an eye on it 
        # Rule 2b will be solved by putting a filter, every pred superior to the probability of being positive will be equal to the prob of being positive
        # This rule makes total sense and can be kept for long term
        for c in classes:
            exam_pred[c] = min(exam_pred[c], 1 - prob_neg)
    
    return exam_pred

In [None]:
import gc 

TH_POS_IMG = 0.5
TH_POS = 0.90

# Para cada paciente:
for p in tqdm(test['StudyInstanceUID'].unique()):
    
    if (time.time() - start_time) > 8.5*60*60:
        break

    gc.collect()
    
    # pega slices ordenados, cortando % inicial e final
    sop_uids, sop_uids_ini, sop_uids_end, slices = get_slices(p, test[test['StudyInstanceUID']==p]['SeriesInstanceUID'].values[0])

    for idx in sop_uids_ini.SOPInstanceUID:
        sub.loc[sub["id"]==idx, "label"] = 0.000001
    
    for idx in sop_uids_end.SOPInstanceUID:
        sub.loc[sub["id"]==idx, "label"] = 0.000001
    
    del sop_uids_ini, sop_uids_end
    
    # predição dos slices desse exame:
    exam_predictions = predict_dataset(sop_uids, MODELS['pe_present'], MODELS['ratio'], MODELS['side'], MODELS['chronic'], slices)
    
    del sop_uids
    
    classes = ['rv_lv_ratio_gte_1_pred', 'rv_lv_ratio_lt_1_pred', 'leftsided_pe_pred', 'rightsided_pe_pred', 'central_pe_pred', 
               'chronic_pe_pred', 'acute_and_chronic_pe_pred']
    
    ratio = ['rv_lv_ratio_gte_1_pred', 'rv_lv_ratio_lt_1_pred']
    
    side = ['leftsided_pe_pred', 'rightsided_pe_pred', 'central_pe_pred']
    
    chronic = ['chronic_pe_pred', 'acute_and_chronic_pe_pred']
    
    max_score = exam_predictions.pe_present_on_image_pred.max()

    prob_neg = 1 - max_score
    
    del max_score 
    
    idx = p + '_negative_exam_for_pe'
    sub.loc[sub["id"]==idx, "label"] = prob_neg
    
    exam_pred = {}
    for c in classes:
        pr = exam_predictions[exam_predictions[c]>0][c].mean()
        exam_pred[c] = pr
    
    exam_pred_treated = treat_pred(exam_pred, prob_neg)
    
    del exam_pred, prob_neg
    
    for c in classes:
        idx = p + '_' + c.replace('_pred','')
        sub.loc[sub["id"]==idx, "label"] = exam_pred_treated[c]
    
    del exam_pred_treated
    
    idx = p + '_' + 'indeterminate'.replace('_pred','')
    
    sub.loc[sub["id"]==idx, "label"] = 0.020484822355039723
    

In [None]:
sub.to_csv('submission.csv', index=False)

In [None]:
def check_consistency(sub, test):
    
    '''
    Checks label consistency and returns the errors
    
    Args:
    sub   = submission dataframe (pandas)
    test  = test.csv dataframe (pandas)
    '''
    
    # EXAM LEVEL
    for i in test['StudyInstanceUID'].unique():
        df_tmp = sub.loc[sub.id.str.contains(i, regex = False)].reset_index(drop = True)
        df_tmp['StudyInstanceUID'] = df_tmp['id'].str.split('_').str[0]
        df_tmp['label_type']       = df_tmp['id'].str.split('_').str[1:].apply(lambda x: '_'.join(x))
        del df_tmp['id']
        if i == test['StudyInstanceUID'].unique()[0]:
            df = df_tmp.copy()
        else:
            df = pd.concat([df, df_tmp], axis = 0)
    df_exam = df.pivot(index = 'StudyInstanceUID', columns = 'label_type', values = 'label')
    
    # IMAGE LEVEL
    df_image = sub.loc[sub.id.isin(test.SOPInstanceUID)].reset_index(drop = True)
    df_image = df_image.merge(test, how = 'left', left_on = 'id', right_on = 'SOPInstanceUID')
    df_image.rename(columns = {"label": "pe_present_on_image"}, inplace = True)
    del df_image['id']
    
    # MERGER
    df = df_exam.merge(df_image, how = 'left', on = 'StudyInstanceUID')
    ids    = ['StudyInstanceUID', 'SeriesInstanceUID', 'SOPInstanceUID']
    labels = [c for c in df.columns if c not in ids]
    df = df[ids + labels]
    
    # SPLIT NEGATIVE AND POSITIVE EXAMS
    df['positive_images_in_exam'] = df['StudyInstanceUID'].map(df.groupby(['StudyInstanceUID']).pe_present_on_image.max())
    df_pos = df.loc[df.positive_images_in_exam >  0.5]
    df_neg = df.loc[df.positive_images_in_exam <= 0.5]
    
    # CHECKING CONSISTENCY OF POSITIVE EXAM LABELS
    rule1a = df_pos.loc[((df_pos.rv_lv_ratio_lt_1  >  0.5)  & 
                         (df_pos.rv_lv_ratio_gte_1 >  0.5)) | 
                        ((df_pos.rv_lv_ratio_lt_1  <= 0.5)  & 
                         (df_pos.rv_lv_ratio_gte_1 <= 0.5))].reset_index(drop = True)
    rule1a['broken_rule'] = '1a'
    rule1b = df_pos.loc[(df_pos.central_pe    <= 0.5) & 
                        (df_pos.rightsided_pe <= 0.5) & 
                        (df_pos.leftsided_pe  <= 0.5)].reset_index(drop = True)
    rule1b['broken_rule'] = '1b'
    rule1c = df_pos.loc[(df_pos.acute_and_chronic_pe > 0.5) & 
                        (df_pos.chronic_pe           > 0.5)].reset_index(drop = True)
    rule1c['broken_rule'] = '1c'
    rule1d = df_pos.loc[(df_pos.indeterminate        > 0.5) | 
                        (df_pos.negative_exam_for_pe > 0.5)].reset_index(drop = True)
    rule1d['broken_rule'] = '1d'

    # CHECKING CONSISTENCY OF NEGATIVE EXAM LABELS
    rule2a = df_neg.loc[((df_neg.indeterminate        >  0.5)  & 
                         (df_neg.negative_exam_for_pe >  0.5)) | 
                        ((df_neg.indeterminate        <= 0.5)  & 
                         (df_neg.negative_exam_for_pe <= 0.5))].reset_index(drop = True)
    rule2a['broken_rule'] = '2a'
    rule2b = df_neg.loc[(df_neg.rv_lv_ratio_lt_1     > 0.5) | 
                        (df_neg.rv_lv_ratio_gte_1    > 0.5) |
                        (df_neg.central_pe           > 0.5) | 
                        (df_neg.rightsided_pe        > 0.5) | 
                        (df_neg.leftsided_pe         > 0.5) |
                        (df_neg.acute_and_chronic_pe > 0.5) | 
                        (df_neg.chronic_pe           > 0.5)].reset_index(drop = True)
    rule2b['broken_rule'] = '2b'
    
    # MERGING INCONSISTENT PREDICTIONS
    errors = pd.concat([rule1a, rule1b, rule1c, rule1d, rule2a, rule2b], axis = 0)
    
    # OUTPUT
    print('Found', len(errors), 'inconsistent predictions')
    return errors

In [None]:
# max_score

In [None]:
# len(check_consistency(sub, test))

In [None]:
# error = check_consistency(sub, test)

In [None]:
# error.head()

In [None]:
print(time.time() - start_time)