In [None]:
import numpy as np, pandas as pd, os
import matplotlib.pyplot as plt
import glob
import datetime
import torch
import torchvision.transforms as transforms
from torch import nn
from torch.nn import functional as F

import pydicom
import vtk
from vtk.util import numpy_support

from pydicom import dcmread
from tqdm import tqdm
import cv2
import gc
import pickle

startTime = datetime.datetime.now()

In [None]:
# Codes from this cell are adopted from Quadcore/Richard Epstein public notebook
# This notebook loads GDCM without Internet access.
# GDCM is needed to read some DICOM compressed images.
# Once you run a notebook and get the GDCM error, you must restart that Kernel to read the files, even if you load the GDCM software.
# Note that you do not "import GDCM". You just "import pydicom".
# The Dataset (gdcm-conda-install) was provided by Ronaldo S.A. Batista. Definitely deserves an upvote!

!cp ../input/gdcm-conda-install/gdcm.tar .
!tar -xvzf gdcm.tar
!conda install --offline ./gdcm/gdcm-2.8.9-py37h71b2a6d_0.tar.bz2

print("GDCM installed.")

import pydicom

In [None]:
dataDirPath = '../input/rsna-str-pulmonary-embolism-detection/'
modelDirPath = '../input/firstbaselinemodel/'

testDataDF = pd.read_csv(dataDirPath+'test.csv', dtype={'StudyInstanceUID':'string', 'SeriesInstanceUID':'string', 'SOPInstanceUID':'string'})
testDataDF = testDataDF.set_index('SOPInstanceUID')
print(testDataDF.head())

DEBUG = (testDataDF.shape[0]==146853)
DEBUG
if DEBUG:
    testDataDF = testDataDF.head(1000)

In [None]:
listOfStudyID = testDataDF['StudyInstanceUID'].unique()
print(len(listOfStudyID))

In [None]:
# Helper functions

def window(img, WL=50, WW=350):
    upper, lower = WL+WW//2, WL-WW//2
    X = np.clip(img.copy(), lower, upper)
    X = X - np.min(X)
    X = X / np.max(X)
    X = (X*255.0).astype('uint8')
    return X

data_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

# Helper functions for inference
def dcmDataToImage(dcmData):
    image = dcmData.pixel_array * int(dcmData.RescaleSlope) + int(dcmData.RescaleIntercept)
    image = np.stack([window(image, WL=-600, WW=1500),
                    window(image, WL=40, WW=400),
                    window(image, WL=100, WW=700)], 2)
    if image.shape[0] != 512 or image.shape[1] != 512:
        image = cv2.resize(image, (512,512), interpolation = cv2.INTER_AREA)
    #image = image.astype(np.float32)
    return image

def getVolByVTK(PathDicom):
    reader = vtk.vtkDICOMImageReader()
    reader.SetDirectoryName(PathDicom)
    reader.Update()

    # Load dimensions using `GetDataExtent`
    _extent = reader.GetDataExtent()
    ConstPixelDims = [_extent[1]-_extent[0]+1, _extent[3]-_extent[2]+1, _extent[5]-_extent[4]+1]

    # Load spacing values
    ConstPixelSpacing = reader.GetPixelSpacing()

    # Get the 'vtkImageData' object from the reader
    imageData = reader.GetOutput()
    # Get the 'vtkPointData' object from the 'vtkImageData' object
    pointData = imageData.GetPointData()
    # Ensure that only one array exists within the 'vtkPointData' object
    assert (pointData.GetNumberOfArrays()==1)
    # Get the `vtkArray` (or whatever derived type) which is needed for the `numpy_support.vtk_to_numpy` function
    arrayData = pointData.GetArray(0)

    # Convert the `vtkArray` to a NumPy array
    ArrayDicom = numpy_support.vtk_to_numpy(arrayData)
    # Reshape the NumPy array to 3D using 'ConstPixelDims' as a 'shape'
    ArrayDicom = ArrayDicom.reshape(ConstPixelDims, order='F')
    return ArrayDicom

# Helper functions for inference
def VTKsliceToImage(VTKvol, index):
    image = np.flipud(VTKvol[:,:,index].T.copy())
    image = np.stack([window(image, WL=-600, WW=1500),
                    window(image, WL=40, WW=400),
                    window(image, WL=100, WW=700)], 2)
    if image.shape[0] != 512 or image.shape[1] != 512:
        image = cv2.resize(image, (512,512), interpolation = cv2.INTER_AREA)
    #image = image.astype(np.float32)
    return image

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def sortByImgPosHelper(dcmDataDict):
    return dcmDataDict['img_pos']

In [None]:
# Helper functions to ensure label consistency

#0 Negative for PE 
#1 Indeterminate 
#2 Chronic 
#3 Acute & Chronic 
#4 Central PE 
#5 Left PE 
#6 Right PE 
#7 RV/LV Ratio >= 1 
#8 RV/LV Ratio < 1

# Methods to ensure there is no logical inconsistency in the predicted labels
# Supposed you have two numpy arrays, array A for image labels and array B for study lvl labels
# Use modifyAll(A, B) to adjust the labels to be logically consistent as required for the competition rules

def negativeOrIndeterminate(imgLvlLabels, stdLvlLabels):
    workingArray = stdLvlLabels[:2]
    # Part 1 - only one of negative or indeterminate can be positive
    # if both Neg and Indeterminate are >0.5, suppress the lower value to <0.49
    if workingArray[0]>0.5 and workingArray[1]>0.5:
        minInd = np.argmin(workingArray)
        workingArray[minInd] = min(workingArray[minInd],0.49)
    # Part 2 - if either negative or indeterminate is positive, then study is NOT positive, 
    # none of the image level labels can be positive
    # if Neg or Indeterminate, supress all image level labels to <0.49    
    if max(workingArray)>0.5:
        imgLvlLabels[:] = np.minimum(imgLvlLabels,0.49)[:]
    # Part 3 - if both negative and indeterminate is negative, then study is positive
    # then at least one of the image has to be positive
    if max(workingArray)<0.5 and max(imgLvlLabels)<0.5:
        maxInd = np.argmax(imgLvlLabels)
        imgLvlLabels[maxInd] = max(imgLvlLabels[maxInd],0.51)  
    stdLvlLabels[:2] = workingArray
    return

def rightLeftCentral(imgLvlLabels, stdLvlLabels):
    workingArray = stdLvlLabels[4:7]
    # Part 1 if at least one image level label is positive, one of these has to be positive
    # if max(image level label) > 0.5, raise the highest level to >0.51
    if max(imgLvlLabels)>0.5:
        maxInd = np.argmax(workingArray)
        workingArray[maxInd] = max(workingArray[maxInd],0.51)
    else:
        # Part 2 if all image level label are negative, right left central cannot be positive
        # else suppress right left central to at most 0.49
        workingArray[:] = np.minimum(workingArray,0.49)[:]
    stdLvlLabels[4:7] = workingArray
    return

def rv_lv_ratio(imgLvlLabels, stdLvlLabels):
    workingArray = stdLvlLabels[7:9]
    # Part 1 if at least one image level label is positive, one of these has to be positive
    # if max(image level label) > 0.5, raise one of the rv_lv labels to >0.51
    if max(imgLvlLabels)>0.5:
        maxInd = np.argmax(workingArray)
        workingArray[maxInd] = max(workingArray[maxInd],0.51)
        stdLvlLabels[7:9] = workingArray
    else:
    # Part 2 if all image levels are negative, then neither rv_lv labels can be positive
        workingArray[:] = np.minimum(workingArray,0.49)[:]
    # Part 2 Both can't be simultaneously positive.only one of the two choices can be positive
    # if smaller(min) of the two is >0.5, suppress it to 0.49
    if min(workingArray)>0.5:
        minInd = np.argmin(workingArray)
        workingArray[minInd] = min(workingArray[minInd],0.49)
    stdLvlLabels[7:9] = workingArray
    return

def chronicOrAcuteChronic(imgLvlLabels, stdLvlLabels):
    workingArray = stdLvlLabels[2:4]
    # Both can't be simultaneously positive.only one of the two choices can be positive
    if min(workingArray)>0.5:
        minInd = np.argmin(workingArray)
        workingArray[minInd] = min(workingArray[minInd],0.49)
    stdLvlLabels[2:4] = workingArray
    return

def modifyAll(imgLvlLabels, stdLvlLabels):
    negativeOrIndeterminate(imgLvlLabels, stdLvlLabels)
    rightLeftCentral(imgLvlLabels, stdLvlLabels)
    rv_lv_ratio(imgLvlLabels, stdLvlLabels)
    chronicOrAcuteChronic(imgLvlLabels, stdLvlLabels)
    return

In [None]:
INPUT_SIZE = 64
HIDDEN_SIZE = 32
NUM_LAYERS = 1
NUM_CLASSES = 1

class BiGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.GRU = nn.GRU(
            input_size, hidden_size, num_layers, batch_first=True, bidirectional=True
        )
        self.linear1 = nn.Linear(hidden_size*2, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_classes)
        self.linear3 = nn.Linear(hidden_size*2, hidden_size)
        self.linear4 = nn.Linear(hidden_size, 9)

    def forward(self, x):
        imageLevelOutputs = []
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).cuda()
        #c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).cuda()

        out, h_n = self.GRU(x, h0)
        
        for i, out_t in enumerate(out.chunk(out.size(1), dim=1)):
            out_t = out_t.squeeze(1)
            out_t = F.relu(self.linear1(out_t))
            out_t = self.linear2(out_t)
            imageLevelOutputs += [out_t]
        imageLevelOutputs = torch.stack(imageLevelOutputs, 1).squeeze(2)
        
        h_n = h_n.view(1,-1)
        studyLevelOutputs = F.relu(self.linear3(h_n))
        studyLevelOutputs = self.linear4(studyLevelOutputs)
        
        return (imageLevelOutputs, studyLevelOutputs)

In [None]:
# List all models here

# CV4
CNNmodel1_Path = modelDirPath+'CNNmodel_01_epoch1_CV4_20201008_2252_embedder.pth' 
CNNembedderModel1 = torch.load(CNNmodel1_Path) 
CNNembedderModel1.eval();

RNNmodel11_Path = modelDirPath+'CNNmodel_01_epoch1_CV4_20201008_2252_sequence4_20201010_1528.pth'
RNNmodel11 = torch.load(RNNmodel11_Path) 
RNNmodel11.eval();

RNNmodel12_Path = modelDirPath+'CNNmodel_01_epoch1_CV4_20201008_2252_sequence7_20201022_0539.pth'
RNNmodel12 = torch.load(RNNmodel12_Path) 
RNNmodel12.eval();

RNNmodel13_Path = modelDirPath+'CNNmodel_01_epoch1_CV4_20201008_2252_sequence5_20201022_0415.pth'
RNNmodel13 = torch.load(RNNmodel13_Path) 
RNNmodel13.eval();

RNNmodel14_Path = modelDirPath+'CNNmodel_01_epoch1_CV4_20201008_2252_sequence7_20201022_1749.pth'
RNNmodel14 = torch.load(RNNmodel14_Path) 
RNNmodel14.eval();

# CV0
CNNmodel2_Path = modelDirPath+'CNNmodel_01_cv0_epoch1_20201014_0012_embedder.pth' 
CNNembedderModel2 = torch.load(CNNmodel2_Path) 
CNNembedderModel2.eval();

RNNmodel21_Path = modelDirPath+'CNNmodel_01_cv0_epoch1_20201014_0012_sequence5_20201020_1653.pth'
RNNmodel21 = torch.load(RNNmodel21_Path) 
RNNmodel21.eval();

RNNmodel22_Path = modelDirPath+'CNNmodel_01_cv0_epoch1_20201014_0012_sequence8_20201020_2122.pth'
RNNmodel22 = torch.load(RNNmodel22_Path) 
RNNmodel22.eval();

# CV3
CNNmodel3_Path = modelDirPath+'CNNmodel_01_cv3_epoch2_20201018_1153_embedder.pth' 
CNNembedderModel3 = torch.load(CNNmodel3_Path) 
CNNembedderModel3.eval();

RNNmodel31_Path = modelDirPath+'CNNmodel_01_cv3_epoch2_20201018_1153_sequence2_20201019_0622.pth'
RNNmodel31 = torch.load(RNNmodel31_Path) 
RNNmodel31.eval();

RNNmodel32_Path = modelDirPath+'CNNmodel_01_cv3_epoch2_20201018_1153_sequence8_20201023_0529.pth'
RNNmodel32 = torch.load(RNNmodel32_Path) 
RNNmodel32.eval();

# log regression models
logRegModel_indeterminate = pickle.load(open('../input/firstbaselinemodel/logRegModel_indeterminate.sav', 'rb'))
logRegModel_chronic_pe = pickle.load(open('../input/firstbaselinemodel/logRegModel_chronic_pe.sav', 'rb'))
logRegModel_acute_and_chronic_pe = pickle.load(open('../input/firstbaselinemodel/logRegModel_acute_and_chronic_pe.sav', 'rb'))
logRegModel_rv_lv_ratio_gte_1 = pickle.load(open('../input/firstbaselinemodel/logRegModel_rv_lv_ratio_gte_1.sav', 'rb'))
logRegModel_rv_lv_ratio_lt_1 = pickle.load(open('../input/firstbaselinemodel/logRegModel_rv_lv_ratio_lt_1.sav', 'rb'))

In [None]:
# Study level labels
listOfMetricLabels = ['negative_exam_for_pe', 'indeterminate',
                        'chronic_pe', 'acute_and_chronic_pe',
                        'central_pe', 'leftsided_pe', 'rightsided_pe',
                        'rv_lv_ratio_gte_1', 'rv_lv_ratio_lt_1']

defaultScore = {'pe_present_on_image': 0.053915069524414806,
                 'negative_exam_for_pe': 0.6763928618101033,
                 'rv_lv_ratio_gte_1': 0.12875001256566257,
                 'rv_lv_ratio_lt_1': 0.17437230326919448,
                 'leftsided_pe': 0.21089872969528548,
                 'chronic_pe': 0.040139752506710064,
                 'rightsided_pe': 0.2575653665766779,
                 'acute_and_chronic_pe': 0.019458347341720122,
                 'central_pe': 0.054468517151291695,
                 'indeterminate': 0.020484822355039723}

q_weighted_means = [0.00326324, 0.05970682, 0.32645303, 0.67452216, 0.71344817, 0.4734337, 0.0740926, 0.00369781]
sliceBins = [0/8, 1/8, 2/8, 3/8, 4/8, 5/8, 6/8, 7/8]

def getBinnedProb(sliceLoc):
    thisProb = 0
    for eachIndex, eachSlice in enumerate(sliceBins):
        if sliceLoc>=eachSlice:
            thisProb = q_weighted_means[eachIndex]
    return(thisProb)

In [None]:
# Helper functions
def RNNinferenceHelper(RNNmodel, embeddingVol):
    imageLevelLabels, studyLevelLabels = RNNmodel(embeddingVol)
    imageLevelLabels = torch.sigmoid(imageLevelLabels).squeeze(0).cpu().detach().numpy()
    studyLevelLabels = torch.sigmoid(studyLevelLabels).squeeze(0).cpu().detach().numpy()
    return imageLevelLabels,studyLevelLabels

In [None]:
numExceptions = 0
submissionList = []
pred = 0.5

In [None]:
def inferenceBlock(eachStudyID):
    thisStudyDF = testDataDF[testDataDF['StudyInstanceUID']==eachStudyID]
    listOfImageIDs = thisStudyDF.index
    
    listOfDcm_dataDict = []
    for eachImageID in listOfImageIDs:
        eachImagePath = dataDirPath+'/test/'+thisStudyDF.loc[eachImageID, 'StudyInstanceUID']+'/'+thisStudyDF.loc[eachImageID, 'SeriesInstanceUID']+'/'+eachImageID+'.dcm'
        dcm_data = dcmread(eachImagePath)
        img_pos = dcm_data[0x20, 0x32].value[2]
        listOfDcm_dataDict.append({'imageID':eachImageID, 'dcm_data':dcm_data, 'img_pos':img_pos})

    listOfDcm_dataDict.sort(key=sortByImgPosHelper, reverse=True)
    tensorChunkIterator = chunks(listOfDcm_dataDict,48)

    embeddingList1 = []
    embeddingList2 = []
    embeddingList3 = []
    for eachChunk in tensorChunkIterator:
        #torch.cuda.empty_cache()
        images = [dcmDataToImage(eachImageID['dcm_data']) for eachImageID in eachChunk]
        images = [eachImage.astype(np.float32) for eachImage in images]
        listOfTensors = [data_transform(eachImage) for eachImage in images]
        stackedImagesTensor = torch.stack(listOfTensors, dim=0, out=None)
        stackedImagesTensor = stackedImagesTensor.cuda()
        embedding1 = CNNembedderModel1(stackedImagesTensor).detach()
        embeddingList1.append(embedding1)
        embedding2 = CNNembedderModel2(stackedImagesTensor).detach()
        embeddingList2.append(embedding2)
        embedding3 = CNNembedderModel3(stackedImagesTensor).detach()
        embeddingList3.append(embedding3)

    embeddingVol1 = torch.cat(embeddingList1, dim=0)
    embeddingVol1 = embeddingVol1.unsqueeze(0)
    embeddingVol2 = torch.cat(embeddingList2, dim=0)
    embeddingVol2 = embeddingVol2.unsqueeze(0)
    embeddingVol3 = torch.cat(embeddingList3, dim=0)
    embeddingVol3 = embeddingVol3.unsqueeze(0)

    imageLevelLabels11, studyLevelLabels11 = RNNinferenceHelper(RNNmodel11, embeddingVol1)
    imageLevelLabels12, studyLevelLabels12 = RNNinferenceHelper(RNNmodel12, embeddingVol1)
    imageLevelLabels13, studyLevelLabels13 = RNNinferenceHelper(RNNmodel13, embeddingVol1)
    imageLevelLabels14, studyLevelLabels14 = RNNinferenceHelper(RNNmodel14, embeddingVol1)
    imageLevelLabels21, studyLevelLabels21 = RNNinferenceHelper(RNNmodel21, embeddingVol2)
    imageLevelLabels22, studyLevelLabels22 = RNNinferenceHelper(RNNmodel22, embeddingVol2)
    imageLevelLabels31, studyLevelLabels31 = RNNinferenceHelper(RNNmodel31, embeddingVol3)
    imageLevelLabels32, studyLevelLabels32 = RNNinferenceHelper(RNNmodel32, embeddingVol3)

    imageLevelLabels = 0.125*imageLevelLabels11 + 0.125*imageLevelLabels12 + 0.125*imageLevelLabels13 + 0.125*imageLevelLabels14 + 0.125*imageLevelLabels21 + 0.125*imageLevelLabels22 + 0.125*imageLevelLabels31 + 0.125*imageLevelLabels32
    studyLevelLabels = 0.125*studyLevelLabels11 + 0.125*studyLevelLabels12 + 0.125*studyLevelLabels13 + 0.125*studyLevelLabels14 + 0.125*studyLevelLabels21 + 0.125*studyLevelLabels22 + 0.125*studyLevelLabels31 + 0.125*studyLevelLabels32
    
    studyLevelLabels_copy = np.copy(studyLevelLabels)
    studyLevelLabels_copy = studyLevelLabels_copy[np.newaxis,:]
    studyLevelLabels[1] = logRegModel_indeterminate.predict_proba(studyLevelLabels_copy)[0,1]
    studyLevelLabels[2] = logRegModel_chronic_pe.predict_proba(studyLevelLabels_copy)[0,1]
    studyLevelLabels[3] = logRegModel_acute_and_chronic_pe.predict_proba(studyLevelLabels_copy)[0,1]
    studyLevelLabels[7] = logRegModel_rv_lv_ratio_gte_1.predict_proba(studyLevelLabels_copy)[0,1]
    studyLevelLabels[8] = logRegModel_rv_lv_ratio_lt_1.predict_proba(studyLevelLabels_copy)[0,1]
    
    modifyAll(imageLevelLabels, studyLevelLabels)
    
    # imageLevelLabels
    for eachIndex in range(len(listOfDcm_dataDict)):
        submissionList.append([listOfDcm_dataDict[eachIndex]['imageID'], imageLevelLabels[eachIndex]])

    # studyLevelLavels
    for eachIndex, eachMetric in enumerate(listOfMetricLabels):
        submissionList.append([eachStudyID+'_'+eachMetric, studyLevelLabels[eachIndex]])
    return

def contingencyPlanB(eachStudyID):
    # Contingency plan B
    thisStudyDF = testDataDF[testDataDF['StudyInstanceUID']==eachStudyID]
    listOfImageIDs = thisStudyDF.index

    listOfDcm_dataDict = []
    for eachImageID in listOfImageIDs:
        eachImagePath = dataDirPath+'/test/'+thisStudyDF.loc[eachImageID, 'StudyInstanceUID']+'/'+thisStudyDF.loc[eachImageID, 'SeriesInstanceUID']+'/'+eachImageID+'.dcm'
        dcm_data = dcmread(eachImagePath)
        img_pos = dcm_data[0x20, 0x32].value[2]
        listOfDcm_dataDict.append({'imageID':eachImageID, 'dcm_data':dcm_data, 'img_pos':img_pos})

    listOfDcm_dataDict.sort(key=sortByImgPosHelper, reverse=True)

    imageLevelLabels = np.full(len(listOfImageIDs), defaultScore['pe_present_on_image'])
    studyLevelLabels = np.zeros(len(listOfMetricLabels))
    for eachIndex in range(len(listOfDcm_dataDict)):
        thisSliceLoc = eachIndex/(len(listOfDcm_dataDict)-1)
        imageLevelLabels[eachIndex] = getBinnedProb(thisSliceLoc)
    for eachIndex, eachMetric in enumerate(listOfMetricLabels):
        studyLevelLabels[eachIndex] = defaultScore[eachMetric]
    
    modifyAll(imageLevelLabels, studyLevelLabels)
    # imageLevelLabels
    for eachIndex in range(len(listOfDcm_dataDict)):
        submissionList.append([listOfDcm_dataDict[eachIndex]['imageID'], imageLevelLabels[eachIndex]])

    # studyLevelLavels
    for eachIndex, eachMetric in enumerate(listOfMetricLabels):
        submissionList.append([eachStudyID+'_'+eachMetric, studyLevelLabels[eachIndex]])
    return

def contingencyPlanC(eachStudyID):
    global numExceptions
    numExceptions += 1
    # Contingency plan C
    thisStudyDF = testDataDF[testDataDF['StudyInstanceUID']==eachStudyID]
    listOfImageIDs = thisStudyDF.index
    imageLevelLabels = np.full(len(listOfImageIDs), defaultScore['pe_present_on_image'])
    studyLevelLabels = np.zeros(len(listOfMetricLabels))
    for eachIndex, eachMetric in enumerate(listOfMetricLabels):
        studyLevelLabels[eachIndex] = defaultScore[eachMetric]
    
    modifyAll(imageLevelLabels, studyLevelLabels)
    # imageLevelLabels
    for eachImageIDs in listOfImageIDs:
        submissionList.append([eachImageIDs, imageLevelLabels[eachIndex]])

    # studyLevelLavels
    for eachIndex, eachMetric in enumerate(listOfMetricLabels):
        submissionList.append([eachStudyID+'_'+eachMetric, studyLevelLabels[eachIndex]])
    return

with torch.no_grad():
    for eachStudyID in listOfStudyID:
        gc.collect()
        try:
            try: 
                inferenceBlock(eachStudyID)
            except:
                contingencyPlanB(eachStudyID)
        except:
            contingencyPlanC(eachStudyID)
        
submissionDF = pd.DataFrame(submissionList, columns = ['id','label'])
submissionDF.fillna(0.5)
print('finish')

In [None]:
print(len(submissionDF))

In [None]:
def check_consistency(sub, test):
    
    '''
    Checks label consistency and returns the errors
    
    Args:
    sub   = submission dataframe (pandas)
    test  = test.csv dataframe (pandas)
    '''
    
    # EXAM LEVEL
    for i in test['StudyInstanceUID'].unique():
        df_tmp = sub.loc[sub.id.str.contains(i, regex = False)].reset_index(drop = True)
        df_tmp['StudyInstanceUID'] = df_tmp['id'].str.split('_').str[0]
        df_tmp['label_type']       = df_tmp['id'].str.split('_').str[1:].apply(lambda x: '_'.join(x))
        del df_tmp['id']
        if i == test['StudyInstanceUID'].unique()[0]:
            df = df_tmp.copy()
        else:
            df = pd.concat([df, df_tmp], axis = 0)
    df_exam = df.pivot(index = 'StudyInstanceUID', columns = 'label_type', values = 'label')
    
    # IMAGE LEVEL
    df_image = sub.loc[sub.id.isin(test.SOPInstanceUID)].reset_index(drop = True)
    df_image = df_image.merge(test, how = 'left', left_on = 'id', right_on = 'SOPInstanceUID')
    df_image.rename(columns = {"label": "pe_present_on_image"}, inplace = True)
    del df_image['id']
    
    # MERGER
    df = df_exam.merge(df_image, how = 'left', on = 'StudyInstanceUID')
    ids    = ['StudyInstanceUID', 'SeriesInstanceUID', 'SOPInstanceUID']
    labels = [c for c in df.columns if c not in ids]
    df = df[ids + labels]
    
    # SPLIT NEGATIVE AND POSITIVE EXAMS
    df['positive_images_in_exam'] = df['StudyInstanceUID'].map(df.groupby(['StudyInstanceUID']).pe_present_on_image.max())
    df_pos = df.loc[df.positive_images_in_exam >  0.5]
    df_neg = df.loc[df.positive_images_in_exam <= 0.5]
    
    # CHECKING CONSISTENCY OF POSITIVE EXAM LABELS
    rule1a = df_pos.loc[((df_pos.rv_lv_ratio_lt_1  >  0.5)  & 
                         (df_pos.rv_lv_ratio_gte_1 >  0.5)) | 
                        ((df_pos.rv_lv_ratio_lt_1  <= 0.5)  & 
                         (df_pos.rv_lv_ratio_gte_1 <= 0.5))].reset_index(drop = True)
    rule1a['broken_rule'] = '1a'
    rule1b = df_pos.loc[(df_pos.central_pe    <= 0.5) & 
                        (df_pos.rightsided_pe <= 0.5) & 
                        (df_pos.leftsided_pe  <= 0.5)].reset_index(drop = True)
    rule1b['broken_rule'] = '1b'
    rule1c = df_pos.loc[(df_pos.acute_and_chronic_pe > 0.5) & 
                        (df_pos.chronic_pe           > 0.5)].reset_index(drop = True)
    rule1c['broken_rule'] = '1c'
    rule1d = df_pos.loc[(df_pos.indeterminate        > 0.5) | 
                        (df_pos.negative_exam_for_pe > 0.5)].reset_index(drop = True)
    rule1d['broken_rule'] = '1d'

    # CHECKING CONSISTENCY OF NEGATIVE EXAM LABELS
    rule2a = df_neg.loc[((df_neg.indeterminate        >  0.5)  & 
                         (df_neg.negative_exam_for_pe >  0.5)) | 
                        ((df_neg.indeterminate        <= 0.5)  & 
                         (df_neg.negative_exam_for_pe <= 0.5))].reset_index(drop = True)
    rule2a['broken_rule'] = '2a'
    rule2b = df_neg.loc[(df_neg.rv_lv_ratio_lt_1     > 0.5) | 
                        (df_neg.rv_lv_ratio_gte_1    > 0.5) |
                        (df_neg.central_pe           > 0.5) | 
                        (df_neg.rightsided_pe        > 0.5) | 
                        (df_neg.leftsided_pe         > 0.5) |
                        (df_neg.acute_and_chronic_pe > 0.5) | 
                        (df_neg.chronic_pe           > 0.5)].reset_index(drop = True)
    rule2b['broken_rule'] = '2b'
    
    # MERGING INCONSISTENT PREDICTIONS
    errors = pd.concat([rule1a, rule1b, rule1c, rule1d, rule2a, rule2b], axis = 0)
    
    # OUTPUT
    print('Found', len(errors), 'inconsistent predictions')
    return errors

In [None]:
print('numExceptions:', numExceptions)

test  = pd.read_csv(dataDirPath+'test.csv')
consistencyErrors = check_consistency(submissionDF, test)

#if numExceptions < 50 and len(consistencyErrors) == 0: 
submissionDF.to_csv('submission.csv', index=False)

In [None]:
endTime = datetime.datetime.now()
print(endTime-startTime)