<h1>Pre-process I used in the competition</h1>

I used the information contained in the images by performing lung segmentation first.

I intended to use this library : 
https://github.com/JoHof/lungmask

The library allows to detect masks for left and right lung. 
Nevertheless I could not use it as it seems that Skimage could not be imported in this competition.
Therefore I used another segmentation tool I found in [this notebook](https://www.kaggle.com/super13579/ed-simple-method-of-lung-segmentation) posted on Kaggle by @funkyboy.

After obtaining a mask, and as I had no other way to detect the right lung from the left one, I just used a rough method, and just divided the mask image into two, vertically, and considered that the left lung is in the left half of the mask while the right lung is in the right half of the mask. I then calculated the area of the left and right lung and then the volume, and used this information as features.

I did not use a CNN because I believed the processing time will  be longer than the one allowed for this competition.




In [None]:
!conda install -c conda-forge gdcm -y

In [None]:
import os
import sys
import math
import random
from tqdm import tqdm
import itertools
#import fill_voids
import logging
import datetime
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, GroupKFold
from sklearn.decomposition import PCA

from PIL import Image
import pydicom as dcm
from pydicom.pixel_data_handlers.util import apply_modality_lut
from pydicom.pixel_data_handlers.gdcm_handler import *

import glob
from skimage import morphology, io, color, exposure, img_as_float, transform
import skimage.measure

from keras.models import load_model
from keras.preprocessing.image import ImageDataGenerator

#from lung-segmentation.lungmask import mask, utils
from skimage.segmentation import clear_border
from skimage.measure import label, regionprops
from skimage.morphology import disk, dilation, binary_erosion, binary_closing
from skimage.filters import roberts, sobel
import cv2
from scipy import ndimage as ndi

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset


In [None]:
ROOT = '/kaggle/input/rsna-str-pulmonary-embolism-detection'
PATH_TRAIN_IMAGES = ROOT + '/train/'
PATH_TEST_IMAGES = ROOT + '/test/'

df_train = pd.read_csv('/kaggle/input/rsna-str-pulmonary-embolism-detection/train.csv')
df_test = pd.read_csv('/kaggle/input/rsna-str-pulmonary-embolism-detection/test.csv')
sub = pd.read_csv('/kaggle/input/rsna-str-pulmonary-embolism-detection/sample_submission.csv')

# New features to add : 
df_train = df_train.assign(area=0, area_real=0, area_mean_per_study=0, area_min_per_study=0, area_max_per_study=0, pixel_spacing_height=0, pixel_spacing_width=0, slice_thickness=0)
df_test = df_test.assign(area=0, area_real=0, area_mean_per_study=0, area_min_per_study=0, area_max_per_study=0, pixel_spacing_height=0, pixel_spacing_width=0, slice_thickness=0)
sub = sub.assign(area=0, area_real=0, area_mean_per_study=0, area_min_per_study=0, area_max_per_study=0, pixel_spacing_height=0, pixel_spacing_width=0, slice_thickness=0)
      

Classes I used:

In [None]:
class DataImage():
    def __init__(self):
        self.substudy=''
        self.study=''
        self.file_name=''
        self.pixel_spacing_height=0
        self.pixel_spacing_width=0
        self.slice_thickness=0
        self.np_lung=np.array((512,512))
        self.np_mask=np.array((512,512))
        self.lung_area=0
        self.lung_area_real=0
        self.lung_area_1=0
        self.lung_area_2=0
        self.real_lung_area_1=0
        self.real_lung_area_2=0
        
    def to_string(self):
        print("Class DataImage:")
        print("File name: ", self.file_name) 
        print("Pixel spacing height: ", self.pixel_spacing_height)
        print("Pixel spacing width: ", self.pixel_spacing_width)
        print("Slicethickness: ", self.slice_thickness)
        
        
class DataPatient():
    def __init__(self, s, sbs, w):
        self.study=s
        self.substudy=sbs
        self.where=w
        self.data_images = []
        
    def to_string(self):
        print("Class DataPatient:")
        print("Study: ", self.study)
        print("Subtudy: ", self.substudy)
        print("Where: ", self.where)
        print("Images: ", self.data_images)
        
def preprocess(df):
    df1 = df.groupby(by=['StudyInstanceUID']).count()
    df['nb_images_per_study'] = df.apply(lambda row: df1.loc[row['StudyInstanceUID'], 'SeriesInstanceUID'], axis=1)
    
    return df

def score_series(y_test_col, y_val_predict, col):
    score = -weights[col] * log_loss(y_test_col, y_val_predict)
    return score


Image segmentation functions

In [None]:
def get_segmented_lungs(im2, plot=False):
    im = im2.copy()
    # Step 1: Convert into a binary image.
    binary = im < -400
    
    if plot:
        plt.imshow(binary)
        plt.show()
        
    # Step 2: Remove the blobs connected to the border of the image.
    cleared = clear_border(binary)
    
    if plot:
        plt.imshow(cleared)
        plt.show()    
        
    # Step 3: Label the image.
    label_image = label(cleared)
    
    if plot:
        plt.imshow(label_image)
        plt.show()    
        
    # Step 4: Keep the labels with 2 largest areas.
    areas = [r.area for r in regionprops(label_image)]
    areas.sort()
    if len(areas) > 0:
        for region in regionprops(label_image):
            if region.area < areas[0]:
                for coordinates in region.coords:
                       label_image[coordinates[0], coordinates[1]] = 0
    binary = label_image > 0
    
    if plot:
        plt.imshow(binary)
        plt.show()  
        
    # Step 5: Erosion operation with a disk of radius 2. This operation is seperate the lung nodules attached to the blood vessels.
    selem = disk(2)
    binary = binary_erosion(binary, selem)
    
    if plot:
        plt.imshow(binary)
        plt.show()  
        
    # Step 6: Closure operation with a disk of radius 10. This operation is to keep nodules attached to the lung wall.
    selem = disk(10) # CHANGE BACK TO 10
    binary = binary_closing(binary, selem)
    
    if plot:
        plt.imshow(binary)
        plt.show() 
        
    # Step 7: Fill in the small holes inside the binary mask of lungs.
    edges = roberts(binary)
    
    if plot:
        plt.imshow(edges)
        plt.show() 
        
    binary = ndi.binary_fill_holes(edges)
    
    if plot:
        plt.imshow(binary)
        plt.show() 
        
    # Step 8: Superimpose the binary mask on the input image.
    selem = disk(4)
    binary = dilation(binary, selem)
    get_high_vals = binary == 0
    im[get_high_vals] = -2000
    
    if plot:
        plt.imshow(im)
        plt.show()
        
    return im, binary

def get_mask_for_one_image(image_path):
    #dicom = data_path+studyID+"/"+SeriesID+"/"+SOPID+".dcm"
    img = dcm.dcmread(image_path)
    d = DataImage()
    
    d.pixel_spacing_height = float(img.PixelSpacing[0])
    d.pixel_spacing_width = float(img.PixelSpacing[1])
    d.slice_thickness = float(img.SliceThickness)
    
    img_data = img.pixel_array # Read the pixel value
    hu = apply_modality_lut(img_data, img) # Transform to HU value
    lung_seg, _ = get_segmented_lungs(hu)
    minval = np.amin(lung_seg)
    s1 = lung_seg>minval
    area = s1.sum()
    
    d.lung_area=area
    d.lung_area_real= area * d.pixel_spacing_width * d.pixel_spacing_height
    d.np_lung=hu
    d.np_mask=lung_seg
    
    return d


def get_mask_for_one_series(dir_study, p):
    count=0
    for i, filename in enumerate(os.listdir(dir_study)):
        count = count+1
        (lung_seg, hu, area) = get_mask_for_one_image(dir_study + filename)
        
        d = DataImage()
        d.study = study
        d.substudy = substudy
        d.file_name=filename
        d.np_lung=hu
        d.np_mask=lung_seg
        d.lung_area = area
        p.data_images.append(d)
    return p    
        

This is the segmentation for the training set. The one for the test set is similar.

In [None]:
c=0
for dirname, studies, filenames in os.walk(ROOT+'/train/'):
    studies.sort(reverse=False)
    for study in studies:
        c += 1   
        print(study+" "+str(c))
        df_train_study = df_train.loc[df_train['StudyInstanceUID']==study]
        study = str(study)
        for dirname2, substudies, filenames in os.walk(ROOT+'/train/'+study+'/'):
            for substudy in substudies:
                dir_study = PATH_TRAIN_IMAGES+study+'/'+substudy+'/'
                p = DataPatient(study, substudy, 'TRAIN')
                #(lung_seg, hu, area) = get_mask_for_one_series(dir_study, p)
                for i, filename in enumerate(os.listdir(dir_study)):
                    d = get_mask_for_one_image(dir_study + str(filename))
                    filename = str(filename)
                    im_name = "train_"+study+"_"+substudy+"_"+filename[:-4]
                    #matplotlib.image.imsave(im_name+'.jpg', d.np_mask)
                    df_train_study.loc[(df_train_study['SeriesInstanceUID']==substudy) & (df_train_study['SOPInstanceUID']==filename[:-4]), 'area'] = d.lung_area
                    df_train_study.loc[(df_train_study['SeriesInstanceUID']==substudy) & (df_train_study['SOPInstanceUID']==filename[:-4]), 'area_real'] = d.lung_area_real
                    
                    df_train_study.loc[(df_train_study['SeriesInstanceUID']==substudy) & (df_train_study['SOPInstanceUID']==filename[:-4]), 'pixel_spacing_height'] = d.pixel_spacing_height
                    df_train_study.loc[(df_train_study['SeriesInstanceUID']==substudy) & (df_train_study['SOPInstanceUID']==filename[:-4]), 'pixel_spacing_width'] = d.pixel_spacing_width
                    df_train_study.loc[(df_train_study['SeriesInstanceUID']==substudy) & (df_train_study['SOPInstanceUID']==filename[:-4]), 'slice_thickness'] = d.slice_thickness
             
        df_train_study_nonzero = df_train_study.loc[df_train_study['area']>0]
        
        val_min = df_train_study_nonzero['area_real'].min()
        val_max = df_train_study_nonzero['area_real'].max()
        val_mean = df_train_study_nonzero['area_real'].mean()
        
        df_train_study.loc[:,'area_mean_per_study'] = val_mean
        df_train_study.loc[:,'area_min_per_study'] = val_min
        df_train_study.loc[:,'area_max_per_study'] = val_max
        
        df_train_study.loc[:,'nb_images_per_study'] = df_train_study.shape[0]
        
        df_train_study.to_csv('df_train_'+str(study)+'.csv')
        
    break
    print("End Train") 
