# Introduction

* The goal of this notebook is to investigate a metric for describing the state of the lung's fibrosis allowing for comparison among patients of varying characterisitcs. 
* FVC alone should not be a great indicator for the state of the lung's fibrosis as it most probably should depend linearly on the lungs's volume regarded as an empty vessel.
* The tabular data offers the Percent column which is described as the fraction of an FVC value divided by a FVC value typical for a person with similar characterisitcs. 
* However, I thought the most straightforward way for describing the state of the lung's fibrosis should be the FVC divided by the lung's volume regarded as an empty vessel.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#These installations are necessary in order to open all scans
!conda install -c conda-forge gdcm -y
!conda install -c conda-forge pillow -y
!conda install -c conda-forge pydicom -y
!conda install -c conda-forge tslearn -y

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

import time
import math

import cv2
import random
import pydicom
import warnings

from glob import glob
from tqdm import tqdm

In [None]:
ROOT = '../input/osic-pulmonary-fibrosis-progression'
train_df = pd.read_csv(f'{ROOT}/train.csv')
train_df.drop_duplicates(subset=['Patient','Weeks'], keep = False, inplace = True)
groupedbypatient = train_df.groupby('Patient')
patientimages = groupedbypatient.Patient.first().to_numpy()

# Segmentation

Taken from @Xie29's great notebook(https://www.kaggle.com/xiejialun/lung-ct-scan-segmentation-model)

In [None]:
INPUT_FOLDER = '/kaggle/input/osic-pulmonary-fibrosis-progression/train/'

In [None]:
def load_scan(path):
    """
    Loads scans from a folder and into a list.
    
    Parameters: path (Folder path)
    
    Returns: slices (List of slices)
    """
    
    slices = [pydicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: int(x.InstanceNumber))   
   
        
    return slices

In [None]:
def get_segmentation_model():
    
    class FixedDropout(tf.keras.layers.Dropout):
        def _get_noise_shape(self, inputs):
            if self.noise_shape is None:
                return self.noise_shape

            symbolic_shape = tf.keras.backend.shape(inputs)
            noise_shape = [symbolic_shape[axis] if shape is None else shape
                           for axis, shape in enumerate(self.noise_shape)]
            return tuple(noise_shape)

    def DiceCoef(y_trues, y_preds, smooth=1e-5, axis=None):
        intersection = tf.reduce_sum(y_trues * y_preds, axis=axis)
        union = tf.reduce_sum(y_trues, axis=axis) + tf.reduce_sum(y_preds, axis=axis)
        return tf.reduce_mean((2*intersection+smooth) / (union + smooth))

    def DiceLoss(y_trues, y_preds):
        return 1.0 - DiceCoef(y_trues, y_preds)

    get_custom_objects().update({'swish': tf.keras.layers.Activation(tf.nn.swish)})
    get_custom_objects().update({'FixedDropout':FixedDropout})
    get_custom_objects().update({'DiceCoef' : DiceCoef})
    get_custom_objects().update({'DiceLoss' : DiceLoss})
    
    print('Load segmentation model...')
    model = tf.keras.models.load_model('../input/lung-ct-segmentation-pretrain/osic_segmentation_model.h5')
    return model

In [None]:
def scale_and_resize(dcm):
    DIM = 256
    image = dcm.pixel_array
    image = ((image - np.min(image)) / (np.max(image) - np.min(image)) * 255).astype(np.uint8)

    if image.shape[0] != 512 or image.shape[1] != 512:
        old_x, old_y = image.shape[0], image.shape[1]
        x = (image.shape[0] - 512) // 2
        y = (image.shape[1] - 512) // 2
        image = image[x : old_x-x, y : old_y-y]
        image = image[:512, :512]

    image = cv2.resize(image, (DIM,DIM), cv2.INTER_AREA)
    image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    return image/ 255.0

# Calculating volumes

The volumes can be retrieved from the data set

In [None]:
def get_volume(patient, trace = False):
    #loading
    scans = load_scan(INPUT_FOLDER + patient)   
    stack = np.array([scale_and_resize(s) for s in scans])    
    
    #masks
    pred_masks = model.predict(stack, verbose=0)    
    pred_masks = (pred_masks>0.5).astype(np.float32)    
        
    n = len(scans)
    volume = []
    pixelcount = 0    

    #volume of each slice
    for i in range(n):   
        pixelfactor = scans[i].pixel_array.shape[0] * scans[i].pixel_array.shape[1] / (pred_masks.shape[1] * pred_masks.shape[2])
        thickness = float(scans[i].SliceThickness)
        if (i < n - 1 and hasattr(scans[i], 'SliceLocation') and hasattr(scans[i+1], 'SliceLocation')):
            thickness = min(abs(scans[i].SliceLocation - scans[i+1].SliceLocation), thickness)        
        voxelsize = float(scans[i].PixelSpacing[0]) * float(scans[i].PixelSpacing[1]) * thickness
        pixelcount = np.sum(pred_masks[i].astype('int')) * pixelfactor
        volume.append(pixelcount * voxelsize)    
      
    #linear interpolation between slices with spacing larger than slice thickness
    totalvolume = 0
    for i in range(n-1):
        meanv = (volume[i] + volume[i + 1]) / 2
        meanth = ((scans[i].SliceThickness + scans[i+1].SliceThickness) / 2)
        if (hasattr(scans[i], 'SliceLocation') and hasattr(scans[i+1], 'SliceLocation')):
            meanth = min(abs(scans[i].SliceLocation - scans[i+1].SliceLocation), meanth)
        distance = meanth if (not hasattr(scans[i], 'SliceLocation') or not hasattr(scans[i+1], 'SliceLocation')) else np.abs(scans[i].SliceLocation - scans[i+1].SliceLocation)
        totalvolume += volume[i] +  (distance - meanth) * meanv

    totalvolume += volume[-1]
    
    del scans
    del stack
    del pred_masks

    return totalvolume / (100*100*100) * 1000

In [None]:
def get_volumes(patients):
    volumes = np.zeros(len(patients))
    count = 0
    for p in patients:
        volumes[count] = get_volume(p,False)
        print(volumes[count])
        print(count)
        count += 1      
        
    return volumes

In [None]:
volumes = None
volumesPath = '../input/osicpulmonaryfibrosislungvolumes/LungVolumesInterpolated.npy'

try:
    volumes = np.load(volumesPath) 
except:
    import tensorflow as tf
    from sklearn.model_selection import train_test_split
    from tensorflow.keras.utils import get_custom_objects

    warnings.filterwarnings('ignore')
    print('Tensorflow version : {}'.format(tf.__version__))
    
    model = get_segmentation_model()
    volumes = get_volumes(patientimages)
    

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import get_custom_objects

warnings.filterwarnings('ignore')
print('Tensorflow version : {}'.format(tf.__version__))
    
model = get_segmentation_model()

# Analysing volumes

* If the assumption made in the introduction is correct the calculated volumes should correlate linearly with the maximum FVC values

In [None]:
ROOT = '../input/osic-pulmonary-fibrosis-progression'
train_df = pd.read_csv(f'{ROOT}/train.csv')

In [None]:
train_df.drop_duplicates(subset=['Patient','Weeks'], keep = False, inplace = True)

In [None]:
groupedfvc = groupedbypatient.FVC.apply(list).reset_index()['FVC']
groupedprecents = groupedbypatient.Percent.apply(list).reset_index()['Percent']
groupedweeks = groupedbypatient.Weeks.apply(list).reset_index()['Weeks']

In [None]:
maxfvc = np.array([max(groupedfvc[i]) for i in range(groupedfvc.shape[0])])
maxpercent = np.array([max(groupedprecents[i]) for i in range(groupedprecents.shape[0])])
minweeks = np.array([min(groupedweeks[i]) for i in range(groupedweeks.shape[0])])

In [None]:
plt.scatter(maxfvc,volumes)

* The multiple clusters in the scatter plot suggest that some aspect was not properly included to get consistent volumes for all patients
* The large cluster at the bottom however suggests that there is some value to the assumption, let's look at it closer

In [None]:
maskfraction = (maxfvc / volumes) < 0.8
maskfraction = maskfraction & ((maxfvc / volumes) > 0.4)
maskvolume = volumes < 6000
maskfv = maskfraction & maskvolume

In [None]:
plt.scatter(maxfvc[maskfv],volumes[maskfv])

* Clearly there is a linear correlation
* This supports the thesis that FVC/Volume should be a better indicator for the state of the disease, than FVC alone
* Let's investigate how the FVC/Volume correlates with Percent

In [None]:
plt.scatter(maxfvc[maskfv]/volumes[maskfv],maxpercent[maskfv])

* Apparently there is no clear correlation
* This suggests, that the two quantities do not describe the lung's state of fibrosis equivalently

In [None]:
patientimagesmasked = patientimages[maskfv]

In [None]:
#Defining health using calculated volumes
health = maxfvc[maskfv] / volumes[maskfv]

In [None]:
#Defining health using Percent
health = maxpercent[maskfv]

In [None]:
#Testing the health metric
percentmasked = maxpercent[maskfv]
weeksmasked = minweeks[maskfv]
mhpercent = percentmasked[np.max(health) == health][0]
mcpercent = percentmasked[np.min(health) == health][0]
mhminweek = weeksmasked[np.max(health) == health][0]
mcminweek = weeksmasked[np.min(health) == health][0]
print('Most healthy patient by estimation has a percent of ' + str(mhpercent))
print('Most healthy patient by estimation has a min week of ' + str(mhminweek))
print('Most critical patient by estimation has a percent of ' + str(mcpercent))
print('Most critical patient by estimation has a min week of ' + str(mcminweek))

In [None]:
mosthealthypatient = patientimagesmasked[np.max(health) == health][0]
mostcriticalpatient = patientimagesmasked[np.min(health) == health][0]

In [None]:
mhscan = load_scan(INPUT_FOLDER + mosthealthypatient)
plt.imshow(mhscan[len(mhscan) // 2].pixel_array)

In [None]:
mcscan = load_scan(INPUT_FOLDER + mostcriticalpatient)
plt.imshow(mcscan[len(mcscan) // 2].pixel_array)

# How do the metrics influence image classification?

* The idea is to divide the weeks into intervals with enough training data for a neural network

* Determining the intervals

In [None]:
lastw = 0
for i in range(weeksmasked.shape[0]):
    if np.max(weeksmasked[i]) > lastw:
        lastw = np.max(weeksmasked[i])

In [None]:
minw = -100 #week marking an interval
maxw = 0 #week marking an interval
interval = np.zeros((2,weeksmasked.shape[0])) #intervals
index = 0
minamount = 10#minmum amount of patients per interval

j = maxw
while j <= lastw + 1:
    count = 0
    while(count < minamount and maxw <= lastw + 1):
        count = 0
        for i in range(weeksmasked.shape[0]):
            mask1 = minw <= weeksmasked[i]
            mask2 = weeksmasked[i] < maxw
            r = weeksmasked[i][mask1 & mask2]   
            if r.shape[0] > 0:
                count += 1
        
        maxw += 1
        
        
    interval[0,index] = minw
    interval[1,index] = maxw
    index += 1
    minw = maxw
    maxw = minw + 1
    j = maxw

In [None]:
mask = np.zeros((interval.shape[0],interval.shape[1])).astype('bool')
mask[0,:] = interval[1,:] > 0
mask[1,:] = mask[0,:]

In [None]:
interval = interval[mask].reshape(2,int(interval[mask].shape[0]/2)).astype('int')

In [None]:
print('Intervals:')
interval

In [None]:
columnlist = ['Patient']
for i in range(interval.shape[1]):
    columnlist.append(str(interval[0,i]) + "_" + str(interval[1,i]))

In [None]:
fvcmasked = np.array(groupedfvc[maskfv])
volumesmasked = volumes[maskfv]
percentsmasked = np.array(groupedprecents[maskfv])

In [None]:
fvcpervolintervalmeans_df = None
for i in range(len(patientimagesmasked)):
    row = list()
    patient = patientimagesmasked[i]
    row.append(patient)
    for j in range(interval.shape[1]):       
        maskinterval = (weeksmasked[i] >= interval[0,j]) 
        maskinterval2 = weeksmasked[i] < (interval[1,j] - 1)         
        mean = (np.mean(fvcmasked[i][maskinterval2 & maskinterval])) / volumesmasked[i]
        row.append(mean)
   
    df = pd.DataFrame([row], columns=columnlist)
    
    if(i == 0):
        fvcpervolintervalmeans_df = df
    else:
        fvcpervolintervalmeans_df = pd.concat([fvcpervolintervalmeans_df,df])

In [None]:
def get_intervalmeans(fvcperv):
    intervalmeans_df = None
    for i in range(len(patientimagesmasked)):
        row = list()
        patient = patientimagesmasked[i]
        row.append(patient)
        for j in range(interval.shape[1]):       
            maskinterval = (weeksmasked[i] >= interval[0,j]) 
            maskinterval2 = weeksmasked[i] < (interval[1,j] - 1)         
            mean = (np.mean(fvcmasked[i][maskinterval2 & maskinterval])) / volumesmasked[i] if fvcperv else (np.mean(percentsmasked[i][maskinterval2 & maskinterval]))
            row.append(mean)

        df = pd.DataFrame([row], columns=columnlist)

        if(i == 0):
            intervalmeans_df = df
        else:
            intervalmeans_df = pd.concat([intervalmeans_df,df])
            
    return intervalmeans_df        

In [None]:
fvcpervolintervalmeans_df = get_intervalmeans(True)
percentintervalmeans_df = get_intervalmeans(False)

In [None]:
fvcpervolintervalmeans_df.head()

* Choosing the six slices with the largest area for training the classifier

In [None]:
def get_largest_slices(nslices, patient):
    scans = load_scan(INPUT_FOLDER + patient)   
    stack = np.array([scale_and_resize(s) for s in scans])    
    pred_masks = model.predict(stack, verbose=0)    
    pred_masks = (pred_masks>0.5).astype(np.float32)
    ps = np.sum(pred_masks, axis = 1)
    ps = np.sum(ps, axis = 1)
    ps = ps.reshape(ps.shape[0])
    
    del scans
    del stack
    del pred_masks
    
    ps2 = ps.copy() 
    indices = []
    indiceshelper = np.arange(0, ps.shape[0])
    for i in range(nslices):
        maxtemp = np.max(ps2)
        indices.append(indiceshelper[ps == maxtemp][0])
        ps2[ps2 == maxtemp] = -1
        
    return indices
   

In [None]:
intervalmeanspath = '../input/osicintervalmeansandlargestslices/intervalmeans_df.csv'

try:
    largestslices_df = pd.read_csv(intervalmeanspath)
except:
    nslices = 6
    largestslices = []
    count = 0
    for p in intervalmeans_df['Patient']:
        print(count)
        count += 1
        largestslices.append(get_largest_slices(nslices, p))

    intervalmeans_df['LargestSlices'] = largestslices    
    intervalmeans_df.to_csv('intervalmeans_df',index=False)
    

In [None]:
def get_files_names(interval, intervalmeans_df, getlabels = True):
   
    fnames = []
    labels = []
    for p in intervalmeans_df['Patient']:
        if getlabels:
            if math.isnan(intervalmeans_df[intervalmeans_df['Patient'] == p][interval].to_list()[0]):
                continue
        sa = [int(s.split(".")[0]) for s in os.listdir(INPUT_FOLDER + p)]
        sa.sort()
        largestslices = largestslices_df[largestslices_df['Patient'] == p]['LargestSlices']     
        print(largestslices)
        largestslices = largestslices.to_list()[0].strip('][').split(', ') if type(largestslices.to_list()[0]) == str else largestslices[0]    
        print(largestslices)
        for s in largestslices:  
            #print(type(s))
            #print(s)
            fnames.append(p + '/' + str(sa[int(s)]) + '.dcm')
            if getlabels: labels.append(intervalmeans_df[intervalmeans_df['Patient'] == p][interval].to_list()[0])   
    
    
    return fnames, labels

In [None]:
fnames, labelsfvcperv = get_files_names(fvcpervolintervalmeans_df.columns[2], fvcpervolintervalmeans_df)

In [None]:
fnames, labelspercent = get_files_names(fvcpervolintervalmeans_df.columns[2], percentintervalmeans_df)

In [None]:
!pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
!pip install fastai==2.0.9

In [None]:
from fastai.vision.all import *
from fastai.data.all import *

In [None]:
def get_image_file_paths(inputfolder):
    return [inputfolder+f for f in fnames]

In [None]:
def get_labels_fvcperv(fname):    
    return np.array(labelsfvcperv)[np.array(fnames) == (fname).split('/')[-2] + '/' + (fname).split('/')[-1]]

In [None]:
def get_labels_percent(fname):    
    return np.array(labelspercent)[np.array(fnames) == (fname).split('/')[-2] + '/' + (fname).split('/')[-1]] / 100

In [None]:
def get_pixels_hu(scans):
    """
    Converts raw images to Hounsfield Units (HU).
    
    Parameters: scans (Raw images)
    
    Returns: image (NumPy array)
    """
    
    image = np.stack([s.pixel_array for s in scans])
    image = image.astype(np.int16)

    # Since the scanning equipment is cylindrical in nature and image output is square,
    # we set the out-of-scan pixels to 0
    image[image == -2000] = 0
    
    
    # HU = m*P + b
    intercept = scans[0].RescaleIntercept
    slope = scans[0].RescaleSlope
    
    if slope != 1:
        image = slope * image.astype(np.float64)
        image = image.astype(np.int16)
        
    image += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

In [None]:
def create(fn):            
    img1 = pydicom.read_file(fn)
    img1 = get_pixels_hu([img1])
    img1 = PILImage(Image.fromarray(img1[0], mode=None))
    img1 = Resize(224)(img1)       
        
    return img1

In [None]:
def CustomImageBlock(): return TransformBlock(type_tfms=create, batch_tfms=IntToFloatTensor)

In [None]:
dblockfvcperv = DataBlock(blocks    = (CustomImageBlock, RegressionBlock),
                   get_items = get_image_file_paths,
                   get_y     = get_labels_fvcperv,
                   splitter  = RandomSplitter())
dsets = dblockfvcperv.datasets(INPUT_FOLDER)
dsets.train[0]

In [None]:
dblockpercent = DataBlock(blocks    = (CustomImageBlock, RegressionBlock),
                   get_items = get_image_file_paths,
                   get_y     = get_labels_percent,
                   splitter  = RandomSplitter())
dsets = dblockpercent.datasets(INPUT_FOLDER)
dsets.train[0]

In [None]:
dlsfvcperv = dblockfvcperv.dataloaders(INPUT_FOLDER)
dlspercent = dblockpercent.dataloaders(INPUT_FOLDER)
dlspercent.cuda().one_batch()

In [None]:
torch.cuda.is_available()

In [None]:
learnpercent = cnn_learner(dlspercent.cuda(), resnet34, metrics=error_rate)

In [None]:
learnpercent.lr_find()

In [None]:
learnpercent.fine_tune(10,1e-2)

In [None]:
learnfvcperv = cnn_learner(dlsfvcperv.cuda(), resnet34, metrics=error_rate)
learnfvcperv.lr_find()

In [None]:
learnfvcperv.fine_tune(10,1e-2)

* The low valid_loss shows, that both quantities can be extracted from the images
* Let's see what they predict on a test set

In [None]:
ntest = 10

In [None]:
testpatients = patientimages[np.bitwise_not(maskfv)][10:10+ntest]

In [None]:
nslices = 6
largestslices = []
count = 0
for p in testpatients:
    print(count)
    count += 1
    #print(INPUT_FOLDER + patient)
    largestslices.append(get_largest_slices(nslices, p))


In [None]:
fnamestest = []    
for p in testpatients:    
    sa = [int(s.split(".")[0]) for s in os.listdir(INPUT_FOLDER + p)]
    sa.sort()
    lp = np.array(largestslices)[testpatients == p]    
    lp = lp[0]
    for s in lp:  
        #print(type(s))
        #print(s)
        fnamestest.append(p + '/' + str(sa[int(s)]) + '.dcm')    


In [None]:
predsfvcpervol = []
predspercent = []
for fn in fnamestest:
    predsfvcpervol.append(learnfvcperv.predict(create(INPUT_FOLDER+fn)))
    predspercent.append(learnpercent.predict(create(INPUT_FOLDER+fn)))

In [None]:
predsfvcpervol = [p[0][0] for p in predsfvcpervol]

In [None]:
predsfvcpervolmeaned = []
for i in range(len(predsfvcpervol) // 6):
    predsfvcpervolmeaned.append(np.mean(predsfvcpervol[i*6:(i+1)*6]))    

In [None]:
plt.hist(predsfvcpervolmeaned)

In [None]:
predspercent = [p[0][0] for p in predspercent]

In [None]:
predspercentmeaned = []
for i in range(len(predspercent) // 6):
    predspercentmeaned.append(np.mean(predspercent[i*6:(i+1)*6]))    

In [None]:
plt.hist(predspercentmeaned)

In [None]:
predsindexhelper = np.arange(0,ntest)

In [None]:
minpercentim = pydicom.read_file(INPUT_FOLDER + fnamestest[predsindexhelper[np.array(predspercentmeaned) == min(predspercentmeaned)][0]*6]).pixel_array
plt.imshow(minpercentim)

In [None]:
maxpercentim = pydicom.read_file(INPUT_FOLDER + fnamestest[predsindexhelper[np.array(predspercentmeaned) == max(predspercentmeaned)][0]*6]).pixel_array
plt.imshow(maxpercentim)

In [None]:
minfvcpervolim = pydicom.read_file(INPUT_FOLDER + fnamestest[predsindexhelper[np.array(predsfvcpervolmeaned) == min(predsfvcpervolmeaned)][0]*6]).pixel_array
plt.imshow(minfvcpervolim)

In [None]:
maxfvcpervolim = pydicom.read_file(INPUT_FOLDER + fnamestest[predsindexhelper[np.array(predsfvcpervolmeaned) == max(predsfvcpervolmeaned)][0]*6]).pixel_array
plt.imshow(maxfvcpervolim)