In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn



import seaborn as sns
import cv2
from skimage import io

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.python.keras import Sequential
from tensorflow.keras import layers, optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, LearningRateScheduler
import tensorflow.keras.backend as K

from tensorflow.keras.layers import Conv2D, BatchNormalization, Activation, MaxPool2D, Conv2DTranspose, Concatenate, Input
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import (Dense, Dropout, Activation, Flatten, Input, Add,
                                    BatchNormalization, LeakyReLU, Concatenate, GlobalAveragePooling2D,Conv2D, AveragePooling2D)

from warnings import filterwarnings
filterwarnings('ignore')

import random

import glob
from IPython.display import display

from pathlib import Path

# Set Color Palettes for the notebook
custom_colors = ['#74a09e','#86c1b2','#98e2c6','#f3c969','#f2a553', '#d96548', '#c14953']
sns.palplot(sns.color_palette(custom_colors))

# Set Style
sns.set_style("whitegrid")
sns.despine(left=True, bottom=True)

from scipy.stats import pearsonr
import pydicom
import re
from sklearn.cluster import KMeans
from skimage import morphology
from skimage import measure
from skimage.transform import resize
from tensorflow.keras.utils import Sequence

In [None]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

In [None]:
ROOT = Path('../input/osic-pulmonary-fibrosis-progression/')

train = pd.read_csv(ROOT / 'train.csv')
test = pd.read_csv(ROOT / 'test.csv')
sub = pd.read_csv(ROOT / 'sample_submission.csv')

train.info()

In [None]:
train.head()

In [None]:
# train.loc[:,"Patient"]

In [None]:
# train.head()

In [None]:
#data.iloc[<row s <column selection>]
# train.iloc[2,1]

In [None]:
print("Q: Are there any missing values?", "\n" +
      "A: {}".format(train.isnull().values.any()))

# **EDA**

In [None]:
print("There are {} unique patients in Train Data.".format(len(train["Patient"].unique())), "\n")

# Recordings per Patient
data = train.groupby(by="Patient")["Weeks"].count().reset_index(drop=False)
# print(data)
# Sort by Weeks
data = data.sort_values(['Weeks']).reset_index(drop=True)
print("Minimum number of entries are: {}".format(data["Weeks"].min()), "\n" +
      "Maximum number of entries are: {}".format(data["Weeks"].max()))
# print(data)

# Plot
plt.figure(figsize = (16, 6))
p = sns.barplot(data["Patient"], data["Weeks"], color=custom_colors[2])

plt.title("Number of Entries per Patient", fontsize = 17)
plt.xlabel('Patient', fontsize=14)
plt.ylabel('Frequency', fontsize=14)


p.axes.get_xaxis().set_visible(False);

In [None]:
# Select unique bio info for the patients
data = train.groupby(by="Patient")[["Patient", "Age", "Sex", "SmokingStatus"]].first().reset_index(drop=True)

# Figure
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize = (16, 6))

a = sns.distplot(data["Age"], ax=ax1, color=custom_colors[1], hist=False, kde_kws=dict(lw=6, ls="--"))
b = sns.countplot(data["Sex"], ax=ax2, palette=custom_colors[2:4])
c = sns.countplot(data["SmokingStatus"], ax=ax3, palette = custom_colors[4:7])

a.set_title("Patient Age Distribution", fontsize=16)
b.set_title("Sex Frequency", fontsize=16)
c.set_title("Smoking Status", fontsize=16);

In [None]:
print("Min FVC value: {:,}".format(train["FVC"].min()), "\n" +
      "Max FVC value: {:,}".format(train["FVC"].max()), "\n" +
      "\n" +
      "Min Percent value: {:.4}%".format(train["Percent"].min()), "\n" +
      "Max Percent value: {:.4}%".format(train["Percent"].max()))

# Figure
f, (ax1, ax2) = plt.subplots(1, 2, figsize = (16, 6))

a = sns.distplot(train["FVC"], ax=ax1, color=custom_colors[6], hist=False, kde_kws=dict(lw=6, ls="--"))
b = sns.distplot(train["Percent"], ax=ax2, color=custom_colors[4], hist=False, kde_kws=dict(lw=6, ls="-."))

a.set_title("FVC Distribution", fontsize=16)
b.set_title("Percent Distribution", fontsize=16);

In [None]:
print("Minimum no. weeks before CT: {}".format(train['Weeks'].min()), "\n" +
      "Maximum no. weeks after CT: {}".format(train['Weeks'].max()))

plt.figure(figsize = (16, 6))

a = sns.distplot(train['Weeks'], color=custom_colors[3], hist=False, kde_kws=dict(lw=8, ls="--"))
plt.title("Number of weeks before/after the CT scan", fontsize = 16)
plt.xlabel("Weeks", fontsize=14);

In [None]:
# Compute Correlation
corr1, _ = pearsonr(train["FVC"], train["Percent"])
corr2, _ = pearsonr(train["FVC"], train["Age"])
corr3, _ = pearsonr(train["Percent"], train["Age"])
print("Pearson Corr FVC x Percent: {:.4}".format(corr1), "\n" +
      "Pearson Corr FVC x Age: {:.0}".format(corr2), "\n" +
      "Pearson Corr Percent x Age: {:.2}".format(corr3))

# Figure
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize = (16, 6))

a = sns.scatterplot(x = train["FVC"], y = train["Percent"], palette=[custom_colors[2], custom_colors[6]],
                    hue = train["Sex"], style = train["Sex"], s=100, ax=ax1)

b = sns.scatterplot(x = train["FVC"], y = train["Age"], palette=[custom_colors[2], custom_colors[6]],
                    hue = train["Sex"], style = train["Sex"], s=100, ax=ax2)

c = sns.scatterplot(x = train["Percent"], y = train["Age"], palette=[custom_colors[2], custom_colors[6]],
                    hue = train["Sex"], style = train["Sex"], s=100, ax=ax3)

a.set_title("Correlation between FVC and Percent", fontsize = 16)
a.set_xlabel("FVC", fontsize = 14)
a.set_ylabel("Percent", fontsize = 14)

b.set_title("Correlation between FVC and Age", fontsize = 16)
b.set_xlabel("FVC", fontsize = 14)
b.set_ylabel("Age", fontsize = 14)

c.set_title("Correlation between Percent and Age", fontsize = 16)
c.set_xlabel("Percent", fontsize = 14)
c.set_ylabel("Age", fontsize = 14);

In [None]:
# Figure
f, (ax1, ax2) = plt.subplots(1,2, figsize = (16, 6))

a = sns.barplot(x = train["SmokingStatus"], y = train["FVC"], ax=ax1, palette=custom_colors[0:4])
b = sns.barplot(x = train["SmokingStatus"], y = train["Percent"], ax=ax2, palette=custom_colors[4:7])

a.set_title("Mean FVC per Smoking Status", fontsize=16)
b.set_title("Mean Perc per Smoking Status", fontsize=16);

In [None]:
sns.boxplot(x='Sex', y='FVC', data=train)

In [None]:
sns.boxplot(x='Sex', y='Age', data=train)

In [None]:
sns.boxplot(x='SmokingStatus', y='FVC', data=train)

In [None]:
sns.boxplot(x='Sex', y='Percent', data=train)

In [None]:
sns.boxplot(x='Sex', y='Weeks', data=train)

In [None]:
sns.boxplot(x='SmokingStatus', y='Age', data=train)

In [None]:
# Create Time variable to count in ascending order the times the Patient has done a check in FVC
data_time = train.groupby(by="Patient")["Weeks"].count().reset_index()
# print(data_time)
# print(data_time)
train["Time"] = 0
#print(train)

for patient, times in zip(data_time["Patient"], data_time["Weeks"]):
    train.loc[train["Patient"] == patient, 'Time'] = range(1, times+1)
train.head()

In [None]:
# print(train)

In [None]:
# For graph purposes, keep only Patients that had a big difference in FVC between Time 1 and last Time
min_fvc = train[train["Time"] == 1][["Patient", "FVC"]].reset_index(drop=True)

idx = train.groupby(["Patient"])["Weeks"].transform(max) == train["Weeks"]
max_fvc = train[idx][["Patient", "FVC"]].reset_index(drop=True)
# print(max_fvc)

# Compute difference and select only top patients with biggest difference
data = pd.merge(min_fvc, max_fvc, how="inner", on="Patient")
data["Dif"] = data["FVC_x"] - data["FVC_y"]

# Select only top n
l = list(data.sort_values("Dif", ascending=False).head(10)["Patient"])
x = train[train["Patient"].isin(l)]

In [None]:
plt.figure(figsize = (16, 6))

a = sns.lineplot(x = x["Time"], y = x["FVC"], hue = x["Patient"], legend=False,
                 palette=sns.color_palette("GnBu_d", 10), size=1)

plt.title("Patient FVC decrease on Weeks", fontsize = 16)
plt.xlabel("Weeks", fontsize=14)
plt.ylabel("FVC", fontsize=14);

In [None]:
# Create base director for Train .dcm files
director = "../input/osic-pulmonary-fibrosis-progression/train"

# Create path column with the path to each patient's CT
train["Path"] = director + "/" + train["Patient"]

# Create variable that shows how many CT scans each patient has
train["CT_number"] = 0

for k, path in enumerate(train["Path"]):
    train["CT_number"][k] = len(os.listdir(path))

In [None]:
print("Minimum number of CT scans: {}".format(train["CT_number"].min()), "\n" +
      "Maximum number of CT scans: {:,}".format(train["CT_number"].max()))

# Scans per Patient
data = train.groupby(by="Patient")["CT_number"].first().reset_index(drop=False)
# Sort by Weeks
data = data.sort_values(['CT_number']).reset_index(drop=True)


# Plot
plt.figure(figsize = (16, 6))
p = sns.barplot(data["Patient"], data["CT_number"], color=custom_colors[5])
plt.axvline(x=85, color=custom_colors[2], linestyle='--', lw=3)

plt.title("Number of CT Scans per Patient", fontsize = 17)
plt.xlabel('Patient', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

plt.text(86, 850, "Median=94", fontsize=13)

p.axes.get_xaxis().set_visible(False);

In [None]:
# print(data.Patient.iloc[-1])

In [None]:
train.loc[train['Patient'] == 'ID00011637202177653955184']
# df.loc[df['col1'] == value]

In [None]:
train.loc[train['Patient'] == 'ID00052637202186188008618']

In [None]:
train.loc[train['Patient'] == 'ID00078637202199415319443']

In [None]:
path = "../input/osic-pulmonary-fibrosis-progression/train/ID00007637202177411956430/19.dcm"
dataset = pydicom.dcmread(path)

print("Patient id.......:", dataset.PatientID, "\n" +
      "Modality.........:", dataset.Modality, "\n" +
      "Rows.............:", dataset.Rows, "\n" +
      "Columns..........:", dataset.Columns)

plt.figure(figsize = (7, 7))
plt.imshow(dataset.pixel_array, cmap="plasma")
plt.axis('off');

In [None]:
patient_dir = "../input/osic-pulmonary-fibrosis-progression/train/ID00007637202177411956430"
datasets = []

# First Order the files in the dataset
files = []
for dcm in list(os.listdir(patient_dir)):
    files.append(dcm) 
files.sort(key=lambda f: int(re.sub('\D', '', f)))
print(files)

# Read in the Dataset
for dcm in files:
    path = patient_dir + "/" + dcm
    datasets.append(pydicom.dcmread(path))

# Plot the images
fig=plt.figure(figsize=(50,50))
columns = 15
rows = 40

for i in range(1, columns*rows +1):
    img = datasets[i-1].pixel_array
    fig.add_subplot(rows, columns, i)
    plt.imshow(img, cmap="plasma")
    plt.title(i, fontsize = 9)
    plt.axis('off');

In [None]:
train.loc[train['Patient'] == 'ID00078637202199415319443']

In [None]:
# data["Patient"]=="ID00078637202199415319443"

In [None]:
from PIL import Image
from IPython.display import Image as show_gif
import scipy.misc
import matplotlib

In [None]:
def create_gif(number_of_CT = 87):
    """Picks a patient at random and creates a GIF with their CT scans."""
    
    # Select one of the patients
    # patient = "ID00007637202177411956430"
    patient = train[train["CT_number"] == number_of_CT].sample(random_state=1)["Patient"].values[0]
    
    # === READ IN .dcm FILES ===
    patient_dir = "../input/osic-pulmonary-fibrosis-progression/train/" + patient
    datasets = []

    # First Order the files in the dataset
    files = []
    for dcm in list(os.listdir(patient_dir)):
        files.append(dcm) 
    files.sort(key=lambda f: int(re.sub('\D', '', f)))

    # Read in the Dataset from the Patient path
    for dcm in files:
        path = patient_dir + "/" + dcm
        datasets.append(pydicom.dcmread(path))
        
        
    # === SAVE AS .png ===
    # Create directory to save the png files
    if os.path.isdir(f"png_{patient}") == False:
        os.mkdir(f"png_{patient}")

    # Save images to PNG
    for i in range(len(datasets)):
        img = datasets[i].pixel_array
        matplotlib.image.imsave(f'png_{patient}/img_{i}.png', img)
        
        
    # === CREATE GIF ===
    # First Order the files in the dataset (again)
    files = []
    for png in list(os.listdir(f"../working/png_{patient}")):
        files.append(png) 
    files.sort(key=lambda f: int(re.sub('\D', '', f)))

    # Create the frames
    frames = []

    # Create frames
    for file in files:
    #     print("../working/png_images/" + name)
        new_frame = Image.open(f"../working/png_{patient}/" + file)
        frames.append(new_frame)

    # Save into a GIF file that loops forever
    frames[0].save(f'gif_{patient}.gif', format='GIF',
                   append_images=frames[1:],
                   save_all=True,
                   duration=200, loop=0)

In [None]:
create_gif(number_of_CT=12)

In [None]:
show_gif(filename="./gif_ID00165637202237320314458.gif", format='png', width=400, height=400)

In [None]:
# https://www.raddq.com/dicom-processing-segmentation-visualization-in-python/

def make_lungmask(img, display=False):
    row_size= img.shape[0]
    col_size = img.shape[1]
    
    mean = np.mean(img)
    std = np.std(img)
    img = img-mean
    img = img/std
    
    # Find the average pixel value near the lungs
        # to renormalize washed out images
    middle = img[int(col_size/5):int(col_size/5*4),int(row_size/5):int(row_size/5*4)] 
    mean = np.mean(middle)  
    max = np.max(img)
    min = np.min(img)
    
    # To improve threshold finding, I'm moving the 
    # underflow and overflow on the pixel spectrum
    img[img==max]=mean
    img[img==min]=mean
    
    # Using Kmeans to separate foreground (soft tissue / bone) and background (lung/air)
    
    kmeans = KMeans(n_clusters=2).fit(np.reshape(middle,[np.prod(middle.shape),1]))
    centers = sorted(kmeans.cluster_centers_.flatten())
    threshold = np.mean(centers)
    thresh_img = np.where(img<threshold,1.0,0.0)  # threshold the image

    # First erode away the finer elements, then dilate to include some of the pixels surrounding the lung.  
    # We don't want to accidentally clip the lung.

    eroded = morphology.erosion(thresh_img,np.ones([3,3]))
    dilation = morphology.dilation(eroded,np.ones([8,8]))

    labels = measure.label(dilation) # Different labels are displayed in different colors
    label_vals = np.unique(labels)
    regions = measure.regionprops(labels)
    good_labels = []
    for prop in regions:
        B = prop.bbox
        if B[2]-B[0]<row_size/10*9 and B[3]-B[1]<col_size/10*9 and B[0]>row_size/5 and B[2]<col_size/5*4:
            good_labels.append(prop.label)
    mask = np.ndarray([row_size,col_size],dtype=np.int8)
    mask[:] = 0


    #  After just the lungs are left, we do another large dilation
    #  in order to fill in and out the lung mask 
    
    for N in good_labels:
        mask = mask + np.where(labels==N,1,0)
    mask = morphology.dilation(mask,np.ones([10,10])) # one last dilation

    if (display):
        fig, ax = plt.subplots(3, 2, figsize=[12, 12])
        ax[0, 0].set_title("Original")
        ax[0, 0].imshow(img, cmap='gray')
        ax[0, 0].axis('off')
        ax[0, 1].set_title("Threshold")
        ax[0, 1].imshow(thresh_img, cmap='gray')
        ax[0, 1].axis('off')
        ax[1, 0].set_title("After Erosion and Dilation")
        ax[1, 0].imshow(dilation, cmap='gray')
        ax[1, 0].axis('off')
        ax[1, 1].set_title("Color Labels")
        ax[1, 1].imshow(labels)
        ax[1, 1].axis('off')
        ax[2, 0].set_title("Final Mask")
        ax[2, 0].imshow(mask, cmap='gray')
        ax[2, 0].axis('off')
        ax[2, 1].set_title("Apply Mask on Original")
        ax[2, 1].imshow(mask*img, cmap='gray')
        ax[2, 1].axis('off')
        
        plt.show()
    return mask*img

In [None]:
# Select a sample
path = "../input/osic-pulmonary-fibrosis-progression/train/ID00007637202177411956430/19.dcm"
dataset = pydicom.dcmread(path)
img = dataset.pixel_array

# Masked image
mask_img = make_lungmask(img, display=True)

# masking image for one patient

In [None]:
patient_dir = "../input/osic-pulmonary-fibrosis-progression/train/ID00007637202177411956430"
datasets = []

# First Order the files in the dataset
files = []
for dcm in list(os.listdir(patient_dir)):
    files.append(dcm) 
files.sort(key=lambda f: int(re.sub('\D', '', f)))

# Read in the Dataset
for dcm in files:
    path = patient_dir + "/" + dcm
    datasets.append(pydicom.dcmread(path))
    
imgs = []
for data in datasets:
    img = data.pixel_array
    imgs.append(img)
    
    
# Show masks
fig=plt.figure(figsize=(16, 6))
columns = 10
rows = 3

for i in range(1, columns*rows +1):
    img = make_lungmask(datasets[i-1].pixel_array)
    fig.add_subplot(rows, columns, i)
    plt.imshow(img, cmap="gray")
    plt.title(i, fontsize = 9)
    plt.axis('off');

# data preparation

In [None]:
def get_tab(df):
    ''' 
    This function gives an array wrt each patient containing
    feature like age, gender and smoking status
    '''
    vector = [(df.Age.values[0]-30)/30]
    
    if df.Sex.values[0].lower() == 'male':
        vector.append(0)
    else:
        vector.append(1)
        
    if df.SmokingStatus.values[0] == 'Never smoked':
        vector.extend([0,0])
    elif df.SmokingStatus.values[0] == 'Ex-smoker':
        vector.extend([1,1])
    elif df.SmokingStatus.values[0] == 'Currently smokes':
        vector.extend([0,1])
    else:
        vector.extend([1,0])
        
    return np.array(vector)

In [None]:
A = {} #Stores slope value for each of the patient
TAB = {} #Stores training data wrt each patient
P = [] #Stores all unique patient id's

for i,p in enumerate(train.Patient.unique()):
    sub = train.loc[train.Patient == p, :]
    fvc = sub.FVC.values
    week = sub.Weeks.values
    #print(week)
    c = np.vstack([week, np.ones(len(week))]).T
    a, b = np.linalg.lstsq(c,fvc)[0]
    #print(b)
    
    A[p] = a # Contains slope
    TAB[p] = get_tab(sub) #Contains gender and smoking feature
    P.append(p) #contains unique id
   # print(TAB)

# Creating CNN architecture for coeficient prediction:

In [None]:
def get_img(path):
    d = pydicom.dcmread(path)
    return cv2.resize(d.pixel_array/2**11 ,(512,512))

In [None]:
class IGenerator(Sequence):
    
    ''' 
    This is the generator class, which generates an input of batch size 32
    i.e 32 patient's 2 dicom image, and features from tabular data is generated. As output 
    from his generator x and y contains pixel_data of a dicom image, tab conatins patient's meta
    information, and 'a' is the coeffiecient wrt each patient. 
    '''
    BAD_ID = ['ID00011637202177653955184', 'ID00052637202186188008618']
    def __init__(self, keys, a, tab, batch_size=16):
        self.keys = [k for k in keys if k not in self.BAD_ID]
        self.a = a
        self.tab = tab
        self.batch_size = batch_size
        
        self.train_data = {}
        for p in train.Patient.values:
            self.train_data[p] = os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{p}/')
            #print(p)
    def __len__(self):
        return 1000
    
    def __getitem__(self, idx):
        x, y = [], []
        a, tab = [], [] 
        keys = np.random.choice(self.keys, size = self.batch_size)
        
        for k in keys:
            try:
                i = np.random.choice(self.train_data[k], size=1)[0]
                j = np.random.choice(self.train_data[k], size=1)[0]
                img1 = get_img(f'../input/osic-pulmonary-fibrosis-progression/train/{k}/{i}')
                img2 = get_img(f'../input/osic-pulmonary-fibrosis-progression/train/{k}/{j}')
                
                x.append(img1)
                y.append(img2)
                
                a.append(self.a[k])
                tab.append(self.tab[k])
            except:
                print(k, i)
        
        
        x,y,a,tab = np.array(x),np.array(y), np.array(a), np.array(tab)
        x = np.expand_dims(x, axis=-1)
        y = np.expand_dims(y, axis=-1)
        return [x,y, tab] , a


In [None]:
def model_architechture(shape=(512,512,1)):
    '''Architecture used here is inspired by this kaggle notebook 
    https://www.kaggle.com/miklgr500/linear-decay-based-on-resnet-cnn/notebook'''
    
    def res_block(x, filter_number):
        _x = x
        x = Conv2D(filter_number, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
        x = BatchNormalization()(x)
        x = LeakyReLU(0.05)(x)
        x = Conv2D(filter_number, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
        x = LeakyReLU(0.05)(x)
        
        x = Add()([_x, x])
        return x
    
    #two input branch for images
    input1 = Input(shape=shape, name= 'dicom_image_1')
    input2 = Input(shape=shape, name= 'dicom_image_2')
    
    #image input branch 1 begins
    x = Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same')(input1)
    x = BatchNormalization()(x)
    x = LeakyReLU(0.05)(x)
    
    x = Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(0.05)(x)
    
    x = AveragePooling2D(pool_size=(2, 2), strides=(2, 2))(x)
    
    #image input branch 2 begins
    y = Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same')(input2)
    y = BatchNormalization()(y)
    y = LeakyReLU(0.05)(y)
    
    y = Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same')(y)
    y = BatchNormalization()(y)
    y = LeakyReLU(0.05)(y)
    
    y = AveragePooling2D(pool_size=(2, 2), strides=(2, 2))(y)
    
    #Concatinating image inputs
    x_and_y = Concatenate()([x, y])
    
    x_and_y = Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same')(x_and_y)
    x_and_y = BatchNormalization()(x_and_y)
    x_and_y = LeakyReLU(0.05)(x_and_y)
    
    x_and_y = Conv2D(16, kernel_size=(3, 3), strides=(1, 1), padding='same')(x_and_y)
    for _ in range(2):
        x_and_y = res_block(x_and_y, 16)
    x_and_y = AveragePooling2D(pool_size=(2, 2), strides=(2, 2))(x_and_y)
    
    x_and_y = Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='same')(x_and_y)
    for _ in range(3):
        x_and_y = res_block(x_and_y, 64)
    x_and_y = AveragePooling2D(pool_size=(2, 2), strides=(2, 2))(x_and_y)    
    
    x_and_y = Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='same')(x_and_y)
    for _ in range(1):
        x_and_y = res_block(x_and_y, 128)
        
   
    x_and_y = GlobalAveragePooling2D()(x_and_y)
    
    #Patient tabular data input
    input3 = Input(shape=(4,))
    z = tf.keras.layers.GaussianNoise(0.2)(input3)
    xyz = Concatenate()([x_and_y, z])
    xyz = Dropout(0.6)(xyz) 
    xyz = Dense(1)(xyz)
    return Model([input1, input2, input3] , xyz)

In [None]:
model = model_architechture()

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mae') 

tr_p, vl_p = train_test_split(P, shuffle=True, train_size= 0.8)
val_p, ts_p = train_test_split(vl_p, shuffle=True, train_size= 0.8)


In [None]:
from tensorflow import keras
keras.utils.plot_model(model,'img.png', show_shapes=True)

In [None]:
er = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=1e-3,
    patience=5,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)

model.fit_generator(IGenerator(keys=tr_p, 
                               a = A, 
                               tab = TAB), 
                    steps_per_epoch = 20,
                    validation_data=IGenerator(keys=val_p, 
                               a = A, 
                               tab = TAB),
                    validation_steps = 20, 
                    callbacks = [er], 
                    epochs=30)

In [None]:
model.save('best_model.h5')
            

In [None]:
def score(fvc_true, fvc_pred, sigma):
    sigma_clip = np.maximum(sigma,70)
    delta = np.abs(fvc_true - fvc_pred)
    delta = np.minimum(delta,1000)
    sqrt = np.sqrt(2)
    metric = (delta/sigma_clip)*sqrt + np.log(sigma_clip*sqrt)
    return np.mean(metric)

In [None]:
from tqdm.notebook import tqdm

metric = []
for q in tqdm(range(1, 10)):
    m = []
    for p in val_p:
        x, y = [], []
        tab = [] 
        
        if p in ['ID00011637202177653955184', 'ID00052637202186188008618']:
            continue
            
        img_set = os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{p}/')
        img_set = np.random.choice(img_set, size=20)
        for i in img_set:
            x.append(get_img(f'../input/osic-pulmonary-fibrosis-progression/train/{p}/{i}')) 
            y.append(get_img(f'../input/osic-pulmonary-fibrosis-progression/train/{p}/{i}'))
            tab.append(get_tab(train.loc[train.Patient == p, :])) 
        tab = np.array(tab) 
    
        x = np.expand_dims(x, axis=-1)
        y = np.expand_dims(y, axis=-1)
        _a = model.predict([x,y, tab]) 
        a = np.quantile(_a, q / 10)
        print(a)
        percent_true = train.Percent.values[train.Patient == p]
        fvc_true = train.FVC.values[train.Patient == p]
        weeks_true = train.Weeks.values[train.Patient == p]
        
        fvc = a * (weeks_true - weeks_true[0]) + fvc_true[0]
        percent = percent_true[0] - a * abs(weeks_true - weeks_true[0])
        m.append(score(fvc_true, fvc, percent))
#     print(np.mean(m))
    metric.append(np.mean(m))

In [None]:
q = (np.argmin(metric) + 1)/ 10
q

In [None]:
sub = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv') 
sub.head() 

In [None]:
test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv') 
test.head()
frames = [test + ts_p]

In [None]:
A_test, B_test, P_test,W, FVC= {}, {}, {},{},{} 
STD, WEEK = {}, {} 
for p in frames.Patient.unique():
    x,y = [],[]
    tab = [] 
    img_set = os.listdir(f'../input/osic-pulmonary-fibrosis-progression/test/{p}/')
    img_set = np.random.choice(img_set, size=20)
    for i in img_set:
        x.append(get_img(f'../input/osic-pulmonary-fibrosis-progression/test/{p}/{i}')) 
        y.append(get_img(f'../input/osic-pulmonary-fibrosis-progression/test/{p}/{i}'))
        tab.append(get_tab(test.loc[test.Patient == p, :])) 
    tab = np.array(tab) 
            
    x = np.expand_dims(x, axis=-1) 
    y = np.expand_dims(y, axis=-1) 
    _a = model.predict([x,y, tab]) 
    a = np.quantile(_a, q)
    A_test[p] = a
    B_test[p] = test.FVC.values[test.Patient == p] - a*test.Weeks.values[test.Patient == p]
    P_test[p] = test.Percent.values[test.Patient == p] 
    WEEK[p] = test.Weeks.values[test.Patient == p]

In [None]:
for k in sub.Patient_Week.values:
    p, w = k.split('_')
    w = int(w) 
    
    fvc = A_test[p] * w + B_test[p]
    sub.loc[sub.Patient_Week == k, 'FVC'] = fvc
    sub.loc[sub.Patient_Week == k, 'Confidence'] = (
        P_test[p] - A_test[p] * abs(WEEK[p] - w) 
) 
sub.head()