# Imports

In [None]:
!pip install ../input/kerasapplications/keras-team-keras-applications-3b180cb -f ./ --no-index
!pip install ../input/efficientnet/efficientnet-1.1.0/ -f ./ --no-index

In [None]:
import os
import cv2
import pydicom
import pandas as pd
import numpy as np 
import tensorflow as tf 
import matplotlib.pyplot as plt 
import random
from tqdm.notebook import tqdm 
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from tensorflow_addons.optimizers import RectifiedAdam
from tensorflow.keras import Model
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
from tensorflow.keras.optimizers import Nadam
import seaborn as sns
from PIL import Image

def seed_everything(seed=2020):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(42)

In [None]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

In [None]:
train = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv') 

# 1. Background: general information about your chosen ADS

* **What is the purpose of this ADS? What are its stated goals?**

The purpose of this ADS is to automatically predict a patient’s severity of decline in lung function based on a CT scan of their lungs. 

# 2. Input and output
* **Describe the data used by this ADS. How was this data collected or selected?**

For each patient, there is a baseline chest CT scan and clinical information. The baseline chest CT scan is at Week 0 and over 1-2 years, there are followup visits with spirometry tests. There are train and test CSV files, which hold the unique patient ID, the number of weeks before or after the CT scan for a visit where FVC value (mL) was determined, “a computed field which approximates the patient’s FVC as a percent of the typical FVC for a person of similar characteristics,” age, sex and smoking status. The details to the collection is not explicitly stated. The only detail mentioned about the data is that it is real medical data.

* **For each input feature**
    * Describe its datatype
    * Give information on missing values
    * Value distribution
    * Show pairwise correlations between features if appropriate. 
    * Run any other reasonable profiling of the input that you find interesting and appropriate.
    * What is the output of the system (e.g., is it a class label, a score, a probability, or some other type of output), and how do we interpret it?

In [None]:
# Data types

train.info()

In [None]:
# Missing Values

train.isna().sum()

In [None]:
# Value distribution

plt.figure(figsize=(20,15))

plt.subplot(2, 3, 1)
plt.hist(train['Weeks'])
plt.title('Week')
plt.xlabel('Week Value')
plt.ylabel('Count')

plt.subplot(2, 3, 2)
plt.hist(train['FVC'])
plt.title('FVC')
plt.xlabel('FVC Value')
plt.ylabel('Count')

plt.subplot(2, 3, 3)
plt.hist(train['Percent'])
plt.title('Percent')
plt.xlabel('Percent Value')
plt.ylabel('Count')

plt.subplot(2, 3, 4)
plt.hist(train['Age'])
plt.title('Age')
plt.xlabel('Age Value')
plt.ylabel('Count')

plt.subplot(2, 3, 5)
plt.hist(train['Sex'])
plt.title('Sex')
plt.xlabel('Sex Value')
plt.ylabel('Count')

plt.subplot(2, 3, 6)
plt.hist(train['SmokingStatus'])
plt.title('Smoking Status')
plt.xlabel('Smoking Status Value')
plt.ylabel('Count')


plt.show()


In [None]:
# Pairwise Correlations

plt.figure(figsize=(12,10))
cor = train.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.title('Correlation Matrix - Train DF')
plt.show()

In [None]:
imdir = "/kaggle/input/osic-pulmonary-fibrosis-progression/train/ID00123637202217151272140"
print("total images for patient ID00123637202217151272140: ", len(os.listdir(imdir)))

# view first (columns*rows) images in order
fig=plt.figure(figsize=(12, 12))
columns = 4
rows = 5
imglist = os.listdir(imdir)
for i in range(1, columns*rows +1):
    filename = imdir + "/" + str(i) + ".dcm"
    ds = pydicom.dcmread(filename)
    fig.add_subplot(rows, columns, i)
    plt.imshow(ds.pixel_array, cmap='gray')
plt.title('Example CT Scan Images for a Patient')
plt.show()

# 3. Implementation and validation:
* **Describe data cleaning and any other pre-processing**

The only data cleaning done for this project is omitting two patient ids and their corresponding images. Some of the image files associated with Patient IDs 'ID00011637202177653955184' and 'ID00052637202186188008618' are corrupted and cannot be loaded. 

The pre-processing is typical for TensorFlow. First, we are given a test df with values corresponding to patients. This is vectorized so it can be used as input to the neural net. The other input to the neural net are the image files, which are 512 x 512 pixels in size. 

* **Give high-level information about the implementation of the system**

The solution in this notebook uses EfficientNet B5 and quantile regression neural networks (QRNN). EfficentNets are a family of CNN models that shares the same convolution operations as the baseline network but the depth, width and resolution are all uniformly scaled with a compound coefficient depending on how much resources are available. It uses significantly less parameters and computing power than comparable models. Quantile regression disregards the assumption of constant vari- ance for the error term typical to linear regression and it will essentially try to fit a line that splits the data such that there is an certain amount of data below that line defined by the quantile (e.g., 0.75); this concept is applied to the loss function for a neural network. EfficientNets utilized CT scans and tabular data, whereas QRNN relied on tabular data. The decisions from these models were blended together as an ensemble to make the final predictions.

* **How was the ADS validated? How do we know that it meets its stated goal(s)?**

The evaluation metric is based on a modified version of Laplace Log Likelihood and its purpose is to evaluate how far the predicted FVC value is from the true value and the model’s confidence in that prediction. If the predicted value is far from the true value (clipped at 1000) but less confident (it is represented as a high value and can also be interpreted as standard deviation or uncertainty), then the model is not penalized as much. The confidence value is clipped at 70 as that reflects the ”approximate measurement uncertainty in FVC.” If the absolute error is high and the model is very confident, then the model will be penalized harshly.


# Outcomes

* **Analyze the effectiveness (accuracy) of the ADS by comparing its performance across different subpopulations.**

* **Select one or several fairness or diversity measures, justify your choice of these measures for the ADS in question, and quantify the fairness or diversity of this ADS.**

* **Develop additional methods for analyzing ADS performance: think about stability, robustness, performance on difficult or otherwise important examples (in the style of LIME), or any other property that you believe is important to check for this ADS.**

# Summary
* **Do you believe that the data was appropriate for this ADS?**

Yes, we do believe the type of data is appropriate becasue in the medical field, doctors would be given the same data. CT scans and some historical data. The goal of this ADS is to have a model thats trained on much more data than a doctor can reference by themselves. However, it seems inappropriate that data is 80% male and 20% female. Also, the sample test output only has males, which is also inappropriate.

* **Do you believe the implementation is robust, accurate, and fair? Discuss your choice of accuracy and fairness measures, and explain which stakeholders may find these measures appropriate.**


* **Would you be comfortable deploying this ADS in the public sector, or in the industry? Why so or why not?**


* **What improvements do you recommend to the data collection, processing, or analysis methodology?**

# Start of Notebook

In [None]:
def get_img(path):
    d = pydicom.dcmread(path)
    return cv2.resize(d.pixel_array / 2**11, (512, 512))

In [None]:
# Example Image
# All images are 512 x 512 images
# The directory structure is that each unique id from train df
# has a folder and in that folder are different number of image files
# Some patients had more CT scans and other patients have fewer CT scans

ex_img = get_img(f'../input/osic-pulmonary-fibrosis-progression/train/ID00007637202177411956430/4.dcm')

ex_img.shape, np.min(ex_img), np.max(ex_img)

# Linear Decay (based on EfficientNets)

In [None]:
def get_tab(df):
    vector = [(df.Age.values[0] - 30) / 30] 
    
    if df.Sex.values[0] == 'male':
       vector.append(0)
    else:
       vector.append(1)
    
    if df.SmokingStatus.values[0] == 'Never smoked':
        vector.extend([0,0])
    elif df.SmokingStatus.values[0] == 'Ex-smoker':
        vector.extend([1,1])
    elif df.SmokingStatus.values[0] == 'Currently smokes':
        vector.extend([0,1])
    else:
        vector.extend([1,0])
    return np.array(vector) 

In [None]:
A = {} 
TAB = {} 
P = [] 
for i, p in tqdm(enumerate(train.Patient.unique())):
    sub = train.loc[train.Patient == p, :] 
    fvc = sub.FVC.values
    weeks = sub.Weeks.values
    c = np.vstack([weeks, np.ones(len(weeks))]).T
    a, b = np.linalg.lstsq(c, fvc)[0]
    
    A[p] = a
    TAB[p] = get_tab(sub)
    P.append(p)

## CNN for coeff prediction

In [None]:
def get_img(path):
    d = pydicom.dcmread(path)
    return cv2.resize(d.pixel_array / 2**11, (512, 512))

In [None]:
from tensorflow.keras.utils import Sequence

class IGenerator(Sequence):
    BAD_ID = ['ID00011637202177653955184', 'ID00052637202186188008618']
    def __init__(self, keys, a, tab, batch_size=32):
        self.keys = [k for k in keys if k not in self.BAD_ID]
        self.a = a
        self.tab = tab
        self.batch_size = batch_size
        
        self.train_data = {}
        for p in train.Patient.values:
            self.train_data[p] = os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{p}/')
    
    def __len__(self):
        return 1000
    
    def __getitem__(self, idx):
        x = []
        a, tab = [], [] 
        keys = np.random.choice(self.keys, size = self.batch_size)
        for k in keys:
            try:
                i = np.random.choice(self.train_data[k], size=1)[0]
                img = get_img(f'../input/osic-pulmonary-fibrosis-progression/train/{k}/{i}')
                x.append(img)
                a.append(self.a[k])
                tab.append(self.tab[k])
            except:
                print(k, i)
       
        x,a,tab = np.array(x), np.array(a), np.array(tab)
        x = np.expand_dims(x, axis=-1)
        return [x, tab] , a

In [None]:
from tensorflow.keras.layers import (
    Dense, Dropout, Activation, Flatten, Input, BatchNormalization, GlobalAveragePooling2D, Add, Conv2D, AveragePooling2D, 
    LeakyReLU, Concatenate 
)
import efficientnet.tfkeras as efn

def get_efficientnet(model, shape):
    models_dict = {
        'b0': efn.EfficientNetB0(input_shape=shape,weights=None,include_top=False),
        'b1': efn.EfficientNetB1(input_shape=shape,weights=None,include_top=False),
        'b2': efn.EfficientNetB2(input_shape=shape,weights=None,include_top=False),
        'b3': efn.EfficientNetB3(input_shape=shape,weights=None,include_top=False),
        'b4': efn.EfficientNetB4(input_shape=shape,weights=None,include_top=False),
        'b5': efn.EfficientNetB5(input_shape=shape,weights=None,include_top=False),
        'b6': efn.EfficientNetB6(input_shape=shape,weights=None,include_top=False),
        'b7': efn.EfficientNetB7(input_shape=shape,weights=None,include_top=False)
    }
    return models_dict[model]

def build_model(shape=(512, 512, 1), model_class=None):
    inp = Input(shape=shape)
    base = get_efficientnet(model_class, shape)
    x = base(inp)
    x = GlobalAveragePooling2D()(x)
    inp2 = Input(shape=(4,))
    x2 = tf.keras.layers.GaussianNoise(0.2)(inp2)
    x = Concatenate()([x, x2]) 
    x = Dropout(0.5)(x) 
    x = Dense(1)(x)
#     print(x)
    model = Model([inp, inp2] , x)
    
    weights = [w for w in os.listdir('../input/osic-model-weights') if model_class in w][0]
#     model.load_weights('../input/osic-model-weights/' + weights)
    model.load_weights('../input/effnet-b5-30epochs-1/effnet_30.h5')
    return model

model_classes = ['b5'] #['b0','b1','b2','b3',b4','b5','b6','b7']
models = [build_model(shape=(512, 512, 1), model_class=m) for m in model_classes]
print('Number of models: ' + str(len(models)))

In [None]:
models[0].output

In [None]:
from sklearn.model_selection import train_test_split 

tr_p, vl_p = train_test_split(P, 
                              shuffle=True, 
                              train_size= 1) 

In [None]:
sns.distplot(list(A.values()));

In [None]:
def score(fvc_true, fvc_pred, sigma):
    sigma_clip = np.maximum(sigma, 70) # changed from 70, trie 66.7 too
    delta = np.abs(fvc_true - fvc_pred)
    delta = np.minimum(delta, 1000)
    sq2 = np.sqrt(2)
    metric = (delta / sigma_clip)*sq2 + np.log(sigma_clip* sq2)
    return np.mean(metric)

# Need to figure out how to do prediction with subset of data

## Split data based on sex to run algorithm on sub populations

In [None]:
# Final DFs for subpopulations
# Final female and male dfs that are the third param to effnet_iter

BAD_ID = ['ID00011637202177653955184', 'ID00052637202186188008618']

female_df = train[train.Sex=='Female']
female_df = female_df[~female_df['Patient'].isin(BAD_ID)]
female_df = female_df[~(female_df.duplicated(['Patient']))].sample(n=37)

male_df = train[train.Sex=='Male']
male_df = male_df[~male_df['Patient'].isin(BAD_ID)]
male_df = male_df[~(male_df.duplicated(['Patient']))].sample(n=37)

In [None]:
def make_sub_df(df):
    tempy = pd.DataFrame(columns=['Patient', 'Weeks'])

    for p in tqdm(df.Patient.unique()):
        x = [] 
        tab = [] 
        ldir = os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{p}/')
        for i in ldir:
            tempy = tempy.append({'Patient' : p, 'Weeks': str(i.split('.')[0])}, ignore_index=True)
            
    return tempy.sort_values(by=['Patient', 'Weeks'], ascending=[True, True])


In [None]:
tempy_female = make_sub_df(female_df.copy())
tempy_male = make_sub_df(male_df.copy())

In [None]:
def make_sub_df_2(train, merge_df):
    # Need to make week int string for join
    train['Weeks'] = train['Weeks'].apply(str)
    merge_df['Weeks'] = merge_df['Weeks'].apply(str)

    new_sub = pd.merge(left=train, right=merge_df, on=['Patient','Weeks'], how='inner')

    new_sub['Patient_Week'] = new_sub['Patient'].astype(str)+'_'+new_sub['Weeks'].astype(str)

    new_sub = new_sub[['Patient_Week', 'FVC', 'Percent']]
    new_sub = new_sub.rename(columns={'Percent': 'Confidence_actual', 'FVC': 'FVC_actual'})
    
    # change week back to int
    train['Weeks'] = train['Weeks'].apply(int)
    merge_df['Weeks'] = merge_df['Weeks'].apply(int)

    return new_sub

In [None]:
# Final female and male sub dfs that are the second param to effnet_iter

female_sub = make_sub_df_2(train, tempy_female)
male_sub = make_sub_df_2(train, tempy_male)

In [None]:
def effnet_iter(models, sub, train):
    # sub needs to be with the Patient_Week column (female_sub or male_sub)
    # train needs to be unique male_df or female_df
    subs = []
    for model in models:
        q = 0.5
#         sub = temp_fem.iloc[:3].copy()
#         train = female_df.iloc[:3].copy()

        A_test, B_test, P_test,W, FVC= {}, {}, {},{},{} 
        STD, WEEK = {}, {} 
        for p in tqdm(train.Patient.unique()):
            x = [] 
            tab = [] 
            ldir = os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{p}/')
            for i in ldir:
                if int(i[:-4]) / len(ldir) < 1.1 and int(i[:-4]) / len(ldir) > -0.1:
                    x.append(get_img(f'../input/osic-pulmonary-fibrosis-progression/train/{p}/{i}')) 
                    tab.append(get_tab(train.loc[train.Patient == p, :])) 
            if len(x) <= 1:
                continue
            # tab is a list containing the csv data for all the images in a users folder
            tab = np.array(tab) 
            # x is a list containing the img data for all the images in a users folder
            x = np.expand_dims(x, axis=-1) 
            _a = model.predict([x, tab])
            # a is the median of the predicted output _a
            a = np.quantile(_a, q)

            # We keep the median from the output of the NN
            A_test[p] = a
            # For the given patient, we take their FVC value then from that 
            # subtract the median times the week collected for that patient
            B_test[p] = train.FVC.values[train.Patient == p] - a*train.Weeks.values[train.Patient == p]
            # Keep the percent value found in the test df for the given patient
            P_test[p] = train.Percent.values[train.Patient == p] 
            # Keep week info for given patient
            WEEK[p] = train.Weeks.values[train.Patient == p]


        # SO if we want to do it on a subset of data, 
        # we need a df that matches sub = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
        for k in sub.Patient_Week.values:
            p, w = k.split('_')
            w = int(w) 

            fvc = A_test[p] * w + B_test[p]
            sub.loc[sub.Patient_Week == k, 'FVC'] = fvc
            sub.loc[sub.Patient_Week == k, 'Confidence'] = (
                P_test[p] - A_test[p] * abs(WEEK[p] - w) 
        ) 

    #     _sub = sub[["Patient_Week","FVC","Confidence"]].copy()
        _sub = sub.copy()
        subs.append(_sub)
    return subs

In [None]:
fem_output = effnet_iter(models, female_sub.copy(), female_df.copy())

In [None]:
fem_output[0]

In [None]:
male_output = effnet_iter(models, male_sub.copy(), male_df.copy())

In [None]:
male_output[0]

## Eval Metric

The error is thresholded at 1000 ml to avoid large errors adversely penalizing results, while the confidence values are clipped at 70 ml to reflect the approximate measurement uncertainty in FVC. The final score is calculated by averaging the metric across all test set Patient_Weeks threeperpatient. Note that metric values will be negative and higher is better.

In [None]:
def eval_metric(df):

    sigma_clipped = [np.min([x, 70]) for x in df['Confidence_actual']]
    temp = df['FVC_actual']-df['FVC']
    delta = [np.min([np.abs(x), 1000]) for x in temp]

    metric = []
    for i in range(len(sigma_clipped)):
        metric.append(((-1*np.sqrt(2)*delta[i])/sigma_clipped[i])-np.log(np.sqrt(2)*sigma_clipped[i]))
        
    return np.mean(metric)



In [None]:
female_score = eval_metric(fem_output[0])
male_score = eval_metric(male_output[0])

In [None]:
female_score, male_score

In [None]:
## evaluation metric function
def laplace_log_likelihood(actual_fvc, predicted_fvc, confidence, return_values = False):
    """
    Calculates the modified Laplace Log Likelihood score for this competition.
    """
    sd_clipped = np.maximum(confidence, 70)
    delta = np.minimum(np.abs(actual_fvc - predicted_fvc), 1000)
    metric = - np.sqrt(2) * delta / sd_clipped - np.log(np.sqrt(2) * sd_clipped)

    if return_values:
        return metric
    else:
        return np.mean(metric)


## default benchmark
# male_ll_default = laplace_log_likelihood(male_output[0].FVC_actual, np.mean(male_output[0].FVC_actual), np.std(male_output[0].FVC_actual))
# female_ll_default = laplace_log_likelihood(fem_output[0].FVC_actual, np.mean(fem_output[0].FVC_actual), np.std(fem_output[0].FVC_actual))

male_ll_default = laplace_log_likelihood(male_output[0].FVC_actual, np.mean(male_output[0].FVC_actual), male_output[0].Confidence_actual)
female_ll_default = laplace_log_likelihood(fem_output[0].FVC_actual, np.mean(fem_output[0].FVC_actual), fem_output[0].Confidence_actual)

male_ll_pred = laplace_log_likelihood(male_output[0].FVC_actual, male_output[0].FVC, male_output[0].Confidence)
female_ll_pred = laplace_log_likelihood(fem_output[0].FVC_actual, fem_output[0].FVC, fem_output[0].Confidence)

In [None]:
female_ll_default, male_ll_default

In [None]:
female_ll_pred, male_ll_pred

## Original Implementation

In [None]:
subs = []
for model in models:

    q = 0.5

    sub = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv') 
    test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv') 
    A_test, B_test, P_test,W, FVC= {}, {}, {},{},{} 
    STD, WEEK = {}, {} 
    for p in tqdm(test.Patient.unique()):
        x = [] 
        tab = [] 
        ldir = os.listdir(f'../input/osic-pulmonary-fibrosis-progression/test/{p}/')
        for i in ldir:
            if int(i[:-4]) / len(ldir) < 1.1 and int(i[:-4]) / len(ldir) > -0.1:
                x.append(get_img(f'../input/osic-pulmonary-fibrosis-progression/test/{p}/{i}')) 
                tab.append(get_tab(test.loc[test.Patient == p, :])) 
        if len(x) <= 1:
            continue
        # tab is a list containing the csv data for all the images in a users folder
        tab = np.array(tab) 
        # x is a list containing the img data for all the images in a users folder
        x = np.expand_dims(x, axis=-1) 
        _a = model.predict([x, tab])
        # a is the median of the predicted output _a
        a = np.quantile(_a, q)

        # We keep the median from the output of the NN
        A_test[p] = a
        # For the given patient, we take their FVC value then from that 
        # subtract the median times the week collected for that patient
        B_test[p] = test.FVC.values[test.Patient == p] - a*test.Weeks.values[test.Patient == p]
        # Keep the percent value found in the test df for the given patient
        P_test[p] = test.Percent.values[test.Patient == p] 
        # Keep week info for given patient
        WEEK[p] = test.Weeks.values[test.Patient == p]
        

    # SO if we want to do it on a subset of data 
    # we need a df that matches sub = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv') 
    for k in sub.Patient_Week.values:
        p, w = k.split('_')
        w = int(w) 

        fvc = A_test[p] * w + B_test[p]
        sub.loc[sub.Patient_Week == k, 'FVC'] = fvc
        sub.loc[sub.Patient_Week == k, 'Confidence'] = (
            P_test[p] - A_test[p] * abs(WEEK[p] - w) 
    ) 

    _sub = sub[["Patient_Week","FVC","Confidence"]].copy()
    subs.append(_sub)

## Averaging Predictions

In [None]:
N = len(subs)
sub = subs[0].copy() # ref
sub["FVC"] = 0
sub["Confidence"] = 0
for i in range(N):
    sub["FVC"] += subs[0]["FVC"] * (1/N)
    sub["Confidence"] += subs[0]["Confidence"] * (1/N)

In [None]:
sub.head()

In [None]:
sub[["Patient_Week","FVC","Confidence"]].to_csv("submission_img.csv", index=False)

In [None]:
img_sub = sub[["Patient_Week","FVC","Confidence"]].copy()

# Osic-Multiple-Quantile-Regression

In [None]:
ROOT = "../input/osic-pulmonary-fibrosis-progression"
BATCH_SIZE=128

tr = pd.read_csv(f"{ROOT}/train.csv")
tr.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
chunk = pd.read_csv(f"{ROOT}/test.csv")

print("add infos")
sub = pd.read_csv(f"{ROOT}/sample_submission.csv")
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub =  sub[['Patient','Weeks','Confidence','Patient_Week']]
sub = sub.merge(chunk.drop('Weeks', axis=1), on="Patient")

In [None]:
tr['WHERE'] = 'train'
chunk['WHERE'] = 'val'
sub['WHERE'] = 'test'
data = tr.append([chunk, sub])

In [None]:
print(tr.shape, chunk.shape, sub.shape, data.shape)
print(tr.Patient.nunique(), chunk.Patient.nunique(), sub.Patient.nunique(), 
      data.Patient.nunique())
#

In [None]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

In [None]:
base = data.loc[data.Weeks == data.min_week]
base = base[['Patient','FVC']].copy()
base.columns = ['Patient','min_FVC']
base['nb'] = 1
base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
base = base[base.nb==1]
base.drop('nb', axis=1, inplace=True)

In [None]:
data = data.merge(base, on='Patient', how='left')
data['base_week'] = data['Weeks'] - data['min_week']
del base

In [None]:
COLS = ['Sex','SmokingStatus'] #,'Age'
FE = []
for col in COLS:
    for mod in data[col].unique():
        FE.append(mod)
        data[mod] = (data[col] == mod).astype(int)

In [None]:
#
data['age'] = (data['Age'] - data['Age'].min() ) / ( data['Age'].max() - data['Age'].min() )
data['BASE'] = (data['min_FVC'] - data['min_FVC'].min() ) / ( data['min_FVC'].max() - data['min_FVC'].min() )
data['week'] = (data['base_week'] - data['base_week'].min() ) / ( data['base_week'].max() - data['base_week'].min() )
#data['percent'] = (data['Percent'] - data['Percent'].min() ) / ( data['Percent'].max() - data['Percent'].min() )
FE += ['age','week','BASE']

In [None]:
tr = data.loc[data.WHERE=='train']
chunk = data.loc[data.WHERE=='val']
sub = data.loc[data.WHERE=='test']
del data

In [None]:
tr.shape, chunk.shape, sub.shape

In [None]:
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")

def score(y_true, y_pred):
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    
    #sigma_clip = sigma + C1
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype=tf.float32) )
    metric = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip* sq2)
    return K.mean(metric)

def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    qs = [0.2, 0.50, 0.8]
    q = tf.constant(np.array([qs]), dtype=tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q*e, (q-1)*e)
    return K.mean(v)

def mloss(_lambda):
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda)*score(y_true, y_pred)
    return loss

def make_model(nh):
    z = L.Input((nh,), name="Patient")
    x = L.Dense(100, activation="relu", name="d1")(z)
    x = L.Dense(100, activation="relu", name="d2")(x)
    #x = L.Dense(100, activation="relu", name="d3")(x)
    p1 = L.Dense(3, activation="linear", name="p1")(x)
    p2 = L.Dense(3, activation="relu", name="p2")(x)
    preds = L.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
                     name="preds")([p1, p2])
    
    model = M.Model(z, preds, name="CNN")
    #model.compile(loss=qloss, optimizer="adam", metrics=[score])
    model.compile(loss=mloss(0.8), optimizer=tf.keras.optimizers.Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False), metrics=[score])
    return model

In [None]:
y = tr['FVC'].values
z = tr[FE].values
ze = sub[FE].values
nh = z.shape[1]
pe = np.zeros((ze.shape[0], 3))
pred = np.zeros((z.shape[0], 3))

In [None]:
net = make_model(nh)
print(net.summary())
print(net.count_params())

In [None]:
NFOLD = 2 # originally 5
kf = KFold(n_splits=NFOLD)

In [None]:
%%time
cnt = 0
EPOCHS = 600
for tr_idx, val_idx in kf.split(z):
    cnt += 1
    print(f"FOLD {cnt}")
    net = make_model(nh)
    net.fit(z[tr_idx], y[tr_idx], batch_size=BATCH_SIZE, epochs=EPOCHS, 
            validation_data=(z[val_idx], y[val_idx]), verbose=0) #
    print("train", net.evaluate(z[tr_idx], y[tr_idx], verbose=0, batch_size=BATCH_SIZE))
    print("val", net.evaluate(z[val_idx], y[val_idx], verbose=0, batch_size=BATCH_SIZE))
    print("predict val...")
    pred[val_idx] = net.predict(z[val_idx], batch_size=BATCH_SIZE, verbose=0)
    print("predict test...")
    pe += net.predict(ze, batch_size=BATCH_SIZE, verbose=0) / NFOLD

In [None]:
sigma_opt = mean_absolute_error(y, pred[:, 1])
unc = pred[:,2] - pred[:, 0]
sigma_mean = np.mean(unc)
print(sigma_opt, sigma_mean)

In [None]:
idxs = np.random.randint(0, y.shape[0], 100)
plt.plot(y[idxs], label="ground truth")
plt.plot(pred[idxs, 0], label="q25")
plt.plot(pred[idxs, 1], label="q50")
plt.plot(pred[idxs, 2], label="q75")
plt.legend(loc="best")
plt.show()

In [None]:
print(unc.min(), unc.mean(), unc.max(), (unc>=0).mean())

In [None]:
plt.hist(unc)
plt.title("uncertainty in prediction")
plt.show()

In [None]:
sub.head()

In [None]:
# PREDICTION
sub['FVC1'] = 1.*pe[:, 1]
sub['Confidence1'] = pe[:, 2] - pe[:, 0]
subm = sub[['Patient_Week','FVC','Confidence','FVC1','Confidence1']].copy()
subm.loc[~subm.FVC1.isnull()].head(10)

In [None]:
subm.loc[~subm.FVC1.isnull(),'FVC'] = subm.loc[~subm.FVC1.isnull(),'FVC1']
sigma_mean = 60
if sigma_mean<sigma_mean:
    subm['Confidence'] = sigma_opt
else:
    subm.loc[~subm.FVC1.isnull(),'Confidence'] = subm.loc[~subm.FVC1.isnull(),'Confidence1']

In [None]:
subm.head()

In [None]:
subm.describe().T

In [None]:
otest = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
for i in range(len(otest)):
    subm.loc[subm['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'FVC'] = otest.FVC[i]
    subm.loc[subm['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'Confidence'] = 0.1

In [None]:
subm[["Patient_Week","FVC","Confidence"]].to_csv("submission_regression.csv", index=False)

In [None]:
reg_sub = subm[["Patient_Week","FVC","Confidence"]].copy()

# Ensemble (Simple Blend)

In [None]:
img_sub

In [None]:
reg_sub

In [None]:
df1 = img_sub.sort_values(by=['Patient_Week'], ascending=True).reset_index(drop=True)
df2 = reg_sub.sort_values(by=['Patient_Week'], ascending=True).reset_index(drop=True)

In [None]:
df = df1[['Patient_Week']].copy()
df['FVC'] = (0.45*df1['FVC'] + 0.55*df2['FVC'])
df['Confidence'] = (0.45*df1['Confidence'] + 0.55*df2['Confidence'])
df.head()

In [None]:
df.to_csv('submission.csv', index=False)

# Test Code JIC

In [None]:
tempy_hehe = pd.DataFrame(columns=['Patient', 'Weeks'])

for i in tqdm(female_df.Patient):
    for j in range(-12,133+1):
        temp_name = i + '_' + str(j)
#         print(i + '_' + str(j))
        tempy_hehe = tempy_hehe.append({'Patient':i, 'Weeks' : j}, ignore_index=True)

In [None]:
tempy_hehe = tempy_hehe.sort_values(by=['Weeks'], ascending=[True])

tempy_hehe['Patient_Week'] = tempy_hehe['Patient'].astype(str)+'_'+tempy_hehe['Weeks'].astype(str)

In [None]:
tempy_hehe['FVC'] = 0 
tempy_hehe['FVC'] = 0 

In [None]:
tempy_hehe[['Patient_Week']]

In [None]:
# This was the original way I was getting the Sub df, 
# but its not good bc it doesnt get all the img files for a given patient
# need to go to the directory for each patient, get all their img

temp_fem = pd.DataFrame(columns=['Patient_Week', 'FVC', 'Confidence', 'FVC_actual', 'Percent_actual'])

temp_fem['Patient_Week'] = female_df['Patient'].astype(str)+'_'+female_df['Weeks'].astype(str)
temp_fem['FVC_actual'] = female_df['FVC']
temp_fem['Percent_actual'] = female_df['Percent']

temp_male = pd.DataFrame(columns=['Patient_Week', 'FVC', 'Confidence', 'FVC_actual', 'Percent_actual'])

temp_male['Patient_Week'] = male_df['Patient'].astype(str)+'_'+male_df['Weeks'].astype(str)
temp_male['FVC_actual'] = male_df['FVC']
temp_male['Percent_actual'] = male_df['Percent']
