## OSIC Pulmonary Fibrosis Progression

**Data Provided**
- train.csv : Baseline CT Scan and entire history of FVC
- test.csv  : Baseline CT and Initial FVC Measurement
- train/    : Baseline CT scan in DICOM format
- test/     : Baseline CT Scan in DICOM format


**Commit 5**
- Folds 10

In [None]:
import os
import numpy as np
import pandas as pd
import random
import math

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.metrics import mean_squared_error
import category_encoders as ce

from sklearn.linear_model import Ridge, ElasticNet
from functools import partial
import scipy as sp

import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.express as px

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [None]:
OUTPUT_DICT = './'

ID = 'Patient_Week'
TARGET = 'FVC'
SEED = 2020
seed_everything(seed=SEED)

N_FOLD = 10

In [None]:
train = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
test_a = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

In [None]:
train.head(8)

In [None]:
test_a.head(8)

In [None]:
print('Training Data:',train.info(), end = "\n\n\n")

print('Testing Data:',test_a.info())

### Training Data

In [None]:
# Visualising Train DataSet 
fig = px.histogram(train, x="Sex")
fig.update_layout(title_text= "Patient Count in Training Dataset")
fig.show()

In [None]:
fig = px.histogram(train, x="SmokingStatus")
fig.update_layout(title_text= "Ex-Smoker , Never Smoked, Present Smoker")
fig.show()

In [None]:
# Age Distribution

fig = px.histogram(train, x="Age")
fig.update_layout(title_text= "Patient Count in Training Dataset")
fig.show()

In [None]:
fig = px.histogram(train, y="Sex" , color = "Age")
fig.update_layout(title_text= "Affected Patient wr Age")
fig.show()

In [None]:
fig = px.histogram(train, x="Age" , color = "SmokingStatus")
fig.update_layout(title_text= "Age wr Smoking Status")
fig.show()

In [None]:
df = px.data.gapminder()
fig = px.area(train, x="Weeks", y="Percent", color = "SmokingStatus")
fig.update_layout(title_text= "Percent Affected wr Weeks and Smoking Status")
fig.show()

In [None]:
fig = px.scatter(x = train["Weeks"] , y = train["Percent"])

fig.update_layout(title_text= "Weeks vs Percent")

fig.show()

In [None]:
fig = px.histogram(train, x="FVC", color = "Sex")
fig.update_layout(title_text= "FVC wr Gender")
fig.show()

In [None]:
fig = px.histogram(train, x="FVC", color = "SmokingStatus")
fig.update_layout(title_text= "FVC wr Smoking Status")
fig.show()

In [None]:
train.columns

In [None]:
parallel_diagram = train[['Weeks', 'Patient', 'FVC', 'Percent', 'Age', 'Sex', 'SmokingStatus']]

fig = px.parallel_categories(parallel_diagram, color_continuous_scale=px.colors.sequential.Inferno)
fig.update_layout(title='Parallel category diagram on trainset')
fig.show()

### Constructing Training Input
Since we have common columns in both training and test dataset, concat to add more info also the test dataset has just 5 entries

groupby 'Patient'
rename columns as follows
drop few cols
Weeks passed = predict_week - base_week
Make changes to train dataset

In [None]:
train = pd.concat([train, test_a])

output = pd.DataFrame()

gb = train.groupby('Patient') # Combines all col data by object name and return mean values respectively

# tqdm => i love you so much in spanish, progress bar for running loops

tk0 = tqdm(gb, total = len(gb))

for _, usr_df in tk0:
    usr_output = pd.DataFrame()
    for week, tmp in usr_df.groupby("Weeks"):
        rename_cols = {'Weeks': 'base_Week', 'FVC': 'base_FVC', 'Age': 'base_Age'}
        
        tmp = tmp.rename(columns = rename_cols)
        
        drop_cols = ['Age', 'Sex', 'SmokingStatus', 'Percent'] 
        
        _usr_output = usr_df.drop(columns=drop_cols).rename(columns={'Weeks': 'predict_Week'}).merge(tmp, on='Patient')
        
        _usr_output['Week_passed'] = _usr_output['predict_Week'] - _usr_output['base_Week']
        
        # Concat the empty DF with edited DF
        usr_output = pd.concat([usr_output, _usr_output])
    output = pd.concat([output, usr_output])
        
train = output[output['Week_passed']!=0].reset_index(drop=True)

In [None]:
output

### Constructing Testing Input
- Rename columns in test dataset 'test_a' to 'test'
- From sample submission, getting values of week from ID
- From Patient Week, get values of predict week
- Drop columns in submission and merge with Test on 'Patient'

Week_passed in test, week passed = predict week - base week

In [None]:
test = test_a.rename(columns={'Weeks': 'base_Week', 'FVC': 'base_FVC', 'Age': 'base_Age'})

# Adding Sample Submission
submission = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/sample_submission.csv")

# In submisison file, format: ID_'week', using lambda to split the ID
submission['Patient'] = submission['Patient_Week'].apply(lambda x:x.split('_')[0])

# In submisison file, format: ID_'week', using lambda to split the Week
submission['predict_Week'] = submission['Patient_Week'].apply(lambda x:x.split('_')[1]).astype(int)

test = submission.drop(columns = ["FVC", "Confidence"]).merge(test, on = 'Patient')

test['Week_passed'] = test['predict_Week'] - test['base_Week']

test.set_index('Patient_Week', inplace=True)

### Folds Preparation

In [None]:
folds = train[['Patient', TARGET]].copy()
folds = train[['Patient', TARGET]].copy()
Fold = GroupKFold(n_splits=N_FOLD)
groups = folds['Patient'].values
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[TARGET], groups)):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)

### Building Model

In [None]:
def run_single_model(clf, train_df, test_df, folds, features, target, fold_num=0):
    trn_idx = folds[folds.fold!=fold_num].index
    val_idx = folds[folds.fold==fold_num].index
    
    y_tr = target.iloc[trn_idx].values
    X_tr = train_df.iloc[trn_idx][features].values
    y_val = target.iloc[val_idx].values
    X_val = train_df.iloc[val_idx][features].values
    
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    clf.fit(X_tr, y_tr)
    
    oof[val_idx] = clf.predict(X_val)
    predictions += clf.predict(test_df[features])
    return oof, predictions

In [None]:
def run_kfold_model(clf, train, test, folds, features, target, n_fold=9):
    
    # n_fold from 5 to 7
    
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    feature_importance_df = pd.DataFrame()

    for fold_ in range(n_fold):

        _oof, _predictions = run_single_model(clf,train, test, folds, features, target, fold_num = fold_)

        oof += _oof
        predictions += _predictions/n_fold
    
    return oof, predictions

### FVC Prediction

In [None]:
target = train[TARGET]
test[TARGET] = np.nan # Displays all Null values

# features
cat_features = ['Sex', 'SmokingStatus'] # Categorical Features
num_features = [c for c in test.columns if (test.dtypes[c] != 'object') & (c not in cat_features)] # Numerical Features

features = num_features + cat_features
drop_features = [TARGET, 'predict_Week', 'Percent', 'base_Week']
features = [c for c in features if c not in drop_features]

if cat_features:
    ce_oe = ce.OrdinalEncoder(cols=cat_features, handle_unknown='impute')
    ce_oe.fit(train)
    train = ce_oe.transform(train)
    test = ce_oe.transform(test)

In [None]:
for alpha1 in [0.3]:
    for l1s in [0.8]:
        
        print(" For alpha:",alpha1,"& l1_ratio:",l1s)
        clf = ElasticNet(alpha=alpha1, l1_ratio = l1s)
        oof, predictions = run_kfold_model(clf, train, test, folds, features, target, n_fold=N_FOLD)

        train['FVC_pred'] = oof
        test['FVC_pred'] = predictions

        # baseline score
        train['Confidence'] = 100
        train['sigma_clipped'] = train['Confidence'].apply(lambda x: max(x, 70))
        train['diff'] = abs(train['FVC'] - train['FVC_pred'])
        train['delta'] = train['diff'].apply(lambda x: min(x, 1000))
        train['score'] = -math.sqrt(2)*train['delta']/train['sigma_clipped'] - np.log(math.sqrt(2)*train['sigma_clipped'])
        score = train['score'].mean()
        print(score)

        def loss_func(weight, row):
            confidence = weight
            sigma_clipped = max(confidence, 70)
            diff = abs(row['FVC'] - row['FVC_pred'])
            delta = min(diff, 1000)
            score = -math.sqrt(2)*delta/sigma_clipped - np.log(math.sqrt(2)*sigma_clipped)
            return -score

        results = []
        tk0 = tqdm(train.iterrows(), total=len(train))
        for _, row in tk0:
            loss_partial = partial(loss_func, row=row)
            weight = [100]
            result = sp.optimize.minimize(loss_partial, weight, method='SLSQP')
            x = result['x']
            results.append(x[0])

        # optimized score
        train['Confidence'] = results
        train['sigma_clipped'] = train['Confidence'].apply(lambda x: max(x, 70))
        train['diff'] = abs(train['FVC'] - train['FVC_pred'])
        train['delta'] = train['diff'].apply(lambda x: min(x, 1000))
        train['score'] = -math.sqrt(2)*train['delta']/train['sigma_clipped'] - np.log(math.sqrt(2)*train['sigma_clipped'])
        score = train['score'].mean()
        print(score)

### Predicting Confidence

In [None]:
TARGET = 'Confidence'

target = train[TARGET]
test[TARGET] = np.nan

# features
cat_features = ['Sex', 'SmokingStatus']
num_features = [c for c in test.columns if (test.dtypes[c] != 'object') & (c not in cat_features)]
features = num_features + cat_features
drop_features = [ID, TARGET, 'predict_Week', 'base_Week', 'FVC', 'FVC_pred']
features = [c for c in features if c not in drop_features]

oof, predictions = run_kfold_model(clf, train, test, folds, features, target, n_fold=N_FOLD)

In [None]:
train['Confidence'] = oof
train['sigma_clipped'] = train['Confidence'].apply(lambda x: max(x, 70))
train['diff'] = abs(train['FVC'] - train['FVC_pred'])
train['delta'] = train['diff'].apply(lambda x: min(x, 1000))
train['score'] = -math.sqrt(2)*train['delta']/train['sigma_clipped'] - np.log(math.sqrt(2)*train['sigma_clipped'])
score = train['score'].mean()
print(score)

In [None]:
test['Confidence'] = predictions
test = test.reset_index()

### Submission

In [None]:
sub = submission[['Patient_Week']].merge(test[['Patient_Week', 'FVC_pred', 'Confidence']], on='Patient_Week')
sub = sub.rename(columns={'FVC_pred': 'FVC'})

for i in range(len(test_a)):
    sub.loc[sub['Patient_Week']==test_a.Patient[i]+'_'+str(test_a.Weeks[i]), 'FVC'] = test_a.FVC[i]
    sub.loc[sub['Patient_Week']==test_a.Patient[i]+'_'+str(test_a.Weeks[i]), 'Confidence'] = 0.1
    
sub[sub.Confidence<1]

sub.to_csv('submission.csv', index=False, float_format='%.1f')