In [None]:
import os
import pandas as pd
import numpy as np
import random
np.random.seed(12)

import matplotlib.pyplot as plt
import glob
import cv2

import time
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, regularizers
from sklearn.model_selection import KFold, StratifiedKFold
import scipy.ndimage as ndimage
from skimage import measure, morphology, segmentation, color
import pydicom
import imageio
from joblib import parallel_backend, Parallel, delayed
import PIL
from tensorflow.keras.utils import Sequence
from tensorflow.keras.applications import ResNet50
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
from tensorflow import keras
from tensorflow.keras import layers
from functools import partial
import xgboost as xgb
import scipy as sp


In [None]:
path = '../input/osic-pulmonary-fibrosis-progression/'

path_imgs_train = path + '/train/'
path_imgs_test = path + '/test/'


path_train_masks = path + '/train_masks_fast_masks/'
path_test_masks = path + '/test_masks_fast_masks/'

path_scans_train = path + 'train/'
path_scans_test = path + 'test/'

In [None]:
df_train = pd.read_csv(path + 'train.csv')
df_test = pd.read_csv(path + 'test.csv')

print(f'1.1 -> There are {df_train.Patient.unique().shape[0]} train unique patients')
print(f'1.2 -> There are {df_test.Patient.unique().shape[0]} test unique patients')

train_paths = glob.glob(path_imgs_train + '*')
test_paths = glob.glob(path_imgs_test + '*')
      
print(f'No. of Train Images : {len(train_paths)}')
print(f'No. of Test Images : {len(test_paths)}')
      
unique_train_patients = df_train.Patient.unique()
unique_test_patients = df_test.Patient.unique()

dict_train_patients_paths = {patient: path_imgs_train + patient + '/' for patient in unique_train_patients}
dict_test_patients_paths = {patient: path_imgs_test + patient + '/' for patient in unique_test_patients}

dict_train_patients_masks_paths = {patient: path_train_masks + patient + '/' for patient in unique_train_patients}
dict_test_patients_masks_paths = {patient: path_test_masks + patient + '/' for patient in unique_test_patients}


for patient in tqdm(dict_train_patients_paths):
    list_files = os.listdir(dict_train_patients_paths[patient])
    list_files = [dict_train_patients_paths[patient] + file for file in list_files]
    dict_train_patients_paths[patient] = list_files
    
for patient in tqdm(dict_test_patients_paths):
    list_files = os.listdir(dict_test_patients_paths[patient])
    list_files = [dict_test_patients_paths[patient] + file for file in list_files]
    dict_test_patients_paths[patient] = list_files

In [None]:
def customLossFunction(y_true, y_pred, std=70):
    std_clipped = tf.cast(tf.maximum(std, 70), dtype=tf.float32)
    delta = tf.cast(tf.minimum(tf.abs(y_true - y_pred), 1_000), dtype=tf.float32)
    sq2 = tf.sqrt(2.)
    loss = (delta/std_clipped) * sq2 + tf.math.log(sq2 * std_clipped)
    loss = tf.reduce_mean(loss)
    return loss


def quantileLoss(quantiles, y_true, y_pred):
    e = y_true - y_pred
    v = tf.maximum(quantiles * e, (quantiles-1) * e)
    return tf.reduce_mean(v)


def negloglik(y, p_y): 
    return -p_y.log_prob(y)


def scale(x, mean_, std_):
    return (x - mean_) / std_


def unscale(x, mean_, std_):
    return (x * std_) + mean_


def normalize(x, min_, max_):
    return (x - min_) / (max_ - min_)


def unormalize(x, min_, max_):
    return x * (max_ - min_) + min_




In [None]:
df_train = pd.read_csv( path + 'train.csv')
df_test = pd.read_csv(path + 'test.csv')

print(f'1.1 -> There are {df_train.Patient.unique().shape[0]} train unique patients')
print(f'1.2 -> There are {df_test.Patient.unique().shape[0]} test unique patients')




      
unique_train_patients = df_train.Patient.unique()
unique_test_patients = df_test.Patient.unique()



dict_train_patients_scans_paths = {patient: path_scans_train + patient + '/' for patient in unique_train_patients}
dict_test_patients_scans_paths = {patient: path_scans_test + patient + '/' for patient in unique_test_patients}


    

for patient in tqdm(dict_train_patients_scans_paths):
    list_files = os.listdir(dict_train_patients_scans_paths[patient])
    list_files = [dict_train_patients_scans_paths[patient] + file for file in list_files]
    dict_train_patients_scans_paths[patient] = list_files
    
for patient in tqdm(dict_test_patients_scans_paths):
    list_files = os.listdir(dict_test_patients_scans_paths[patient])
    list_files = [dict_test_patients_scans_paths[patient] + file for file in list_files]
    dict_test_patients_scans_paths[patient] = list_files
    
# Preprocessing:

df_train = df_train.groupby(['Patient', 'Weeks']).agg({
    'FVC': np.mean,
    'Percent': np.mean,
    'Age': np.max,
    'Sex': np.max,
    'SmokingStatus': np.max 
}).reset_index()

df_train['FVC_Percent'] = (df_train['FVC'] / df_train['Percent']) * 100
df_test['FVC_Percent'] = (df_test['FVC'] / df_test['Percent']) * 100


# Standarize data

mean_fvc, std_fvc = df_train.FVC.mean(), df_train.FVC.std()
mean_perc, std_perc = df_train.Percent.mean(), df_train.Percent.std()
mean_age, std_age = df_train.Age.mean(), df_train.Age.std()

df_train['Age'] = df_train['Age'].apply(lambda x: (x-mean_age)/std_age)
df_test['Age'] = df_test['Age'].apply(lambda x: (x-mean_age)/std_age)

df_train['FVC'] = df_train['FVC'].apply(lambda x: (x-mean_fvc)/std_fvc)
df_test['FVC'] = df_test['FVC'].apply(lambda x: (x-mean_fvc)/std_fvc)
df_train['FVC_Percent'] = df_train['FVC_Percent'].apply(lambda x: (x-mean_fvc)/std_fvc)
df_test['FVC_Percent'] = df_test['FVC_Percent'].apply(lambda x: (x-mean_fvc)/std_fvc)

df_train['Percent'] = df_train['Percent'].apply(lambda x: (x-mean_perc)/std_perc)
df_test['Percent'] = df_test['Percent'].apply(lambda x: (x-mean_perc)/std_perc)

# Mapping categories dictionaries 

dict_sex = {'Male': 0, 'Female': 1}
dict_sex_inv = {0: 'Male', 1: 'Female'}

dict_smoke = {'Ex-smoker': 0, 'Never smoked': 1, 'Currently smokes': 2}
dict_smoke_inv = {0: 'Ex-smoker', 1:'Never smoked', 2:'Currently smokes'}

dict_kind_patient = {'decreased': 0, 'regular': 1, 'increased': 2}
dict_kind_patient_inv = {0: 'decreased', 1: 'regular', 2: 'increased'}

df_train.Sex = df_train.Sex.apply(lambda x: dict_sex[x])
df_train.SmokingStatus = df_train.SmokingStatus.apply(lambda x: dict_smoke[x])

df_test.Sex = df_test.Sex.apply(lambda x: dict_sex[x])
df_test.SmokingStatus = df_test.SmokingStatus.apply(lambda x: dict_smoke[x])

# Build WeeksSinceLastVisit feature

df_train['ElapsedWeeks'] = df_train['Weeks']
df_test['ElapsedWeeks'] = df_test['Weeks']

train_weeks_elapsed = df_train.set_index(['Patient', 'Weeks'])['ElapsedWeeks'].diff().reset_index()
test_weeks_elapsed = df_test.set_index(['Patient', 'Weeks'])['ElapsedWeeks'].diff().reset_index()

df_train = df_train.drop('ElapsedWeeks', axis=1)
df_test = df_test.drop('ElapsedWeeks', axis=1)

train_weeks_elapsed['ElapsedWeeks'] = train_weeks_elapsed['ElapsedWeeks'].fillna(0).astype(int)
test_weeks_elapsed['ElapsedWeeks'] = test_weeks_elapsed['ElapsedWeeks'].fillna(0).astype(int)

df_train = df_train.merge(train_weeks_elapsed, how='inner', on=['Patient', 'Weeks'])
df_test = df_test.merge(test_weeks_elapsed, how='inner', on=['Patient', 'Weeks'])

df_train['patient_row'] = df_train.sort_values(['Patient', 'Weeks'], ascending=[True, True]) \
             .groupby(['Patient']) \
             .cumcount() + 1

df_test['patient_row'] = df_test.sort_values(['Patient', 'Weeks'], ascending=[True, True]) \
             .groupby(['Patient']) \
             .cumcount() + 1

df_train['WeeksSinceLastVisit'] = df_train.apply(lambda x: x['Weeks'] if x['patient_row']==1 else x['ElapsedWeeks'], axis=1)
df_test['WeeksSinceLastVisit'] = df_test.apply(lambda x: x['Weeks'] if x['patient_row']==1 else x['ElapsedWeeks'], axis=1)

# Norm Weeks

mean_weeks, std_weeks = df_train.Weeks.mean(), df_train.Weeks.std()

df_train['WeeksSinceLastVisit'] = df_train['WeeksSinceLastVisit'].apply(lambda x: (x-mean_weeks)/std_weeks)
df_test['WeeksSinceLastVisit'] = df_test['WeeksSinceLastVisit'].apply(lambda x: (x-mean_weeks)/std_weeks)


df_train['Weeks'] = df_train['Weeks'].apply(lambda x: (x-mean_weeks)/std_weeks)
df_test['Weeks'] = df_test['Weeks'].apply(lambda x: (x-mean_weeks)/std_weeks)

# Ini dictionaries

columns = ['FVC', 'Age', 'Sex', 'SmokingStatus', 'WeeksSinceLastVisit', 'Percent']
dict_patients_train_ini_features, dict_patients_test_ini_features = {}, {}
dict_patients_train_kind_patient, dict_patients_test_kind_patient = {}, {}
df_train_patients, df_test_patients = df_train.set_index('Patient'), df_test.set_index('Patient')

for patient in unique_train_patients:
    dict_patients_train_ini_features[patient] = df_train_patients[columns][df_train_patients.index==patient].\
                                                                    to_dict('records')[0]
    std = np.std(unscale(df_train_patients['FVC'][df_train_patients.index==patient], mean_fvc, std_fvc).values)
    mean_first_1 = np.mean(unscale(df_train_patients['FVC'][df_train_patients.index==patient], mean_fvc, std_fvc).values[:1])
    mean_last_1 = np.mean(unscale(df_train_patients['FVC'][df_train_patients.index==patient], mean_fvc, std_fvc).values[-1:])
    if std<=100:
        dict_patients_train_kind_patient[patient] = 'regular'
    elif std>100 and mean_last_1 > mean_first_1 :
        dict_patients_train_kind_patient[patient] = 'increased'
    elif std>100 and mean_last_1 <= mean_first_1 :
        dict_patients_train_kind_patient[patient] = 'decreased'
    dict_patients_train_ini_features[patient]['kind'] = dict_kind_patient[dict_patients_train_kind_patient[patient]]
        
    
for patient in unique_test_patients:
    dict_patients_test_ini_features[patient] = df_test_patients[columns][df_test_patients.index==patient].\
                                                                    to_dict('records')[0]
    std = np.std(unscale(df_train_patients['FVC'][df_train_patients.index==patient], mean_fvc, std_fvc).values)
    mean_first_1 = np.mean(unscale(df_train_patients['FVC'][df_train_patients.index==patient], mean_fvc, std_fvc).values[:1])
    mean_last_1 = np.mean(unscale(df_train_patients['FVC'][df_train_patients.index==patient], mean_fvc, std_fvc).values[-1:])
    if std<=100:
        dict_patients_test_kind_patient[patient] = 'regular'
    elif std>100 and mean_last_1 > mean_first_1 :
        dict_patients_test_kind_patient[patient] = 'increased'
    elif std>100 and mean_last_1 <= mean_first_1 :
        dict_patients_test_kind_patient[patient] = 'decreased'
    dict_patients_test_ini_features[patient]['kind'] = dict_kind_patient[dict_patients_test_kind_patient[patient]]

# Decoder inputs

dict_train_sequence_fvc, dict_train_sequence_weekssincelastvisit = {}, {}
dict_train_sequence_cumweeks = {}
for patient in unique_train_patients:
    dict_train_sequence_fvc[patient] = list(df_train_patients['FVC'].loc[patient].values[1:])
    dict_train_sequence_weekssincelastvisit[patient] = list(df_train_patients['WeeksSinceLastVisit'].loc[patient].values[1:])
    dict_train_sequence_cumweeks[patient] = list(df_train_patients['Weeks'].loc[patient].values[1:])

In [None]:
def buildDataSet(list_patients, dict_ini_features, dict_seq_weeks, dict_seq_cumweeks, 
                 training=True, predictions=None):
    
    dict_to_tree = {
        'Patient' : [],
        'Weeks_Elapsed_since_firstVisit': [],
        'Base_Percent' : [],
        'Age' : [],
        'Sex' : [],
        'Base_Week' : [],
        'Base_FVC' : [],
        'Curr_Smokes' : [],
        'Ex_Smoker' : [],
        'Never_Smoked' : []
    }

    if training:
        dict_to_tree['fvc_real'] = []
        dict_to_tree['kind'] = []
    

    for patient in tqdm(list_patients, position=0):
        
        dict_to_tree['Weeks_Elapsed_since_firstVisit'].extend([dict_seq_cumweeks[patient][i] \
                                            for i in range(len(dict_seq_cumweeks[patient]))])
        
        for i in range(len(dict_seq_weeks[patient])):
            dict_to_tree['Patient'].extend([patient])

            dict_to_tree['Base_Percent'].extend([dict_ini_features[patient]['Percent']])

            dict_to_tree['Age'].extend([dict_ini_features[patient]['Age']])

            dict_to_tree['Sex'].extend([dict_ini_features[patient]['Sex']])

            dict_to_tree['Base_Week'].extend([dict_ini_features[patient]['WeeksSinceLastVisit']])

            dict_to_tree['Base_FVC'].extend([dict_ini_features[patient]['FVC']])

            dict_to_tree['Curr_Smokes'].extend([1 if dict_ini_features[patient]['SmokingStatus']==2 else 0])

            dict_to_tree['Ex_Smoker'].extend([1 if dict_ini_features[patient]['SmokingStatus']==0 else 0])

            dict_to_tree['Never_Smoked'].extend([1 if dict_ini_features[patient]['SmokingStatus']==1 else 0])
            
            if training:
                dict_to_tree['kind'].extend([dict_ini_features[patient]['kind']])

        list_weeks_elapsed = list(dict_seq_weeks[patient])
        list_weeks_cum = list(dict_seq_cumweeks[patient])

        if training:
            dict_to_tree['fvc_real'].extend(dict_train_sequence_fvc[patient])

    df_tree = pd.DataFrame.from_dict(dict_to_tree, orient='columns')
    
    return df_tree


def buildTrainModel(dict_params, features, df_train, df_val, epochs, verbose_eval=10):
    X_train, y_train = df_train[features], df_train['Confidence']
    X_val, y_val = df_val[features], df_val['Confidence']
    
    xgb_data = [(xgb.DMatrix(X_train, y_train), 'train'), (xgb.DMatrix(X_val, y_val), 'valid')]
   
    xgb_model = xgb.train(
                        params=dict_params,
                        dtrain=xgb.DMatrix(X_train, y_train),
                        num_boost_round=epochs,
                        evals=xgb_data,
                        verbose_eval=verbose_eval,
                        early_stopping_rounds=100
                        
    )

    return xgb_model


def lossFuncWeights(weight, row):
    confidence = weight
    sigma_clipped = max(confidence, 70)
    diff = np.abs(row['fvc_real'] - row['fvc_pred'])
    delta = min(diff, 1000)
    score = -np.sqrt(2)*delta/sigma_clipped - np.log(np.sqrt(2)*sigma_clipped)
    return -score


def getConfidenceWeights(df):
    results = []
    tk0 = tqdm(df.iterrows(), total=len(df), position=0)
    for _, row in tk0:
        loss_partial = partial(lossFuncWeights, row=row)
        weight = [100]
        result = sp.optimize.minimize(loss_partial, weight, method='SLSQP')
        x = result['x']
        results.append(x[0])
        
    return results

In [None]:
def mloss(_lambda):
    def loss(y_true, y_pred):
        y_true = unscale(y_true, mean_fvc, std_fvc)
        y_pred = unscale(y_pred, mean_fvc, std_fvc)
        return _lambda * quantileLoss(tf.constant([0.2, 0.5, 0.8]), y_true, y_pred) + (1 - _lambda)*customLossFunction(y_true, y_pred)
    return loss

def buildModel(num_inputs, lambda_factor):
    z = layers.Input((num_inputs,), name="Patient")
    x = layers.Dense(64, activation="relu", name="d1")(z)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(32, activation="relu",  name="d2")(x)
    x = layers.Dropout(0.2)(x)
    p1 = layers.Dense(3, activation="linear", name="p1")(x)
    p2 = layers.Dense(3, activation="relu", name="p2")(x)
    preds = layers.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
                     name="preds")([p1, p2])

    model = models.Model(z, p1, name="CNN")
    model_loss = mloss(lambda_factor)
    model.compile(loss=model_loss, 
                  optimizer=tf.keras.optimizers.Adam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=None,
                                                     amsgrad=False, clipvalue=10), 
                  metrics=['mae'])
    return model

In [None]:
xgb_inputs = {
    'objective': 'reg:squarederror', 
    'eta': 0.01, 
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.9, 
    'gamma': 0.4, 
    'booster' : 'gbtree',
    'eval_metric': 'rmse', 
    'seed': 20 
}


In [None]:
skf = StratifiedKFold(n_splits = 7, random_state = 12, shuffle = True)
list_models, list_history, list_final_metric = [], [], []

for num_fold, (train_index, val_index) in enumerate(skf.split(unique_train_patients, 
                                                              np.zeros(unique_train_patients.shape[0]))):

    x_train_patients = list(unique_train_patients[train_index])
    x_val_patients = list(unique_train_patients[val_index])
    
    print(f'Num Fold: {num_fold + 1}')
    print(f'Train patients: {len(x_train_patients)}, Test patients: {len(x_val_patients)}')  

    df_train_weights = buildDataSet(x_train_patients, 
                 dict_ini_features=dict_patients_train_ini_features, 
                 dict_seq_weeks=dict_train_sequence_weekssincelastvisit, 
                 dict_seq_cumweeks=dict_train_sequence_cumweeks, 
                 training=True, 
                 predictions=None)

    df_val_weights = buildDataSet(x_val_patients,
                 dict_ini_features=dict_patients_train_ini_features, 
                 dict_seq_weeks=dict_train_sequence_weekssincelastvisit, 
                 dict_seq_cumweeks=dict_train_sequence_cumweeks, 
                 training=True, 
                 predictions=None)
                                               
    features = list(col for col in df_train_weights.columns if col not in ['Patient', 'fvc_real', 'kind'])   
    y_train = df_train_weights['fvc_real'].astype(float)
    y_val = df_val_weights['fvc_real'].astype(float)

    X_train =  df_train_weights[features]
    X_val =  df_val_weights[features]

    model_weights = buildModel(len(features), lambda_factor=0.8)

    model_weights.fit(X_train, y_train, shuffle=True, batch_size=16, epochs=40, 
                validation_data=(X_val, y_val), verbose=0)
    
    list_models.append(model_weights)

    y_val_pred = model_weights.predict(X_val)
    y_val_pred_median = unscale(y_val_pred[:, 1], mean_fvc, std_fvc)
    y_val_pred_std = unscale(y_val_pred[:, 2], mean_fvc, std_fvc) - unscale(y_val_pred[:, 0], mean_fvc, std_fvc)

    
    metric = customLossFunction(unscale(y_val, mean_fvc, std_fvc),
                                      y_val_pred_median,
                                     y_val_pred_std).numpy()
    
    list_history.append({'metric' : metric})
    print(f'Metric base model: {metric}')
    
    ### Confidence ###
    
    df_all_weights = pd.concat([df_train_weights, df_val_weights], axis=0)
    df_all_weights = df_all_weights[features + ['fvc_real', 'Patient']]
    
    predictions = model_weights.predict(df_all_weights[features])
    df_all_weights['fvc_real'] = unscale(df_all_weights['fvc_real'], mean_fvc, std_fvc)
    df_all_weights['fvc_pred'] = unscale(predictions[:, 1], mean_fvc, std_fvc)
    df_all_weights['Confidence'] = unscale(predictions[:, 2], mean_fvc, std_fvc) - unscale(predictions[:, 0], mean_fvc, std_fvc)
    df_all_weights['sigma_clipped'] = df_all_weights['Confidence'].apply(lambda x: max(x, 70))
    df_all_weights['diff'] = np.abs(df_all_weights['fvc_real'] - df_all_weights['fvc_pred'])
    df_all_weights['delta'] = df_all_weights['diff'].apply(lambda x: min(x, 1_000))
    df_all_weights['score'] = -np.sqrt(2)*df_all_weights['delta']/df_all_weights['sigma_clipped'] - np.log(np.sqrt(2)*df_all_weights['sigma_clipped'])
    
    score = customLossFunction(df_all_weights['fvc_real'],
                                 df_all_weights['fvc_pred'],
                                 df_all_weights['Confidence']).numpy()
    print(f'Metric train+val, before confidence weights: {score}') 
    
    confidence_weights = getConfidenceWeights(df_all_weights)
    
    df_all_weights['Confidence'] = confidence_weights
    df_all_weights['sigma_clipped'] = df_all_weights['Confidence'].apply(lambda x: max(x, 70))
    df_all_weights['diff'] = np.abs(df_all_weights['fvc_real'] - df_all_weights['fvc_pred'])
    df_all_weights['delta'] = df_all_weights['diff'].apply(lambda x: min(x, 1_000))
    df_all_weights['score'] = -np.sqrt(2)*df_all_weights['delta']/df_all_weights['sigma_clipped'] - np.log(np.sqrt(2)*df_all_weights['sigma_clipped'])
    score = customLossFunction(df_all_weights['fvc_real'],
                                 df_all_weights['fvc_pred'],
                                 df_all_weights['Confidence']).numpy()
    print(f'Metric train+val, confidence weights: {score}') 
    
    # xgboost
    
    df_tmp_train = df_all_weights[df_all_weights['Patient'].isin(x_train_patients)]
    df_tmp_val = df_all_weights[df_all_weights['Patient'].isin(x_val_patients)]
    
    xgb_model = buildTrainModel(xgb_inputs, features, \
                                df_train=df_tmp_train, df_val=df_tmp_val, epochs=900, verbose_eval=50)
    
    pred_confidence = xgb_model.predict(xgb.DMatrix(df_tmp_val[features]))
    final_metric = customLossFunction(y_true=df_tmp_val['fvc_real'],
                             y_pred=df_tmp_val['fvc_pred'],
                             std=pred_confidence)
    
    print('***'*20)
    print(f'Validation Weights predicted: {final_metric}')
    print('***'*20)
    list_final_metric.append(final_metric)
    

In [None]:
val_metric = np.mean([history['metric'] for history in list_history])

print(val_metric)

In [None]:
df_train_confidence = buildDataSet(unique_train_patients,
                                 dict_ini_features=dict_patients_train_ini_features, 
                                 dict_seq_weeks=dict_train_sequence_weekssincelastvisit, 
                                 dict_seq_cumweeks=dict_train_sequence_cumweeks, 
                                 training=True, 
                                 predictions=None)

predictions = np.mean([model.predict(df_train_confidence[features]) for model in list_models], axis=0)
df_train_confidence['fvc_real'] = unscale(df_train_confidence['fvc_real'], mean_fvc, std_fvc)
df_train_confidence['fvc_pred'] = unscale(predictions[:, 0], mean_fvc, std_fvc)
df_train_confidence['Confidence'] = unscale(predictions[:, 2], mean_fvc, std_fvc) - unscale(predictions[:, 0], mean_fvc, std_fvc)
df_train_confidence['sigma_clipped'] = df_train_confidence['Confidence'].apply(lambda x: max(x, 70))
df_train_confidence['diff'] = np.abs(df_train_confidence['fvc_real'] - df_train_confidence['fvc_pred'])
df_train_confidence['delta'] = df_train_confidence['diff'].apply(lambda x: min(x, 1_000))
df_train_confidence['score'] = -np.sqrt(2)*df_train_confidence['delta']/df_train_confidence['sigma_clipped'] - np.log(np.sqrt(2)*df_train_confidence['sigma_clipped'])
score = df_train_confidence['score'].mean()
print(score)

In [None]:
import scipy as sp

def loss_func(weight, row):
    confidence = weight
    sigma_clipped = max(confidence, 70)
    diff = abs(row['fvc_real'] - row['fvc_pred'])
    delta = min(diff, 1000)
    score = -np.sqrt(2)*delta/sigma_clipped - np.log(np.sqrt(2)*sigma_clipped)
    return -score

results = []
tk0 = tqdm(df_train_confidence.iterrows(), total=len(df_train_confidence), position=0)
for _, row in tk0:
    loss_partial = partial(loss_func, row=row)
    weight = [100]
    result = sp.optimize.minimize(loss_partial, weight, method='SLSQP')
    x = result['x']
    results.append(x[0])

In [None]:


def seed_everything(seed=2020):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(509)



In [None]:
ROOT = "../input/osic-pulmonary-fibrosis-progression"
tr = pd.read_csv(f"{ROOT}/train.csv")
tr.head()


In [None]:
tr["Patient"].unique().shape

In [None]:


tr.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
chunk = pd.read_csv(f"{ROOT}/test.csv")

print("add infos")
sub = pd.read_csv(f"{ROOT}/sample_submission.csv")
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub =  sub[['Patient','Weeks','Confidence','Patient_Week']] 
sub = sub.merge(chunk.drop('Weeks', axis=1), on="Patient")



In [None]:
sub.head()

In [None]:
tr['WHERE'] = 'train'
chunk['WHERE'] = 'val'
sub['WHERE'] = 'test'
data = tr.append([chunk, sub])

In [None]:
print(tr.shape, chunk.shape, sub.shape, data.shape)
print(tr.Patient.nunique(), chunk.Patient.nunique(), sub.Patient.nunique(), 
      data.Patient.nunique())


In [None]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

In [None]:
base = data.loc[data.Weeks == data.min_week]

In [None]:


base = base[['Patient','FVC']].copy()
base.columns = ['Patient','min_FVC']
base['nb'] = 1
base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
base = base[base.nb==1]
base.drop('nb', axis=1, inplace=True)



In [None]:
df_train_confidence['Confidence'] = results
df_train_confidence['sigma_clipped'] = df_train_confidence['Confidence'].apply(lambda x: max(x, 70))
df_train_confidence['diff'] = np.abs(df_train_confidence['fvc_real'] - df_train_confidence['fvc_pred'])
df_train_confidence['delta'] = df_train_confidence['diff'].apply(lambda x: min(x, 1_000))
df_train_confidence['score'] = -np.sqrt(2)*df_train_confidence['delta']/df_train_confidence['sigma_clipped'] - np.log(np.sqrt(2)*df_train_confidence['sigma_clipped'])
score = df_train_confidence['score'].mean()
print(score)