## <u>Introduction</u>
* Data preprocessing based on this kernel => https://www.kaggle.com/yasufuminakama/osic-ridge-baseline/output.

In [None]:
import os
import numpy as np
import pandas as pd
import math 
import random
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
import torch
import scipy as sp
import torch.nn as nn

from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
from functools import partial
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from torch.autograd import Variable

import warnings
warnings.filterwarnings('ignore')

## <u>Utils</u>

In [None]:
def get_logger(filename='log'):
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

logger = get_logger()

def seed_everything(seed=777):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

## <u>Config</u>

In [None]:
OUTPUT_DICT = './'

ID = 'Patient_Week'
TARGET = 'FVC'
SEED = 42
seed_everything(seed=SEED)

N_FOLD = 4

## <u>Data Loading</u>

In [None]:
train = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
train.head(10)

In [None]:
# add a new `Patient_Week` column
train[ID] = train['Patient'].astype(str) + '_' + train['Weeks'].astype(str)
print(train.shape)
train.head()

In [None]:
# construct train input
output = pd.DataFrame()
gb = train.groupby('Patient')
tk0 = tqdm(gb, total=len(gb))
for _, usr_df in tk0:
    usr_output = pd.DataFrame()
    for week, tmp in usr_df.groupby('Weeks'):
        rename_cols = {
            'Weeks': 'base_Week', 'FVC': 'base_FVC', 
            'Percent': 'base_Percent', 'Age': 'base_Age'
        }
        tmp = tmp.drop(columns='Patient_Week').rename(columns=rename_cols)
        drop_cols = [
            'Age', 'Sex', 'SmokingStatus', 'Percent'
        ]
        _usr_output = usr_df.drop(columns=drop_cols).rename(columns={
            'Weeks': 'predict_Week'
        }).merge(tmp, on='Patient')
        _usr_output['Week_passed'] = _usr_output['predict_Week'] - _usr_output['base_Week']
        usr_output = pd.concat([usr_output, _usr_output])
    output = pd.concat([output, usr_output])
    
train = output[output['Week_passed']!=0].reset_index(drop=True)
print(train.shape)
train.head()

In [None]:
# construct test output
test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')\
        .rename(columns={'Weeks': 'base_Week', 'FVC': 'base_FVC', 'Percent': 'base_Percent', 'Age': 'base_Age'})
test.head(10)

In [None]:
submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
submission['Patient'] = submission['Patient_Week'].apply(
    lambda x: x.split('_')[0])
submission['predict_Week'] = submission['Patient_Week'].apply(
    lambda x: x.split('_')[1]).astype(int)
test = submission.drop(columns=['FVC', 'Confidence']).merge(test, on='Patient')
test['Week_passed'] = test['predict_Week'] - test['base_Week']
print(test.shape)
test.head()

In [None]:
submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
print(submission.shape)
submission.head()

## <u>Prepare Folds</u>

In [None]:
folds = train[[ID, 'Patient', TARGET]].copy()

Fold = GroupKFold(n_splits=N_FOLD)
groups = folds['Patient'].values
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[TARGET], groups)):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.shape)
folds.head()

## <u>Model</u>


In [None]:
def return_data(train_df, test_df, features, target, folds, fold_num):
    trn_idx = folds[folds.fold!=fold_num].index
    val_idx = folds[folds.fold==fold_num].index
    
    y_train = target.iloc[trn_idx].values
    x_train = train_df.iloc[trn_idx][features].values
    y_val = target.iloc[val_idx].values
    x_val = train_df.iloc[val_idx][features].values
    
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    
    x_train = torch.tensor(x_train, dtype=torch.float)
    y_train = torch.tensor(y_train, dtype=torch.float)
    x_val = torch.tensor(x_val, dtype=torch.float)
    y_val = torch.tensor(y_val, dtype=torch.float)
    
    x_train = x_train.t()
    y_train = y_train.t()
    x_val = x_val.t()
    y_val = y_val.t()
    
    return x_train, y_train, x_val, y_val, val_idx

def run_single_linear_nn(train_df, test_df, folds, features, 
                     target, fold_num):
    x_train, y_train, x_val, y_val, val_idx = return_data(train, test, features, 
                                                 target, folds, fold_num)
    
    print(x_train.shape, y_train.shape)
    
    
    n = 6 # num features

    A = torch.randn((1, n), requires_grad=True)
    b = torch.randn(1, requires_grad=True)

    def model(x_input):
        return A.mm(x_input) + b

    def loss(y_predicted, y_target):
        return ((y_predicted - y_target)**2).sum()
    optimizer = torch.optim.Adam([A, b], lr=0.1)

    for t in range(20000): # run each CV for 20000 epochs
        # set the gradients to 0
        optimizer.zero_grad()
        # compute predictions
        y_predicted = model(x_train)
        # compute MSE loss
        current_loss = loss(y_predicted, y_train)
        # backprop
        current_loss.backward()
        # update optimizer
        optimizer.step()
#         print(y_predicted)
    #     print(f"t = {t}, loss = {current_loss}, A = {A.detach().numpy()}, b = {b.item()}")
    
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    test_df = test_df[features].values
    test_df = torch.tensor(test_df, dtype=torch.float)
    test_df = test_df.t()
    
    
    with torch.no_grad():
        oof[val_idx] = model(x_val)
        preds = model(test_df)
        preds = preds.t().reshape(-1, 1)
        preds = torch.flatten(preds)
        preds = preds.numpy()
#         print(preds.size())
        predictions += preds
    
    logger.info(f"fold {fold_num} score: {np.sqrt(mean_squared_error(target[val_idx], oof[val_idx])):<8.5f}")
    
    return oof, predictions

def run_kfold_linear_nn(train, test, folds, features, target, n_fold=5):
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    feature_importance_df = pd.DataFrame()
    
    for fold_ in range(n_fold):
        logger.info(f"fold {fold_}")
        _oof, _predictions = run_single_linear_nn(train, test, folds, features, target,
                                              fold_num=fold_
        )
        oof += _oof
        predictions += _predictions / n_fold
        
    logger.info(f"CV score: {np.sqrt(mean_squared_error(target, oof)):<8.5f}")
        
    return oof, predictions

## <u>Predict FVC</u>

In [None]:
target = train[TARGET]
test[TARGET] = np.nan

# features
cat_features = ['Sex', 'SmokingStatus']
num_features = [c for c in test.columns if (test.dtypes[c] != 'object')
                & (c not in cat_features)]
features = num_features + cat_features
drop_features = [ID, TARGET, 'predict_Week', 'base_Week']
features = [c for c in features if c not in drop_features]

if cat_features:
    ce_oe = ce.OrdinalEncoder(cols=cat_features, handle_unknown='impute')
    ce_oe.fit(train)
    train = ce_oe.transform(train)
    test = ce_oe.transform(test)
    

oof, predictions = run_kfold_linear_nn(train, test, folds, features, 
                                   target, n_fold=N_FOLD)

In [None]:
train['FVC_pred'] = oof
test['FVC_pred'] = predictions

## <u>Make Confidence Labels</u>

In [None]:
# baseline score
train['Confidence'] = 100
train['sigma_clipped'] = train['Confidence'].apply(
    lambda x: max(x, 70))
train['diff'] = abs(train['FVC'] - train['FVC_pred'])
train['delta'] = train['diff'].apply(lambda x: min(x, 1000))
train['score'] = -math.sqrt(2) * train['delta']/train['sigma_clipped'] - \
np.log(math.sqrt(2)*train['sigma_clipped'])
score = train['score'].mean()
print(score)

In [None]:
train.head(10)

In [None]:
def loss_func(weight, row):
    confidence = weight
    sigma_clipped = max(confidence, 70)
    diff = abs(row['FVC'] - row['FVC_pred'])
    delta = min(diff, 1000)
    score = -math.sqrt(2)*delta/sigma_clipped - np.log(math.sqrt(2)*sigma_clipped)
    return -score

results = []
tk0 = tqdm(train.iterrows(), total=len(train))
for _, row in tk0:
    loss_partial = partial(loss_func, row=row)
    weight = [100]
    result = sp.optimize.minimize(loss_partial, weight, method='SLSQP')
    x = result['x']
    results.append(x[0])

In [None]:
# optimized score
train['Confidence'] = results
train['sigma_clipped'] = train['Confidence'].apply(lambda x: max(x, 70))
train['diff'] = abs(train['FVC'] - train['FVC_pred'])
train['delta'] = train['diff'].apply(lambda x: min(x, 1000))
train['score'] = -math.sqrt(2)*train['delta']/train['sigma_clipped'] - np.log(math.sqrt(2)*train['sigma_clipped'])
score = train['score'].mean()
print(score)

In [None]:
train.head(10)

## <u>Predict Confidence</u>

In [None]:
TARGET = 'Confidence'

target = train[TARGET]
test[TARGET] = np.nan

# features
cat_features = ['Sex', 'SmokingStatus']
num_features = [c for c in test.columns if (test.dtypes[c] != 'object') 
                & (c not in cat_features)]
features = num_features + cat_features
drop_features = [ID, TARGET, 'predict_Week', 'base_Week', 'FVC', 'FVC_pred']
features  = [c for c in features if c not in drop_features]

oof, predictions = run_kfold_linear_nn(train, test, folds, features, 
                                   target, n_fold=N_FOLD)

In [None]:
train['Confidence'] = oof
train['sigma_clipped'] = train['Confidence'].apply(lambda x: max(x, 70))
train['diff'] = abs(train['FVC'] - train['FVC_pred'])
train['delta'] = train['diff'].apply(lambda x: min(x, 1000))
train['score'] = -math.sqrt(2)*train['delta']/train['sigma_clipped'] - \
np.log(math.sqrt(2)*train['sigma_clipped'])
score = train['score'].mean()
print(score)

In [None]:
def lb_metric(train):
    train['sigma_clipped'] = train['Confidence'].apply(lambda x: max(x, 70))
    train['diff'] = abs(train['FVC'] - train['FVC_pred'])
    train['delta'] = train['diff'].apply(lambda x: min(x, 1000))
    train['score'] = -math.sqrt(2)*train['delta']/train['sigma_clipped'] - \
    np.log(math.sqrt(2)*train['sigma_clipped'])
    score = train['score'].mean()
    return score

In [None]:
score = lb_metric(train)
logger.info(f"Local Score: {score}")

In [None]:
test['Confidence'] = predictions

## <u>Submission</u>

In [None]:
submission.head()

In [None]:
sub = submission.drop(columns=['FVC', 'Confidence']).merge(test[[
    'Patient_Week', 'FVC_pred', 'Confidence']], on='Patient_Week')
sub.columns = submission.columns
sub.to_csv('submission.csv', index=False)
sub.head()