In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
sns.set_palette('deep')

In [None]:
train = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

In [None]:
sns.lineplot(x="Weeks", y="FVC",
             hue="Sex", style="SmokingStatus",
             data=train)

No clear trend visible

In [None]:
def metric(trueFVC, predFVC, predSTD):
    clipSTD = np.clip(predSTD, 70, 9e9)
    deltaFVC = np.clip(np.abs(trueFVC - predFVC), 0, 1000)
    return np.mean(-1*(np.sqrt(2)*deltaFVC/clipSTD) - np.log(np.sqrt(2) * clipSTD))

In [None]:
# make train like test
train = pd.merge(train, train.groupby('Patient').nth(0)['Weeks'].rename("Weeks_init"), on='Patient', how='left')
train = pd.merge(train, train.groupby('Patient').nth(0)['FVC'].rename("FVC_init"), on='Patient', how='left')
train = pd.merge(train, train.groupby('Patient').nth(0)['Percent'].rename("Percent_init"), on='Patient', how='left')
train["Confidence"] = 100
train

In [None]:
# Just using the first FVC value
print('Metric:', metric(trueFVC=train.groupby('Patient').tail(3)['FVC'].values, predFVC=train.groupby('Patient').tail(3)['FVC_init'].values, predSTD=train.groupby('Patient').tail(3)['Confidence'].values))

In [None]:
train["FVC_norm"] = train["FVC"] / train["FVC_init"]

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

sns.lineplot(x="Weeks", y="FVC_norm",
             hue="Sex", style="SmokingStatus",
             data=train)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.lmplot(x="Weeks", y="FVC_norm", hue="Sex", data=train)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.lmplot(x="Weeks", y="FVC_norm", hue="SmokingStatus", data=train)

normalizing FVC helps

In [None]:
# Weeks should be normalized to start at zero!
train["Weeks_norm"] = train["Weeks"] - train["Weeks_init"]

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

sns.lineplot(x="Weeks_norm", y="FVC_norm",
             hue="Sex", style="SmokingStatus",
             data=train)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.lmplot(x="Weeks_norm", y="FVC_norm", hue="Sex", data=train)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.lmplot(x="Weeks_norm", y="FVC_norm", hue="SmokingStatus", data=train)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.lmplot(x="Weeks_norm", y="FVC_norm", hue="SmokingStatus", col="Sex", data=train)

Normalizing also by starting week, looks even better

In [None]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_absolute_error

# Create linear regression object
regr = linear_model.LinearRegression()

X = train["Weeks_norm"].values
Y = train["FVC_norm"].values

# Train the model using the training sets
regr.fit(X.reshape(-1, 1), Y)

# The coefficients
print('Coefficients: ', regr.coef_)
print('Intercept: ', regr.intercept_)

Let's use that slope for our submission and scale with the FVC initial value!

In [None]:
submission['Patient'] = submission['Patient_Week'].apply(lambda x: x.split('_')[0]) 
submission['Weeks'] = submission['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
submission.head()

In [None]:
test_new = pd.merge(submission, test[['Patient','Percent','Age','Sex','SmokingStatus', 'FVC', 'Weeks']], on='Patient', how='left')

In [None]:
test_new = test_new.rename(columns={"Weeks_x": "Weeks", "Weeks_y": "Weeks_init"})
test_new = test_new.rename(columns={"FVC_x": "FVC_2000", "FVC_y": "FVC_init"})
test_new["Weeks_norm"] = test_new["Weeks"] - test_new["Weeks_init"]
test_new.head(10)

In [None]:
test_new["FVC_pred_linear"] = (1 - 0.00139284 * test_new["Weeks_norm"]) * test_new["FVC_init"]

In [None]:
test_new["Confidence"] = 285  # hyperparameter chosen by best value from train

In [None]:
test_new = test_new.rename(columns={"FVC_pred_linear": "FVC"})
test_new[['Patient_Week','FVC','Confidence']].to_csv('submission.csv', index=False)
test_new[['Patient_Week','FVC','Confidence']].head(10)