# How Well Can We Do Without A Model?

## Imports

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import gc; gc.enable()

## Data Ingestion

In [None]:
df_train = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
df_test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
sub = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

print('Train shape: ', df_train.shape)
print('Number of unique customers in train: {}'.format(df_train['Patient'].nunique()))
print('Test shape:', df_test.shape)

## Quick Peek

In [None]:
df_train.head(3)

In [None]:
df_test.head(3)

In [None]:
sub.head(3)

## Relative-Group FVC Distributions

In [None]:
SIZE = (20,8)

plt.figure(figsize=SIZE)
(df_train.FVC / df_train.Percent * 100).hist()
plt.show()

plt.figure(figsize=SIZE)
(df_test.FVC / df_test.Percent * 100).hist()
plt.show()

## Data Prep

In [None]:
sub['Patient'] = sub.Patient_Week.str.split('_').apply(lambda x: x[0])
sub['Weeks'] = sub.Patient_Week.str.split('_').apply(lambda x: int(x[1]))

### Sanity Check

In [None]:
sub.Patient.unique().tolist()

In [None]:
df_test.Patient.unique().tolist()

In [None]:
sub['Weeks'].hist()

In [None]:
df_train.Weeks.hist()

In [None]:
df_test.Weeks.hist()

### Outliers? Poorly-Behaved Patient Percents

I am choosing to focus on Percent over FVC since FVC is dependent on hidden parameters: weight, height, ethnicity, etc.

In [None]:
patients = df_train.Patient.unique().tolist()

plt.figure(figsize=SIZE)
for patient in patients:
    temp = df_train[df_train.Patient == patient][['Percent', 'Weeks']].copy()
    plt.plot(temp.Weeks, temp.Percent.pct_change())
plt.show()

In [None]:
plt.figure(figsize=SIZE)
(df_train.Percent).hist()
plt.show()

In [None]:
plt.figure(figsize=SIZE)
for patient in patients:
    temp = df_train[df_train.Patient == patient][['Percent', 'Weeks']].copy()
    plt.plot(temp.Weeks, temp.Percent)
plt.show()

In [None]:
patients = df_test.Patient.unique().tolist()

plt.figure(figsize=SIZE)
for patient in patients:
    temp = df_test[df_test.Patient == patient][['Percent', 'Weeks']].copy()
    plt.scatter(temp.Weeks, temp.Percent)
plt.show()

## Combine DataFrames

In [None]:
sub['FVC'] = np.nan
sub['Confidence'] = np.nan

df = pd.concat([df_train, df_test, sub], axis=0, ignore_index=True)
df

## Simple / Naive Inference

In [None]:
df['gpFVC'] = df.FVC / df.Percent * 100

df['Percent'] = df['Percent'].fillna(df.groupby('Patient')['Percent'].transform('median'))
df['gpFVC'] = df['gpFVC'].fillna(df.groupby('Patient')['gpFVC'].transform('median'))

df['FVC'] = df.Percent * df.gpFVC / 100

df

## Submission

In [None]:
del sub['FVC']; del sub['Confidence']; gc.collect()

In [None]:
sub = sub.merge(df[['Patient_Week', 'FVC']], on='Patient_Week', how='left')
del sub['Patient']
del sub['Weeks']
sub['Confidence'] = sub['FVC']*0.12

sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)