# OSIC: Pulmonary Fibrosis | Baseline Regression

* Feature engineering
* Experimentation with simple and multiple regression models
* Preparation of sample submissions

## Import Libraries

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Visualization
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn import linear_model
import statsmodels.api as sm

# Load Data

## Feature Engineering | Train

In [None]:
# Location of the training images
BASE_PATH = '../input/osic-pulmonary-fibrosis-progression'

# image directories
data_train_dir = f'{BASE_PATH}/train'
data_test_dir = f'{BASE_PATH}/test'

# Location of training labels
train = pd.read_csv(f'{BASE_PATH}/train.csv')
test = pd.read_csv(f'{BASE_PATH}/test.csv')

In [None]:
sample_submission = pd.read_csv(f'{BASE_PATH}/sample_submission.csv')

### Create 'Initial FVC' Column

In [None]:
patient_dict = {}
def init_fvc(row):
    if row['Patient'] not in patient_dict.keys():
        patient_dict[row['Patient']] = row['FVC']
        return row['FVC']
    else:
        return patient_dict[row['Patient']]

train['InitFVC'] = train.apply(lambda row: init_fvc(row), axis=1)
train.head(20)

### Create 'Initial Week' Column

In [None]:
patient_dict = {}
def init_week(row):
    if row['Patient'] not in patient_dict.keys():
        patient_dict[row['Patient']] = row['Weeks']
        return row['Weeks']
    else:
        return patient_dict[row['Patient']]

train['InitWeeks'] = train.apply(lambda row: init_week(row), axis=1)
train.head(20)

### Add Column 'InitPercent'

In [None]:
patient_dict = {}
def init_percent(row):
    if row['Patient'] not in patient_dict.keys():
        patient_dict[row['Patient']] = row['Percent']
        return row['Percent']
    else:
        return patient_dict[row['Patient']]

train['InitPercent'] = train.apply(lambda row: init_percent(row), axis=1)
train.head(20)

### Convert Sex and SmokingStatus to Indicator Variables

In [None]:
train_df = pd.get_dummies(train, columns=['Sex', 'SmokingStatus'], prefix=['Sex', 'SmokingStatus'])

In [None]:
train_df.head()

## Feature Engineering | Test
* For each patient, add a row for weeks with range -12 to 133. 
* All else is *constant* for now
* Age is *initial age*

In [None]:
test.head()

In [None]:
data = []
for i in range(-12, 133+1):
    for index, row in test.iterrows():
        new_cols = list(test.columns)
        new_cols.append('InitWeeks')
        new_vals = [row['Patient'], i, row['FVC'], row['Percent'],row['Age'],row['Sex'],row['SmokingStatus'], row['Weeks']]
        data.append(dict(zip(new_cols, new_vals)))
test_df = pd.DataFrame(data)
test_df.head(10)

# Regression Models

## 1. Naive Simple Linear Regression (FVC vs. Weeks)

In [None]:
# Naive Simple Linear Regression (FVC vs. Weeks)
X = train[['Weeks']]
Y = train['FVC']

regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

model = sm.OLS(Y, X).fit()
print(model.summary())

### Create Weeks vs. FVC Plot 

Trend is very weak because of the different initial FVC for each patient. (Everyone has different starting points, and decline at different rates). 

* A negative correlation can be visualized by **grouping by InitFVC** (See section 2 below). 

In [None]:
sns.lmplot(x='Weeks', y='FVC', data=train.sample(frac=0.8))

In [None]:
X_test = test[['Weeks']]
Y_test = test['FVC']
print('Predicted FVCs: \n', regr.predict(X_test))
print('Actual FVCs: \n', Y_test)

## 2. Basic Multiple Linear Regression (FVC vs. InitFVC & Weeks)
* Incorporate InitFVC as a feature in the linear regression model in order to differentiate beteween each patient's starting points. 
* InitFVC controls y-intercept of line. 

In [None]:
# Naive Simple Linear Regression (FVC vs. Weeks)
X = train[['Weeks', 'InitFVC']]
Y = train['FVC']

regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

model = sm.OLS(Y, X).fit()
print(model.summary())

### Visualize Weeks vs. FVC Grouped by InitFVC
* When grouped by InitFVC, the negative correlation is much more pronounced. 

In [None]:
sns.lmplot(x='Weeks', y='FVC', hue='InitFVC', data=train.head(98))

display(test.head())

In [None]:
X_test = test_df[['Weeks', 'FVC']]
Y_test = test_df['FVC']

Y_pred = regr.predict(X_test)
# print('Predicted FVCs: \n', Y_pred)
# print('Actual FVCs: \n', Y_test)

## 3. Basic Multiple Linear Regression (FVC vs. InitFVC, Weeks, InitWeeks)

In [None]:
# Multiple Linear Regression 
X = train[['Weeks', 'InitFVC', 'InitWeeks']]
Y = train['FVC']

regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

model = sm.OLS(Y, X).fit()
print(model.summary())

In [None]:
X_test = test_df[['Weeks', 'FVC', 'InitWeeks']]
Y_test = test_df['FVC']

Y_pred = regr.predict(X_test)
# print('Predicted FVCs: \n', Y_pred)
# print('Actual FVCs: \n', Y_test)

### Visualize Test Regression 
* Slopes are equal for this model, since **InitFVC** and **InitWeeks** are *constant* for each patient

In [None]:
# use df.iloc[idx] to access specific row 
data = []
for i in range(test_df.shape[0]):
    new_cols = ['Patient', 'Weeks', 'FVC', 'Confidence']
    new_vals = [test_df.iloc[i]['Patient'], test_df.iloc[i]['Weeks'], Y_pred[i], 100]
    data.append(dict(zip(new_cols, new_vals)))
viz = pd.DataFrame(data)
sns.lmplot(x='Weeks', y='FVC', hue='Patient', data=viz)

## 4. MLR (FVC vs. Weeks, InitPercent, InitWeeks)

In [None]:
train.head()

In [None]:
# Multiple Linear Regression 
X = train[['Weeks', 'InitFVC','InitWeeks']]
Y = train['FVC']

regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

model = sm.OLS(Y, X).fit()
print(model.summary())

## 5. MLR (FVC vs. Weeks, Age, InitWeeks, InitFVC)
* Incorporation of age --> p-value of predictor < 0.05. 
* Age could have an effect on **rate of decline**. 

In [None]:
# Multiple Linear Regression 
X = train[['Weeks', 'InitFVC', 'InitWeeks', 'Age']]
Y = train['FVC']

regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

model = sm.OLS(Y, X).fit()
print(model.summary())

In [None]:
X_test = test_df[['Weeks', 'FVC', 'InitWeeks', 'Age']]
Y_test = test_df['FVC']

Y_pred = regr.predict(X_test)
# print('Predicted FVCs: \n', Y_pred)
# print('Actual FVCs: \n', Y_test)

## 5. MLR (FVC vs. Weeks, Age, Sex, InitWeeks, InitFVC)

In [None]:
train_df.head()

In [None]:
# Multiple Linear Regression 
X = train_df[['Weeks', 'InitFVC', 'InitWeeks', 'Age', 'SmokingStatus_Currently smokes']]
Y = train_df['FVC']

regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

model = sm.OLS(Y, X).fit()
print(model.summary())

# Sample Submission

* **Confidence** - Redefined as **UNCERTAINTY/STANDARD DEVIATION**
* TODO: Implement confidence optimization function based on predictions. 
* Current optimal confidence estimate ~ **244** based on graph from https://www.kaggle.com/rohanrao/osic-understanding-laplace-log-likelihood/data?select=train

In [None]:
test.head()

In [None]:
# use df.iloc[idx] to access specific row 
data = []
for i in range(test_df.shape[0]):
    new_cols = ['Patient_Week', 'FVC', 'Confidence']
    new_vals = [test_df.iloc[i]['Patient']+"_"+str(test_df.iloc[i]['Weeks']), Y_pred[i], 244]
    data.append(dict(zip(new_cols, new_vals)))
submission = pd.DataFrame(data)
submission.head(95)

# create function to compare dif between actual and pred fvc for given week 

In [None]:
submission.to_csv('submission.csv', index=False)