In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read in the inputs

In [None]:
data = pd.read_csv("/kaggle/input/osic-pulmonary-fibrosis-progression/train.csv")

In [None]:
data.shape # How many rows and columns?

In [None]:
# Let's check a few rows
data.head()

In [None]:
# How many rows for each patient?
data.groupby('Patient').size()

Looks like 176 unique patients! Let's try and explain one row in words. If we look at previous display of rows, for example - patient ID00007637202177411956430 , we can say that this person is a male, ex-smoker and 79 years old. What's the percent, weeks and FVC? 

Why is weeks negative? From data description - "the relative number of weeks pre/post the baseline CT (may be negative)"

Others from Data Description:
FVC - is the recorded lung capacity in ml
(What's a good FVC value?)

Percent- a computed field which approximates the patient's FVC as a percent of the typical FVC for a person of similar characteristics. 

What's the goal?
It seems like we have to predict the lung capacity based on gender, smoking status, age and also use the CT scans I guess?

How is the model evaluated?
As per competition page, "This competition is evaluated on a modified version of the Laplace Log Likelihood. In medical applications, it is useful to evaluate a model's confidence in its decisions. Accordingly, the metric is designed to reflect both the accuracy and certainty of each prediction."



In [None]:
# How many rows for each smokers vs. ex-smokers?
data.groupby('SmokingStatus')['Patient'].nunique()

Looks like a lot of ex-smokers 118 of 176 i.e., 67%. I wonder if people who never smoked have high lung capacity?

In [None]:
data.groupby('SmokingStatus')['FVC'].mean() 
# Seems odd that the data description page says FVC is "the recorded lung capacity in ml".

In [None]:
data.groupby('Weeks')['Patient'].nunique()

In [None]:
data.groupby(['Weeks','SmokingStatus','Sex','Age'])['FVC'].mean()

# Prep categorical variables

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_features = ['Sex','SmokingStatus']
encoder = LabelEncoder()

# Apply the label encoder to each column
encoded = data[cat_features].apply(encoder.fit_transform)

In [None]:
data2 = data[['FVC','Percent','Weeks','Age']].join(encoded)
data2.head()

In [None]:
X = data2[['SmokingStatus','Age','Sex','Weeks','Percent']]
y = data2['FVC']

# Evaluation Metric

In [None]:
# Let's define a function to calculate the metric
# I didn't actually use this evaluation but sharing my thoughts
# def eval_metric(FVC,FVC_Pred,sigma):
#     sigma_clipped = np.max(sigma,70)
#     delta = np.min(np.abs(FVC-FVC_Pred),1000)
#     eval_metric = -np.sqrt(2)*delta/sigma_clipped - np.ln(np.sqrt(2)*sigma_clipped)
#     return eval_metric

# We need the prediction for FVC_Pred and confidence(sigma I think?)

# Simple Linear Regression Model

In [None]:
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

# Create training, validation, and test splits

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train) #training the algorithm

In [None]:
#To retrieve the intercept:
print(regressor.intercept_)
#For retrieving the slope:
print(regressor.coef_)


In [None]:
y_pred = regressor.predict(X_test)

# Predictions

In [None]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

In [None]:
df1 = df.head(25)
df1.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
test = pd.read_csv("/kaggle/input/osic-pulmonary-fibrosis-progression/test.csv")

In [None]:
# test2 = test[['Percent','Weeks','Age']].join(encoded)
test.head()

In [None]:
test['Patient_Week'] = test['Patient'].astype(str)+"_"+test['Weeks'].astype(str)
test.head()

In [None]:
test.groupby('SmokingStatus')['FVC'].mean() 

In [None]:
# Apply the label encoder to each column
encoded = test[cat_features].apply(encoder.fit_transform)
test2 = test[['Patient','Percent','Weeks','Age']].join(encoded)

In [None]:
test2.head(100)

In [None]:
submission = pd.read_csv("/kaggle/input/osic-pulmonary-fibrosis-progression/sample_submission.csv")

In [None]:
submission.head(100)

In [None]:
submission[['Patient','Weeks']] = submission.Patient_Week.str.split("_",expand=True,)

In [None]:
submission.head()

In [None]:
submission = submission.drop('FVC',1)
submission = submission.drop('Confidence',1)
test2 = test2.drop('Weeks',1)

In [None]:
submission2 = pd.merge(submission,test2,on='Patient',how='left')
submission2.head(100)

In [None]:
X2 = submission2[['SmokingStatus','Age','Sex','Weeks','Percent']]
submission2['FVC'] = regressor.predict(X2)

In [None]:
submission2.head()

In [None]:
submission2.shape

In [None]:
submission2.groupby(['SmokingStatus','Sex','Age'])['FVC'].mean()

In [None]:
submission2['FVC_Group'] = submission2.groupby(['SmokingStatus','Sex','Age'])['FVC'].transform('mean')

In [None]:
submission2.head(100)

In [None]:
submission2['Confidence'] = 100*submission2['FVC']/submission2['FVC_Group']

In [None]:
submission2.head(100)

In [None]:
submission3 = submission2[['Patient_Week','FVC','Confidence']]

In [None]:
submission3.head()

In [None]:
submission3['FVC'] = submission3['FVC'].astype(int)
submission3['Confidence'] = submission3['Confidence'].astype(int)

In [None]:
submission3.head()

In [None]:
submission3.to_csv("/kaggle/working/submission.csv",index=False)