<a href="https://colab.research.google.com/github/sahana-manju/pulmonary_fibrosis/blob/main/pulmonary_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!gdown --id 10GiDUGg6qva-9h6A_BbTK4AlJU89uv7h

Downloading...
From: https://drive.google.com/uc?id=10GiDUGg6qva-9h6A_BbTK4AlJU89uv7h
To: /content/pulmonary.zip
100% 29.2k/29.2k [00:00<00:00, 26.9MB/s]


IMPORTING LIBRARIES

In [18]:
import numpy as np 
import pandas as pd 
import tensorflow as tf

LOADING TRAIN SET

In [20]:
!unzip pulmonary.zip -d pulmonary

Archive:  pulmonary.zip
  inflating: pulmonary/pulmonary/sample_submission.csv  
  inflating: pulmonary/pulmonary/test.csv  
  inflating: pulmonary/pulmonary/train.csv  
  inflating: pulmonary/pulmonary/validation.txt  


In [36]:
train_data=pd.read_csv('/content/pulmonary/pulmonary/train.csv')
train_data.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker


In [37]:
train_data.shape

(1549, 7)

DATA PREPROCESSING

In [38]:
#ALL THE MISSING VALUES ARE HANDLED
train_data.isnull().sum()

Patient          0
Weeks            0
FVC              0
Percent          0
Age              0
Sex              0
SmokingStatus    0
dtype: int64

In [39]:
#Drop records having same Patient ID and week
train_data.drop_duplicates(subset=['Patient', 'Weeks'], keep=False, inplace=True)

In [40]:
train_data.shape

(1535, 7)

In [41]:
#GENERATE NEW COLUMNS LIKE BaslineWeek,BaselineFVC,TargetWeek,TargetFVC
train_new = []
for p in train_data['Patient'].unique():
    patientData = train_data[train_data['Patient'] == p]
    firstMeasure = list(patientData.iloc[0, :].values)
  
    for i, week in enumerate(patientData['Weeks'].iloc[1:]):
        fvc = patientData.iloc[i, 2]
        trainDataPoint = firstMeasure + [week, fvc]
        train_new.append(trainDataPoint)
    
        
train_new = pd.DataFrame(train_new)

train_new.columns = ['PatientID', 'BaselineWeek', 'BaselineFVC', 'first_Percent', 'Age', 'Sex', 'SmokingStatus'] + ['TargetWeek', 'TargetFVC']

#Dropping percent column since it was increasing the loss
train_new.drop(columns = ['first_Percent'], inplace = True)

train_new.head()


Unnamed: 0,PatientID,BaselineWeek,BaselineFVC,Age,Sex,SmokingStatus,TargetWeek,TargetFVC
0,ID00007637202177411956430,-4,2315,79,Male,Ex-smoker,5,2315
1,ID00007637202177411956430,-4,2315,79,Male,Ex-smoker,7,2214
2,ID00007637202177411956430,-4,2315,79,Male,Ex-smoker,9,2061
3,ID00007637202177411956430,-4,2315,79,Male,Ex-smoker,11,2144
4,ID00007637202177411956430,-4,2315,79,Male,Ex-smoker,17,2069


In [42]:
train_new.shape

(1359, 8)

MIN MAX NORMALISATION

In [43]:
train_new["TargetWeek"]=(train_new['TargetWeek'] - train_new['TargetWeek'].min() ) / ( train_new['TargetWeek'].max() - train_new['TargetWeek'].min() )
train_new["Age"]=(train_new['Age'] - train_new['Age'].min() ) / ( train_new['Age'].max() - train_new['Age'].min() )
train_new["BaselineWeek"]=(train_new['BaselineWeek'] - train_new['BaselineWeek'].min() ) / ( train_new['BaselineWeek'].max() - train_new['BaselineWeek'].min() )
train_new["BaselineFVC"]=(train_new['BaselineFVC'] - train_new['BaselineFVC'].min() ) / ( train_new['BaselineFVC'].max() - train_new['BaselineFVC'].min() )

In [44]:
train_new.head()

Unnamed: 0,PatientID,BaselineWeek,BaselineFVC,Age,Sex,SmokingStatus,TargetWeek,TargetFVC
0,ID00007637202177411956430,0.011905,0.241456,0.769231,Male,Ex-smoker,0.030303,2315
1,ID00007637202177411956430,0.011905,0.241456,0.769231,Male,Ex-smoker,0.045455,2214
2,ID00007637202177411956430,0.011905,0.241456,0.769231,Male,Ex-smoker,0.060606,2061
3,ID00007637202177411956430,0.011905,0.241456,0.769231,Male,Ex-smoker,0.075758,2144
4,ID00007637202177411956430,0.011905,0.241456,0.769231,Male,Ex-smoker,0.121212,2069


In [46]:
#ENCODING Sex and SmokingStatus
train_new["Sex"]=train_new["Sex"].astype("category").cat.codes
train_new["SmokingStatus"]=train_new["SmokingStatus"].astype("category").cat.codes

In [47]:
train_new.head()

Unnamed: 0,PatientID,BaselineWeek,BaselineFVC,Age,Sex,SmokingStatus,TargetWeek,TargetFVC
0,ID00007637202177411956430,0.011905,0.241456,0.769231,1,1,0.030303,2315
1,ID00007637202177411956430,0.011905,0.241456,0.769231,1,1,0.045455,2214
2,ID00007637202177411956430,0.011905,0.241456,0.769231,1,1,0.060606,2061
3,ID00007637202177411956430,0.011905,0.241456,0.769231,1,1,0.075758,2144
4,ID00007637202177411956430,0.011905,0.241456,0.769231,1,1,0.121212,2069


In [48]:
train_new.columns

Index(['PatientID', 'BaselineWeek', 'BaselineFVC', 'Age', 'Sex',
       'SmokingStatus', 'TargetWeek', 'TargetFVC'],
      dtype='object')

In [49]:
target=train_new["TargetFVC"].values
features=train_new[[ 'BaselineWeek', 'BaselineFVC', 'Age', 'Sex',
       'SmokingStatus','TargetWeek']]

In [50]:
features.head()

Unnamed: 0,BaselineWeek,BaselineFVC,Age,Sex,SmokingStatus,TargetWeek
0,0.011905,0.241456,0.769231,1,1,0.030303
1,0.011905,0.241456,0.769231,1,1,0.045455
2,0.011905,0.241456,0.769231,1,1,0.060606
3,0.011905,0.241456,0.769231,1,1,0.075758
4,0.011905,0.241456,0.769231,1,1,0.121212


TRAIN XGBOOST MODEL

In [101]:
from xgboost import XGBRegressor

In [102]:
mod = XGBRegressor(learning_rate=0.1, n_estimators=100)
mod.fit(features,target)



XGBRegressor()

LOAD TEST DATA

In [103]:
test_data=pd.read_csv("/content/pulmonary/pulmonary/validation.txt")

PREPROCESS TEST DATA

In [104]:
#GENERATE NEW COLUMNS LIKE BaslineWeek,BaselineFVC,TargetWeek,TargetFVC
test_new = []
for p in test_data['Patient'].unique():
    patientData = test_data[test_data['Patient'] == p]
    firstMeasure = list(patientData.iloc[0, :].values)
  
    for i, week in enumerate(patientData['Weeks'].iloc[1:]):
        fvc = patientData.iloc[i, 2]
        trainDataPoint = firstMeasure + [week, fvc]
        test_new.append(trainDataPoint)
    
        
test_new = pd.DataFrame(test_new)

test_new.columns = ['PatientID', 'BaselineWeek', 'BaselineFVC', 'first_Percent', 'Age', 'Sex', 'SmokingStatus'] + ['TargetWeek', 'TargetFVC']

#Dropping percent column since it was increasing the loss
test_new.drop(columns = ['first_Percent'], inplace = True)

test_new.head()

Unnamed: 0,PatientID,BaselineWeek,BaselineFVC,Age,Sex,SmokingStatus,TargetWeek,TargetFVC
0,ID00076637202199015035026,-4,2298,51,Male,Never smoked,3,2298
1,ID00076637202199015035026,-4,2298,51,Male,Never smoked,5,2576
2,ID00076637202199015035026,-4,2298,51,Male,Never smoked,6,2182
3,ID00076637202199015035026,-4,2298,51,Male,Never smoked,9,2374
4,ID00076637202199015035026,-4,2298,51,Male,Never smoked,15,2370


In [105]:
#ENCODING Sex and SmokingStatus
test_new["Sex"]=test_new["Sex"].astype("category").cat.codes
test_new["SmokingStatus"]=test_new["SmokingStatus"].astype("category").cat.codes

MINMAX NORMALISATION

In [106]:
test_new["TargetWeek"]=(test_new['TargetWeek'] - test_new['TargetWeek'].min() ) / ( test_new['TargetWeek'].max() - test_new['TargetWeek'].min() )
test_new["Age"]=(test_new['Age'] - test_new['Age'].min() ) / ( test_new['Age'].max() - test_new['Age'].min() )
test_new["BaselineWeek"]=(test_new['BaselineWeek'] - test_new['BaselineWeek'].min() ) / ( test_new['BaselineWeek'].max() - test_new['BaselineWeek'].min() )
test_new["BaselineFVC"]=(test_new['BaselineFVC'] - test_new['BaselineFVC'].min() ) / ( test_new['BaselineFVC'].max() - test_new['BaselineFVC'].min() )

In [107]:
target_test=test_new["TargetFVC"].values
features_test=test_new[[ 'BaselineWeek', 'BaselineFVC', 'Age', 'Sex','SmokingStatus','TargetWeek']]

PREDICT TEST DATA

In [111]:
output=mod.predict(features_test)

EVALUATION

In [112]:
## evaluation metric function
def laplace_log_likelihood(actual_fvc, predicted_fvc, confidence, return_values = False):
    """
    Calculates the modified Laplace Log Likelihood score for this competition.
    """
    sd_clipped = np.maximum(confidence, 70)
    delta = np.minimum(np.abs(actual_fvc - predicted_fvc), 1000)
    metric = - np.sqrt(2) * delta / sd_clipped - np.log(np.sqrt(2) * sd_clipped)

    if return_values:
        return metric
    else:
        return np.mean(metric)


## default benchmark
laplace_log_likelihood(target_test, output,300)



-6.561985023832604

In [71]:
import pickle
file=open('xgb_pul.pkl','wb')
pickle.dump(mod,file)