In [1]:
import numpy as np
import pandas as pd

# Reading Input

In [2]:
path = '/home/prakhar/Desktop/ml/OSIC-Pulmonary-Fibrosis-Progression/input/'

In [3]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sub = pd.read_csv(path + 'sample_submission.csv')

In [4]:
train.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker


In [5]:
test.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,6,3020,70.186855,73,Male,Ex-smoker
1,ID00421637202311550012437,15,2739,82.045291,68,Male,Ex-smoker
2,ID00422637202311677017371,6,1930,76.672493,73,Male,Ex-smoker
3,ID00423637202312137826377,17,3294,79.258903,72,Male,Ex-smoker
4,ID00426637202313170790466,0,2925,71.824968,73,Male,Never smoked


# Preprocessing Input

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
encoder = LabelEncoder()

In [8]:
cat_features = ['Sex', 'SmokingStatus']
encoded = train[cat_features].apply(encoder.fit_transform)
encoded.head()

Unnamed: 0,Sex,SmokingStatus
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1


In [9]:
X = train[['Percent', 'Age', 'Weeks']].join(encoded)
X.head()

Unnamed: 0,Percent,Age,Weeks,Sex,SmokingStatus
0,58.253649,79,-4,1,1
1,55.712129,79,5,1,1
2,51.862104,79,7,1,1
3,53.950679,79,9,1,1
4,52.063412,79,11,1,1


In [10]:
Y = train['FVC']
Y.head()

0    2315
1    2214
2    2061
3    2144
4    2069
Name: FVC, dtype: int64

# Model Building

In [11]:
def score_func(y, y_pred) :
    confidence = np.std(y_pred)
    
    sd_clipped = np.maximum(confidence, 70)
    delta = np.minimum(np.abs(y - y_pred), 1000)
    metric = - np.sqrt(2) * delta / sd_clipped - np.log(np.sqrt(2) * sd_clipped)
    
    return np.mean(metric)

In [12]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer

from pprint import pprint

In [13]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [14]:
rf = RandomForestRegressor()
scorer = make_scorer(score_func, greater_is_better=True)

rf_random = RandomizedSearchCV(
    estimator=rf,
    scoring=scorer,
    param_distributions=random_grid,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [15]:
rf_random.fit(X, Y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.5min finished


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, scoring=make_scorer(score_func), verbose=2)

In [16]:
print(rf_random.best_estimator_)

RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                      n_estimators=1600)


In [17]:
print(rf_random.best_score_)

-7.427078453764776


In [18]:
model = RandomForestRegressor(n_estimators=1600, max_depth=10, max_features='sqrt', min_samples_leaf=4)
model.fit(X, Y)

RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                      n_estimators=1600)

# Preprocessing test data

In [20]:
sub[['Patient', 'Weeks']] = sub.Patient_Week.str.split("_", expand=True)

In [21]:
encoded = test[cat_features].apply(encoder.fit_transform)
test.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,6,3020,70.186855,73,Male,Ex-smoker
1,ID00421637202311550012437,15,2739,82.045291,68,Male,Ex-smoker
2,ID00422637202311677017371,6,1930,76.672493,73,Male,Ex-smoker
3,ID00423637202312137826377,17,3294,79.258903,72,Male,Ex-smoker
4,ID00426637202313170790466,0,2925,71.824968,73,Male,Never smoked


In [22]:
test2 = test[['Patient', 'Percent', 'Age']].join(encoded)
test2.head()

Unnamed: 0,Patient,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,70.186855,73,0,0
1,ID00421637202311550012437,82.045291,68,0,0
2,ID00422637202311677017371,76.672493,73,0,0
3,ID00423637202312137826377,79.258903,72,0,0
4,ID00426637202313170790466,71.824968,73,0,1


In [23]:
sub = sub.drop('FVC', 1)
sub = sub.drop('Confidence', 1)
# test.drop('Weeks', 1)
sub = pd.merge(sub, test2, on='Patient', how='left')

In [24]:
sub.head()

Unnamed: 0,Patient_Week,Patient,Weeks,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264_-12,ID00419637202311204720264,-12,70.186855,73,0,0
1,ID00421637202311550012437_-12,ID00421637202311550012437,-12,82.045291,68,0,0
2,ID00422637202311677017371_-12,ID00422637202311677017371,-12,76.672493,73,0,0
3,ID00423637202312137826377_-12,ID00423637202312137826377,-12,79.258903,72,0,0
4,ID00426637202313170790466_-12,ID00426637202313170790466,-12,71.824968,73,0,1


In [25]:
X2 = sub[['Percent', 'Age', 'Weeks', 'Sex', 'SmokingStatus']]
sub['FVC'] = model.predict(X2)

In [26]:
sub.head()

Unnamed: 0,Patient_Week,Patient,Weeks,Percent,Age,Sex,SmokingStatus,FVC
0,ID00419637202311204720264_-12,ID00419637202311204720264,-12,70.186855,73,0,0,1884.515071
1,ID00421637202311550012437_-12,ID00421637202311550012437,-12,82.045291,68,0,0,2135.115408
2,ID00422637202311677017371_-12,ID00422637202311677017371,-12,76.672493,73,0,0,1835.271672
3,ID00423637202312137826377_-12,ID00423637202312137826377,-12,79.258903,72,0,0,1991.774112
4,ID00426637202313170790466_-12,ID00426637202313170790466,-12,71.824968,73,0,1,1700.490736


In [27]:
sub['FVC_Group'] = sub.groupby(['Weeks','SmokingStatus','Sex','Age'])['FVC'].transform('mean')

In [28]:
sub['Confidence'] = 100 * sub['FVC'] / sub['FVC_Group']
sub.head(100)

Unnamed: 0,Patient_Week,Patient,Weeks,Percent,Age,Sex,SmokingStatus,FVC,FVC_Group,Confidence
0,ID00419637202311204720264_-12,ID00419637202311204720264,-12,70.186855,73,0,0,1884.515071,1859.893371,101.323823
1,ID00421637202311550012437_-12,ID00421637202311550012437,-12,82.045291,68,0,0,2135.115408,2135.115408,100.000000
2,ID00422637202311677017371_-12,ID00422637202311677017371,-12,76.672493,73,0,0,1835.271672,1859.893371,98.676177
3,ID00423637202312137826377_-12,ID00423637202312137826377,-12,79.258903,72,0,0,1991.774112,1991.774112,100.000000
4,ID00426637202313170790466_-12,ID00426637202313170790466,-12,71.824968,73,0,1,1700.490736,1700.490736,100.000000
...,...,...,...,...,...,...,...,...,...,...
95,ID00419637202311204720264_7,ID00419637202311204720264,7,70.186855,73,0,0,1880.031050,1853.672089,101.421986
96,ID00421637202311550012437_7,ID00421637202311550012437,7,82.045291,68,0,0,2116.904769,2116.904769,100.000000
97,ID00422637202311677017371_7,ID00422637202311677017371,7,76.672493,73,0,0,1827.313128,1853.672089,98.578014
98,ID00423637202312137826377_7,ID00423637202312137826377,7,79.258903,72,0,0,1976.753132,1976.753132,100.000000


In [29]:
submission = sub[['Patient_Week', 'FVC', 'Confidence']]
submission['FVC'] = submission['FVC'].astype(int)
submission['Confidence'] = submission['Confidence'].astype(int)
submission.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Patient_Week,FVC,Confidence
0,ID00419637202311204720264_-12,1884,101
1,ID00421637202311550012437_-12,2135,100
2,ID00422637202311677017371_-12,1835,98
3,ID00423637202312137826377_-12,1991,100
4,ID00426637202313170790466_-12,1700,100
