In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.options.mode.chained_assignment = None

# In this notebook I extracted height using : https://en.wikipedia.org/wiki/Vital_capacity#:~:text=It%20is%20equal%20to%20the,a%20wet%20or%20regular%20spirometer.

In [None]:
ID = 'Patient_Week'
TARGET = 'FVC'

In [None]:
train = pd.read_csv("/kaggle/input/osic-pulmonary-fibrosis-progression/train.csv")
train[ID] = train['Patient'].astype(str) + '_' + train['Weeks'].astype(str)
print(train.shape)


In [None]:
train.head()

In [None]:
train_Male = train.loc[train['Sex']=='Male']
train_Male['height'] = train_Male['FVC']/(27.63-0.112*train_Male['Age'])


In [None]:
train_Female = train[train['Sex']=='Female']
train_Female['height'] = train_Female['FVC']/(21.78-0.101*train_Female['Age'])

In [None]:
train_Female.head()

In [None]:
frames =[train_Male,train_Female]
train = pd.concat(frames)

In [None]:
train.head()

In [None]:
# construct train input
from tqdm.notebook import tqdm 

output = pd.DataFrame()
gb = train.groupby('Patient')
tk0 = tqdm(gb, total=len(gb))
for _, usr_df in tk0:
    usr_output = pd.DataFrame()
    for week, tmp in usr_df.groupby('Weeks'):
        rename_cols = {'Weeks': 'base_Week', 'FVC': 'base_FVC', 'Percent': 'base_Percent', 'Age': 'base_Age'}
        tmp = tmp.drop(columns='Patient_Week').rename(columns=rename_cols)
        drop_cols = ['Age', 'Sex', 'SmokingStatus']
        _usr_output = usr_df.drop(columns=drop_cols).rename(columns={'Weeks': 'predict_Week'}).merge(tmp, on='Patient')
        _usr_output['Week_passed'] = _usr_output['predict_Week'] - _usr_output['base_Week']
        usr_output = pd.concat([usr_output, _usr_output])
    output = pd.concat([output, usr_output])
    
train = output[output['Week_passed']!=0].reset_index(drop=True)
print(train.shape)
train.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_features = ['Sex','SmokingStatus']
encoder = LabelEncoder()

# Apply the label encoder to each column
encoded = train[cat_features].apply(encoder.fit_transform)

In [None]:
data2 = train[['FVC','Percent','Week_passed','base_Age','height_y']].join(encoded)
data2.head()

In [None]:
X = data2[['SmokingStatus','base_Age','Sex','Week_passed','Percent','height_y']]
y = data2['FVC']

In [None]:
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
%matplotlib inline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
#Create a Gaussian Classifier
regr=RandomForestRegressor(random_state=0)
#Train the model using the training sets y_pred=clf.predict(X_test)
regr.fit(X_train,y_train)

In [None]:
y_pred=regr.predict(X_test)

In [None]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

In [None]:
df1 = df.head(25)
df1.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
test = pd.read_csv("/kaggle/input/osic-pulmonary-fibrosis-progression/test.csv")

In [None]:
test['Patient_Week'] = test['Patient'].astype(str)+"_"+test['Weeks'].astype(str)
test.head()

In [None]:
test_Male = test[test['Sex']=='Male']
test_Male['height'] = test_Male['FVC']/(27.63-0.112*test_Male['Age'])

In [None]:
test_Female = test[test['Sex']=='Male']
test_Female['height'] = test_Female['FVC']/(21.78-0.101*test_Female['Age'])

In [None]:
frames =[test_Male,test_Female]
test = pd.concat(frames)

In [None]:
test.head()

In [None]:
rename_cols = {'Weeks': 'Week_passed', 'Age': 'base_Age','height': 'height_y'}
test2 = test.rename(columns=rename_cols)

In [None]:
test2.head()

In [None]:
# Apply the label encoder to each column
encoded = test2[cat_features].apply(encoder.fit_transform)
test3 = test2[['Patient','Percent','Week_passed','base_Age','height_y']].join(encoded)

In [None]:
submission = pd.read_csv("/kaggle/input/osic-pulmonary-fibrosis-progression/sample_submission.csv")

In [None]:
submission[['Patient','Weeks']] = submission.Patient_Week.str.split("_",expand=True,)

In [None]:
submission.head()

In [None]:
submission = submission.drop('FVC',1)
submission = submission.drop('Confidence',1)
test4 = test3.drop('Week_passed',1)

In [None]:
test4.shape

In [None]:
submission2 = pd.merge(submission,test4,on='Patient',how='left')
submission2.head(100)

In [None]:
X2 = submission2[['SmokingStatus','base_Age','Sex','Weeks','Percent','height_y']]
submission2['FVC'] = regr.predict(X2)

In [None]:
submission2.head()

In [None]:
X_Per = data2[['SmokingStatus','base_Age','Sex','Week_passed','height_y']]
y_Per = data2['Percent']

In [None]:
X_trainper, X_testper, y_trainper, y_testper = train_test_split(X_Per, y_Per, test_size=0.3, random_state=0)

In [None]:
#Create a Gaussian Classifier
regr2=RandomForestRegressor(random_state=0)
#Train the model using the training sets y_pred=clf.predict(X_test)
regr2.fit(X_trainper,y_trainper)

In [None]:
y_predper=regr2.predict(X_testper)

In [None]:
df2 = pd.DataFrame({'Actual': y_testper, 'Predicted': y_predper})
df2

In [None]:
X3 = submission2[['SmokingStatus','base_Age','Sex','Weeks','height_y']]
submission2['Confidence'] = regr2.predict(X3)

In [None]:
submission3 = submission2[['Patient_Week','FVC','Confidence']]

In [None]:
submission3.head()

In [None]:
submission3.shape

In [None]:
submission4 = submission3.drop_duplicates()

In [None]:
submission4.head()

In [None]:
submission4.shape

In [None]:
submission4['FVC'] = submission4['FVC'].astype(int)
submission4['Confidence'] = submission4['Confidence'].astype(int)

In [None]:
submission5 = submission4.drop_duplicates(subset='Patient_Week',keep='last')

In [None]:
submission5.shape

In [None]:
submission5.to_csv("/kaggle/working/submission.csv",index=False)