## Liver Disease Prediction
Create predictive models to predict the stage of liver Cirrhosis using 18 clinical features. Cirrhosis damages the liver from a variety of causes leading to scarring and liver failure.

Hepatitis and chronic alcohol abuse are frequent causes of the disease. Liver damage caused by cirrhosis can't be undone, but further damage can be limited. Treatments focus on the underlying cause. In advanced cases, a liver transplant may be required. Predicting the stage of cirrhosis and beginning the treatment before it's too late can prevent the fatal consequences of the disease.

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np

In [2]:
df=pd.read_csv('train_dataset.csv')

In [3]:
df_test=pd.read_csv('test_dataset.csv')

In [4]:
df.head(4)

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,7135,1654,CL,D-penicillamine,19581,F,N,N,Y,N,0.3,279.0,2.96,84.0,1500.8,99.43,109.0,293.0,10.2,4.0
1,7326,41,C,D-penicillamine,22880,F,,N,,N,0.3,,2.96,,1835.4,26.35,131.0,308.0,10.8,1.0
2,7254,297,D,,27957,F,N,N,,N,0.3,328.0,2.64,4.0,,,116.0,194.0,10.3,3.0
3,3135,1872,C,D-penicillamine,21111,F,,Y,Y,N,0.3,302.0,2.02,49.0,,26.35,,,10.5,4.0


In [5]:
df.drop(columns=['ID','N_Days','Alk_Phos','Platelets','Prothrombin'],inplace=True)

In [6]:
df_test.drop(columns=['ID','N_Days','Alk_Phos','Platelets','Prothrombin'],inplace=True)

In [7]:
import math
df['Cholesterol']=df['Cholesterol'].map(lambda x:x if math.isnan(x) else(0 if x<=250 else 1))
df['SGOT']=df['SGOT'].map(lambda x:x if math.isnan(x) else(0 if x<=45 else 1))
df['Tryglicerides']=df['Tryglicerides'].map(lambda x:x if math.isnan(x) else(0 if x<150 else 1))
df['Copper']=df['Copper'].map(lambda x:x if math.isnan(x) else(0 if x<=30 else 1)) 
df['Albumin']=df['Albumin'].map(lambda x:x if math.isnan(x) else(0 if x>=3.4 and x<5.4 else 1))
df['Bilirubin']=df['Bilirubin'].map(lambda x:x if math.isnan(x) else(0 if x>=0.3 and x<=1.0 else (0 if x<0.3 else 1)))

In [8]:
df_test['Cholesterol']=df['Cholesterol'].map(lambda x:x if math.isnan(x) else(0 if x<=250 else 1))
df_test['SGOT']=df['SGOT'].map(lambda x:x if math.isnan(x) else(0 if x<=45 else 1))
df_test['Tryglicerides']=df['Tryglicerides'].map(lambda x:x if math.isnan(x) else(0 if x<150 else 1))
df_test['Copper']=df['Copper'].map(lambda x:x if math.isnan(x) else(0 if x<=30 else 1)) 
df_test['Albumin']=df['Albumin'].map(lambda x:x if math.isnan(x) else(0 if x>=3.4 and x<5.4 else 1))
df_test['Bilirubin']=df['Bilirubin'].map(lambda x:x if math.isnan(x) else(0 if x>=0.3 and x<=1.0 else (0 if x<0.3 else 1)))

In [9]:
df=pd.get_dummies(df,columns=['Status'])
df=df.drop(columns=['Status_D'])
df_test=pd.get_dummies(df_test,columns=['Status'])
df_test=df_test.drop(columns=['Status_D'])

In [10]:
null_col=list(df.columns[df.isnull().any()])

In [11]:
#encode categorical variables
df_mapped=df.copy()
df_mapped['Age']=df_mapped['Age'].map(lambda x:int(round(x/365)))
df_mapped['Drug']=df_mapped['Drug'].map({'D-penicillamine':1,'Placebo':0})
df_mapped['Sex']=df_mapped['Sex'].map({'M':0,'F':1})
df_mapped['Ascites']=df_mapped['Ascites'].map({'N':0,'Y':1})
df_mapped['Hepatomegaly']=df_mapped['Hepatomegaly'].map({'N':0,'Y':1})
df_mapped['Spiders']=df_mapped['Spiders'].map({'N':0,'Y':1})
df_mapped['Edema']=df_mapped['Edema'].map({'N':0,'S':1,'Y':2})

In [12]:
df_test_mapped=df_test.copy()
df_test_mapped['Age']=df_test_mapped['Age'].map(lambda x:int(round(x/365)))
df_test_mapped['Drug']=df_test_mapped['Drug'].map({'D-penicillamine':1,'Placebo':0})
df_test_mapped['Sex']=df_test_mapped['Sex'].map({'M':1,'F':0})
df_test_mapped['Ascites']=df_test_mapped['Ascites'].map({'N':0,'Y':1})
df_test_mapped['Hepatomegaly']=df_test_mapped['Hepatomegaly'].map({'N':0,'Y':1})
df_test_mapped['Spiders']=df_test_mapped['Spiders'].map({'N':0,'Y':1})
df_test_mapped['Edema']=df_test_mapped['Edema'].map({'N':0,'S':1,'Y':2})

In [13]:
df_test_mapped

Unnamed: 0,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,SGOT,Tryglicerides,Status_C,Status_CL
0,0.0,62,0,0.0,,0.0,0,0,0.0,1,0.0,0.0,0.0,1,0
1,1.0,44,0,0.0,1.0,0.0,0,0,,1,,0.0,0.0,1,0
2,1.0,74,0,0.0,0.0,0.0,0,0,0.0,1,0.0,,0.0,1,0
3,0.0,47,0,0.0,1.0,0.0,0,0,0.0,1,0.0,0.0,,0,0
4,1.0,49,0,,,,0,0,0.0,1,0.0,0.0,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,0.0,50,1,,0.0,0.0,1,0,0.0,1,0.0,,0.0,0,0
3196,,33,0,0.0,0.0,1.0,0,0,,1,0.0,,,0,0
3197,1.0,47,0,,1.0,,0,0,0.0,1,,0.0,0.0,0,0
3198,0.0,78,1,,,0.0,2,0,,1,,0.0,0.0,0,0


In [14]:
null_df=df_mapped[df_mapped.isnull().any(axis=1)]

In [15]:
non_null=df_mapped.dropna(how="any")
non_null.isnull().any()
df_mapped.dropna(how='all',inplace=True)

In [16]:
from xgboost import XGBClassifier,XGBRegressor

In [17]:
X=non_null.drop(columns=['Drug','Stage'])

In [18]:
testdata=df_test_mapped[df_test_mapped['Drug'].isnull()]

In [19]:
def xgbimputer():
    for col in null_col:
        X=non_null.drop(columns=[col,'Stage'])
        y=non_null[col]
        X_test=df_mapped[df_mapped[col].isnull()]
        X_test.drop(columns=[col],inplace=True)
        X_test.drop(columns=['Stage'],inplace=True)
        testdata=df_test_mapped[df_test_mapped[col].isnull()]
        testdata.drop(columns=[col],inplace=True)
        xgb=XGBClassifier(use_label_encoder=False,eval_metric='mlogloss')
        xgb.fit(X,y)
        predict=xgb.predict(X_test)
        df_mapped.loc[X_test.index,col]=predict
        predict=xgb.predict(testdata)
        df_test_mapped.loc[testdata.index,col]=predict
        print("Imputed ",col," successfully👍🏻")

In [20]:
xgbimputer()

Imputed  Drug  successfully👍🏻
Imputed  Ascites  successfully👍🏻
Imputed  Hepatomegaly  successfully👍🏻
Imputed  Spiders  successfully👍🏻
Imputed  Cholesterol  successfully👍🏻
Imputed  Copper  successfully👍🏻
Imputed  SGOT  successfully👍🏻
Imputed  Tryglicerides  successfully👍🏻


In [21]:
df_mapped=df_mapped.astype(int)
df_test_mapped=df_test_mapped.astype(int)

In [22]:
df_mapped

Unnamed: 0,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,SGOT,Tryglicerides,Stage,Status_C,Status_CL
0,1,54,1,0,0,1,0,0,1,1,1,1,0,4,0,1
1,1,63,1,0,0,0,0,0,0,1,0,0,0,1,1,0
2,0,77,1,0,0,0,0,0,1,1,0,1,0,3,0,0
3,1,58,1,0,1,1,0,0,1,1,1,0,0,4,1,0
4,1,49,1,0,1,1,0,0,1,1,1,1,0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6795,0,57,0,0,0,0,0,0,1,1,1,1,0,1,1,0
6796,0,78,1,1,0,0,0,0,0,1,1,1,1,4,1,0
6797,1,66,1,0,1,0,0,0,0,1,1,0,0,1,1,0
6798,1,54,1,0,1,1,0,0,1,1,0,1,0,2,0,1


In [23]:
X=df_mapped.drop(columns=['Stage'])
y=df_mapped['Stage']

In [24]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [25]:
def hyperparameter_tuning(estimator,param,X,y):
    clf=GridSearchCV(estimator,param_grid=param,return_train_score=True,scoring="f1_weighted",cv=5)
    clf.fit(X,y)
    return clf    

In [26]:
def train_fit_check(clf,X_train_dataset,y_train_dataset):
    y_train_pred=clf.predict(X_train_dataset)
    print('Classification Report:\n',classification_report(y_train_pred,y_train_dataset))
    print('How well the model fit the training dataset:',clf.score(X_train_dataset,y_train_dataset))

In [55]:
#randomforestclassifier
#round1
param={
    "n_estimators":[600],
    "max_depth":[35],
    "min_samples_leaf":[1],
    "max_features":["sqrt"]
}
rfc=hyperparameter_tuning(RandomForestClassifier(),param,X,y)
train_fit_check(rfc,X,y)

Classification Report:
               precision    recall  f1-score   support

           1       0.79      0.91      0.85       403
           2       0.84      0.92      0.88      1371
           3       0.82      0.93      0.87      1173
           4       0.97      0.88      0.93      3853

    accuracy                           0.90      6800
   macro avg       0.86      0.91      0.88      6800
weighted avg       0.91      0.90      0.90      6800

How well the model fit the training dataset: 0.8998854160640809


In [56]:
rfc.best_estimator_

RandomForestClassifier(max_depth=35, max_features='sqrt', n_estimators=600)

In [38]:
predict=pd.DataFrame(rfc.predict(df_test_mapped),columns=['Stage'])

In [32]:
y.value_counts()

4    3506
2    1507
3    1322
1     465
Name: Stage, dtype: int64

In [40]:
predict.value_counts()

Stage
4        2699
2         319
3         139
1          43
dtype: int64

In [39]:
predict.to_csv('predict.csv',index=False)

Stage 1 = Healthy Liver <br>
Stage 2 = Fatty Liver <br> 
Stage 3 = Fibrosis Liver <br>
Stage 4 = Cirrhosis Liver <br>
