# Importing Data And Libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_excel('Liver Patient.xlsx',header = None)

In [3]:
data.columns = ['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase',
                    'Alamine_Aminotransferase', 'Aspartate_Aminotransferase', 'Total_Protiens',
                    'Albumin', 'Albumin_and_Globulin_Ratio', 'Class']

# Dealing With Null Values

In [5]:
data.isna().sum()

Age                            0
Gender                        20
Total_Bilirubin                0
Direct_Bilirubin               0
Alkaline_Phosphotase           0
Alamine_Aminotransferase       0
Aspartate_Aminotransferase     0
Total_Protiens                15
Albumin                        0
Albumin_and_Globulin_Ratio     4
Class                          0
dtype: int64

In [6]:
data['Gender'].fillna(data['Gender'].mode()[0],inplace=True)
data['Total_Protiens'].fillna(data['Total_Protiens'].mean(),inplace=True)
data['Albumin_and_Globulin_Ratio'].fillna(data['Albumin_and_Globulin_Ratio'].mean(),inplace=True)

# Splitting Dataset

In [7]:
X=data.drop('Class',axis=1) 
Y=data.Class
X=pd.get_dummies(data=X, drop_first=True)

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0)

# Applying Models
## KNN Model

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_knn=KNeighborsClassifier(n_neighbors=2)
clf_knn.fit(x_train,y_train)
knn_pred=clf_knn.predict(x_test)

cfm_knn=confusion_matrix(y_test,knn_pred)
print("knn test actual vs predicted cfm:-")
print(cfm_knn)
Accuracy_knn_train = clf_knn.score(x_train,y_train)
print("knn train score:-",Accuracy_knn_train)
Accuracy_knn_test = accuracy_score(y_test,knn_pred)
print("knn test score:-",Accuracy_knn_test)
cv_knn = cross_val_score(clf_knn, x_train, y_train, cv=5)
print("knn cross val score:-",round(cv_knn.mean(), 2) * 100)
print(classification_report(y_test,knn_pred))
knn_pred[0:20]

knn test actual vs predicted cfm:-
[[92  8]
 [40  6]]
knn train score:- 0.8283752860411899
knn test score:- 0.6712328767123288
knn cross val score:- 72.0
              precision    recall  f1-score   support

          No       0.70      0.92      0.79       100
         Yes       0.43      0.13      0.20        46

    accuracy                           0.67       146
   macro avg       0.56      0.53      0.50       146
weighted avg       0.61      0.67      0.61       146



array(['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No'],
      dtype=object)

In [10]:
from sklearn.model_selection import GridSearchCV

parameters=[{'n_neighbors':[1,2,3,4,5,6]}]
grid_search= GridSearchCV(estimator= clf_knn,param_grid=parameters,scoring='accuracy',cv=5,n_jobs= -1)
grid_search=grid_search.fit(x_train,y_train)
best_accuracy_knn=grid_search.best_score_
best_parameters_knn=grid_search.best_params_
print(best_accuracy_knn)
print(best_parameters_knn)

0.7185997910135842
{'n_neighbors': 2}


## XGB Model

In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_xgb=XGBClassifier(learning_rate=0.5,random_state=0)
clf_xgb.fit(x_train,y_train)
xgb_pred=clf_xgb.predict(x_test)

cfm_xgb=confusion_matrix(y_test,xgb_pred)
print("xgb test actual vs predicted cfm:-")
print(cfm_xgb)
Accuracy_xgb_train = clf_xgb.score(x_train,y_train)
print("xgb train score:-",Accuracy_xgb_train)
Accuracy_xgb_test = accuracy_score(y_test,xgb_pred)
print("xgb test score:-",Accuracy_xgb_test)
cv_xgb = cross_val_score(clf_xgb, x_train, y_train, cv=5)
print("xgb cross val score:-",round(cv_xgb.mean(), 2) * 100)
print(classification_report(y_test,xgb_pred))
xgb_pred[0:20]

xgb test actual vs predicted cfm:-
[[89 11]
 [30 16]]
xgb train score:- 1.0
xgb test score:- 0.7191780821917808
xgb cross val score:- 72.0
              precision    recall  f1-score   support

          No       0.75      0.89      0.81       100
         Yes       0.59      0.35      0.44        46

    accuracy                           0.72       146
   macro avg       0.67      0.62      0.63       146
weighted avg       0.70      0.72      0.69       146



array(['No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No'],
      dtype=object)

In [12]:
from sklearn.model_selection import GridSearchCV

parameters=[{'learning_rate' : [0.2,0.5,0.8,0.9],'random_state':[0,1,2,3]}]
grid_search= GridSearchCV(estimator= clf_xgb,param_grid=parameters,scoring='accuracy',cv=5,n_jobs= -1)
grid_search=grid_search.fit(x_train,y_train)
best_accuracy_xgb=grid_search.best_score_
best_parameters_xgb=grid_search.best_params_
print(best_accuracy_xgb)
print(best_parameters_xgb)

0.7162748171368861
{'learning_rate': 0.5, 'random_state': 0}


## Random Forest Model

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_rf=RandomForestClassifier(criterion='entropy',random_state = 0,n_estimators = 30)
clf_rf.fit(x_train,y_train)
rf_pred=clf_rf.predict(x_test)

cfm_rf=confusion_matrix(y_test,rf_pred)
print("rf test actual vs predicted cfm:-")
print(cfm_rf)
Accuracy_rf_train = clf_rf.score(x_train,y_train)
print("rf train score:-",Accuracy_rf_train)
Accuracy_rf_test = accuracy_score(y_test,rf_pred)
print("rf test score:-",Accuracy_rf_test)
cv_rf = cross_val_score(clf_rf, x_train, y_train, cv=5)
print("rf cross val score:-",round(cv_rf.mean(), 2) * 100)
print(classification_report(y_test,rf_pred))
rf_pred[0:20]

rf test actual vs predicted cfm:-
[[86 14]
 [35 11]]
rf train score:- 1.0
rf test score:- 0.6643835616438356
rf cross val score:- 74.0
              precision    recall  f1-score   support

          No       0.71      0.86      0.78       100
         Yes       0.44      0.24      0.31        46

    accuracy                           0.66       146
   macro avg       0.58      0.55      0.54       146
weighted avg       0.63      0.66      0.63       146



array(['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No'],
      dtype=object)

In [14]:
from sklearn.model_selection import GridSearchCV

parameters=[{'n_estimators' : [10,20,30], 'criterion' : ['entropy'],'random_state':[0,1,2]},
           {'n_estimators' : [10,20,30], 'criterion' : ['gini'],'random_state':[0,1,2]}]
grid_search= GridSearchCV(estimator= clf_rf,param_grid=parameters,scoring='accuracy',cv=5,n_jobs= -1)
grid_search=grid_search.fit(x_train,y_train)
best_accuracy_rf=grid_search.best_score_
best_parameters_rf=grid_search.best_params_
print(best_accuracy_rf)
print(best_parameters_rf)

0.7438349007314524
{'criterion': 'entropy', 'n_estimators': 30, 'random_state': 0}


## Logistic Model

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_log=LogisticRegression(random_state=0)
clf_log.fit(x_train,y_train)
log_pred=clf_log.predict(x_test)

cfm_log=confusion_matrix(y_test,log_pred)
print("log test actual vs predicted cfm:-")
print(cfm_log)
Accuracy_log_train = clf_log.score(x_train,y_train)
print("log train score:-",Accuracy_log_train)
Accuracy_log_test = accuracy_score(y_test,log_pred)
print("log test score:-",Accuracy_log_test)
cv_log = cross_val_score(clf_log, x_train, y_train, cv=5)
print("log cross val score:-",round(cv_log.mean(), 2) * 100)
print(classification_report(y_test,log_pred))
log_pred[0:20]

log test actual vs predicted cfm:-
[[93  7]
 [37  9]]
log train score:- 0.7345537757437071
log test score:- 0.6986301369863014
log cross val score:- 73.0
              precision    recall  f1-score   support

          No       0.72      0.93      0.81       100
         Yes       0.56      0.20      0.29        46

    accuracy                           0.70       146
   macro avg       0.64      0.56      0.55       146
weighted avg       0.67      0.70      0.65       146



array(['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No',
       'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No'],
      dtype=object)

In [16]:
from sklearn.model_selection import GridSearchCV

parameters=[{'random_state':[0,1,2,3,4,5,6,7,8,9,10]}]
grid_search= GridSearchCV(estimator= clf_log,param_grid=parameters,scoring='accuracy',cv=5,n_jobs= -1)
grid_search=grid_search.fit(x_train,y_train)
best_accuracy_log=grid_search.best_score_
best_parameters_log=grid_search.best_params_
print(best_accuracy_log)
print(best_parameters_log)

0.7255224660397074
{'random_state': 0}


## SVC Model

In [23]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_svc=SVC(gamma=0.1)
clf_svc.fit(x_train,y_train)
svc_pred=clf_svc.predict(x_test)

cfm_svc=confusion_matrix(y_test,svc_pred)
print("svc test actual vs predicted cfm:-")
print(cfm_svc)
Accuracy_svc_train = clf_svc.score(x_train,y_train)
print("svc train score:-",Accuracy_svc_train)
Accuracy_svc_test = accuracy_score(y_test,svc_pred)
print("svc test score:-",Accuracy_svc_test)
cv_svc = cross_val_score(clf_svc, x_train, y_train, cv=5)
print("svc cross val score:-",round(cv_svc.mean(), 2) * 100)
print(classification_report(y_test,svc_pred))
svc_pred[0:20]

svc test actual vs predicted cfm:-
[[100   0]
 [ 45   1]]
svc train score:- 0.9954233409610984
svc test score:- 0.6917808219178082
svc cross val score:- 72.0
              precision    recall  f1-score   support

          No       0.69      1.00      0.82       100
         Yes       1.00      0.02      0.04        46

    accuracy                           0.69       146
   macro avg       0.84      0.51      0.43       146
weighted avg       0.79      0.69      0.57       146



array(['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No'], dtype=object)

In [18]:
from sklearn.model_selection import GridSearchCV

parameters=[{'gamma':[0.1,0.2,0.3]}]
grid_search= GridSearchCV(estimator= clf_svc,param_grid=parameters,scoring='accuracy',cv=5,n_jobs= -1)
grid_search=grid_search.fit(x_train,y_train)
best_accuracy_svc=grid_search.best_score_
best_parameters_svc=grid_search.best_params_
print(best_accuracy_svc)
print(best_parameters_svc)

0.7231191222570532
{'gamma': 0.1}


## We are getting good accuracy in random forest model ,hence we use random forest for the prediction.

In [26]:
df = pd.DataFrame({'Id':x_test.index,'Actual class':y_test,'rf':rf_pred})
df.to_csv('prediction.csv', index = False)