# Liver Disease prediction

## Introduction
In the following project, we'll work with the Indian Liver Patient Dataset from the UCI Machine learning repository.

We'll instantiate three classifiers to predict whether a patient suffers from a liver disease using all the features present in the dataset.

![liver disease](https://c.ndtvimg.com/2022-11/i55rcs2_liver_625x300_24_November_22.jpg)

In [36]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier as KNN

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier


## Dataset

In [9]:
df_patients = pd.read_csv('indian_liver_patient_preprocessed.csv')
print(df_patients.head())

   Unnamed: 0   Age_std  Total_Bilirubin_std  Direct_Bilirubin_std  \
0           0  1.247403            -0.420320             -0.495414   
1           1  1.062306             1.218936              1.423518   
2           2  1.062306             0.640375              0.926017   
3           3  0.815511            -0.372106             -0.388807   
4           4  1.679294             0.093956              0.179766   

   Alkaline_Phosphotase_std  Alamine_Aminotransferase_std  \
0                 -0.428870                     -0.355832   
1                  1.675083                     -0.093573   
2                  0.816243                     -0.115428   
3                 -0.449416                     -0.366760   
4                 -0.395996                     -0.295731   

   Aspartate_Aminotransferase_std  Total_Protiens_std  Albumin_std  \
0                       -0.319111            0.293722     0.203446   
1                       -0.035962            0.939655     0.077462   
2 

In [10]:
df_patients = df_patients.drop(['Unnamed: 0'], axis=1)

## Train/Test split

In [13]:
X = df_patients.drop(['Liver_disease'], axis=1)
y = df_patients[['Liver_disease']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

## Define classifier models

In [3]:
SEED = 1
# Instantiate lr
lr = LogisticRegression(random_state = SEED)

# Instantiate KNN
knn = KNN(n_neighbors=27)

# Instantiate dr
dt = DecisionTreeClassifier(min_samples_leaf=0.13, random_state=SEED)

# List of classifiers
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbors', knn), ('Classification Tree', dt)]

## Evaluate Classifiers

In [17]:
# Iterate over classifier
for clf_name, clf in classifiers:

  # Fit to the training data
  clf.fit(X_train, y_train)

  # Predict test data
  y_pred = clf.predict(X_test)

  # Calculate accuracy
  accuracy = accuracy_score(y_test, y_pred)

  # Evaluate clf's accuracy on the test set
  print('{:s} : {:.3f}'.format(clf_name, accuracy))

Logistic Regression : 0.690
K Nearest Neighbors : 0.698
Classification Tree : 0.672


  y = column_or_1d(y, warn=True)
  return self._fit(X, y)


## Voting classifiers

In [20]:
# Instantiate a VottingClassifier
vc = VotingClassifier(estimators = classifiers)

# Fit to training data
vc.fit(X_train, y_train)

# predict
y_pred = vc.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

print('Voting Classifer: {:.3f}'.format(accuracy))

Voting Classifer: 0.681


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


## Bagging classifier

In [26]:
# Instantiate dt
dt = DecisionTreeClassifier(random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, oob_score=True, random_state=1)

In [27]:
# Fit bc on training data
bc.fit(X_train, y_train)

# Predict bc on test data
bc.predict(X_test)

# Evaluate
acc_test = accuracy_score(y_test, y_pred)

# Evaluate OOB accuracy
acc_oob = bc.oob_score_

# Print acc_test and acc_oob
print('Test set accuracy: {:.3f}, OOB accuracy: {:.3f}'.format(acc_test, acc_oob))

Test set accuracy: 0.681, OOB accuracy: 0.737


  y = column_or_1d(y, warn=True)


## Adaboost classifier

In [33]:
# Instantiate ada
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

# Fit to the training data
ada.fit(X_train, y_train)

# Predict on test data
y_pred_proba = ada.predict_proba(X_test)[:,1]

  y = column_or_1d(y, warn=True)


## Evaluate ada


In [35]:
# Evaluate ada classifier
ada_roc_auc_score = roc_auc_score(y_test, y_pred_proba)

print('ROC AUC score: {:.3f}'.format(ada_roc_auc_score))

ROC AUC score: 0.657


## GridSearchCV: Decision Tree hyperparamets

In [43]:
# Define params
params_dt = {'max_depth':[2, 3, 4], 'min_samples_leaf':[0.12, 0.14, 0.16, 0.18]}

# Instantiate grid
grid_dt = GridSearchCV(estimator=dt, param_grid=params_dt, scoring='roc_auc', cv=5, n_jobs=-1)

grid_dt.fit(X_train, y_train)

## GridSearch result

In [44]:
# Extract the best estimator
best_model = grid_dt.best_estimator_

# Predict the test set probabilities of the positive class
y_pred_proba = best_model.predict_proba(X_test)[:,1]

# Compute test_roc_auc
test_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print test_roc_auc
print('Test set ROC AUC score: {:.3f}'.format(test_roc_auc))

Test set ROC AUC score: 0.696
