# Liver Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
from sklearn.preprocessing import StandardScaler
import random
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("liver dataset.csv")
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
df.shape

(583, 11)

In [4]:
df.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [5]:
df = df.fillna(method="ffill")

In [6]:
gender_map = {'Male':1,'Female':0}
df.Gender = df.Gender.map(gender_map)
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [17]:
X = df.drop("Dataset", axis='columns')
Y = df["Dataset"]

In [18]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

### Logistic Regression

In [21]:
lr_score = cross_val_score(LogisticRegression(), X, Y, cv=10)
lr_score.mean()

0.7204850964348334

### Random Forest

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rf_clf = GridSearchCV(RandomForestClassifier(), {'n_estimators':[ 10, 20, 30, 35, 40, 45]}, cv=10, return_train_score=False)
rf_clf.fit(X, Y)
rf_clf.cv_results_

{'mean_fit_time': array([0.02500002, 0.04640045, 0.06520152, 0.08080039, 0.08640079,
        0.09740098]),
 'std_fit_time': array([0.005     , 0.00454312, 0.00612901, 0.00477093, 0.00535216,
        0.00387816]),
 'mean_score_time': array([0.00200002, 0.00160003, 0.00510008, 0.00239999, 0.00549994,
        0.00350006]),
 'std_score_time': array([0.00400004, 0.00303975, 0.0042532 , 0.00397994, 0.00471692,
        0.00450004]),
 'param_n_estimators': masked_array(data=[10, 20, 30, 35, 40, 45],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 10},
  {'n_estimators': 20},
  {'n_estimators': 30},
  {'n_estimators': 35},
  {'n_estimators': 40},
  {'n_estimators': 45}],
 'split0_test_score': array([0.71186441, 0.69491525, 0.74576271, 0.61016949, 0.6779661 ,
        0.72881356]),
 'split1_test_score': array([0.71186441, 0.69491525, 0.6440678 , 0.6779661 , 0.71186441,
        0.6779661 ]),
 'split2_te

In [23]:
rf_result_df = pd.DataFrame(rf_clf.cv_results_)
rf_result_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.025,0.005,0.002,0.004,10,{'n_estimators': 10},0.711864,0.711864,0.694915,0.672414,0.62069,0.724138,0.724138,0.517241,0.689655,0.758621,0.682554,0.065134,6
1,0.0464,0.004543,0.0016,0.00304,20,{'n_estimators': 20},0.694915,0.694915,0.661017,0.775862,0.775862,0.655172,0.637931,0.62069,0.655172,0.775862,0.69474,0.057203,3
2,0.065202,0.006129,0.0051,0.004253,30,{'n_estimators': 30},0.745763,0.644068,0.728814,0.810345,0.741379,0.689655,0.706897,0.534483,0.706897,0.706897,0.70152,0.068966,2
3,0.0808,0.004771,0.0024,0.00398,35,{'n_estimators': 35},0.610169,0.677966,0.728814,0.706897,0.724138,0.672414,0.655172,0.568966,0.706897,0.793103,0.684454,0.060212,5
4,0.086401,0.005352,0.0055,0.004717,40,{'n_estimators': 40},0.677966,0.711864,0.745763,0.758621,0.655172,0.655172,0.689655,0.603448,0.655172,0.724138,0.687697,0.045479,4
5,0.097401,0.003878,0.0035,0.0045,45,{'n_estimators': 45},0.728814,0.677966,0.661017,0.758621,0.741379,0.706897,0.672414,0.637931,0.689655,0.741379,0.701607,0.037998,1


In [24]:
rf_result_df[['param_n_estimators', 'mean_test_score']]

Unnamed: 0,param_n_estimators,mean_test_score
0,10,0.682554
1,20,0.69474
2,30,0.70152
3,35,0.684454
4,40,0.687697
5,45,0.701607


### SVM

In [25]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svm_clf = GridSearchCV(SVC(), {'C': [1,10,20], 'kernel': ['rbf','linear']}, cv=10, return_train_score=False)
svm_clf.fit(X, Y)
svm_clf.cv_results_

{'mean_fit_time': array([0.01170063, 0.00610032, 0.01000001, 0.01500003, 0.01100006,
        0.021     ]),
 'std_fit_time': array([2.60973378e-03, 1.51333507e-03, 7.15255737e-08, 4.99999523e-03,
        2.99997330e-03, 3.00006867e-03]),
 'mean_score_time': array([0.0016001 , 0.00070009, 0.002     , 0.        , 0.        ,
        0.00200002]),
 'std_score_time': array([0.0006633 , 0.00045832, 0.004     , 0.        , 0.        ,
        0.00400004]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20

In [26]:
svm_result_df = pd.DataFrame(svm_clf.cv_results_)
svm_result_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.011701,0.002609734,0.0016,0.000663,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.711864,0.694915,0.711864,0.724138,0.724138,0.724138,0.706897,0.655172,0.706897,0.706897,0.706692,0.01939,4
1,0.0061,0.001513335,0.0007,0.000458,1,linear,"{'C': 1, 'kernel': 'linear'}",0.711864,0.711864,0.711864,0.724138,0.724138,0.724138,0.706897,0.706897,0.706897,0.706897,0.713559,0.007224,1
2,0.01,7.152557e-08,0.002,0.004,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.677966,0.677966,0.644068,0.741379,0.706897,0.689655,0.706897,0.637931,0.706897,0.758621,0.694828,0.03617,5
3,0.015,0.004999995,0.0,0.0,10,linear,"{'C': 10, 'kernel': 'linear'}",0.711864,0.711864,0.711864,0.724138,0.724138,0.724138,0.706897,0.706897,0.706897,0.706897,0.713559,0.007224,1
4,0.011,0.002999973,0.0,0.0,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.677966,0.694915,0.610169,0.741379,0.706897,0.655172,0.62069,0.603448,0.706897,0.741379,0.675891,0.048898,6
5,0.021,0.003000069,0.002,0.004,20,linear,"{'C': 20, 'kernel': 'linear'}",0.711864,0.711864,0.711864,0.724138,0.724138,0.724138,0.706897,0.706897,0.706897,0.706897,0.713559,0.007224,1


In [27]:
svm_result_df[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.706692
1,1,linear,0.713559
2,10,rbf,0.694828
3,10,linear,0.713559
4,20,rbf,0.675891
5,20,linear,0.713559


### Naives Bayes

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
nb_clf = GridSearchCV(GaussianNB(),{}, cv=10, return_train_score=False)
nb_clf.fit(X, Y)
nb_clf.cv_results_

{'mean_fit_time': array([0.00900002]),
 'std_fit_time': array([0.02071237]),
 'mean_score_time': array([0.]),
 'std_score_time': array([0.]),
 'params': [{}],
 'split0_test_score': array([0.59322034]),
 'split1_test_score': array([0.57627119]),
 'split2_test_score': array([0.57627119]),
 'split3_test_score': array([0.53448276]),
 'split4_test_score': array([0.48275862]),
 'split5_test_score': array([0.39655172]),
 'split6_test_score': array([0.48275862]),
 'split7_test_score': array([0.51724138]),
 'split8_test_score': array([0.65517241]),
 'split9_test_score': array([0.75862069]),
 'mean_test_score': array([0.55733489]),
 'std_test_score': array([0.09549726]),
 'rank_test_score': array([1])}

In [29]:
nb_result_df = pd.DataFrame(nb_clf.cv_results_)
nb_result_df[['mean_test_score']]

Unnamed: 0,mean_test_score
0,0.557335


### Save the model

In [30]:
lr_clf = LogisticRegression()
lr_clf.fit(X,Y)

LogisticRegression()

In [31]:
import pickle
pickle.dump(lr_clf, open('liver.pkl', 'wb'))