#CLASSIFICATION OF LIVER DISEASE : COMPARISON OF MULTIPLE CLASSIFICATION ALGORITHMS

In [None]:
#importing libs
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
#from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, classification_report, confusion_matrix, cohen_kappa_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
#for oversampling
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
 data=pd.read_csv("/content/gdrive/MyDrive/LiverClassification/data.csv", encoding='unicode_escape')
 data.tail()

Unnamed: 0,Age of the patient,Gender of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
30686,50.0,Male,2.2,1.0,610.0,17.0,28.0,7.3,2.6,0.55,1
30687,55.0,Male,2.9,1.3,482.0,22.0,34.0,7.0,2.4,0.5,1
30688,54.0,Male,6.8,3.0,542.0,116.0,66.0,6.4,3.1,0.9,1
30689,48.0,Female,1.9,1.0,231.0,16.0,55.0,4.3,1.6,0.6,1
30690,30.0,Male,3.1,1.6,253.0,80.0,406.0,6.8,3.9,1.3,1


#DATA EXPLORATION & PREPROCESSING

In [None]:
data.columns=['age','gender','TB','DB','AAP','SGPT','SGOT','TP','ALB','A/G','Label']

data.Label.count

<bound method Series.count of 0        1
1        1
2        1
3        1
4        1
        ..
30686    1
30687    1
30688    1
30689    1
30690    1
Name: Label, Length: 30691, dtype: int64>

In [None]:
data.isnull().sum()

age         2
gender    902
TB        648
DB        561
AAP       796
SGPT      538
SGOT      462
TP        463
ALB       494
A/G       559
Label       0
dtype: int64

In [None]:
data.gender.replace('Female',1,inplace=True)
data.gender.replace('Male',0,inplace=True)
data.Label.replace(2,0,inplace=True)
#null impute
data.age=data.age.fillna(data.age.mean())
data.gender=data.gender.fillna(1.0)
data.TB=data.TB.fillna(data.TB.mean())
data.ALB=data.ALB.fillna(data.ALB.mean())
data.DB=data.DB.fillna(data.DB.median())
data.AAP =data.AAP.fillna(data.AAP.median())
data.SGPT =data.SGPT.fillna(data.SGPT.median())
data.SGOT =data.SGOT.fillna(data.SGOT.median())
data.TP =data.TP.fillna(data.TP.median())
data['A/G'] =data['A/G'].fillna(data['A/G'].median())

In [None]:
#oversampling on basis of label
#nice sampling illustration : https://raw.githubusercontent.com/rafjaa/machine_learning_fecib/master/src/static/img/resampling.png
X = data.iloc[:,:-1]
Y = data.iloc[:,-1]
Data_rs=RandomOverSampler(sampling_strategy='minority')
X_res, Y_res = Data_rs.fit_resample(X, Y)


In [None]:
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(Y_res)))

Original dataset shape Counter({1: 21917, 0: 8774})
Resampled dataset shape Counter({1: 21917, 0: 21917})


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_res, Y_res, test_size=0.2,random_state=109)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(35067, 10)
(8767, 10)
(35067,)
(8767,)


#APPLYING MACHINE LEARNING MODELS:

#SVM

In [None]:
#generating svc model
clf = svm.SVC(kernel='poly', random_state=100) 
clf.fit(X_train, Y_train)

#Predict
y_pred_svm = clf.predict(X_test)

In [None]:
# Model Accuracy  [Pretty aweful- perhaps because of the outliers]
print("SVC Accuracy:",accuracy_score(Y_test, y_pred_svm))
print('SVC Precision: %.3f' % precision_score(Y_test, y_pred_svm))
print('SVC Recall: %.3f' % recall_score(Y_test, y_pred_svm))
print('SVC F1 Score: %.3f' % f1_score(Y_test, y_pred_svm))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_svm))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_svm))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_svm))

SVC Accuracy: 0.6607733546252994
SVC Precision: 0.894
SVC Recall: 0.355
SVC F1 Score: 0.508
confusion matrix 
 [[4258  182]
 [2792 1535]]
cohens kappa 
 0.3161852068369263
ROC_AUC score 
 0.6568791289556253


##Randomised Search CV - SVC

In [None]:
mdl = svm.SVC(probability = True, random_state = 100)
#auc = make_scorer(roc_auc_score)

In [None]:
rand_list = {"C": [0.1,1,10,100,1000],
             'degree':np.arange(3,6)}
              
rand_search = RandomizedSearchCV(mdl, param_distributions = rand_list, n_iter = 20, n_jobs = 4, cv = 3, random_state = 2017) 
rand_search.fit(X, Y) 
rand_search.cv_results_



{'mean_fit_time': array([380.16, 379.18, 379.4 , 421.27, 428.28, 427.72, 448.56, 443.84,
        456.56, 499.93, 505.98, 519.41, 706.68, 709.78, 551.48]),
 'mean_score_time': array([29.09, 29.13, 28.89, 28.32, 28.36, 28.45, 27.87, 27.56, 28.03,
        27.68, 27.67, 27.94, 27.79, 25.75, 16.61]),
 'mean_test_score': array([0.71, 0.71, 0.71, 0.71, 0.71, 0.71, 0.72, 0.72, 0.72, 0.72, 0.72,
        0.72, 0.72, 0.72, 0.72]),
 'param_C': masked_array(data=[0.1, 0.1, 0.1, 1, 1, 1, 10, 10, 10, 100, 100, 100,
                    1000, 1000, 1000],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_degree': masked_array(data=[3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False],
        fill_value='?',
     

In [None]:
y_pred_svm_rs = rand_search.predict(X_test)

In [None]:
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_svm_rs)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_svm_rs))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_svm_rs))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_svm_rs))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_svm_rs))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_svm_rs))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_svm_rs))

Accuracy Score 0.5057602372533364
RF Precision: 0.500
RF Recall: 1.000
RF F1 Score: 0.666
confusion matrix 
 [[ 107 4333]
 [   0 4327]]
cohens kappa 
 0.023795871390178802
ROC_AUC score 
 0.5120495495495496


##gridSearchCv for SVC:

In [None]:


# defining parameter range
param_grid = {'kernel': ['poly','linear','rbf','sigmoid'],
      'C':[0.1,1,10,100,1000],
      'degree':np.arange(3,6)   
      }

grid = GridSearchCV(svm.SVC(random_state=100), param_grid, refit = False, verbose = 3, n_jobs=-1)

# fitting the model for grid search
grid.fit(X_train, Y_train)


Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [None]:
y_pred_svm_gs = grid.predict(X_test)

# print classification report
print(classification_report(y_test, y_pred_svm_gs))


In [None]:
#model accuracy
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_svm_gs)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_svm_gs))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_svm_gs))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_svm_gs))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_svm_gs))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_svm_gs))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_svm_gs))

#Random forest :


In [None]:
#Generate Gaussian Classifier
RF_clf=RandomForestClassifier(n_estimators=50, random_state=100)

#Train
RF_clf.fit(X_train,Y_train)
#Predict
y_pred_rf=RF_clf.predict(X_test)
#y_pred_Train=RF_clf.predict(X_train) (TRAINING PERFORMANCE)

In [None]:
# Model Accuracy

print("RF Accuracy:", accuracy_score(Y_test, y_pred_rf))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_rf))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_rf))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_rf))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_rf))
'''print('confusion matrix \n', confusion_matrix(Y_train,y_pred_Train))
print("RF Accuracy:", accuracy_score(Y_train, y_pred_Train))'''
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_rf))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_rf))

RF Accuracy: 0.9981749743355766
RF Precision: 0.999
RF Recall: 0.997
RF F1 Score: 0.998
confusion matrix 
 [[4437    3]
 [  13 4314]]
cohens kappa 
 0.9963492348129215
ROC_AUC score 
 0.9981599666456379


##RandomizedSearchCV-RF

In [None]:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [None]:
#@title
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=50,cv=2,verbose=2,
                               random_state=100)
### fit the randomized model
rf_randomcv.fit(X_train,Y_train)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV] END criterion=entropy, max_depth=1000, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   4.2s
[CV] END criterion=entropy, max_depth=1000, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   4.1s
[CV] END criterion=gini, max_depth=120, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=1400; total time=  25.4s
[CV] END criterion=gini, max_depth=120, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=1400; total time=  24.9s
[CV] END criterion=gini, max_depth=340, max_features=sqrt, min_samples_leaf=1, min_samples_split=14, n_estimators=2000; total time=  36.2s
[CV] END criterion=gini, max_depth=340, max_features=sqrt, min_samples_leaf=1, min_samples_split=14, n_estimators=2000; total time=  35.9s
[CV] END criterion=entropy, max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=14,

RandomizedSearchCV(cv=2, estimator=RandomForestClassifier(), n_iter=50,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [None]:
#@title
best_random_grid=rf_randomcv.best_estimator_

In [None]:
#@title
#predict RF-RCV
y_pred_rf_rs=best_random_grid.predict(X_test)
#model accuracy
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_rf_rs)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_rf_rs))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_rf_rs))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_rf_rs))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_rf_rs))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_rf_rs))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_rf_rs))

Accuracy Score 0.998517166647656
RF Precision: 1.000
RF Recall: 0.997
RF F1 Score: 0.998
confusion matrix 
 [[4438    2]
 [  11 4316]]
cohens kappa 
 0.9970337620091309
ROC_AUC score 
 0.9985036862607929


##GridSearchCV-RF :

In [None]:
#@title
from sklearn.model_selection import GridSearchCV

'''param_grid = {
    'criterion': [rf_randomcv.best_params_['criterion']],
    'max_depth': [rf_randomcv.best_params_['max_depth']],
    'max_features': [rf_randomcv.best_params_['max_features']],
    'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], 
                         rf_randomcv.best_params_['min_samples_leaf']+2, 
                         rf_randomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] - 2,
                          rf_randomcv.best_params_['min_samples_split'] - 1,
                          rf_randomcv.best_params_['min_samples_split'], 
                          rf_randomcv.best_params_['min_samples_split'] +1,
                          rf_randomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 100, 
                     rf_randomcv.best_params_['n_estimators'], 
                     rf_randomcv.best_params_['n_estimators'] + 100]
}'''

param_grid = {  'bootstrap': [True], 'max_depth': [5, 10, None], 'max_features': ['auto', 'log2'], 'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15]}
print(param_grid)

{'bootstrap': [True], 'max_depth': [5, 10, None], 'max_features': ['auto', 'log2'], 'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15]}


In [None]:

#### Fit the grid_search to the data
rf=RandomForestClassifier(random_state=100)
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,n_jobs=-1,cv=2,verbose=2)
grid_search.fit(X_train,Y_train)

Fitting 2 folds for each of 60 candidates, totalling 120 fits


GridSearchCV(cv=2, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [5, 10, None],
                         'max_features': ['auto', 'log2'],
                         'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15]},
             verbose=2)

In [None]:
#@title
print(grid_search.best_params_)

{'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'n_estimators': 15}


In [None]:
#@title
#predict RF-GSCV
y_pred_rf_gs=grid_search.predict(X_test)
#model accuracy
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_rf_gs)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_rf_gs))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_rf_gs))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_rf_gs))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_rf_gs))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_rf_gs))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_rf_gs))

Accuracy Score 0.9978327820234972
RF Precision: 0.999
RF Recall: 0.997
RF F1 Score: 0.998
confusion matrix 
 [[4435    5]
 [  14 4313]]
cohens kappa 
 0.9956647290902683
ROC_AUC score 
 0.9978191879191415


#XGBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [None]:
# Init classifier
xgb_cl = xgb.XGBClassifier(random_state=100)

# Fit
xgb_cl.fit(X_train, Y_train)

# Predict
y_pred_xgb = xgb_cl.predict(X_test)


In [None]:
#model accuracy
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_xgb)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_xgb))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_xgb))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_xgb))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_xgb))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_xgb))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_xgb))

Accuracy Score 0.8590167674232919
RF Precision: 0.960
RF Recall: 0.745
RF F1 Score: 0.839
confusion matrix 
 [[4306  134]
 [1102 3225]]
cohens kappa 
 0.717181561302212
ROC_AUC score 
 0.8575699515091704


In [None]:
#what are the hyperparameters?
xgb_cl

XGBClassifier()

##gridsearchCV-XGB

In [None]:

param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

In [None]:
# Init classifier
xgb_cl = xgb.XGBClassifier(objective="binary:logistic", random_state=100)

# Init Grid Search
grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")

# Fit
grid_cv.fit(X_train, Y_train)

GridSearchCV(cv=3, estimator=XGBClassifier(), n_jobs=-1,
             param_grid={'colsample_bytree': [0.5], 'gamma': [0, 0.25, 1],
                         'learning_rate': [0.1, 0.01, 0.05],
                         'max_depth': [3, 4, 5, 7], 'reg_lambda': [0, 1, 10],
                         'scale_pos_weight': [1, 3, 5], 'subsample': [0.8]},
             scoring='roc_auc')

In [None]:
grid_cv.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_depth': 7,
 'reg_lambda': 0,
 'scale_pos_weight': 1,
 'subsample': 0.8}

In [None]:
y_pred_xg_gs =grid_cv.predict(X_test)

In [None]:
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_xg_gs)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_xg_gs))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_xg_gs))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_xg_gs))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_xg_gs))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_xg_gs))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_xg_gs))

Accuracy Score 0.9962358845671268
RF Precision: 0.999
RF Recall: 0.993
RF F1 Score: 0.996
confusion matrix 
 [[4436    4]
 [  29 4298]]
cohens kappa 
 0.9924699646169465
ROC_AUC score 
 0.996198498012688


##randomisedSearchCV-XGB


In [None]:
xgb_cl = xgb.XGBClassifier(objective="binary:logistic", random_state=100)
param_grid = {
        'silent': [False],
        'max_depth': [6, 10, 15, 20],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0,3],
        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
        'gamma': [0, 0.25, 0.5, 1.0],
        'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
        'n_estimators': [100]}

rs_clf = RandomizedSearchCV(xgb_cl, param_grid, n_iter=20,
                            n_jobs=1, verbose=2, cv=2,
                            scoring='neg_log_loss', refit=True, random_state=42)

In [None]:
rs_clf.fit(X_train, Y_train)

Fitting 2 folds for each of 20 candidates, totalling 40 fits
[CV] END colsample_bylevel=0.4, colsample_bytree=0.9, gamma=1.0, learning_rate=0.2, max_depth=6, min_child_weight=5.0, n_estimators=100, reg_lambda=50.0, silent=False, subsample=0.7; total time=   1.3s
[CV] END colsample_bylevel=0.4, colsample_bytree=0.9, gamma=1.0, learning_rate=0.2, max_depth=6, min_child_weight=5.0, n_estimators=100, reg_lambda=50.0, silent=False, subsample=0.7; total time=   1.3s
[CV] END colsample_bylevel=0.8, colsample_bytree=0.8, gamma=0.25, learning_rate=0.1, max_depth=20, min_child_weight=1.0, n_estimators=100, reg_lambda=1.0, silent=False, subsample=0.6; total time=   3.8s
[CV] END colsample_bylevel=0.8, colsample_bytree=0.8, gamma=0.25, learning_rate=0.1, max_depth=20, min_child_weight=1.0, n_estimators=100, reg_lambda=1.0, silent=False, subsample=0.6; total time=   3.7s
[CV] END colsample_bylevel=0.4, colsample_bytree=1.0, gamma=0.25, learning_rate=0.1, max_depth=15, min_child_weight=7.0, n_estima

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


[CV] END colsample_bylevel=0.5, colsample_bytree=0.9, gamma=0.25, learning_rate=3, max_depth=20, min_child_weight=10.0, n_estimators=100, reg_lambda=5.0, silent=False, subsample=0.7; total time=   0.5s


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


[CV] END colsample_bylevel=0.5, colsample_bytree=0.9, gamma=0.25, learning_rate=3, max_depth=20, min_child_weight=10.0, n_estimators=100, reg_lambda=5.0, silent=False, subsample=0.7; total time=   0.6s
[CV] END colsample_bylevel=0.8, colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=15, min_child_weight=1.0, n_estimators=100, reg_lambda=10.0, silent=False, subsample=0.6; total time=   3.3s
[CV] END colsample_bylevel=0.8, colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=15, min_child_weight=1.0, n_estimators=100, reg_lambda=10.0, silent=False, subsample=0.6; total time=   3.3s
[CV] END colsample_bylevel=0.4, colsample_bytree=0.9, gamma=0.25, learning_rate=0.01, max_depth=15, min_child_weight=5.0, n_estimators=100, reg_lambda=0.1, silent=False, subsample=0.5; total time=   2.7s
[CV] END colsample_bylevel=0.4, colsample_bytree=0.9, gamma=0.25, learning_rate=0.01, max_depth=15, min_child_weight=5.0, n_estimators=100, reg_lambda=0.1, silent=False, subsample=0.5; total

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


[CV] END colsample_bylevel=1.0, colsample_bytree=1.0, gamma=0, learning_rate=3, max_depth=10, min_child_weight=0.5, n_estimators=100, reg_lambda=50.0, silent=False, subsample=0.7; total time=   2.7s


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


[CV] END colsample_bylevel=1.0, colsample_bytree=1.0, gamma=0, learning_rate=3, max_depth=10, min_child_weight=0.5, n_estimators=100, reg_lambda=50.0, silent=False, subsample=0.7; total time=   2.6s
[CV] END colsample_bylevel=0.7, colsample_bytree=0.8, gamma=0, learning_rate=0.2, max_depth=15, min_child_weight=0.5, n_estimators=100, reg_lambda=1.0, silent=False, subsample=0.5; total time=   2.8s
[CV] END colsample_bylevel=0.7, colsample_bytree=0.8, gamma=0, learning_rate=0.2, max_depth=15, min_child_weight=0.5, n_estimators=100, reg_lambda=1.0, silent=False, subsample=0.5; total time=   2.8s
[CV] END colsample_bylevel=1.0, colsample_bytree=0.8, gamma=0, learning_rate=0.001, max_depth=20, min_child_weight=10.0, n_estimators=100, reg_lambda=5.0, silent=False, subsample=0.7; total time=   4.1s
[CV] END colsample_bylevel=1.0, colsample_bytree=0.8, gamma=0, learning_rate=0.001, max_depth=20, min_child_weight=10.0, n_estimators=100, reg_lambda=5.0, silent=False, subsample=0.7; total time=   

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


[CV] END colsample_bylevel=0.4, colsample_bytree=0.8, gamma=0, learning_rate=3, max_depth=10, min_child_weight=0.5, n_estimators=100, reg_lambda=10.0, silent=False, subsample=0.5; total time=   0.7s


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


[CV] END colsample_bylevel=0.4, colsample_bytree=0.8, gamma=0, learning_rate=3, max_depth=10, min_child_weight=0.5, n_estimators=100, reg_lambda=10.0, silent=False, subsample=0.5; total time=   1.4s
[CV] END colsample_bylevel=1.0, colsample_bytree=0.5, gamma=0.25, learning_rate=0.1, max_depth=15, min_child_weight=7.0, n_estimators=100, reg_lambda=10.0, silent=False, subsample=1.0; total time=   2.6s
[CV] END colsample_bylevel=1.0, colsample_bytree=0.5, gamma=0.25, learning_rate=0.1, max_depth=15, min_child_weight=7.0, n_estimators=100, reg_lambda=10.0, silent=False, subsample=1.0; total time=   2.6s
[CV] END colsample_bylevel=1.0, colsample_bytree=0.6, gamma=0, learning_rate=0.001, max_depth=10, min_child_weight=7.0, n_estimators=100, reg_lambda=5.0, silent=False, subsample=0.5; total time=   2.6s
[CV] END colsample_bylevel=1.0, colsample_bytree=0.6, gamma=0, learning_rate=0.001, max_depth=10, min_child_weight=7.0, n_estimators=100, reg_lambda=5.0, silent=False, subsample=0.5; total ti

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


[CV] END colsample_bylevel=0.5, colsample_bytree=0.6, gamma=0, learning_rate=3, max_depth=10, min_child_weight=7.0, n_estimators=100, reg_lambda=100.0, silent=False, subsample=0.6; total time=   1.2s


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


[CV] END colsample_bylevel=0.5, colsample_bytree=0.6, gamma=0, learning_rate=3, max_depth=10, min_child_weight=7.0, n_estimators=100, reg_lambda=100.0, silent=False, subsample=0.6; total time=   0.6s
[CV] END colsample_bylevel=0.5, colsample_bytree=1.0, gamma=0.25, learning_rate=0.2, max_depth=20, min_child_weight=7.0, n_estimators=100, reg_lambda=100.0, silent=False, subsample=0.6; total time=   2.6s
[CV] END colsample_bylevel=0.5, colsample_bytree=1.0, gamma=0.25, learning_rate=0.2, max_depth=20, min_child_weight=7.0, n_estimators=100, reg_lambda=100.0, silent=False, subsample=0.6; total time=   2.5s
[CV] END colsample_bylevel=0.4, colsample_bytree=0.5, gamma=1.0, learning_rate=3, max_depth=15, min_child_weight=1.0, n_estimators=100, reg_lambda=5.0, silent=False, subsample=0.7; total time=   0.6s


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
 -0.63   nan -0.03 -0.64 -0.69   nan -0.09   nan]


[CV] END colsample_bylevel=0.4, colsample_bytree=0.5, gamma=1.0, learning_rate=3, max_depth=15, min_child_weight=1.0, n_estimators=100, reg_lambda=5.0, silent=False, subsample=0.7; total time=   0.6s


RandomizedSearchCV(cv=2, estimator=XGBClassifier(), n_iter=20, n_jobs=1,
                   param_distributions={'colsample_bylevel': [0.4, 0.5, 0.6,
                                                              0.7, 0.8, 0.9,
                                                              1.0],
                                        'colsample_bytree': [0.4, 0.5, 0.6, 0.7,
                                                             0.8, 0.9, 1.0],
                                        'gamma': [0, 0.25, 0.5, 1.0],
                                        'learning_rate': [0.001, 0.01, 0.1, 0.2,
                                                          0, 3],
                                        'max_depth': [6, 10, 15, 20],
                                        'min_child_weight': [0.5, 1.0, 3.0, 5.0,
                                                             7.0, 10.0],
                                        'n_estimators': [100],
                                        'reg

In [None]:
y_pred_xg_rs =rs_clf.predict(X_test)

In [None]:
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred))

Accuracy Score 0.998517166647656
RF Precision: 0.999
RF Recall: 0.998
RF F1 Score: 0.998
confusion matrix 
 [[4436    4]
 [   9 4318]]
cohens kappa 
 0.9970337969031466
ROC_AUC score 
 0.9985095680381098


#ADABOOST

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ad = AdaBoostClassifier()

In [None]:
y_pred_adb=ad.fit(X_train,Y_train).predict(X_test)

In [None]:
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_adb)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_adb))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_adb))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_adb))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_adb))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_adb))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_adb))

Accuracy Score 0.8028972282422722
RF Precision: 0.884
RF Recall: 0.691
RF F1 Score: 0.776
confusion matrix 
 [[4048  392]
 [1336 2991]]
cohens kappa 
 0.6046313331535006
ROC_AUC score 
 0.8014763781576817


##grid searchcv on adaboost

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=ad, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
# execute the grid search
grid_result = grid_search.fit(X_train, Y_train)

In [None]:
y_pred_ada_gs=grid_result.fit(X_train,Y_train).predict(X_test)

In [None]:
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_ada_gs)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_ada_gs))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_ada_gs))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_ada_gs))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_ada_gs))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_ada_gs))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_ada_gs))

Accuracy Score 0.9117143834835177
RF Precision: 0.962
RF Recall: 0.855
RF F1 Score: 0.905
confusion matrix 
 [[4295  145]
 [ 629 3698]]
cohens kappa 
 0.8231476980363454
ROC_AUC score 
 0.9109880188716566


##RandomisedSearchCV-ADB


In [None]:
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
ab=AdaBoostClassifier()
ad_cv = RandomizedSearchCV(ab, grid, n_jobs=-1, cv=cv, scoring='accuracy')
cv_result = ad_cv.fit(X_train, Y_train)

In [None]:
y_pred_ada_rs=cv_result.predict(X_test)

In [None]:
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_ada_rs)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_ada_rs))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_ada_rs))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_ada_rs))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_ada_rs))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_ada_rs))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_ada_rs))

Accuracy Score 0.8028972282422722
RF Precision: 0.884
RF Recall: 0.691
RF F1 Score: 0.776
confusion matrix 
 [[4048  392]
 [1336 2991]]
cohens kappa 
 0.6046313331535006
ROC_AUC score 
 0.8014763781576817


#Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
gb_clf.fit(X_train, Y_train)

In [None]:
y_pred_gb=gb_clf.predict(X_test)

In [None]:
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_gb)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_gb))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_gb))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_gb))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_gb))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_gb))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_gb))

##RandomizedSearchCV-GB

In [None]:
from sklearn.model_selection import RandomizedSearchCV
p_test3 = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 'n_estimators':[100,250,500,750,1000,1250,1500,1750]}

clf= RandomizedSearchCV(estimator =GradientBoostingClassifier(max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10, 
            param_grid = p_test3, scoring='accuracy',n_jobs=4, cv=5)
clf.fit(X_train,Y_train)

In [None]:
y_pred_gb_rs=clf.predict(X_test)

##GridSearchCV-GB

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
p_test3 = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 'n_estimators':[100,250,500,750,1000,1250,1500,1750]}

tuning = GridSearchCV(estimator =GradientBoostingClassifier(max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10), 
            param_grid = p_test3, scoring='accuracy',n_jobs=4, cv=5)
tuning.fit(X_train,Y_train)


In [None]:
y_pred_gb_gs=tuning.predict(X_test)

In [None]:
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred))

#Catboost

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score

In [None]:
model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

In [None]:
model.fit(
    X_train, Y_train,
    #cat_features=categorical_features_indices,
    #eval_set=(X_validation, y_validation),
#     logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

In [None]:
y_pred_cat = model.predict(X_test)

In [None]:
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_cat)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_cat))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_cat))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_cat))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_cat))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_cat))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_cat))

### catboost-RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid = {'max_depth': [3,4,5],'n_estimators':[100, 200, 300]}
gscv = RandomizedSearchCV(estimator = model, param_distributions=grid, scoring ='accuracy', cv = 5)
gscv.fit(X_test,Y_test)


In [None]:
y_pred_cat_rs=gscv.predict(X_test)

In [None]:
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_cat_rs)))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_cat_rs))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_cat_rs))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_cat_rs))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_cat_rs))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_cat_rs))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_cat_rs))

#LightGBM

In [None]:
import lightgbm as lgb
model = lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42)
model.fit(X_train,Y_train,eval_set=[(X_test,Y_test),(X_train,Y_train)],
          verbose=20,eval_metric='logloss')

In [None]:
y_pred_lgbm = model.predict(X_test)

In [None]:
print("Accuracy Score {}".format(accuracy_score(Y_test,y_pred_lgbm )))
print('RF Precision: %.3f' % precision_score(Y_test, y_pred_lgbm ))
print('RF Recall: %.3f' % recall_score(Y_test, y_pred_lgbm ))
print('RF F1 Score: %.3f' % f1_score(Y_test, y_pred_lgbm ))
print('confusion matrix \n', confusion_matrix(Y_test,y_pred_lgbm ))
print('cohens kappa \n', cohen_kappa_score(Y_test,y_pred_lgbm ))
print('ROC_AUC score \n', roc_auc_score(Y_test,y_pred_lgbm ))