In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading the ILPD dataset

In [2]:
df = pd.read_csv("ILPD.csv")
df

Unnamed: 0,Age,Gender,TB,DB,AP,SGPT,SGOT,TP,ALB,A/G,Target
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


# Encoding Gender column

In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoder_x = LabelEncoder()
df['Gender'] = label_encoder_x.fit_transform(df[['Gender']])
df

  y = column_or_1d(y, warn=True)


Unnamed: 0,Age,Gender,TB,DB,AP,SGPT,SGOT,TP,ALB,A/G,Target
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,1,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,1,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,1,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,1,1.3,0.5,184,29,32,6.8,3.4,1.00,1


# Checking for NULL values

In [4]:
df.isnull().values.any()

True

In [5]:
df.isnull().sum().sum()

4

In [6]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Age,Gender,TB,DB,AP,SGPT,SGOT,TP,ALB,A/G,Target
209,45,0,0.9,0.3,189,23,33,6.6,3.9,,1
241,51,1,0.8,0.2,230,24,46,6.5,3.1,,1
253,35,0,0.6,0.2,180,12,15,5.2,2.7,,2
312,27,1,1.3,0.6,106,25,54,8.5,4.8,,2


# Handling Missing data by MultiVariate Imputation

In [7]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split

df2 = df.copy()


In [8]:
cont_feats = [col for col in df2.columns if df2[col].dtype != object]

In [9]:
iter_imputer = IterativeImputer(random_state=42)
iter_imputed = iter_imputer.fit_transform(df2[cont_feats])
df2 = pd.DataFrame(iter_imputed, columns=cont_feats)

In [10]:
df2['A/G'] = np.round(df2[['A/G']],2)

In [11]:
df2.iloc[[209,241,253,312],:]

Unnamed: 0,Age,Gender,TB,DB,AP,SGPT,SGOT,TP,ALB,A/G,Target
209,45.0,0.0,0.9,0.3,189.0,23.0,33.0,6.6,3.9,1.31,1.0
241,51.0,1.0,0.8,0.2,230.0,24.0,46.0,6.5,3.1,0.92,1.0
253,35.0,0.0,0.6,0.2,180.0,12.0,15.0,5.2,2.7,0.99,2.0
312,27.0,1.0,1.3,0.6,106.0,25.0,54.0,8.5,4.8,1.37,2.0


In [12]:
df = df2
df

Unnamed: 0,Age,Gender,TB,DB,AP,SGPT,SGOT,TP,ALB,A/G,Target
0,65.0,0.0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.90,1.0
1,62.0,1.0,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1.0
2,62.0,1.0,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1.0
3,58.0,1.0,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.00,1.0
4,72.0,1.0,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.40,1.0
...,...,...,...,...,...,...,...,...,...,...,...
578,60.0,1.0,0.5,0.1,500.0,20.0,34.0,5.9,1.6,0.37,2.0
579,40.0,1.0,0.6,0.1,98.0,35.0,31.0,6.0,3.2,1.10,1.0
580,52.0,1.0,0.8,0.2,245.0,48.0,49.0,6.4,3.2,1.00,1.0
581,31.0,1.0,1.3,0.5,184.0,29.0,32.0,6.8,3.4,1.00,1.0


# Log1 p Transformation

In [13]:
# save skewed features
skewed_cols = ['A/G', 'TB', 'AP', 'SGPT']

In [14]:
# Apply log1p transformation on dataframe - just selected values
for c in skewed_cols:
    df[c] = df[c].apply('log1p')

In [15]:
df

Unnamed: 0,Age,Gender,TB,DB,AP,SGPT,SGOT,TP,ALB,A/G,Target
0,65.0,0.0,0.530628,0.1,5.236442,2.833213,18.0,6.8,3.3,0.641854,1.0
1,62.0,1.0,2.476538,5.5,6.551080,4.174387,100.0,7.5,3.2,0.553885,1.0
2,62.0,1.0,2.116256,4.1,6.196444,4.110874,68.0,7.0,3.3,0.636577,1.0
3,58.0,1.0,0.693147,0.4,5.209486,2.708050,20.0,6.8,3.4,0.693147,1.0
4,72.0,1.0,1.589235,2.0,5.278115,3.332205,59.0,7.3,2.4,0.336472,1.0
...,...,...,...,...,...,...,...,...,...,...,...
578,60.0,1.0,0.405465,0.1,6.216606,3.044522,34.0,5.9,1.6,0.314811,2.0
579,40.0,1.0,0.470004,0.1,4.595120,3.583519,31.0,6.0,3.2,0.741937,1.0
580,52.0,1.0,0.587787,0.2,5.505332,3.891820,49.0,6.4,3.2,0.693147,1.0
581,31.0,1.0,0.832909,0.5,5.220356,3.401197,32.0,6.8,3.4,0.693147,1.0


# Balance Dataset

In [16]:
from sklearn.utils import resample
df.Target.value_counts()

1.0    416
2.0    167
Name: Target, dtype: int64

In [17]:
# Split data on majority and minority.. minority is dataset == 2
minority = df[df.Target == 2]
majority = df[df.Target == 1]

print('Minority size:', minority.shape)
print('Majority size:', majority.shape)

Minority size: (167, 11)
Majority size: (416, 11)


In [18]:
# choosing upsample as even now we do not have too much data
minority_upsample = resample(
    minority, replace=True, n_samples=majority.shape[0])
print('Minority upsampled size:',  minority_upsample.shape)


Minority upsampled size: (416, 11)


In [19]:
# merge majority with upsampled minority
pd.concat([minority_upsample, majority], axis=0)

Unnamed: 0,Age,Gender,TB,DB,AP,SGPT,SGOT,TP,ALB,A/G,Target
185,38.0,1.0,0.916291,0.4,5.700444,4.110874,103.0,6.0,3.0,0.693147,2.0
131,70.0,0.0,0.530628,0.2,5.472271,2.944439,28.0,5.8,2.5,0.559616,2.0
287,43.0,1.0,0.832909,0.6,5.049856,2.772589,20.0,8.0,4.0,0.693147,2.0
300,58.0,1.0,0.587787,0.2,5.198497,3.496508,25.0,8.2,4.4,0.741937,2.0
328,43.0,1.0,0.587787,0.2,5.262690,3.401197,20.0,6.0,2.9,0.641854,2.0
...,...,...,...,...,...,...,...,...,...,...,...
576,32.0,1.0,2.772589,8.2,5.669881,4.077537,80.0,5.3,2.2,0.530628,1.0
577,32.0,1.0,2.617396,8.4,5.252273,3.367296,47.0,5.4,2.6,0.641854,1.0
579,40.0,1.0,0.470004,0.1,4.595120,3.583519,31.0,6.0,3.2,0.741937,1.0
580,52.0,1.0,0.587787,0.2,5.505332,3.891820,49.0,6.4,3.2,0.693147,1.0


In [20]:
# merge majority with upsampled minority
df = pd.concat([minority_upsample, majority], axis=0)
df = df.reset_index(drop=True)
df


Unnamed: 0,Age,Gender,TB,DB,AP,SGPT,SGOT,TP,ALB,A/G,Target
0,38.0,1.0,0.916291,0.4,5.700444,4.110874,103.0,6.0,3.0,0.693147,2.0
1,70.0,0.0,0.530628,0.2,5.472271,2.944439,28.0,5.8,2.5,0.559616,2.0
2,43.0,1.0,0.832909,0.6,5.049856,2.772589,20.0,8.0,4.0,0.693147,2.0
3,58.0,1.0,0.587787,0.2,5.198497,3.496508,25.0,8.2,4.4,0.741937,2.0
4,43.0,1.0,0.587787,0.2,5.262690,3.401197,20.0,6.0,2.9,0.641854,2.0
...,...,...,...,...,...,...,...,...,...,...,...
827,32.0,1.0,2.772589,8.2,5.669881,4.077537,80.0,5.3,2.2,0.530628,1.0
828,32.0,1.0,2.617396,8.4,5.252273,3.367296,47.0,5.4,2.6,0.641854,1.0
829,40.0,1.0,0.470004,0.1,4.595120,3.583519,31.0,6.0,3.2,0.741937,1.0
830,52.0,1.0,0.587787,0.2,5.505332,3.891820,49.0,6.4,3.2,0.693147,1.0


In [21]:
# df[['DB', 'SGOT', 'TP', 'ALB']]

# Splitting into train and test set

In [22]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [23]:
from sklearn import preprocessing

standard_scaler = preprocessing.StandardScaler()
standard_data = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)
standardized_X_train = pd.DataFrame(standard_scaler.fit_transform(X_train), columns=X.columns)
standardized_X_test = pd.DataFrame(standard_scaler.fit_transform(X_test), columns=X.columns)

min_max_scaler = preprocessing.MinMaxScaler()
minmax_data = pd.DataFrame(min_max_scaler.fit_transform(X), columns=X.columns)
minmax_X_train = pd.DataFrame(min_max_scaler.fit_transform(X_train), columns=X.columns)
minmax_X_test = pd.DataFrame(min_max_scaler.fit_transform(X_test), columns=X.columns)

max_abs_scaler = preprocessing.MaxAbsScaler()
maxabs_data = pd.DataFrame(max_abs_scaler.fit_transform(X), columns=X.columns)
maxabs_X_train = pd.DataFrame(max_abs_scaler.fit_transform(X_train), columns=X.columns)
maxabs_X_test = pd.DataFrame(max_abs_scaler.fit_transform(X_test), columns=X.columns)

robust_scaler = preprocessing.RobustScaler()
robust_data = pd.DataFrame(robust_scaler.fit_transform(X), columns=X.columns)
robust_X_train = pd.DataFrame(robust_scaler.fit_transform(X_train), columns=X.columns)
robust_X_test = pd.DataFrame(robust_scaler.fit_transform(X_test), columns=X.columns)

scaled_data = {
    'Standardized' : 
    {
        'data': standard_data,
        'train':standardized_X_train,
        'test':standardized_X_test
    },
    'Min_Max': 
    {
        'data':minmax_data,
        'train':minmax_X_train,
        'test':minmax_X_test
    },
    'Absolute_max': 
    {
        'data':maxabs_data,
        'train':maxabs_X_train,
        'test':maxabs_X_test
    },
    'Robust':
    {
        'data':robust_data,
        'train':robust_X_train,
        'test':robust_X_test
    },
}


In [24]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import f_classif

In [25]:
feature_selection = []
for name in scaled_data :
    if name!='Standardized' and name!='Robust':
        d1 = {}
        d1['Processing'] = name
        d1['Method'] = "Univariate Selection"
        chi_selector = SelectKBest(chi2, k='all')
        fit = chi_selector.fit(scaled_data[name]['data'], y)
        d1['Features'] = dict(sorted(zip(scaled_data[name]['data'].columns, fit.scores_), key = lambda x: x[1],reverse=True))
        feature_selection.append(d1)
    
    ###############################

    d2 = {}
    d2['Processing'] = name
    d2['Method'] = "Extra Tree Classifier"
    etc = ExtraTreesClassifier(random_state=42)
    etc.fit(scaled_data[name]['data'],y)
    d2['Features'] = dict(sorted(zip(scaled_data[name]['data'].columns, etc.feature_importances_), key = lambda x: x[1],reverse=True))
    feature_selection.append(d2)

    ###############################

    d3 = {}
    d3['Processing'] = name
    d3['Method'] = "Random Forest Classifier"
    rf = RandomForestClassifier(n_estimators=500, random_state=42)
    rf.fit(scaled_data[name]['data'], y)
    d3['Features'] = dict(sorted(zip(scaled_data[name]['data'].columns, rf.feature_importances_), key = lambda x: x[1],reverse=True))
    feature_selection.append(d3)

    ###############################

    d4 = {}
    d4['Processing'] = name
    d4['Method'] = "LGBM Classifier"
    lgbc=LGBMClassifier(n_estimators=500, random_state=42)
    lgbc.fit(scaled_data[name]['data'], y)
    d4['Features'] = dict(sorted(zip(scaled_data[name]['data'].columns, lgbc.feature_importances_), key = lambda x: x[1],reverse=True))
    feature_selection.append(d4)

    ###############################

    d5 = {}
    d5['Processing'] = name
    d5['Method'] = "Correlation Matrix"
    t = pd.concat([scaled_data[name]['data'], y], axis=1)
    corrmat = t.corr()
    d5['Features'] = dict(sorted(zip(corrmat.iloc[:-1,:]['Target'].index,abs(corrmat.iloc[:-1,:]['Target'].values)),key = lambda x: x[1],reverse=True)) 
    feature_selection.append(d5) 

    ###############################

    d6 = {}
    d6['Processing'] = name
    d6['Method'] = "ANOVA F-SCORES"
    anov_selector = SelectKBest(f_classif, k='all')
    fit = anov_selector.fit(scaled_data[name]['data'], y)
    d6['Features'] = dict(sorted(zip(scaled_data[name]['data'].columns, fit.scores_), key = lambda x: x[1], reverse=True))
    feature_selection.append(d6) 
    

In [26]:
feature_selection

[{'Processing': 'Standardized',
  'Method': 'Extra Tree Classifier',
  'Features': {'TB': 0.1382932632824558,
   'SGPT': 0.1349144611528617,
   'AP': 0.11431938107760105,
   'SGOT': 0.1135200206600745,
   'DB': 0.10469972646706446,
   'Age': 0.10373434777319962,
   'A/G': 0.08986628766219805,
   'ALB': 0.08748109044849393,
   'TP': 0.0851621587728987,
   'Gender': 0.028009262703152196}},
 {'Processing': 'Standardized',
  'Method': 'Random Forest Classifier',
  'Features': {'AP': 0.14054123166775356,
   'SGOT': 0.13559788434759537,
   'SGPT': 0.1333464668287657,
   'TB': 0.11768886098870078,
   'Age': 0.11505259499833391,
   'DB': 0.09799406297532835,
   'TP': 0.08611061440000416,
   'ALB': 0.08350681322354178,
   'A/G': 0.07252344619210975,
   'Gender': 0.01763802437786663}},
 {'Processing': 'Standardized',
  'Method': 'LGBM Classifier',
  'Features': {'AP': 2057,
   'Age': 1852,
   'SGPT': 1715,
   'SGOT': 1704,
   'TP': 1298,
   'ALB': 1130,
   'A/G': 897,
   'TB': 825,
   'DB': 475,

# Random Forest

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [89]:
results = []
best_result = {}
cross_best_result = {}
max_accuracy = 0
max_cross_acc = 0
for method in feature_selection:
    data = scaled_data[method['Processing']]
    for i in range(3,11):
        info = {}
        info['Processing'] = method['Processing']
        info['Method'] = method['Method']
        info['Features'] = list(method['Features'].keys())[:i]
        
        data = scaled_data[method['Processing']]['data'][info['Features']]
        X_train = scaled_data[method['Processing']]['train'][info['Features']]
        X_test = scaled_data[method['Processing']]['test'][info['Features']]
        
        model = RandomForestClassifier(n_jobs=-1,random_state=123)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)*100

        # K-Fold Cross Validation
        kfold = StratifiedKFold(n_splits=10)
        crossval_results = cross_val_score(model, data, y, cv=kfold)
        cross_acc = crossval_results.mean()*100
        info['Cross Validation Accuracy'] = cross_acc
        info['Accuracy'] = accuracy
        if accuracy>max_accuracy:
            max_accuracy = accuracy
            best_result = info
        if cross_acc>max_cross_acc:
            max_cross_acc = cross_acc
            cross_best_result = info

In [90]:
best_result

{'Processing': 'Robust',
 'Method': 'Extra Tree Classifier',
 'Features': ['TB',
  'SGPT',
  'AP',
  'SGOT',
  'DB',
  'Age',
  'A/G',
  'ALB',
  'TP',
  'Gender'],
 'Cross Validation Accuracy': 86.65949512335054,
 'Accuracy': 83.17307692307693}

In [91]:
cross_best_result

{'Processing': 'Min_Max',
 'Method': 'Univariate Selection',
 'Features': ['DB', 'TB', 'SGPT', 'SGOT'],
 'Cross Validation Accuracy': 87.3809523809524,
 'Accuracy': 58.65384615384615}

In [92]:
from sklearn.model_selection import GridSearchCV, KFold

In [93]:
X_train = scaled_data[best_result['Processing']]['train'][best_result['Features']]
X_test = scaled_data[best_result['Processing']]['test'][best_result['Features']]
data = scaled_data[best_result['Processing']]['data'][best_result['Features']]

params = {
    'n_estimators': [100, 150, 200, 500],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [1.0,2,4,5],
    'min_samples_leaf': [1,2,4,5],
    'max_leaf_nodes': [4,10,20,50,None]
}

gs1 = GridSearchCV(RandomForestClassifier(n_jobs=-1), params, n_jobs=-1, cv=KFold(n_splits=10), scoring='accuracy')
gs1.fit(X_train, y_train)

print('Best score:', gs1.best_score_)
print('Best score:', gs1.best_params_)

Best score: 0.8590373783922169
Best score: {'criterion': 'gini', 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 150}


### Training model on best parameters obtained from gridsearchcv

In [98]:
model = RandomForestClassifier(n_jobs=-1,random_state=123, criterion='gini', max_leaf_nodes=None, 
                                min_samples_leaf=1, min_samples_split = 4, n_estimators=150) 
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)*100

kfold = StratifiedKFold(n_splits=10)
crossval_results = cross_val_score(model, data, y, cv=kfold)
cross_acc = crossval_results.mean()*100
print("Test Accuracy: ",accuracy,"\nCross validation accuracy: ", cross_acc)

Test Accuracy:  80.76923076923077 
Cross validation accuracy:  86.77280550774526


In [95]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, model.predict(X_train)))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_pred))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

RandomForestClassifier(min_samples_split=4, n_estimators=150, n_jobs=-1,
                       random_state=123)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00       305
         2.0       1.00      1.00      1.00       319

    accuracy                           1.00       624
   macro avg       1.00      1.00      1.00       624
weighted avg       1.00      1.00      1.00       624

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       0.83      0.81      0.82       111
         2.0       0.79      0.80      0.80        97

    accuracy                           0.81       208
   macro avg       0.81      0.81      0.81       208
weighted avg       0.81      0.81      0.81       208

Roc_auc score
-------------------------------------------------------
0.8074672610755

### Training model on default parameters

In [99]:
model = RandomForestClassifier(n_jobs=-1,random_state=123)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)*100

kfold = StratifiedKFold(n_splits=10)
crossval_results = cross_val_score(model, data, y, cv=kfold)
cross_acc = crossval_results.mean()*100
print("Test Accuracy: ",accuracy,"\nCross validation accuracy: ", cross_acc)

Test Accuracy:  83.17307692307693 
Cross validation accuracy:  86.65949512335054


In [97]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, model.predict(X_train)))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_pred))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

RandomForestClassifier(n_jobs=-1, random_state=123)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00       305
         2.0       1.00      1.00      1.00       319

    accuracy                           1.00       624
   macro avg       1.00      1.00      1.00       624
weighted avg       1.00      1.00      1.00       624

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       0.86      0.82      0.84       111
         2.0       0.80      0.85      0.82        97

    accuracy                           0.83       208
   macro avg       0.83      0.83      0.83       208
weighted avg       0.83      0.83      0.83       208

Roc_auc score
-------------------------------------------------------
0.832590322281044

Confusion matrix
-----------------------------------------

# Extra Tree classifier

In [100]:
from sklearn.ensemble import ExtraTreesClassifier

best_result_etc = {}
cross_best_result_etc = {}
max_accuracy_etc = 0
max_cross_acc_etc = 0

for method in feature_selection:
    data = scaled_data[method['Processing']]
    for i in range(3,11):
        info = {}
        info['Processing'] = method['Processing']
        info['Method'] = method['Method']
        info['Features'] = list(method['Features'].keys())[:i]
        
        data = scaled_data[method['Processing']]['data'][info['Features']]
        X_train = scaled_data[method['Processing']]['train'][info['Features']]
        X_test = scaled_data[method['Processing']]['test'][info['Features']]
        
        model = ExtraTreesClassifier(random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)*100

        # K-Fold Cross Validation
        kfold = StratifiedKFold(n_splits=10)
        crossval_results = cross_val_score(model, data, y, cv=kfold)
        cross_acc = crossval_results.mean()*100
        info['Cross Validation Accuracy'] = cross_acc
        info['Accuracy'] = accuracy
        if accuracy>max_accuracy_etc:
            max_accuracy_etc = accuracy
            best_result_etc = info
        if cross_acc>max_cross_acc_etc:
            max_cross_acc_etc = cross_acc
            cross_best_result_etc = info

print(best_result_etc)
print(cross_best_result_etc)

{'Processing': 'Robust', 'Method': 'LGBM Classifier', 'Features': ['AP', 'Age', 'SGPT', 'SGOT', 'TP', 'ALB', 'TB', 'A/G'], 'Cross Validation Accuracy': 88.94291451520367, 'Accuracy': 84.61538461538461}
{'Processing': 'Standardized', 'Method': 'Correlation Matrix', 'Features': ['TB', 'SGPT', 'DB', 'AP', 'A/G', 'SGOT', 'ALB', 'Age', 'Gender'], 'Cross Validation Accuracy': 90.63253012048192, 'Accuracy': 77.40384615384616}


In [101]:
X_train = scaled_data[best_result['Processing']]['train'][best_result['Features']]
X_test = scaled_data[best_result['Processing']]['test'][best_result['Features']]
data = scaled_data[best_result['Processing']]['data'][best_result['Features']]

params = {
    'n_estimators': [100, 150, 200, 500],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [1.0,2,4,5],
    'min_samples_leaf': [1,2,4,5],
    'max_leaf_nodes': [4,10,20,50,None]
}

gs2 = GridSearchCV(ExtraTreesClassifier(n_jobs=-1), params, n_jobs=-1, cv=KFold(n_splits=10), scoring='accuracy')
gs2.fit(X_train, y_train)

print('Best score:', gs2.best_score_)
print('Best score:', gs2.best_params_)

Best score: 0.8878648233486943
Best score: {'criterion': 'entropy', 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


### Training model on best parameters obtained from gridsearchcv

In [102]:
model = ExtraTreesClassifier(n_jobs=-1,random_state=42, criterion='entropy', max_leaf_nodes=None, 
                                min_samples_leaf=1,min_samples_split = 2, n_estimators=200) 
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)*100

kfold = StratifiedKFold(n_splits=10)
crossval_results = cross_val_score(model, data, y, cv=kfold)
cross_acc = crossval_results.mean()*100
print("Test Accuracy: ",accuracy,"\nCross validation accuracy: ", cross_acc)

print(model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, model.predict(X_train)))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_pred))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

Test Accuracy:  82.21153846153845 
Cross validation accuracy:  89.78772231784279
ExtraTreesClassifier(criterion='entropy', n_estimators=200, n_jobs=-1,
                     random_state=42)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00       305
         2.0       1.00      1.00      1.00       319

    accuracy                           1.00       624
   macro avg       1.00      1.00      1.00       624
weighted avg       1.00      1.00      1.00       624

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       0.81      0.87      0.84       111
         2.0       0.84      0.76      0.80        97

    accuracy                           0.82       208
   macro avg       0.82      0.82      0.82       208
weighted avg       0.82      0.82      0.82       208

Roc_auc s

### Training model on default parameters

In [103]:
model = ExtraTreesClassifier(n_jobs=-1,random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)*100

kfold = StratifiedKFold(n_splits=10)
crossval_results = cross_val_score(model, data, y, cv=kfold)
cross_acc = crossval_results.mean()*100
print("Test Accuracy: ",accuracy,"\nCross validation accuracy: ", cross_acc)

print(model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, model.predict(X_train)))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_pred))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

Test Accuracy:  84.13461538461539 
Cross validation accuracy:  89.18674698795182
ExtraTreesClassifier(n_jobs=-1, random_state=42)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00       305
         2.0       1.00      1.00      1.00       319

    accuracy                           1.00       624
   macro avg       1.00      1.00      1.00       624
weighted avg       1.00      1.00      1.00       624

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       0.81      0.92      0.86       111
         2.0       0.89      0.75      0.82        97

    accuracy                           0.84       208
   macro avg       0.85      0.84      0.84       208
weighted avg       0.85      0.84      0.84       208

Roc_auc score
-------------------------------------------------------

In [58]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# XGBoost Classifier

In [61]:
from xgboost import XGBClassifier

best_result_xgboost = {}
cross_best_result_xgboost = {}
max_accuracy_xgboost = 0
max_cross_acc_xgboost = 0

for method in feature_selection:
    data = scaled_data[method['Processing']]
    for i in range(3,11):
        info = {}
        info['Processing'] = method['Processing']
        info['Method'] = method['Method']
        info['Features'] = list(method['Features'].keys())[:i]
        
        data = scaled_data[method['Processing']]['data'][info['Features']]
        X_train = scaled_data[method['Processing']]['train'][info['Features']]
        X_test = scaled_data[method['Processing']]['test'][info['Features']]
        
        model = XGBClassifier(n_jobs=-1,random_state=42, eval_metric='mlogloss')
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)*100

        # K-Fold Cross Validation
        kfold = StratifiedKFold(n_splits=10)
        crossval_results = cross_val_score(model, data, y, cv=kfold)
        cross_acc = crossval_results.mean()*100
        info['Cross Validation Accuracy'] = cross_acc
        info['Accuracy'] = accuracy
        if accuracy>max_accuracy_xgboost:
            max_accuracy_xgboost = accuracy
            best_result_xgboost = info
        if cross_acc>max_cross_acc_xgboost:
            max_cross_acc_xgboost = cross_acc
            cross_best_result_xgboost = info

print(best_result_xgboost)
print(cross_best_result_xgboost)

{'Processing': 'Robust', 'Method': 'Extra Tree Classifier', 'Features': ['TB', 'SGPT', 'AP', 'SGOT', 'DB', 'Age', 'A/G', 'ALB', 'TP'], 'Cross Validation Accuracy': 85.33706253585771, 'Accuracy': 75.96153846153845}
{'Processing': 'Absolute_max', 'Method': 'Univariate Selection', 'Features': ['DB', 'TB', 'SGOT', 'SGPT', 'Gender', 'Age', 'ALB'], 'Cross Validation Accuracy': 86.65519219736086, 'Accuracy': 71.63461538461539}


### Training model on best parameters obtained from gridsearchcv

In [63]:
X_train = scaled_data[best_result_xgboost['Processing']]['train'][best_result_xgboost['Features']]
X_test = scaled_data[best_result_xgboost['Processing']]['test'][best_result_xgboost['Features']]
data = scaled_data[best_result_xgboost['Processing']]['data'][best_result_xgboost['Features']]

params = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01,0.05,0.1],
    'booster': ['gbtree', 'gblinear'],
    'gamma': [0, 0.5, 1],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [0.5, 1, 5],
    'base_score': [0.2, 0.5, 1]
}

gs3 = GridSearchCV(XGBClassifier(n_jobs=-1), params, n_jobs=-1, cv=KFold(n_splits=10), scoring='accuracy')
gs3.fit(X_train, y_train)

print('Best score:', gs3.best_score_)
print('Best score:', gs3.best_params_)

Best score: 0.8621863799283153
Best score: {'base_score': 0.5, 'booster': 'gbtree', 'gamma': 0, 'learning_rate': 0.1, 'n_estimators': 500, 'reg_alpha': 0.5, 'reg_lambda': 5}


### Training model on best parameters obtained from gridsearchcv

In [65]:
xgb_model = XGBClassifier(n_jobs=-1,random_state=42, booster='gbtree', gamma=0, 
                    reg_alpha=0.5, reg_lambda = 5, n_estimators=500, base_score=0.5, learning_rate = 0.1, eval_metric='mlogloss') 
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)*100

kfold = StratifiedKFold(n_splits=10)
crossval_results = cross_val_score(xgb_model, data, y, cv=kfold)
cross_acc = crossval_results.mean()*100
print("Test Accuracy: ",accuracy,"\nCross validation accuracy: ", cross_acc)

print(xgb_model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, xgb_model.predict(X_train)))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_pred))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

Test Accuracy:  74.03846153846155 
Cross validation accuracy:  85.21084337349396
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=500, n_jobs=-1,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0.5, reg_lambda=5, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00       305
         2.0       1.00      1.00      1.00       319

    accuracy           

### Training model on default parameters

In [68]:
xgb_model = XGBClassifier(n_jobs=-1,random_state=42, eval_metric='mlogloss') 
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)*100

kfold = StratifiedKFold(n_splits=10)
crossval_results = cross_val_score(xgb_model, data, y, cv=kfold)
cross_acc = crossval_results.mean()*100
print("Test Accuracy: ",accuracy,"\nCross validation accuracy: ", cross_acc)

print(xgb_model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, xgb_model.predict(X_train)))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_pred))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

Test Accuracy:  75.96153846153845 
Cross validation accuracy:  85.33706253585771
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00       305
         2.0       1.00      1.00      1.00       319

    accuracy     

# Gradient Boosting

In [70]:
from sklearn.ensemble import GradientBoostingClassifier

best_result_gb = {}
cross_best_result_gb = {}
max_accuracy_gb = 0
max_cross_acc_gb = 0

for method in feature_selection:
    data = scaled_data[method['Processing']]
    for i in range(3,11):
        info = {}
        info['Processing'] = method['Processing']
        info['Method'] = method['Method']
        info['Features'] = list(method['Features'].keys())[:i]
        
        data = scaled_data[method['Processing']]['data'][info['Features']]
        X_train = scaled_data[method['Processing']]['train'][info['Features']]
        X_test = scaled_data[method['Processing']]['test'][info['Features']]
        
        model = GradientBoostingClassifier(random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)*100

        # K-Fold Cross Validation
        kfold = StratifiedKFold(n_splits=10)
        crossval_results = cross_val_score(model, data, y, cv=kfold)
        cross_acc = crossval_results.mean()*100
        info['Cross Validation Accuracy'] = cross_acc
        info['Accuracy'] = accuracy
        if accuracy>max_accuracy_gb:
            max_accuracy_gb = accuracy
            best_result_gb = info
        if cross_acc>max_cross_acc_gb:
            max_cross_acc_gb = cross_acc
            cross_best_result_gb = info

print(best_result_gb)
print(cross_best_result_gb)


{'Processing': 'Robust', 'Method': 'Extra Tree Classifier', 'Features': ['TB', 'SGPT', 'AP'], 'Cross Validation Accuracy': 77.877223178428, 'Accuracy': 75.48076923076923}
{'Processing': 'Standardized', 'Method': 'Random Forest Classifier', 'Features': ['AP', 'SGOT', 'SGPT', 'TB', 'Age'], 'Cross Validation Accuracy': 82.08691910499141, 'Accuracy': 67.3076923076923}


In [72]:
X_train = scaled_data[best_result_gb['Processing']]['train'][best_result_gb['Features']]
X_test = scaled_data[best_result_gb['Processing']]['test'][best_result_gb['Features']]
data = scaled_data[best_result_gb['Processing']]['data'][best_result_gb['Features']]

params = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.1, 0.2, 0.5],
    'criterion': ['friedman_mse','mse', 'mae'],
    'min_samples_split' : [2,4,5],
    'min_samples_leaf' : [1,2,4,5]
}

gs4 = GridSearchCV(GradientBoostingClassifier(), params, n_jobs=-1, cv=KFold(n_splits=10), scoring='accuracy')
gs4.fit(X_train, y_train)

print('Best score:', gs4.best_score_)
print('Best score:', gs4.best_params_)

Best score: 0.8477214541730671
Best score: {'criterion': 'mse', 'learning_rate': 0.1, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 500}


### Training model on best parameters obtained from gridsearchcv

In [73]:
gb_model = GradientBoostingClassifier(random_state=42, criterion= 'mse', n_estimators=500, min_samples_split=4,               min_samples_leaf=1, max_leaf_nodes=None, learning_rate = 0.1)
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)*100

kfold = StratifiedKFold(n_splits=10)
crossval_results = cross_val_score(gb_model, data, y, cv=kfold)
cross_acc = crossval_results.mean()*100
print("Test Accuracy: ",accuracy,"\nCross validation accuracy: ", cross_acc)

print(gb_model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, gb_model.predict(X_train)))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_pred))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

Test Accuracy:  66.34615384615384 
Cross validation accuracy:  83.41078600114746
GradientBoostingClassifier(criterion='mse', min_samples_split=4,
                           n_estimators=500, random_state=42)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       0.99      0.99      0.99       305
         2.0       0.99      0.99      0.99       319

    accuracy                           0.99       624
   macro avg       0.99      0.99      0.99       624
weighted avg       0.99      0.99      0.99       624

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       0.66      0.77      0.71       111
         2.0       0.67      0.55      0.60        97

    accuracy                           0.66       208
   macro avg       0.66      0.66      0.66       208
weighted avg       0.66      0.66      0.66   

### Training model on default parameters

In [74]:
gb_model = GradientBoostingClassifier(random_state=42) 
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)*100

kfold = StratifiedKFold(n_splits=10)
crossval_results = cross_val_score(gb_model, data, y, cv=kfold)
cross_acc = crossval_results.mean()*100
print("Test Accuracy: ",accuracy,"\nCross validation accuracy: ", cross_acc)

print(gb_model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, gb_model.predict(X_train)))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_pred))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

Test Accuracy:  75.48076923076923 
Cross validation accuracy:  77.877223178428
GradientBoostingClassifier(random_state=42)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       0.97      0.77      0.86       305
         2.0       0.82      0.98      0.89       319

    accuracy                           0.88       624
   macro avg       0.90      0.87      0.87       624
weighted avg       0.89      0.88      0.87       624

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       0.81      0.70      0.75       111
         2.0       0.71      0.81      0.76        97

    accuracy                           0.75       208
   macro avg       0.76      0.76      0.75       208
weighted avg       0.76      0.75      0.75       208

Roc_auc score
-------------------------------------------------------
0.7585

# Stacking

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

In [82]:
X_train = scaled_data['Robust']['train']
X_test = scaled_data['Robust']['test']
data = scaled_data['Robust']['data']

layer_one_estimators = [
            ('rf_1', RandomForestClassifier(random_state=42)),
            ('et_1', ExtraTreesClassifier(random_state=42)),
            ('xgb_1', XGBClassifier(random_state=42, eval_metric='mlogloss'))
        ]
stacking_model = StackingClassifier(estimators=layer_one_estimators, final_estimator=LogisticRegression())
stacking_model.fit(X_train, y_train)
y_pred = stacking_model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)*100

kfold = StratifiedKFold(n_splits=10)
crossval_results = cross_val_score(stacking_model, data, y, cv=kfold)
cross_acc = crossval_results.mean()*100
print("Test Accuracy: ",accuracy,"\nCross validation accuracy: ", cross_acc)

print(stacking_model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, stacking_model.predict(X_train)))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_pred))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

Test Accuracy:  76.92307692307693 
Cross validation accuracy:  92.91451520367183
StackingClassifier(estimators=[('rf_1',
                                RandomForestClassifier(random_state=42)),
                               ('et_1', ExtraTreesClassifier(random_state=42)),
                               ('xgb_1',
                                XGBClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              enable_categorical=False,
                                              eval_metric='mlogloss',
                                              gamma=None, gpu_id=None,
                                              importance_type=None,
                                              interaction_...
                                              learning_r

# Bagging

In [84]:
from sklearn.ensemble import BaggingClassifier

best_result_bagging = {}
cross_best_result_bagging = {}
max_accuracy_bagging = 0
max_cross_acc_bagging = 0
for method in feature_selection:
    for i in range(3,11):
        info = {}
        info['Processing'] = method['Processing']
        info['Method'] = method['Method']
        info['Features'] = list(method['Features'].keys())[:i]
        
        data = scaled_data[method['Processing']]['data'][info['Features']]
        X_train = scaled_data[method['Processing']]['train'][info['Features']]
        X_test = scaled_data[method['Processing']]['test'][info['Features']]
        
        model = BaggingClassifier(n_jobs=-1,random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)*100

        # K-Fold Cross Validation
        kfold = StratifiedKFold(n_splits=10)
        crossval_results = cross_val_score(model, data, y, cv=kfold)
        cross_acc = crossval_results.mean()*100
        info['Cross Validation Accuracy'] = cross_acc
        info['Accuracy'] = accuracy
        if accuracy>max_accuracy_bagging:
            max_accuracy_bagging = accuracy
            best_result_bagging = info
        if cross_acc>max_cross_acc_bagging:
            max_cross_acc_bagging = cross_acc
            cross_best_result_bagging = info

In [85]:
print(best_result_bagging)
print(cross_best_result_bagging)

{'Processing': 'Robust', 'Method': 'Random Forest Classifier', 'Features': ['AP', 'SGOT', 'SGPT', 'TB', 'Age', 'DB', 'TP'], 'Cross Validation Accuracy': 86.2966150315548, 'Accuracy': 78.84615384615384}
{'Processing': 'Robust', 'Method': 'Random Forest Classifier', 'Features': ['AP', 'SGOT', 'SGPT', 'TB', 'Age', 'DB'], 'Cross Validation Accuracy': 87.49856569133678, 'Accuracy': 65.38461538461539}


In [86]:
X_train = scaled_data[best_result_bagging['Processing']]['train'][best_result_bagging['Features']]
X_test = scaled_data[best_result_bagging['Processing']]['test'][best_result_bagging['Features']]
data = scaled_data[best_result_bagging['Processing']]['data'][best_result_bagging['Features']]
params = {
    'n_estimators': [100, 200, 300]
}
gs1 = GridSearchCV(BaggingClassifier(n_jobs=-1),
                   params, n_jobs=-1, cv=KFold(n_splits=10), scoring='accuracy')
gs1.fit(X_train, y_train)

print('Best score:', gs1.best_score_)
print('Best score:', gs1.best_params_)

Best score: 0.8333077316948285
Best score: {'n_estimators': 300}


In [88]:
gb_model = BaggingClassifier(random_state=42, n_estimators=300) 
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)*100

kfold = StratifiedKFold(n_splits=10)
crossval_results = cross_val_score(gb_model, data, y, cv=kfold)
cross_acc = crossval_results.mean()*100
print("Test Accuracy: ",accuracy,"\nCross validation accuracy: ", cross_acc)

print(gb_model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, gb_model.predict(X_train)))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_pred))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

Test Accuracy:  76.4423076923077 
Cross validation accuracy:  85.57085484796329
BaggingClassifier(n_estimators=300, random_state=42)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00       305
         2.0       1.00      1.00      1.00       319

    accuracy                           1.00       624
   macro avg       1.00      1.00      1.00       624
weighted avg       1.00      1.00      1.00       624

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       0.79      0.76      0.77       111
         2.0       0.74      0.77      0.75        97

    accuracy                           0.76       208
   macro avg       0.76      0.76      0.76       208
weighted avg       0.77      0.76      0.76       208

Roc_auc score
----------------------------------------------------

In [87]:
gb_model = BaggingClassifier(random_state=42) 
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)*100

kfold = StratifiedKFold(n_splits=10)
crossval_results = cross_val_score(gb_model, data, y, cv=kfold)
cross_acc = crossval_results.mean()*100
print("Test Accuracy: ",accuracy,"\nCross validation accuracy: ", cross_acc)

print(gb_model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, gb_model.predict(X_train)))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_pred))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_pred))

Test Accuracy:  78.84615384615384 
Cross validation accuracy:  86.2966150315548
BaggingClassifier(random_state=42)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00       305
         2.0       1.00      1.00      1.00       319

    accuracy                           1.00       624
   macro avg       1.00      1.00      1.00       624
weighted avg       1.00      1.00      1.00       624

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

         1.0       0.79      0.83      0.81       111
         2.0       0.79      0.74      0.77        97

    accuracy                           0.79       208
   macro avg       0.79      0.79      0.79       208
weighted avg       0.79      0.79      0.79       208

Roc_auc score
-------------------------------------------------------
0.785548435032