In [1]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

# libraries
import numpy as np
import pandas as pd

# For Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import pickle
file = open("data_risk2_dummy","rb")
X_train = pickle.load(file)
y_train = pickle.load(file)
X_test = pickle.load(file)
y_test = pickle.load(file)
file.close()

In [3]:
X_train.head(2)

Unnamed: 0,menopaus_0,menopaus_1,agegrp_2,agegrp_3,agegrp_4,agegrp_5,agegrp_6,agegrp_7,agegrp_8,agegrp_9,...,nrelbc_1,nrelbc_2,brstproc_0,brstproc_1,lastmamm_0,lastmamm_1,surgmeno_0,surgmeno_1,hrt_0,hrt_1
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [4]:
X_test.head(2)

Unnamed: 0,menopaus_0,menopaus_1,agegrp_2,agegrp_3,agegrp_4,agegrp_5,agegrp_6,agegrp_7,agegrp_8,agegrp_9,...,nrelbc_1,nrelbc_2,brstproc_0,brstproc_1,lastmamm_0,lastmamm_1,surgmeno_0,surgmeno_1,hrt_0,hrt_1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


# Voting Ensemble Model on Cancer Data

### Default Hyperparameters

We have already tuned the hyperparameters of various models to be used for the ensemble - 

In [5]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, accuracy_score

def model_stats(X, y, y_pred, model):

    cm = confusion_matrix(y, y_pred)
    print("Confusion Matrix ->")
    print(cm)

    TP = cm[1,1] # true positive 
    TN = cm[0,0] # true negatives
    FP = cm[0,1] # false positives
    FN = cm[1,0] # false negatives
    
    print("Accuracy:",accuracy_score(y, y_pred))

    sensi = TP / float(TP+FN)
    speci = TN / float(TN+FP)
    
    # Let's see the sensitivity of our model
    print("Sensitivity:", sensi)

    # Let us calculate specificity
    print("Specificity:", speci)

    print("Average:", (sensi+speci)/2)
    
    # Let us calculate precision
    print("Precision",TP / float(TP+FP))
    print("MCC ", metrics.matthews_corrcoef(y, y_pred))
    print("F1 ", metrics.fbeta_score(y, y_pred, beta = 1))
    print("F2 ", metrics.fbeta_score(y, y_pred, beta = 2))
    print("F3 ", metrics.fbeta_score(y, y_pred, beta = 3))
    
    # Predicting probability for roc_auc
    '''pred_probs = model.predict_proba(X)[:,1]
    print("ROC-AUC : {:2.2}".format(roc_auc_score(y, pred_probs)))

    # Plotting ROC curve
    fpr, tpr, thresholds = roc_curve(y, pred_probs, drop_intermediate = False )
    auc_score = roc_auc_score(y, pred_probs)
    plt.figure(figsize=(6, 6))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
    '''
    

In [6]:
# Importing random forest classifier from sklearn library
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from numpy import mean


In [7]:
# code has been written with the help of https://machinelearningmastery.com/voting-ensembles-with-python/
def get_voting():
    models = []
 
    models.append(('XGB', XGBClassifier(learning_rate =0.1, n_estimators=300, max_depth=5, min_child_weight=6, gamma=0,
                                        subsample=0.9, colsample_bytree=0.9, objective= 'binary:logistic', nthread=4,
                                        reg_alpha=0, scale_pos_weight=25, seed=27)))
    models.append(('LR', LogisticRegression(C=0.1, class_weight='balanced')))
    models.append(('SGD', SGDClassifier(class_weight = 'balanced', max_iter=200, penalty='l1', alpha=0.001, 
                                        random_state = 100)))
    
    #models.append(('RF', RandomForestClassifier(random_state = 100, n_estimators=50 ,max_depth=8 ,min_samples_leaf=200,
    #                        min_samples_split=200, max_samples=0.8, class_weight='balanced')))
    
    ensemble = VotingClassifier(estimators = models, voting='hard')
    return ensemble

def get_models():
    models = dict()
    models['XGB'] = XGBClassifier(learning_rate =0.1, n_estimators=300, max_depth=5, min_child_weight=6, gamma=0,
                                        subsample=0.9, colsample_bytree=0.9, objective= 'binary:logistic', nthread=4,
                                        reg_alpha=0, scale_pos_weight=25, seed=27)
    models['LR'] = LogisticRegression(C=0.1, class_weight='balanced')
    models['SGD'] = SGDClassifier(class_weight = 'balanced', max_iter=200, penalty='l1', alpha=0.001, random_state = 100)
    #models['RF'] = RandomForestClassifier(random_state = 100, n_estimators=50 ,max_depth=8 ,min_samples_leaf=200, 
    #                                      min_samples_split=200,max_samples=0.8, class_weight='balanced')
    models['ensemble'] = get_voting()
    return models

def evaluate_model(model, X, y):
    # Create a cross validation scheme
    folds = RepeatedStratifiedKFold(n_splits = 5, n_repeats=3, random_state = 100)
    scores = cross_val_score(model, X, y, scoring='balanced_accuracy',cv = folds, n_jobs = -1,verbose = 1)
    return scores

In [8]:
models = get_models()
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results.append(scores)
    names.append(name)
    print(name,"->", mean(scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


XGB -> 0.7292990946513821


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    9.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


LR -> 0.6924496271629607


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    7.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SGD -> 0.6866303385851895
ensemble -> 0.6997588517952426


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.1min finished


In [9]:
# fit
models['ensemble'].fit(X_train,y_train)

VotingClassifier(estimators=[('XGB',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.9, gamma=0,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=0.1,
                                            max_delta_step=None, max_depth=5,
                                            min_child_weight=6, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=300, n_jobs=None,
                                            nthread=4, num_parallel_tree=None,
                                            random_state=None, reg_alpha=0,
    

In [10]:
# Let's check the evaluation metrics of our default model

# Making predictions
y_pred_train_def = models['ensemble'].predict(X_train) # For train data
y_pred_test_def = models['ensemble'].predict(X_test) # For test data

#### Evaluating ensemble model on train data

In [11]:
# For train
model_stats(X_train, y_train, y_pred_train_def, models['ensemble'])

Confusion Matrix ->
[[75105 34431]
 [ 1184  3109]]
Accuracy: 0.687118396893586
Sensitivity: 0.7242021896109947
Specificity: 0.6856649868536372
Average: 0.704933588232316
Precision 0.08281832711774108
MCC  0.16608252070478338
F1  0.1486386345707934
F2  0.2841241409562802
F3  0.4081284377174213


#### #### Evaluating ensemble model on test data

In [12]:
# For test
model_stats(X_test, y_test, y_pred_test_def, models['ensemble'])

Confusion Matrix ->
[[34882 23048]
 [  410  1161]]
Accuracy: 0.6057545251340314
Sensitivity: 0.7390197326543603
Specificity: 0.6021405144139479
Average: 0.6705801235341541
Precision 0.047957371225577264
MCC  0.11134525559265625
F1  0.09006982156710627
F2  0.19037156068605907
F3  0.30275372900803166
