# Ensemble Classifier

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pydotplus
import collections
import pickle
import os

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from xgboost import XGBClassifier

%matplotlib inline

In [23]:
data = {}
for dataset in os.listdir('../data/preprocessed'):
    with open(f'../data/preprocessed/{dataset}', 'rb') as fp:
        data[dataset[:-4]] = pickle.load(fp)
        
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

In [15]:
models = {}
for model in os.listdir('../models'):
    with open(f'../models/{model}', 'rb') as fp:
        models[model[:-4]] = pickle.load(fp)

In [32]:
for model in models:
    print(f'{model}:\t{recall_score(y_test, models[model].predict(X_test))}')

svcp:	0.8571428571428571
svcs:	0.8571428571428571
ab:	0.8571428571428571
lr:	0.8857142857142857
svcrbf:	0.8857142857142857
rf:	0.8571428571428571
knn:	0.8285714285714286
dt:	0.8571428571428571
xgb:	0.8285714285714286
svcl:	0.8857142857142857


Here I selected the three top performing models: the sigmoid kernel from the SVCs, AdaBoost from the boosters, and the decision tree over the random forest, for a total of **3 individual models**. I combined them into a single voting classifier below.

In [33]:
clf_votehard = VotingClassifier(
    estimators=[('svcl', models['svcl']),
                ('lr', models['lr']), 
                ('ab', models['ab'])],
    voting='hard')
clf_votehard.fit(X_train, y_train)
recall_score(y_test, clf_votehard.predict(X_test))

0.8857142857142857

In [25]:
confusion_matrix(y_test, clf_votehard.predict(X_test))

array([[33,  7],
       [ 4, 31]])

This ensemble method does as well as two of the individual models, the decision tree and AdaBoost, but not as well the sigmoid SVC. I tried the same model, but with a soft voting system, giving a little more weight to the sigmoid SVC as it performed best on its own. I also included the KNN model hoping the extra diversity in models might produce a more powerful ensemble.

In [46]:
param_grid = {
#     'ab__base_estimator': [DecisionTreeClassifier(max_depth=1), LogisticRegression(solver='lbfgs', multi_class='auto')], 
#     'ab__n_estimators': [10, 30, 50, 1000], 
#     'ab__learning_rate': [.0001, .001, .01, .1],
#     'svcl__kernel': ['linear'], 
#     'svcl__C': np.linspace(.1, 1, 10), 
#     'svcl__gamma': ['scale', 'auto'],
    'weights': [[1,1,1], [3,1,1], [1,3,1], [1,1,3]]

}
gs_hard = GridSearchCV(clf_votehard, param_grid, scoring='recall', cv=5)
gs_hard.fit(X_train, y_train)

print(f'Train Recall: {gs_hard.best_score_}')
print(f'Test Recall: {recall_score(y_test, gs_hard.predict(X_test))}')
print(f'Train Accuracy: {accuracy_score(y_train, gs_hard.predict(X_train))}')
print(f'Test Accuracy: {accuracy_score(y_test, gs_hard.predict(X_test))}')
print(gs_hard.best_params_)

Train Recall: 0.7847619047619048
Test Recall: 0.8571428571428571
Train Accuracy: 0.8558558558558559
Test Accuracy: 0.84
{'weights': [1, 1, 3]}


In [39]:
clf_votesoft = VotingClassifier(
    estimators=[('svcl', models['svcl']),
                ('lr', models['lr']), 
                ('ab', models['ab'])],
    voting='soft')
clf_votesoft.fit(X_train, y_train)
recall_score(y_test, clf_votesoft.predict(X_test))

0.8857142857142857

In [40]:
confusion_matrix(y_test, clf_votesoft.predict(X_test))

array([[33,  7],
       [ 4, 31]])

Again, the model performs as well as the decision tree and AdaBoost each do alone, making this a none too impressive model. 

In [47]:
gs_soft = GridSearchCV(clf_votesoft, param_grid, scoring='recall', cv=5)
gs_soft.fit(X_train, y_train)

print(f'Train Recall: {gs_soft.best_score_}')
print(f'Test Recall: {recall_score(y_test, gs_soft.predict(X_test))}')
print(f'Train Accuracy: {accuracy_score(y_train, gs_soft.predict(X_train))}')
print(f'Test Accuracy: {accuracy_score(y_test, gs_soft.predict(X_test))}')
print(gs_soft.best_params_)

Train Recall: 0.7461904761904761
Test Recall: 0.8857142857142857
Train Accuracy: 0.8288288288288288
Test Accuracy: 0.8666666666666667
{'weights': [1, 3, 1]}
