# Ensemble Vote Classifier

Agora que desenvolvemos alguns modelos relativamente eficazes vamos tentar combinar as suas previsões de modo a obtermos um modelo ainda mais eficaz!

In [37]:
from sklearn.datasets import load_iris
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from copy import deepcopy
from random import randint
from itertools import combinations 

train = pd.read_csv('prep_train.csv')


avg = 0
iss = []
while len(iss) < 5:
    r = randint(0,1000)
    if r not in iss:        
        X_train, X_test, y_train, y_test = train_test_split(train.drop(['Absent'],axis=1), 
                                                            train['Absent'], test_size=0.30, 
                                                            random_state=r)

        pipe1 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Transportation expense', 'Age', 'Hit target', 'Height']), GaussianNB(priors=None, var_smoothing=1e-09))
        pipe2 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Seasons', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Son', 'Weight', 'Body mass index']), RandomForestClassifier(bootstrap=False, criterion='gini', max_depth=20, max_features=None, min_impurity_decrease=0.001, min_samples_split=16, n_estimators=20, oob_score=False))
        pipe3 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Disciplinary failure']), LogisticRegression(C=0.1, dual=False, fit_intercept=True, intercept_scaling=0.1, l1_ratio=0.1, max_iter=10, multi_class= 'ovr', penalty='l2', solver='liblinear', tol=0.1))
        pipe4 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Transportation expense', 'Distance from Residence to Work', 'Age', 'Work load Average/day ', 'Hit target', 'Height', 'Body mass index']), AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), learning_rate=0.5, n_estimators=100))
        pipe5 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Distance from Residence to Work', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Weight']), GradientBoostingClassifier(learning_rate=0.01, loss='deviance', max_depth=3, max_features='auto', min_impurity_decrease=0.1, min_samples_split=2, n_estimators=100))
        pipe6 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Work load Average/day ', 'Disciplinary failure', 'Education', 'Son', 'Weight', 'Height', 'Body mass index']), LinearDiscriminantAnalysis(solver='lsqr', shrinkage=None))
        pipe7 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Education', 'Son', 'Weight', 'Height', 'Body mass index']), MLPClassifier(hidden_layer_sizes=(32,16)))

        eclf = EnsembleVoteClassifier(clfs=[pipe2, pipe6, pipe1, pipe3, pipe4, pipe5])
        
        
        eclf.fit(X_train,y_train)
        predictions = eclf.predict(X_test)
        avg = (avg*len(iss) + accuracy_score(y_test,predictions))/float(len(iss) + 1)
        iss.append(r)
print(avg)


  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))


0.8693333333333335


In [41]:
X_test = pd.read_csv('prep_test.csv')
y_test = pd.read_csv('problem_info/sample_submission.csv').drop('ID', axis=1)

train = pd.read_csv('prep_train.csv')
X_train = train.drop('Absent', axis=1)
y_train = train['Absent']

'''
['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Education', 'Son', 'Weight', 'Height', 'Body mass index']
'''

pipe1 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Transportation expense', 'Age', 'Hit target', 'Height']), GaussianNB(priors=None, var_smoothing=1e-09))
pipe2 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Seasons', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Son', 'Weight', 'Body mass index']), RandomForestClassifier(bootstrap=False, criterion='gini', max_depth=20, max_features=None, min_impurity_decrease=0.001, min_samples_split=8, n_estimators=5, oob_score=False))
pipe3 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Disciplinary failure']), LogisticRegression(C=1, dual=True, fit_intercept=True, intercept_scaling=10, max_iter=10, multi_class= 'ovr', penalty='l2', solver='liblinear', tol=0.0001))
pipe4 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Transportation expense', 'Distance from Residence to Work', 'Age', 'Work load Average/day ', 'Hit target', 'Height', 'Body mass index']), AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), learning_rate=1, n_estimators=100))
pipe5 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Distance from Residence to Work', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Weight']), GradientBoostingClassifier(learning_rate=1, loss='deviance', max_depth=None, max_features=None, min_impurity_decrease=1e-5, min_samples_split=4))
pipe6 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Work load Average/day ', 'Disciplinary failure', 'Education', 'Son', 'Weight', 'Height', 'Body mass index']), LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto'))
pipe7 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Education', 'Son', 'Weight', 'Height', 'Body mass index']), MLPClassifier(activation='tanh', alpha=0.0001, learning_rate_init=0.01, max_iter=100, solver='lbfgs', hidden_layer_sizes=(8,4)))

clfs= [ pipe1, pipe2, pipe3, pipe4, pipe5, pipe6, pipe7]

'''
maxx = 0
for i in range(1, len(clfs) + 1):
    print(i)
    for c in combinations(clfs, i):
        eclf = EnsembleVoteClassifier(clfs=deepcopy(c))

        eclf.fit(X_train,y_train)
        predictions = eclf.predict(X_test)
        
        if accuracy_score(y_test,predictions) > maxx:
            maxx = accuracy_score(y_test,predictions)
            best = c
'''
eclf = EnsembleVoteClassifier(clfs=clfs)

eclf.fit(X_train,y_train)
predictions = eclf.predict(X_test)
for x in best:
    print(x[1].__class__.__name__, end=', ')
print('')
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))



LogisticRegression, GradientBoostingClassifier, LinearDiscriminantAnalysis, 
              precision    recall  f1-score   support

           0       0.36      0.09      0.15        44
           1       0.83      0.96      0.89       196

    accuracy                           0.80       240
   macro avg       0.59      0.53      0.52       240
weighted avg       0.74      0.80      0.75       240

[[  4  40]
 [  7 189]]
0.8041666666666667


1
