# Ensemble Vote Classifier

Agora que desenvolvemos alguns modelos relativamente eficazes vamos tentar combinar as suas previs√µes de modo a obtermos um modelo ainda mais eficaz!

In [50]:
from sklearn.datasets import load_iris
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from copy import deepcopy

train = pd.read_csv('prep_train.csv')
train.head(10)

pipe1 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Distance from Residence to Work', 'Work load Average/day ', 'Education', 'Weight']), GaussianNB(priors=None, var_smoothing=1e-09))
pipe2 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Education', 'Son', 'Weight', 'Height', 'Body mass index']), RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=10, max_features=None, min_impurity_decrease=0, min_samples_split=8, n_estimators=100, oob_score=False))
pipe3 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Age', 'Work load Average/day ', 'Disciplinary failure', 'Son']), LogisticRegression(C=0.1, fit_intercept=True, intercept_scaling=0.1, l1_ratio=0.1, max_iter=10, multi_class= 'ovr', penalty='none', solver='newton-cg', tol=10))
pipe4 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Education', 'Son', 'Weight', 'Height', 'Body mass index']), AdaBoostClassifier(learning_rate=1, n_estimators=50))
pipe5 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Education', 'Son', 'Weight', 'Height', 'Body mass index']), GradientBoostingClassifier(learning_rate=0.5, loss='exponential', max_depth=3, max_features='log2', min_impurity_decrease=0.001, min_samples_split=2, n_estimators=20))
pipe6 = make_pipeline(ColumnSelector(cols=['Reason for absence', 'Month of absence', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Son', 'Weight', 'Height', 'Body mass index']), LinearDiscriminantAnalysis(solver='svd', store_covariance=True, tol=0.0001))


eclf = EnsembleVoteClassifier(clfs=[pipe1, pipe2, pipe6])

X_train, X_test, y_train, y_test = train_test_split(train.drop(['Absent'],axis=1), 
                                                    train['Absent'], test_size=0.30, 
                                                    random_state=101)
eclf.fit(X_train,y_train)

predictions = eclf.predict(X_test)
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))



              precision    recall  f1-score   support

         0.0       0.84      0.62      0.71        26
         1.0       0.92      0.98      0.95       124

    accuracy                           0.91       150
   macro avg       0.88      0.80      0.83       150
weighted avg       0.91      0.91      0.91       150

[[ 16  10]
 [  3 121]]
0.9133333333333333
