In [168]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_score, train_test_split
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from mlxtend.classifier import StackingCVClassifier
from mlxtend.plotting import plot_decision_regions

import warnings
warnings.filterwarnings("ignore")

### Dados

In [2]:
glass = pd.read_csv(os.path.join('Dados', 'glass', 'glass.csv'))

In [3]:
glass.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [4]:
glass.shape

(214, 10)

In [15]:
X = glass.iloc[:, 0:9]
y = glass.Type

### Decision Tree
Nesse caso foi utilizada a entropia, pois o C4.5 usa esse critério. Isso foi feito pois o sklearn não implementa o C4.5, e sim o CART.

In [64]:
dt = DecisionTreeClassifier(random_state=42, criterion='entropy')

In [65]:
parameters = {'splitter':['random', 'best'], 'max_depth':[2, 3, 4, 5, 10, 20, 30, 60]}

In [66]:
model = GridSearchCV(dt, parameters, cv=10, scoring='f1_macro')
model.fit(X, y)
dt_base_estimator = model.best_estimator_
print('F1-macro: %.2f' % model.best_score_)
print('Melhores parâmetros: ' + str(model.best_estimator_))

F1-macro: 0.64
Melhores parâmetros: DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')


### Ensembles
- Bagging -> BaggingClassifier
- Boosting -> AdaBoostClassifier
- Random Forest -> RandomForestClassifier
- XGBOOST -> XGBClassifier
- Stacking -> StackingCVClassifier: Combinação de todos os ensembles testados e a decision tree.

- Ambos bagging e boosting utilizaram como classificador base a decision tree criada anteriormente.


#### Bagging

In [74]:
parameters = {'n_estimators':[10, 20, 40, 80, 160, 320]}
bagging = BaggingClassifier(random_state=42, base_estimator=dt_base_estimator)
model = GridSearchCV(bagging, parameters, cv=10, scoring='f1_macro', n_jobs=-1)
model.fit(X, y)
bagging_model = model.best_estimator_
print('F1-macro: %.2f' % model.best_score_)
print('Melhores parâmetros: ' + str(model.best_estimator_))

F1-macro: 0.68
Melhores parâmetros: BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=20, n_jobs=None, oob_score=False,
         random_state=42, verbose=0, warm_start=False)


#### Boosting

In [73]:
parameters = {'n_estimators':[10, 20, 40, 80, 160, 320, 640], 'learning_rate':[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0]}
boosting = AdaBoostClassifier(random_state=42, base_estimator=dt_base_estimator)
model = GridSearchCV(boosting, parameters, cv=10, scoring='f1_macro', n_jobs=-1)
model.fit(X, y)
boosting_model = model.best_estimator_
print('F1-macro: %.2f' % model.best_score_)
print('Melhores parâmetros: ' + str(model.best_estimator_))

F1-macro: 0.62
Melhores parâmetros: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=32.0, n_estimators=10, random_state=42)


#### Random Forest

In [75]:
parameters = {'n_estimators':[10, 20, 40, 80, 160, 320, 640], 
              'max_depth':[10, 20, 40, 80, 160, 320, 640], 
              'min_samples_split':[2, 3, 4, 5, 6, 7, 8], 
              'min_samples_leaf':[1, 2, 3, 4], 
              'bootstrap':[True, False]}
rf = RandomForestClassifier(criterion='entropy', random_state=42)
model = GridSearchCV(rf, parameters, cv=10, scoring='f1_macro', n_jobs=-1)
model.fit(X, y)
rf_model = model.best_estimator_
print('F1-macro: %.2f' % model.best_score_)
print('Melhores parâmetros: ' + str(model.best_estimator_))

F1-macro: 0.72
Melhores parâmetros: RandomForestClassifier(bootstrap=False, class_weight=None,
            criterion='entropy', max_depth=20, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=None, oob_score=False, random_state=42,
            verbose=0, warm_start=False)


#### XGBOOST

In [81]:
parameters = {'booster':['gbtree', 'dart'], 
              'learning_rate':[0.1, 0.2, 0.3, 0.4], 
              'max_depth':[3, 6, 12, 24]}
xgboost = XGBClassifier(seed=42, eval_metric='auc')
model = GridSearchCV(xgboost, parameters, cv=10, scoring='f1_macro', n_jobs=-1)
model.fit(X, y)
xgboost_model = model.best_estimator_
print('F1-macro: %.2f' % model.best_score_)
print('Melhores parâmetros: ' + str(model.best_estimator_))

F1-macro: 0.67
Melhores parâmetros: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, eval_metric='auc', gamma=0,
       learning_rate=0.3, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42, silent=None,
       subsample=1, verbosity=1)


#### STACKING

In [183]:
lr = LogisticRegression(C=4)
sclf = StackingCVClassifier(classifiers=[dt_base_estimator, bagging_model, boosting_model, rf_model, xgboost_model], 
                            meta_classifier=lr, use_probas=True, random_state=42, use_features_in_secondary=False)

for clf, label in zip([dt_base_estimator, bagging_model, boosting_model, rf_model, xgboost_model, sclf], 
                      ['Decision Tree', 'Bagging', 'AdaBoost', 'Random Forest', 'XGBOOST', 'StackingClassifier']):

    scores = cross_val_score(clf, X.as_matrix(), y, cv=10, scoring='f1_macro')
    print("F1-macro: %.2f [%s]" % (scores.mean(), label))

F1-macro: 0.64 [Decision Tree]
F1-macro: 0.68 [Bagging]
F1-macro: 0.62 [AdaBoost]
F1-macro: 0.72 [Random Forest]
F1-macro: 0.67 [XGBOOST]
F1-macro: 0.71 [StackingClassifier]
