- Use uma base de dados para classificação com pelo menos 1000 amostras;
- Escolha pelo menos três algoritmos de classificação;
- Combine os classificadores de duas formas diferentes:
    - Voting
    - Stacking
- Use gridsearch (ou randomsearch) para ajustar tantos os classificadores fracos quanto o ensemble;

#Importando Dataset

In [1]:
from sklearn.datasets import load_digits
#sklearn.datasets.load_digits(*, n_class=10, return_X_y=False, as_frame=False)
#Classes: 10
#Samples per class: ~180
#Samples total: 1797
#Dimensionality: 64
#Features: integers 0-16

In [2]:
from sklearn.model_selection import train_test_split

X, y = load_digits(return_X_y=True)
X.shape, y.shape

X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=42)
X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((1347, 64), (450, 64), (1347,), (450,))

#Verificações gerais

##XGBoost

In [3]:
from xgboost import XGBClassifier
modelo = XGBClassifier()
modelo.fit(X_tr, y_tr)
xgb_pr = modelo.predict(X_te)
xgbhits = xgb_pr == y_te
#xgbhits,
sum(xgbhits)/len(xgbhits)

0.9688888888888889

##KNN

In [4]:
from sklearn.neighbors import KNeighborsClassifier
modelo = KNeighborsClassifier()
modelo.fit(X_tr, y_tr)
knn_pr = modelo.predict(X_te)
knnhits = knn_pr == y_te
#knnhits, 
sum(knnhits)/len(knnhits)

0.9933333333333333

##Random Forest

In [5]:
#from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
modelo = RandomForestClassifier(random_state=42)
modelo.fit(X_tr, y_tr)
dt_pr = modelo.predict(X_te)
dthits = dt_pr == y_te
#dthits, 
sum(dthits)/len(dthits)

0.9711111111111111

##Voting Classifier

In [6]:
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier([
    ('xgboost', XGBClassifier(colsample_bynode=0.01, learning_rate=0.15, random_state=42)),
    ('knn', KNeighborsClassifier()),
    ('rf', RandomForestClassifier(random_state=42)),
])
voting.fit(X_tr, y_tr)
vo_pr = voting.predict(X_te)
vohits = vo_pr == y_te
#vohits,
sum(vohits)/len(vohits)

0.9822222222222222

##Stacking Classifier

In [9]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
from sklearn.ensemble import StackingClassifier

stacking = StackingClassifier([
    #('voting', voting),
    ('xgboost', XGBClassifier(colsample_bynode=0.01, learning_rate=0.15, random_state=42)),
    ('knn', KNeighborsClassifier()),
    ('dtrf', RandomForestClassifier(random_state=42))
], cv=3, passthrough=True)

stacking.fit(X_tr, y_tr) 
sc_pr = stacking.predict(X_te)
schits = sc_pr == y_te
#schits, 
sum(schits)/len(schits)

0.9755555555555555

- Combine os classificadores de duas formas diferentes:
    - Voting
    - Stacking
- Use gridsearch (ou randomsearch) para ajustar tantos os classificadores fracos quanto o ensemble;

#Gridsearch - Classificadores
  - KNN obteve muito bom resultado com parâmetros default

##XGBoost

In [11]:
from sklearn.model_selection import GridSearchCV

parametros = {
    'colsample_bynode': [0.01, 0.2],
    'learning_rate': [0.15, 0.3],
    'random_state': [42],
    'max_depth':[6,10,27]
}

modelo_xgboost = GridSearchCV(XGBClassifier(), param_grid=parametros)
modelo_xgboost.fit(X_tr, y_tr)
dt_xg = modelo_xgboost.predict(X_te)
xghits = dt_xg == y_te
#dthits, 
sum(xghits)/len(xghits)

0.98

##RF

In [12]:
parametros = {
    'max_depth': [15,25,50],
    'criterion': ['gini', 'entropy'],
    'n_estimators': [100],
    'random_state': [42],
    'min_samples_split': [2, 8, 18],
    'bootstrap': [True,False]
}
modelo_RandomForest = GridSearchCV(RandomForestClassifier(), param_grid=parametros)
modelo_RandomForest.fit(X_tr, y_tr)
dt_pr = modelo_RandomForest.predict(X_te)
dthits = dt_pr == y_te
#dthits, 
sum(dthits)/len(dthits)

0.9777777777777777

#GridSearch - Ensemble 

##Voting

In [15]:
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier([
    ('xgboost', modelo_xgboost),
    ('knn', KNeighborsClassifier()),
    ('rf', modelo_RandomForest)
])

parametros = {
    'voting':['hard', 'soft']
}

voting_search = GridSearchCV(voting, param_grid=parametros)

voting_search.fit(X_tr, y_tr)
vo_pr = voting_search.predict(X_te)
vohits = vo_pr == y_te
#vohits,
sum(vohits)/len(vohits)

#########################################
##  Resultado antes do gridsearch - 0.98 
##  Resultado após o gridsearch - 0.9866666666666667
#########################################

0.9866666666666667

##Stacking

In [13]:
from sklearn.ensemble import StackingClassifier

stacking = StackingClassifier([
    #('voting', voting),
    ('xgboost', modelo_xgboost),
    ('knn', KNeighborsClassifier()),
    ('rf', modelo_RandomForest)
]#, cv=3, passthrough=True
)

parametros = {
    'cv': [3,5,8],
    'passthrough': [True,False]
}

stacking_search = GridSearchCV(stacking, param_grid=parametros)


stacking_search.fit(X_tr, y_tr) 
sc_pr = stacking_search.predict(X_te)
schits = sc_pr == y_te
#schits, 
sum(schits)/len(schits)

#################################################################
## Resultado antes do GridSearch 0.9777777777777777
## Resultado após o GridSearch - 0.9888888888888889 - 04:10h de execução
#################################################################

0.9888888888888889