# Machine Learning
## - Stacking Classifier
###### Por: Ricardo Reis

###### Case - MNIST

#### Carrega Pacotes

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

# Para padronização dos dados
from sklearn.preprocessing import StandardScaler

# Para separação em amostra de treino e teste
from sklearn.model_selection import train_test_split

# Cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV

# Modelos
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

# Métrica de erro
from sklearn.metrics import accuracy_score

# Dados
from sklearn.datasets import fetch_openml

  import pandas.util.testing as tm


In [0]:
 mnist = fetch_openml('mnist_784')

In [3]:
mnist.DESCR

"**Author**: Yann LeCun, Corinna Cortes, Christopher J.C. Burges  \n**Source**: [MNIST Website](http://yann.lecun.com/exdb/mnist/) - Date unknown  \n**Please cite**:  \n\nThe MNIST database of handwritten digits with 784 features, raw data available at: http://yann.lecun.com/exdb/mnist/. It can be split in a training set of the first 60,000 examples, and a test set of 10,000 examples  \n\nIt is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 

In [4]:
mnist.data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [0]:
mnist.target

array(['5', '0', '4', ..., '4', '5', '6'], dtype=object)

In [0]:
# Separando primeiro os dados de teste do resto
X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=42)

# Em seguida separam-se os conjuntos de treino e validação
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

#### Treinamento

In [0]:
rf_clf = RandomForestClassifier(n_estimators=10, random_state=42)
log_clf = LogisticRegression(random_state=42)
sgd_clf = SGDClassifier(loss='modified_huber',random_state=42)

##### Voting

In [0]:
# Lista de tuplas (nome, estimador) para passar como parametro no VotingClassifier
named_estimators = [
    ("rf_clf", rf_clf),
    ("log_clf", log_clf),
    ("sgd_clf", sgd_clf),
]

In [0]:
# Modelo ensemble de random forest, logistic regression e stochastic gradient descent utilizando o hard voting
hard_voting_clf = VotingClassifier(named_estimators, voting='hard')

In [64]:
# Treinando os modelos
for clf in (rf_clf, log_clf, sgd_clf, hard_voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_train)
  print(clf.__class__.__name__, accuracy_score(y_train, y_pred))

RandomForestClassifier 0.99904


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression 0.93654
SGDClassifier 0.89408


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier 0.9561


In [0]:
# Modelo ensemble de random forest, logistic regression e stochastic gradient descent utilizando o soft voting
soft_voting_clf = VotingClassifier(named_estimators, voting='soft')

In [46]:
# Treinando os modelos
for clf in (rf_clf, log_clf, sgd_clf, soft_voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_train)
  print(clf.__class__.__name__, accuracy_score(y_train, y_pred))

RandomForestClassifier 0.99904


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression 0.93654
SGDClassifier 0.89408


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier 0.96436


##### Stacking

In [0]:
# Criando uma matriz para armazenar as previsões dos 3 algoritmos da primeira camada 
X_val_predictions = np.empty((len(X_val), len(named_estimators)), dtype=np.float32)

In [0]:
# Salvando as previsões da primeira camada
for index, estimator in enumerate([rf_clf, log_clf, sgd_clf]):
    X_val_predictions[:, index] = estimator.predict(X_val)

In [74]:
# Treinando um random forest com grid search utilizando as privisões da primeira camada como features
params = {
    "n_estimators" : [200, 500, 800, 1000],
    "max_leaf_nodes" : [60, 62, 64, 66, 68, 70],
}

rnd_forest_blender = RandomForestClassifier(n_jobs=-1, oob_score=True)
rnd_forest_blender_gs = GridSearchCV(rnd_forest_blender, param_grid=params, verbose=False)
rnd_forest_blender_gs.fit(X_val_predictions, y_val)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=True, random_state=None,
                                

In [75]:
rnd_forest_blender_gs.best_params_

{'max_leaf_nodes': 64, 'n_estimators': 1000}

In [76]:
rnd_forest_blender_gs.best_estimator_.oob_score_

0.9416

#### Teste

In [0]:
X_test_predictions = np.empty((len(X_val), len(named_estimators)), dtype=np.float32)

In [0]:
for index, estimator in enumerate([rf_clf, log_clf, sgd_clf]):
    X_test_predictions[:, index] = estimator.predict(X_test)

In [0]:
y_pred = rnd_forest_blender_gs.best_estimator_.predict(X_test_predictions)

In [82]:
accuracy_score(y_test, y_pred)

0.9432