# Decision tree + Adaboost in MNIST

In [1]:
import pandas as pd
import numpy as np
from mnist import MNIST
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
from utils import show_image

In [2]:
mnist = MNIST('datasets/mnist')
train_images, train_labels = mnist.load_training()
test_images, test_labels = mnist.load_testing()

shape = (28,28)

# 1 - Create a Decision Tree Classifier with Adaboost

## 1.1 - Split training dataset in `training` and `validation`

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_validate, y_train, y_validate = train_test_split(
    train_images,
    train_labels,
    test_size=0.33,
    random_state=42)

## 1.2 - Create DecisionTreeClassifier with Adaboost

In [6]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()

ada_ensemble = AdaBoostClassifier(base_estimator=dt_model, n_estimators=10)

- Fit in `training` dataset

In [7]:
ada_ensemble = ada_ensemble.fit(X_train,y_train)

### 1.2.1 - validate with `training` dataset

In [8]:
from sklearn.metrics import classification_report


predict_result = []

predictions = ada_ensemble.predict(X_train)

predict_result = zip(y_train, predictions)

result_training = pd.DataFrame(predict_result, columns=["y","ŷ"])

print(classification_report(result_training["y"],result_training["ŷ"]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3977
           1       1.00      1.00      1.00      4524
           2       1.00      1.00      1.00      4017
           3       1.00      1.00      1.00      4096
           4       1.00      1.00      1.00      3924
           5       1.00      1.00      1.00      3622
           6       1.00      1.00      1.00      3938
           7       1.00      1.00      1.00      4144
           8       1.00      1.00      1.00      3957
           9       1.00      1.00      1.00      4001

   micro avg       1.00      1.00      1.00     40200
   macro avg       1.00      1.00      1.00     40200
weighted avg       1.00      1.00      1.00     40200



### 1.2.2 - validate with `validate` dataset

In [10]:
predict_result = []

predictions = ada_ensemble.predict(X_validate)

predict_result = zip(y_validate, predictions)

result_validate = pd.DataFrame(predict_result, columns=["y","ŷ"])

print(classification_report(result_validate["y"],result_validate["ŷ"]))

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      1946
           1       0.94      0.95      0.95      2218
           2       0.84      0.84      0.84      1941
           3       0.82      0.82      0.82      2035
           4       0.85      0.86      0.85      1918
           5       0.81      0.82      0.82      1799
           6       0.90      0.89      0.89      1980
           7       0.90      0.89      0.90      2121
           8       0.79      0.78      0.78      1894
           9       0.81      0.81      0.81      1948

   micro avg       0.86      0.86      0.86     19800
   macro avg       0.86      0.86      0.86     19800
weighted avg       0.86      0.86      0.86     19800



### 1.3 - CrossValidaion


In [11]:
from sklearn.model_selection import cross_val_score

ada_ensemble = AdaBoostClassifier(base_estimator=dt_model, n_estimators=10)

cv_result = cross_val_score(ada_ensemble, list(train_images), list(train_labels), cv=20)

In [12]:
print("""
Accuracy:
{}
Acurracy Mean: {}
""".format(cv_result, np.mean(cv_result)))


Accuracy:
[0.8762475  0.87720466 0.86846487 0.86775483 0.85143238 0.85838054
 0.88370543 0.87337554 0.87070976 0.87233333 0.859      0.87791861
 0.86724483 0.8712475  0.87791861 0.87024683 0.86591061 0.88525684
 0.88718291 0.88384513]
Acurracy Mean: 0.872269035853



# 2 - Test the classifier with the `test` dataset

In [13]:
ada_ensemble = AdaBoostClassifier(base_estimator=dt_model, n_estimators=10)
dt_model = dt_model.fit(train_images,train_labels)

In [14]:
predict_result = []

predictions = dt_model.predict(test_images)

predict_result = zip(test_labels, predictions)

result_validate = pd.DataFrame(predict_result, columns=["y","ŷ"])
print(" === RESULT === ")
print(classification_report(result_validate["y"],result_validate["ŷ"]))

 === RESULT === 
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       980
           1       0.96      0.96      0.96      1135
           2       0.88      0.85      0.86      1032
           3       0.83      0.85      0.84      1010
           4       0.89      0.88      0.88       982
           5       0.84      0.83      0.84       892
           6       0.89      0.89      0.89       958
           7       0.90      0.90      0.90      1028
           8       0.82      0.81      0.82       974
           9       0.85      0.86      0.85      1009

   micro avg       0.88      0.88      0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

