# Decision tree in MNIST

In [1]:
import pandas as pd
import numpy as np
from mnist import MNIST
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
from utils import show_image

In [2]:
mnist = MNIST('datasets/mnist')
train_images, train_labels = mnist.load_training()
test_images, test_labels = mnist.load_testing()

shape = (28,28)

# 1 - Create a Decision Tree Classifier

## 1.1 - Split training dataset in `training` and `validation`

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_validate, y_train, y_validate = train_test_split(
    train_images,
    train_labels,
    test_size=0.33,
    random_state=42)

## 1.2 - Create DecisionTree Classifier

In [30]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model = dt_model.fit(X_train, y_train)

## 1.3 - Validation of the model

### 1.3.1 - Validate with `train` dataset

In [37]:
from sklearn.metrics import classification_report


predict_result = []

predictions = dt_model.predict(X_train)

predict_result = zip(y_train, predictions)

result_training = pd.DataFrame(predict_result, columns=["y","ŷ"])

print(classification_report(result_training["y"],result_training["ŷ"]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3977
           1       1.00      1.00      1.00      4524
           2       1.00      1.00      1.00      4017
           3       1.00      1.00      1.00      4096
           4       1.00      1.00      1.00      3924
           5       1.00      1.00      1.00      3622
           6       1.00      1.00      1.00      3938
           7       1.00      1.00      1.00      4144
           8       1.00      1.00      1.00      3957
           9       1.00      1.00      1.00      4001

   micro avg       1.00      1.00      1.00     40200
   macro avg       1.00      1.00      1.00     40200
weighted avg       1.00      1.00      1.00     40200



### 1.3.2 - Validate with `validation` dataset

In [36]:
predict_result = []

predictions = dt_model.predict(X_validate)

predict_result = zip(y_validate, predictions)

result_validate = pd.DataFrame(predict_result, columns=["y","ŷ"])

print(classification_report(result_validate["y"],result_validate["ŷ"]))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1946
           1       0.94      0.95      0.95      2218
           2       0.84      0.84      0.84      1941
           3       0.83      0.83      0.83      2035
           4       0.84      0.87      0.86      1918
           5       0.82      0.82      0.82      1799
           6       0.90      0.89      0.90      1980
           7       0.90      0.90      0.90      2121
           8       0.78      0.78      0.78      1894
           9       0.82      0.81      0.81      1948

   micro avg       0.86      0.86      0.86     19800
   macro avg       0.86      0.86      0.86     19800
weighted avg       0.86      0.86      0.86     19800



### 1.3 - CrossValidaion
 - Apply CrossValidation with 20 folds

In [42]:
from sklearn.model_selection import cross_val_score

dt_model = DecisionTreeClassifier()

cv_result = cross_val_score(dt_model, list(train_images), list(train_labels), cv=20)

In [45]:
print("""
Accuracy:
{}
Acurracy Mean: {}
""".format(cv_result, np.mean(cv_result)))


Accuracy:
[0.87857618 0.87920133 0.86813187 0.88141239 0.84776815 0.85938021
 0.88237254 0.86837721 0.86937687 0.87033333 0.858      0.87758506
 0.86490994 0.86891261 0.87991995 0.86924616 0.86757839 0.89026017
 0.88518024 0.88551402]
Acurracy Mean: 0.872601831239



 - Fit the model

In [50]:
dt_model = DecisionTreeClassifier()
dt_model = dt_model.fit(train_images,train_labels)

# 2 - Test the classifier with the `test` dataset

In [52]:
predict_result = []

predictions = dt_model.predict(test_images)

predict_result = zip(test_labels, predictions)

result_validate = pd.DataFrame(predict_result, columns=["y","ŷ"])
print(" === RESULT === ")
print(classification_report(result_validate["y"],result_validate["ŷ"]))

 === RESULT === 
              precision    recall  f1-score   support

           0       0.91      0.94      0.92       980
           1       0.96      0.96      0.96      1135
           2       0.88      0.84      0.86      1032
           3       0.83      0.86      0.84      1010
           4       0.87      0.88      0.88       982
           5       0.82      0.84      0.83       892
           6       0.90      0.88      0.89       958
           7       0.91      0.90      0.90      1028
           8       0.81      0.80      0.81       974
           9       0.86      0.84      0.85      1009

   micro avg       0.88      0.88      0.88     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.88      0.88      0.88     10000

