# MNIST Classifier

In this notebook you will create both, an mnist tabular dataset and a classifier.

## 1.- import the Operating System (os) module in python and any other library you need

In [1]:
import os
from PIL import Image
import numpy as np
import pandas as pd

## 2.- As you can see each class has its own folder (Do it only for train). 

    - Iterate folder by folder ( os.listdir() )
    - Inside each folder: 
        1.- Read the image
        2.- Reshape it into a flat array (784,)
        3.- Save the data into a pandas dataframe apending the column name as the class
    - Save the data into a CSV

    Note: if it takes to long try doing only 100 images per folder for the CSV.

In [40]:
path = r'.\trainingSample\trainingSample'
df = pd.DataFrame()
for folder in os.listdir(path):
    carpeta = os.path.join(path,folder)
    for filename in os.listdir(carpeta):
        img = Image.open(os.path.join(carpeta,filename))
        img_array = np.array(img, dtype=float)
        img_array = img_array.reshape(784,)
        df_append = pd.DataFrame({'img':img_array}).T
        df_append['class'] = folder
        df = df.append(df_append)

df.to_csv('numbers.csv', index=False, header=False)

## 3.- Load the CSV

In [62]:
numbers = pd.read_csv('numbers.csv', header=None)
x = numbers.iloc[:,:-1]
y = numbers.iloc[:,-1]
x.shape

(600, 784)

## 4.- Create a dictionary of models (No preprocessing needed, it has already been done).
    
    Include both, tree models and mult models.

In [55]:
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier

tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees":   ExtraTreesClassifier(n_estimators=100),
  "Random Forest": RandomForestClassifier(n_estimators=100),
  "AdaBoost":      AdaBoostClassifier(n_estimators=100),
  "Skl GBM":       GradientBoostingClassifier(n_estimators=100),
  "XGBoost":       XGBClassifier(n_estimators=100),
  "LightGBM":      LGBMClassifier(n_estimators=100),
  "CatBoost":      CatBoostClassifier(n_estimators=100)
}

## 5.- Using either cross validation or stratification find out which is the best model
    - Base your code on the previous two days examples

In [58]:
from sklearn import model_selection, metrics
import time

results = pd.DataFrame(columns=['Model', 'Accuracy', 'Bal Acc.', 'Time'])
skf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
for model_name, model in tree_classifiers.items():
    start_time = time.time()
        
    # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
    pred = model_selection.cross_val_predict(model, x, y, cv=skf)
    total_time = time.time() - start_time
    

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)

results
    



Learning rate set to 0.5
0:	learn: 1.9165100	total: 322ms	remaining: 31.9s
1:	learn: 1.7066461	total: 488ms	remaining: 23.9s
2:	learn: 1.4658523	total: 656ms	remaining: 21.2s
3:	learn: 1.2829927	total: 825ms	remaining: 19.8s
4:	learn: 1.1168352	total: 1.02s	remaining: 19.5s
5:	learn: 0.9936678	total: 1.22s	remaining: 19s
6:	learn: 0.8975479	total: 1.39s	remaining: 18.5s
7:	learn: 0.7975773	total: 1.57s	remaining: 18.1s
8:	learn: 0.7127647	total: 1.74s	remaining: 17.6s
9:	learn: 0.6743432	total: 1.91s	remaining: 17.2s
10:	learn: 0.6179963	total: 2.07s	remaining: 16.8s
11:	learn: 0.5600549	total: 2.24s	remaining: 16.4s
12:	learn: 0.5282219	total: 2.41s	remaining: 16.1s
13:	learn: 0.4843324	total: 2.59s	remaining: 15.9s
14:	learn: 0.4479397	total: 2.76s	remaining: 15.6s
15:	learn: 0.4198142	total: 2.93s	remaining: 15.4s
16:	learn: 0.3843669	total: 3.09s	remaining: 15.1s
17:	learn: 0.3584411	total: 3.27s	remaining: 14.9s
18:	learn: 0.3334313	total: 3.43s	remaining: 14.6s
19:	learn: 0.31345

13:	learn: 0.4974526	total: 2.41s	remaining: 14.8s
14:	learn: 0.4691723	total: 2.58s	remaining: 14.6s
15:	learn: 0.4361437	total: 2.75s	remaining: 14.4s
16:	learn: 0.4177116	total: 2.92s	remaining: 14.2s
17:	learn: 0.4076908	total: 3.08s	remaining: 14s
18:	learn: 0.3768213	total: 3.25s	remaining: 13.9s
19:	learn: 0.3661992	total: 3.42s	remaining: 13.7s
20:	learn: 0.3524667	total: 3.59s	remaining: 13.5s
21:	learn: 0.3279696	total: 3.76s	remaining: 13.3s
22:	learn: 0.3146041	total: 3.93s	remaining: 13.1s
23:	learn: 0.3111403	total: 4.09s	remaining: 13s
24:	learn: 0.2925178	total: 4.27s	remaining: 12.8s
25:	learn: 0.2804898	total: 4.44s	remaining: 12.6s
26:	learn: 0.2731723	total: 4.63s	remaining: 12.5s
27:	learn: 0.2648077	total: 4.8s	remaining: 12.4s
28:	learn: 0.2570104	total: 4.98s	remaining: 12.2s
29:	learn: 0.2558875	total: 5.16s	remaining: 12s
30:	learn: 0.2455097	total: 5.34s	remaining: 11.9s
31:	learn: 0.2342557	total: 5.52s	remaining: 11.7s
32:	learn: 0.2204153	total: 5.69s	rema

75:	learn: 0.0791891	total: 12.9s	remaining: 4.06s
76:	learn: 0.0782783	total: 13s	remaining: 3.89s
77:	learn: 0.0769769	total: 13.2s	remaining: 3.72s
78:	learn: 0.0758895	total: 13.4s	remaining: 3.55s
79:	learn: 0.0748095	total: 13.5s	remaining: 3.38s
80:	learn: 0.0745908	total: 13.7s	remaining: 3.21s
81:	learn: 0.0725039	total: 13.9s	remaining: 3.04s
82:	learn: 0.0706636	total: 14s	remaining: 2.87s
83:	learn: 0.0705479	total: 14.2s	remaining: 2.7s
84:	learn: 0.0693584	total: 14.4s	remaining: 2.54s
85:	learn: 0.0688062	total: 14.5s	remaining: 2.37s
86:	learn: 0.0678196	total: 14.7s	remaining: 2.2s
87:	learn: 0.0671173	total: 14.9s	remaining: 2.03s
88:	learn: 0.0658646	total: 15s	remaining: 1.86s
89:	learn: 0.0656126	total: 15.2s	remaining: 1.69s
90:	learn: 0.0639447	total: 15.4s	remaining: 1.52s
91:	learn: 0.0633859	total: 15.6s	remaining: 1.35s
92:	learn: 0.0623190	total: 15.7s	remaining: 1.18s
93:	learn: 0.0609074	total: 15.9s	remaining: 1.01s
94:	learn: 0.0605714	total: 16.1s	remai

37:	learn: 0.1960113	total: 6.46s	remaining: 10.5s
38:	learn: 0.1915319	total: 6.62s	remaining: 10.4s
39:	learn: 0.1858044	total: 6.79s	remaining: 10.2s
40:	learn: 0.1819269	total: 6.96s	remaining: 10s
41:	learn: 0.1746715	total: 7.13s	remaining: 9.85s
42:	learn: 0.1690297	total: 7.3s	remaining: 9.68s
43:	learn: 0.1682178	total: 7.47s	remaining: 9.51s
44:	learn: 0.1622084	total: 7.63s	remaining: 9.33s
45:	learn: 0.1616152	total: 7.8s	remaining: 9.16s
46:	learn: 0.1572186	total: 7.97s	remaining: 8.98s
47:	learn: 0.1550172	total: 8.13s	remaining: 8.81s
48:	learn: 0.1541088	total: 8.3s	remaining: 8.64s
49:	learn: 0.1507174	total: 8.46s	remaining: 8.46s
50:	learn: 0.1483301	total: 8.63s	remaining: 8.3s
51:	learn: 0.1443299	total: 8.8s	remaining: 8.13s
52:	learn: 0.1424184	total: 8.97s	remaining: 7.95s
53:	learn: 0.1420289	total: 9.14s	remaining: 7.78s
54:	learn: 0.1415413	total: 9.31s	remaining: 7.62s
55:	learn: 0.1389877	total: 9.48s	remaining: 7.45s
56:	learn: 0.1386170	total: 9.65s	rema

Learning rate set to 0.5
0:	learn: 1.8724875	total: 166ms	remaining: 16.4s
1:	learn: 1.6689583	total: 336ms	remaining: 16.5s
2:	learn: 1.5121642	total: 510ms	remaining: 16.5s
3:	learn: 1.3562071	total: 676ms	remaining: 16.2s
4:	learn: 1.2207703	total: 847ms	remaining: 16.1s
5:	learn: 1.0868086	total: 1.01s	remaining: 15.9s
6:	learn: 0.9896902	total: 1.17s	remaining: 15.6s
7:	learn: 0.8969377	total: 1.34s	remaining: 15.4s
8:	learn: 0.8161615	total: 1.5s	remaining: 15.2s
9:	learn: 0.7396913	total: 1.67s	remaining: 15.1s
10:	learn: 0.6938462	total: 1.84s	remaining: 14.9s
11:	learn: 0.6516941	total: 2s	remaining: 14.7s
12:	learn: 0.5839608	total: 2.17s	remaining: 14.5s
13:	learn: 0.5300207	total: 2.33s	remaining: 14.3s
14:	learn: 0.4870834	total: 2.49s	remaining: 14.1s
15:	learn: 0.4594933	total: 2.66s	remaining: 14s
16:	learn: 0.4432535	total: 2.83s	remaining: 13.8s
17:	learn: 0.4328620	total: 2.99s	remaining: 13.6s
18:	learn: 0.4110973	total: 3.16s	remaining: 13.5s
19:	learn: 0.3945517	t

62:	learn: 0.1180110	total: 10.6s	remaining: 6.2s
63:	learn: 0.1175661	total: 10.7s	remaining: 6.03s
64:	learn: 0.1136783	total: 10.9s	remaining: 5.87s
65:	learn: 0.1134211	total: 11.1s	remaining: 5.7s
66:	learn: 0.1103345	total: 11.2s	remaining: 5.53s
67:	learn: 0.1095404	total: 11.4s	remaining: 5.36s
68:	learn: 0.1069624	total: 11.6s	remaining: 5.19s
69:	learn: 0.1033324	total: 11.7s	remaining: 5.02s
70:	learn: 0.1006516	total: 11.9s	remaining: 4.86s
71:	learn: 0.0990426	total: 12.1s	remaining: 4.69s
72:	learn: 0.0987086	total: 12.2s	remaining: 4.52s
73:	learn: 0.0983685	total: 12.4s	remaining: 4.35s
74:	learn: 0.0957271	total: 12.6s	remaining: 4.18s
75:	learn: 0.0942819	total: 12.7s	remaining: 4.02s
76:	learn: 0.0939598	total: 12.9s	remaining: 3.85s
77:	learn: 0.0922166	total: 13.1s	remaining: 3.69s
78:	learn: 0.0894769	total: 13.3s	remaining: 3.52s
79:	learn: 0.0882785	total: 13.4s	remaining: 3.36s
80:	learn: 0.0860554	total: 13.6s	remaining: 3.19s
81:	learn: 0.0847033	total: 13.8s

24:	learn: 0.2606764	total: 4.17s	remaining: 12.5s
25:	learn: 0.2507511	total: 4.34s	remaining: 12.3s
26:	learn: 0.2441663	total: 4.5s	remaining: 12.2s
27:	learn: 0.2307300	total: 4.67s	remaining: 12s
28:	learn: 0.2241943	total: 4.84s	remaining: 11.8s
29:	learn: 0.2133710	total: 5s	remaining: 11.7s
30:	learn: 0.2080676	total: 5.17s	remaining: 11.5s
31:	learn: 0.2012981	total: 5.34s	remaining: 11.3s
32:	learn: 0.1974612	total: 5.51s	remaining: 11.2s
33:	learn: 0.1905177	total: 5.67s	remaining: 11s
34:	learn: 0.1863295	total: 5.84s	remaining: 10.8s
35:	learn: 0.1840384	total: 6s	remaining: 10.7s
36:	learn: 0.1797324	total: 6.17s	remaining: 10.5s
37:	learn: 0.1749109	total: 6.34s	remaining: 10.3s
38:	learn: 0.1741914	total: 6.51s	remaining: 10.2s
39:	learn: 0.1668034	total: 6.67s	remaining: 10s
40:	learn: 0.1613954	total: 6.84s	remaining: 9.84s
41:	learn: 0.1582770	total: 7.01s	remaining: 9.68s
42:	learn: 0.1551195	total: 7.17s	remaining: 9.51s
43:	learn: 0.1544698	total: 7.36s	remaining:

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
0,Decision Tree,58.666667,58.666667,0.833472
1,Extra Trees,85.666667,85.666667,2.461437
2,Random Forest,83.833333,83.833333,3.171176
3,AdaBoost,28.5,28.5,12.11128
4,Skl GBM,77.166667,77.166667,266.926793
5,XGBoost,81.166667,81.166667,13.853341
6,LightGBM,83.5,83.5,17.290659
7,CatBoost,80.666667,80.666667,171.733826


## Optional: Can you rotate an image?

In [59]:
path = r'.\trainingSample\trainingSample'
df = pd.DataFrame()
for folder in os.listdir(path):
    carpeta = os.path.join(path,folder)
    i=0
    for filename in os.listdir(carpeta):
        img = Image.open(os.path.join(carpeta,filename))
        rotated = img.rotate(45)
        #normal image
        img_array = np.array(img, dtype=float)
        img_array = img_array.reshape(784,)
        df_append = pd.DataFrame({'img':img_array}).T
        df_append['class'] = folder
        df = df.append(df_append)
        #rotated image
        rotated_array = np.array(rotated, dtype=float)
        rotated_array = rotated_array.reshape(784,)
        df_append = pd.DataFrame({'img':rotated_array}).T
        df_append['class'] = folder
        df = df.append(df_append)
df.to_csv('numbers_extra.csv', index=False, header=False)

In [73]:
numbers = pd.read_csv('numbers_extra.csv', header=None)
x = numbers.iloc[:,:-1]
y = numbers.iloc[:,-1]

In [74]:
tree_classifiers = {
  "Extra Trees":   ExtraTreesClassifier(n_estimators=100),
  "Random Forest": RandomForestClassifier(n_estimators=100),
  "LightGBM":      LGBMClassifier(n_estimators=100)
}

In [65]:
results = pd.DataFrame(columns=['Model', 'Accuracy', 'Bal Acc.', 'Time'])
skf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
for model_name, model in tree_classifiers.items():
    start_time = time.time()
        
    # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
    pred = model_selection.cross_val_predict(model, x, y, cv=skf)
    total_time = time.time() - start_time
    

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)

results

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
0,Extra Trees,83.166667,83.166667,4.503611
1,Random Forest,80.916667,80.916667,5.950575
2,LightGBM,80.666667,80.666667,35.572518


In [75]:
best_model = tree_classifiers["Extra Trees"].fit(x,y)

In [70]:
carpeta = r'.\testSet\testSet'
df = pd.DataFrame()
for filename in os.listdir(carpeta):
        img = Image.open(os.path.join(carpeta,filename))
        #normal image
        img_array = np.array(img, dtype=float)
        img_array = img_array.reshape(784,)
        df_append = pd.DataFrame({'img':img_array}).T
        df_append['class'] = folder
        df = df.append(df_append)
df.to_csv('numbers_test.csv', index=False, header=False)

In [76]:
numbers = pd.read_csv('numbers_test.csv', header=None)
x = numbers.iloc[:,:-1]
test_pred = best_model.predict(x)

In [77]:
test_pred

array([2, 3, 9, ..., 4, 2, 7], dtype=int64)