# ENC - master Humanités numériques

# Évaluation pour le cours d'apprentissage machine : question 2

## Question : Random forest sur MNIST
Proposez un code permettant d'effectuer des prédictions des catégories de MNIST en utilisant un algorithme de random-forets.\
Evaluez la performance avec une cross-validation.

## On importe les différents modules.

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf

import numpy as np
import pandas as pd

## On importe le jeu de données MNIST.

In [0]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [0]:
print(f"x_train : {x_train.shape}")
print(f"x_test : {x_test.shape}")
print(f"y_train : {y_train.shape}")
print(f"y_test : {y_test.shape}")

x_train : (60000, 28, 28)
x_test : (10000, 28, 28)
y_train : (60000,)
y_test : (10000,)


In [0]:
flat_x_train = x_train.reshape(60000,784)
flat_x_test = x_test.reshape(10000,784)

In [0]:
df_flat_x_train = pd.DataFrame(flat_x_train)
df_flat_x_train.to_csv('mnist_csv/x_train.csv', header=False, index=False)
df_flat_x_test = pd.DataFrame(flat_x_test)
df_flat_x_test.to_csv('mnist_csv/x_test.csv', header=False, index=False)

In [0]:
df_y_train = pd.DataFrame(y_train)
df_y_train.to_csv('mnist_csv/y_train.csv', header=False, index=False)
df_y_test = pd.DataFrame(y_test)
df_y_test.to_csv('mnist_csv/y_test.csv', header=False, index=False)


In [0]:
cp -r mnist_csv/ drive/My\ Drive/.

In [0]:
nor_flat_x_train = flat_x_train / 255
nor_flat_x_test = flat_x_test / 255

## Modèle

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
rfc = RandomForestClassifier(n_estimators=200, random_state=5, n_jobs=-1)

In [0]:
rfc.fit(flat_x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)

In [0]:
rfc.score(flat_x_test, y_test)

0.9677

https://scikit-learn.org/stable/modules/cross_validation.html

In [0]:
from sklearn.model_selection import cross_val_score

In [0]:
flat_x = np.concatenate([flat_x_train, flat_x_test], axis=0)
print(flat_x.shape)
y = np.concatenate([y_train, y_test], axis=0)
print(y.shape)

(70000, 784)
(70000,)


In [0]:
rfc_cv_score = cross_val_score(rfc, flat_x, y, cv=10)

In [0]:
print("Accuracy: %0.2f (+/- %0.2f)" % (rfc_cv_score.mean(), rfc_cv_score.std() * 2))

Accuracy: 0.95 (+/- 0.01)


In [0]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 5,
 'verbose': 0,
 'warm_start': False}

In [0]:
from sklearn.model_selection import RandomizedSearchCV

In [0]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 300, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [0]:
rfc = RandomForestClassifier()
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid,
                                n_iter = 100, cv = 3, verbose=3, random_state=42, n_jobs = -1)
rfc_random.fit(flat_x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed: 94.0min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed: 195.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 213.2min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [0]:
rfc_random

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [0]:
from joblib import dump

In [0]:
dump(rfc_random, 'exo2_rfc_random.joblib') 

['exo2_rfc_random.joblib']

In [0]:
cp exo2_rfc_random.joblib drive/My\ Drive/.