# Reconnaissance de chiffres manuscrits : MNIST

## Librairies et fonctions utiles

In [None]:
# Pandas : librairie de manipulation de données
# NumPy : librairie de calcul scientifique
# MatPlotLib : librairie de visualisation et graphiques
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score,auc, accuracy_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split

from IPython.core.display import HTML # permet d'afficher du code html dans jupyter

## Le dataset de chiffres manuscrits MNIST

On charge le dataset MNIST :

In [None]:
df = pd.read_csv("../input/mnist-in-csv/mnist_test.csv")

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.shape

On a 785 colonnes :
* une colonne 'label' identifiant le chiffre  
* et 784 colonnes de pixels (image de 28x28 pixels "aplatie")

In [None]:
df.head(10)

On crée la cible y (colonne 'label') :

In [None]:
y = df['label']

et les caractéristiques X :

In [None]:
X = df.drop(['label'], axis=1)

On sépare les ensembles d'apprentissage et de test :

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

On peut maintenant appliquer les méthodes de machine learning, mais auparavant on va visualiser les images

## Visualisation des images MNIST

Pour visualiser les images, on va convertir une ligne de 784 pixels en une matrice 28x28  
Il faut en premier transformer le dataframe X en un tableau :

In [None]:
X1 = np.array(X)

On affiche la première ligne :

In [None]:
print(X1[0])

On applique la méthode **reshape** pour convertir cette ligne de 784 éléments en une matrice 28x28 :

In [None]:
image = X1[0].reshape(28,28)
print(image)

On peut maintenant afficher cette matrice :

In [None]:
plt.imshow(image)

en niveaux de gris, sans graduation des axes, et avec le label comme titre :

In [None]:
plt.imshow(image, cmap="gray_r")
plt.axis('off')
plt.title(y[0])

On redimensionne toutes les lignes :

In [None]:
n_samples = len(df.index)
images = X1.reshape(n_samples,28,28)

On affiche les 50 premiers :

In [None]:
plt.figure(figsize=(10,20))
for i in range(0,49) :
    plt.subplot(10,5,i+1)
    plt.axis('off')
    plt.imshow(images[i], cmap="gray_r")
    plt.title(y[i])

## Machine learning

In [None]:
## Machine learning
data_train = df.sample(frac=0.8, random_state=1)          # 80% des données avec frac=0.8
data_test = df.drop(data_train.index)   

X_train = data_train.drop(['label'], axis=1)
y_train = data_train['label']
X_test = data_test.drop(['label'], axis=1)
y_test = data_test['label']

Appliquer des méthodes de machine learning et évaluer les résultats (accuracy, matrice de confusion, ...)

In [None]:
#Arbre de décision

from sklearn import tree
dtc = tree.DecisionTreeClassifier()
dtc.fit(X_train,y_train)
y_dtc = dtc.predict(X_test)
print(accuracy_score(y_test, y_dtc))

#matrice de confusion
from sklearn.metrics import accuracy_score, confusion_matrix
cm = confusion_matrix(y_test, y_rf)
print(cm)


In [None]:
from sklearn import ensemble
rf = ensemble.RandomForestClassifier()
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)

In [None]:
rf_score = accuracy_score(y_test, y_rf)
print(rf_score)

#Affichage de la matrice de confusion:

from sklearn.metrics import accuracy_score, confusion_matrix
cm = confusion_matrix(y_test, y_rf)
print(cm)

Vous pouvez également si vous le souhaitez tester l'algorithme XGBoost, souvent très efficace :  
https://datascientest.com/xgboost-grand-gagnant-des-competitions-machine-learning-algorithme  
https://medium.com/sfu-cspmp/xgboost-a-deep-dive-into-boosting-f06c9c41349

In [None]:
import xgboost as XGB
xgb  = XGB.XGBClassifier()

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score

# Linear Base Learner
df = pd.read_csv("../input/mnist-in-csv/mnist_test.csv")
y = df['label']
X = df.drop(['label'], axis=1)

# Train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Convert the training and testing sets into DMatrixes
train = xgb.DMatrix(data=X_train, label=y_train)
test = xgb.DMatrix(data=X_test, label=y_test)

# Parameters with booster as gblinear for Linear base learner
params = {"booster": "gblinear", "objective": "reg:squarederror"}

# Train the model: xg_reg
xg_reg = xgb.train(params=params, dtrain=train, num_boost_round=5)

# Making predictions
predictions = xg_reg.predict(test)
print("explained variance:",explained_variance_score(predictions, y_test))


# Computing RMSE
print("RMSE: %f" % (np.sqrt(mean_squared_error(y_test, predictions))))


In [None]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score

# KC House Data
df = pd.read_csv("../input/mnist-in-csv/mnist_test.csv")
y = df['label']
X = df.drop(['label'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Fitting XGB regressor model and default base learner is Decision Tree
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=75, subsample=0.75, max_depth=7)
xgb_reg.fit(X_train, y_train)

# Making Predictions
predictions = xgb_reg.predict(X_test)

# Variance_score
print((explained_variance_score(predictions, y_test)))
print("RMSE: %f" % (np.sqrt(mean_squared_error(y_test, predictions))))

In [None]:
# Matrice de confusion
from sklearn.metrics import accuracy_score, confusion_matrix
cm = confusion_matrix(predictions,y_test)
print(cm)