# <center>MNIST: SIMPLE Machine Learning Algorithms<center>
<hr>

## 1. Introduction

In [None]:
# imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
print(os.listdir("../input"))

In [None]:
# Set our train and test date
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

In [None]:
train_df.head()

In [None]:
# data size
train_df.shape

In [None]:
# show digits distribution
train_df.label.value_counts()

**Class distribution looks balanced which is great to continue into modeling part.**

In [None]:
# Set features and label for showing
digits = train_df.drop(['label'], 1).values
digits = digits / 255.
label = train_df['label'].values

digits.max(), label.max()

**Normize data (/ 255.) isnot necessary with tree based model, but we will keep in case apply new algorithms in the future.**

In [None]:
# Show 25 digits of data
fig, axis = plt.subplots(5, 4, figsize=(22, 20))

for i, ax in enumerate(axis.flat):
    ax.imshow(digits[i].reshape(28, 28), cmap='binary')
    ax.set(title = "Real digit is {}".format(label[i]))

# 2. Machine Learning

In [None]:
# Machine Learning
from sklearn.model_selection import train_test_split

# models
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix

In [None]:
# Set X, y for fiting
X = digits
y = label

# split data into 90% training and 10% for test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## 2.1 Random Forest Classifier Model

In [None]:
# Seting our model
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test) # predict our file test data
rf_acc = accuracy_score(y_test, y_pred)

print("Model accuracy is: {0:.3f}%".format(rf_acc * 100))

In [None]:
# Compare our result
fig, axis = plt.subplots(5, 4, figsize=(18, 20))

for i, ax in enumerate(axis.flat):
    ax.imshow(X_test[i].reshape(28, 28), cmap='binary')
    ax.set(title = "Predicted digit {0}\nTrue digit {1}".format(y_pred[i], y_test[i]))

### Confusion Matrix

In [None]:
np.unique(y_test, return_counts=True)

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(12, 8))
sns.heatmap(cm, annot=True, fmt='.0f')
plt.xlabel("Predicted Digits")
plt.ylabel("True Digits")
plt.show()

* Class/Digit 5 was predicted more false than other digits. 
* Digit 9 has a big interaction with digits 7, 4 and 3.

We could try to tune the RF hyperparameters to improve the model predictions.

## 2.2 Different algorithms

Try different algorithms for multiclass classification problem like (KNN, naive bayes) 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

In [None]:
# knn = KNeighborsClassifier()
# knn.fit(X_train, y_train)

# y_pred = knn.predict(X_test) # predict our file test data
# knn_acc = accuracy_score(y_test, y_pred)

# print("KNN accuracy is: {0:.3f}%".format(knn_acc * 100))

In [None]:
gnb = MultinomialNB(alpha=1e-3)
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test) # predict our file test data
gnb_acc = accuracy_score(y_test, y_pred)

print("NB accuracy is: {0:.3f}%".format(gnb_acc * 100))

## Submission

In [None]:
test_X = test_df.values / 255.
rfc_pred = model.predict(test_X)
gnb_pred = gnb.predict(test_X)

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
sub.head()

In [None]:
# Make submission file
sub['Label'] = rfc_pred
sub.to_csv('submission.csv', index=False)

In [None]:
# Make NB submission file
sub['Label'] = gnb_pred
sub.to_csv('GNB_submission.csv', index=False)

In [None]:
# Show our submission file
sub.head(10)