# Solving the digit recognizer

## The dataset

In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
data = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")

In [12]:
# How many images / dimensions ?
data.shape

Since one line represent one image and each column a pixel, we have 42k images with 784 pixels for each image. The last dimension is the label of this image.

In [13]:
# What does the dataframe look like ?
data.head()

In [14]:
# target identification
target = data["label"]

# Remove target from training set
data = data.drop("label",axis=1)

## Visualisation

In [15]:
import matplotlib.pyplot as plt

In [16]:
def show_image(pixels, label):
    img = np.array(pixels).reshape((28,28))
    plt.title(label)
    plt.imshow(img, cmap='gray')
    return img

In [17]:
# What does one image look like ?
img = show_image(data.iloc[186][0:], target.iloc[186])

In [18]:
# What about each pixel ?
pd_img = pd.DataFrame(img)
print(pd_img.to_string())

## Outliers

In [19]:
from sklearn.ensemble import IsolationForest

In [20]:
clf = IsolationForest(random_state=42, contamination=0.01).fit(data)

In [21]:
outliers = clf.predict(data) == -1

In [22]:
# How many outliers ?
print(f"{data[outliers].shape[0]} outliers detected")

In [23]:
plt.figure(figsize=(20, 3))
for i in range(20):
    plt.subplot(1, 20, i+1)
    show_image(data[outliers].iloc[i], target[outliers].iloc[i])

In [24]:
# Remove identified outliers
normal = clf.predict(data) == 1
data = data[normal]
target = target[normal]
data.shape

## Classification

Solution from : https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html

In [25]:
# Update sklearn just in case.
# I mainly do this because I am on Kaggle.
!pip install -U scikit-learn

In [26]:
# Import datasets, classifiers and performance metrics
from sklearn.svm import SVC
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import train_test_split

In [27]:
# Split data into 50% train and 50% test subsets
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.3, shuffle=False
)

In [28]:
# Create a classifier: a support vector classifier
clf = SVC()

In [29]:
# Learn the digits on the train subset
clf.fit(X_train, y_train)

In [30]:
# Predict the value of the digit on the test subset
predicted = clf.predict(X_test)

In [31]:
print(
    f"Classification report for classifier {clf}:\n"
    f"{classification_report(y_test, predicted)}\n"
)

In [32]:
# disp = ConfusionMatrixDisplay.from_predictions(y_test, predicted)
# disp.figure_.suptitle("Confusion Matrix")

#### This does not work on Kaggle. Cannot figure out why... ####
# I get : AttributeError: type object 'ConfusionMatrixDisplay' has no attribute 'from_predictions'

Doc of the fonction : [click here](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ConfusionMatrixDisplay.html#sklearn.metrics.ConfusionMatrixDisplay.from_predictions)

In [33]:
import seaborn as sns

In [34]:
# Creates a confusion matrix
cm = confusion_matrix(y_test, predicted) 
cm_df = pd.DataFrame(cm)
sns.heatmap(cm_df, annot=True)
plt.title("SVC classifier")
plt.ylabel('True label')
plt.xlabel('Predicted label')