In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Digit-Recognition by Convolutional Neural Networks (CNN)

### 1. Background

#### 1.1 Artificial neural networks

[Artificial neural networks](https://en.wikipedia.org/wiki/Artificial_neural_network) are computing systems that are inspired by, but not identical to, biological neural networks that constitute animal brains. Such systems "learn" to perform tasks by considering examples, generally without being programmed with task-specific rules.

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/46/Colored_neural_network.svg/800px-Colored_neural_network.svg.png" width="250">

#### 1.2 Convolutional neural network CNN)

In deep learning, a convolutional neural network (CNN, or ConvNet) is a class of deep neural networks, most commonly applied to analyzing visual imagery. For example, in image recognition, they might learn to identify images that contain cats by analyzing example images that have been manually labeled as "cat" or "no cat" and using the results to identify cats in other images. They do this without any prior knowledge of cats, for example, that they have fur, tails, whiskers and cat-like faces. Instead, they automatically generate identifying characteristics from the examples that they process.

here is a simple example of a standard 2D CNN

<img src="https://miro.medium.com/max/2510/1*vkQ0hXDaQv57sALXAJquxA.jpeg" height="100">

input dimension is (w,n,3) - (wxn) image with 3 channels (R,G,B)

#### 1.2.1 CNN Layers

1.2.1.1 Convolutional Layers

the major block in CNN is Convolutional layers, which apply filtering to the input that results an activation, repeating applying the filter (kernel) on the input will create a feature map that summarizes the presence of detected features in the input.

<img src="https://www.researchgate.net/profile/Baptiste_Wicht/publication/322505397/figure/fig5/AS:583063998308353@1516024698839/A-valid-convolution-of-a-5x5-image-with-a-3x3-kernel-The-kernel-will-be-applied-to.png" width="200">

1.2.1.2 Pooling Layer

Pooling is required to down sample the detection of features in feature maps by summarizing the presence of features in patches of the feature map. there is many pooling methods one of the common pooling methods is 'max pooling', which summarize the max activated presence of a feature.

<img src="https://datascience-enthusiast.com/figures/max_pool1.png" width="500">

1.2.1.3 Fully Connected Layer

After the two previous steps in CNN process ends, breaking down the image into features, and analyzing them independently. The result of this process will be flatted and feeds into a fully connected neural network structure that drives the final classification decision.

<img src="https://cdn-images-1.medium.com/max/600/1*yjy3dwRL-vmSpmUG7UNJYg@2x.png" width=200>

#### 1.3 MNIST

[MNIST](https://www.kaggle.com/c/digit-recognizer) ("Modified National Institute of Standards and Technology") is the de facto “hello world” dataset of computer vision. Since its release in 1999, this classic dataset of handwritten images has served as the basis for benchmarking classification algorithms. As new machine learning techniques emerge, MNIST remains a reliable resource for researchers and learners alike.

<img src="https://corochann.com/wp-content/uploads/2017/02/mnist_plot.png" width="300">

#### 1.4 Images
RGB images is stored as 3D numpy array (rows, columns, channels), rows is images height of the image and columns is the width of the image.
Channels consists of Red, Green and Blue components of each individual pixel. for example a (0,0,0) pixel is displayed as black, and a pixel whose color components are (255,255,255) is displayed as white.

<img src="https://summations.github.io/assets/img/posts/channelplot/image-matrix.png" width="600">

for our case, we will feed the CNN model with graysacle images, graysacle image is one in which the value of each pixel is a single sample representing only an amount of light.
which will be stored as (rows, columns, 1) in our case it will be (28, 28, 1)

### 2. Problem

The goal is to correctly identify digits from a dataset of tens of thousands of handwritten images.

### 3. Import Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import keras
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Activation, MaxPool2D
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam,RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau
from sklearn.metrics import confusion_matrix

### 4. Gathering data

In [None]:
train = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

### 5. EDA - Exploratory data analysis

In [None]:
train.shape

42,000 images to train and learn from.

In [None]:
train.head()

as you can see we have label wich define the actual digit and 784pixels - pixels[0-783].
i will seperate train data to (X, y) - y will be label only, and X will be the whole data without the label.

In [None]:
X = train.drop('label', axis=1)
y = train['label']

In [None]:
test.shape

28,000 images to test

Let us count the labels.

In [None]:
train['label'].value_counts()

In [None]:
sns.countplot(train['label'])

### 6. Preprocessing that data

In [None]:
X.shape

Let us take a look to the first row

In [None]:
first_row = X.iloc[0].copy()

reshape it to 28x28

In [None]:
first_mat = first_row.values.reshape(28,28)

now we can plot that image

In [None]:
plt.imshow(first_mat)

it looks like the digit ```1```

let us plot the first 10 images

In [None]:
plt.figure(figsize=(15,10))
for i in range(10):
    plt.subplot(2,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(X.iloc[i].values.reshape(28,28))
    plt.xlabel(y[i])

### 7. Neural network model

#### 7.1 Define Variables

In [None]:
input_shape = (28, 28, 1)

let see how many unique labels we have

In [None]:
unique_labels = y.unique()

In [None]:
unique_labels

In [None]:
num_labels = len(unique_labels)

In [None]:
num_labels

#### 7.2 Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
y_train.shape

#### 7.4 Reshaping and Scaling data

In [None]:
X_train.shape

In [None]:
X_train = X_train.values.reshape(-1, 28, 28, 1)
X_test = X_test.values.reshape(-1, 28, 28, 1)

In [None]:
X_train.shape

In [None]:
X_train = X_train / 255.
X_test = X_test / 255.

#### 7.5 Build Neural Network Model

In [None]:
###  Model Definition
model = Sequential()

# add 32 convolution filters used each of size 5x5 with relu activation
model.add(Conv2D(filters=32, kernel_size=(5, 5), padding='Valid', activation='relu', input_shape=(28, 28, 1)))


# add another 32 convolution filters used each of size 3x3 with relu activation
model.add(Conv2D(filters=32, kernel_size=(3, 3), padding='Same', activation='relu'))

# adding pooling layer with a MaxPool2D filter of size 2x2 summarize the presence of features
# in patches of the feature map.
model.add(MaxPool2D(pool_size=(2, 2)))


# turn on and off neurons randomly for reducing interdependent learning amongst the neurons.
model.add(Dropout(0.2))

# add 64 convolution filters used each of size 5x5 with relu activation
model.add(Conv2D(filters=64, kernel_size=(5, 5), padding='Valid', activation='relu'))

# add 64 convolution filters used each of size 3x3 with relu activation
model.add(Conv2D(filters=64, kernel_size=(3, 3), padding='Same', activation='relu'))

# adding pooling layer with a MaxPool2D filter of size 2x2 summarize the presence of features
# in patches of the feature map.
model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))

# turn on and off neurons randomly for reducing interdependent learning amongst the neurons.
model.add(Dropout(0.2))

# # Flattens the data.
model.add(Flatten())

# add densely-connected NN layer, to fully connected to drives the final classification decision.
model.add(Dense(519, activation="relu"))

# turn on and off neurons randomly for reducing interdependent learning amongst the neurons.
model.add(Dropout(0.5))

# output a softmax to let the output to be interpreted as probabilities
model.add(Dense(10, activation="softmax"))


Let's display the architecture of our model.

In [None]:
model.summary()

#### 7.6 Model Compiling and Training

Before training the model, we need to compile :
* Loss function — This measures how accurate the model is during training. You want to minimize this function to "steer" the model in the right direction.
* Optimizer —This is how the model is updated based on the data it sees and its loss function.
* Metrics —Used to monitor the training and testing steps. The following example uses accuracy, the fraction of the images that are correctly classified

In [None]:
# model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-3), metrics=["accuracy"])

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.RMSprop(),
              metrics=['accuracy'])

because we are using the ```categorical_crossentropy``` loss method, we need to convert ```y_train```, ```y_test``` using one hot encoder.

In [None]:
y_train = to_categorical(y_train, num_classes = num_labels)
y_test = to_categorical(y_test, num_classes = num_labels)

#### 7.7 Reduce Learning Rate (LR)

Reduce learning rate when a metric has stopped improving.
Models often benefit from reducing the learning rate by a factor of 2-10 once learning stagnates. This callback monitors a quantity and if no improvement is seen for a 'patience' number of epochs, the learning rate is reduced.

In [None]:

reduce_lr = ReduceLROnPlateau(monitor='val_acc', patience=2, verbose=2, factor=0.5, min_lr=0.0000001)

#### 7.8 Image proccessing

because we want to reduce over-fitting, i will use Data Augmentation technique 
Data augmentation Applies a transformation to an image according to given parameters for example rotates, shears, zooms and other transformations to the image and make the model learns to generalize and not remember specific data. If the model overfits, it will perform very well on the images that it already knows but will fail if new images are given to it.

In [None]:
img_data_gen = ImageDataGenerator(
    featurewise_center=False,
    samplewise_center=False,
    featurewise_std_normalization=False,
    samplewise_std_normalization=False,
    zca_whitening=False,
    rotation_range=10,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=False,
    vertical_flip=False)



In [None]:
# epochs     - One Epoch is when an ENTIRE dataset is passed forward and backward through the neural network only ONCE
num_epochs = 1 # replace it to 30
# batch size -Total number of training examples present in a single batch.
batch_size = 64

train_generator = img_data_gen.flow(X_train, y_train, batch_size=batch_size)
test_generator = img_data_gen.flow(X_test, y_test, batch_size=batch_size)

In [None]:
# Save the model to disk
model.save('MNIST-1.h5')


start train

In [None]:


history = model.fit_generator(train_generator,
                    epochs=num_epochs,
                    validation_data=test_generator,
                    callbacks=[reduce_lr])

### 8. Model Evaluation

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1]) 

Confusion matrix

In [None]:
y_true =  [np.argmax(i) for i in y_test]
predictions = model.predict(X_test)
y_pred = [np.argmax(i) for i in predictions]
plt.figure(figsize=(15,8))
sns.heatmap(confusion_matrix(y_true, y_pred), cmap="coolwarm", annot=True , fmt="d")

accuracy vs. validation accuracy

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')

Let us take a loot to the first prediction

In [None]:
predictions[0]

We can see that the prediction is an array of 10 elements, each element representing the model "confidence" value for the corresponding class.
and to get the correct prediction we need to get the Max "confidence" value for each input

In [None]:
np.argmax(predictions[0])

The model says that the first digit is ```3```, let see the acutal class

In [None]:
np.argmax(y_test[0])

Cool the model is right !

Let us see the first 10 prediction images

In [None]:
def plot_image(i, predictions_array, true_label, img):
    predictions_array, true_label, img = predictions_array, true_label[i], img[i]
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])

    plt.imshow(img, cmap=plt.cm.binary)

    predicted_label = np.argmax(predictions_array)
    if predicted_label == true_label:
        color = 'blue'
    else:
        color = 'red'

    plt.xlabel("{} {:2.0f}% ({})".format(predicted_label,
                                100*np.max(predictions_array),
                                true_label),
                                color=color)

def plot_value_array(i, predictions_array, true_label):
    predictions_array, true_label = predictions_array, true_label[i]
    plt.grid(False)
    plt.xticks(range(10))
    plt.yticks([])
    thisplot = plt.bar(range(10), predictions_array, color="#777777")
    plt.ylim([0, 1])
    predicted_label = np.argmax(predictions_array)

    thisplot[predicted_label].set_color('red')
    thisplot[true_label].set_color('blue')

In [None]:
num_rows = 5
num_cols = 3
num_images = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_images):
    plt.subplot(num_rows, 2*num_cols, 2*i+1)
    plot_image(i, predictions[i], np.argmax(np.array(y_test), axis=1), X_test.reshape(-1,28,28))
    plt.subplot(num_rows, 2*num_cols, 2*i+2)
    plot_value_array(i, predictions[i], np.argmax(np.array(y_test), axis=1))
plt.tight_layout()
plt.show()

let display the model errors

In [None]:
errors = pd.DataFrame(np.argmax(y_test, axis=1), columns=['label'])

In [None]:
errors.reset_index(inplace=True)

In [None]:
errors

In [None]:
errors['predictions'] = y_pred

In [None]:
errors.loc[errors['label'] - errors['predictions'] != 0, 'error'] = 1

In [None]:
errors[errors['error']==1]

In [None]:
num_errors = len(errors[errors['error']==1].index)

In [None]:
print("number of errors is: {}".format(num_errors))

In [None]:
err_index = errors[errors['error']==1].index

In [None]:
plt.figure(figsize=(15,10))
for i in range(10):
    err_index = errors[errors['error']==1].index[i]
    plt.subplot(2,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(X_test[err_index].reshape(28,28))
    plt.xlabel("ture is {}, predicted as {}".format(np.argmax(y_test[err_index]), y_pred[err_index]))

In [None]:

test = test / 255
test = test.values.reshape(-1, 28, 28, 1)

In [None]:
final_predictions = model.predict(test)

In [None]:
final_predictions

In [None]:
final_predictions = list(map(lambda x : np.argmax(np.round(x)), final_predictions))

In [None]:
final_predictions[:10]

In [None]:
predicted_labels = pd.Series(final_predictions, name="Label")
image_id = pd.Series(range(1, len(predicted_labels)+1),name="ImageId")

results = pd.concat([image_id,predicted_labels],axis=1)

results.to_csv("MNIST.csv",index=False)