# Intro
Welcome To The famous [MINST](https://www.kaggle.com/c/digit-recognizer) Competition
![](https://storage.googleapis.com/kaggle-competitions/kaggle/3004/logos/header.png)
This notebook is a starter code for all beginners and easy to understand. To predict the test data a simple CNN is used.

<span style="color: royalblue;">Please vote the notebook up if it helps you. Thank you. </span>

# Load Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
print(os.listdir("../input"))

In [None]:
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.optimizers import RMSprop,Adam
from keras.preprocessing.image import ImageDataGenerator

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

# Load Data

In [None]:
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')

# Define X_train, y_train and X_test

In [None]:
X_train = train_data.copy()
y_train = train_data['label']
del X_train['label']
X_test = test_data.copy()
y_train = to_categorical(y_train, num_classes = 10)

In [None]:
X_train = X_train.values.reshape(-1,28,28,1)
X_test = X_test.values.reshape(-1,28,28,1)

# Distribution Of Pixel Color
For every number in the train dataset we calculate the mean distribution of the 

In [None]:
df_numbers = pd.DataFrame(index=range(0, 256))
for i in range(10):
    num = train_data[train_data['label']==i][train_data.columns[1:]]
    df_counts = pd.DataFrame()
    for row in num.index:
        df_counts[row] = num.loc[row].value_counts()
    df_numbers[i] = df_counts.mean(axis=1)
df_numbers.fillna(0, inplace=True)

df_numbers.fillna(0, inplace=True)
fig, axs = plt.subplots(2, 3, figsize=(22, 6))
fig.subplots_adjust(hspace = .5, wspace=.5)
axs = axs.ravel()
for i in range(6):
    axs[i].bar(df_numbers.columns, df_numbers.iloc[i+250])
    axs[i].grid()
    axs[i].set_title('color: '+str(i+250))
    axs[i].set_xticks(df_numbers.columns)
    axs[i].set_xlabel('number')
    axs[i].set_ylabel('mean frequence')

As we can see the distribution of the pixel color unequal 0 (black) for number 8 and 9 are similiar.

# Scale data

In [None]:
X_train = X_train.astype('float32')/255
X_test = X_test.astype('float32')/255

# Some Examples

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(15, 6))
fig.subplots_adjust(hspace = .5, wspace=.5)
axs = axs.ravel()
for i in range(10):
    idx = train_data[train_data['label']==i].index[0]
    axs[i].imshow(X_train[idx][:,:,0], cmap='gray')
    axs[i].set_title(y_train[idx].argmax())
    axs[i].set_xticklabels([])
    axs[i].set_yticklabels([])

# Split train data to get val data

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state=2020)

# Define model
We use a simple CNN model:

In [None]:
model = Sequential()
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                 activation ='relu', input_shape = (28,28,1)))
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.25))


model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                 activation ='relu'))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(0.25))


model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(10, activation = "softmax"))

In [None]:
optimizer = RMSprop(lr=0.001,rho=0.9, epsilon=1e-08, decay=0.0)

In [None]:
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
epochs = 50
batch_size = 378

# Define the ImageDataGenerator

In [None]:
datagen = ImageDataGenerator(
        featurewise_center=False,
        samplewise_center=False,
        featurewise_std_normalization=False,
        samplewise_std_normalization=False,
        zca_whitening=False,
        rotation_range=10,
        zoom_range = 0.1,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=False,
        vertical_flip=False)
datagen.fit(X_train)

# Train model

In [None]:
history = model.fit_generator(datagen.flow(X_train, y_train, batch_size=batch_size),
                              epochs = epochs, validation_data = (X_val, y_val),
                              steps_per_epoch=X_train.shape[0] // batch_size)

# Predict test data

In [None]:
y_test = model.predict(X_test)

In [None]:
y_test_classes = np.argmax(y_test, axis = 1)

# Write Output for Submission

In [None]:
num = range(1, len(y_test)+1)
output = pd.DataFrame({'ImageId': num,
                       'Label': y_test_classes})
output.to_csv('submission.csv', index=False)

# Analyse results

In [None]:
loss = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(1, len(loss)+1)
plt.plot(epochs, loss, 'bo', label='loss_train')
plt.plot(epochs, loss_val, 'b', label='loss_val')
plt.title('value of the loss function')
plt.xlabel('epochs')
plt.ylabel('value of the loss function')
plt.legend()
plt.grid()
plt.show()

In [None]:
acc = history.history['accuracy']
acc_val = history.history['val_accuracy']
epochs = range(1, len(loss)+1)
plt.plot(epochs, acc, 'bo', label='accuracy_train')
plt.plot(epochs, acc_val, 'b', label='accuracy_val')
plt.title('accuracy')
plt.xlabel('epochs')
plt.ylabel('value of accuracy')
plt.legend()
plt.grid()
plt.show()

# Analyse Wrong Predictions
We want to analyse the wrong predictions on the validation dataset.

In [None]:
y_val_pred = model.predict(X_val)

In [None]:
conf_mat = confusion_matrix(y_val.argmax(axis=1), y_val_pred.argmax(axis=1))

fig, ax = plot_confusion_matrix(conf_mat=conf_mat,
                                show_normed=True,
                                show_absolute=False,
                                figsize=(8, 8))
fig.show()