In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt 
import seaborn as sns

import random

import tensorflow as tf
print("TF_VERSION:", tf.__version__)

# set seed
tf.random.set_seed(0)
random.seed(0)
np.random.seed(0)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load Dataset

In [None]:
train_df = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test_df = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
train_df.shape, test_df.shape

#### Check for NULL values

In [None]:
train_df.isnull().sum().sum(), test_df.isnull().sum().sum()

#### Creating Numpy array from dataframe
- Image: (-1, 784)
- Label: (-1, 1)



In [None]:
# numpy array from pandas df
X_train, y_train = np.array(train_df.loc[:, 'pixel0':]), np.array(train_df.label)
X_test = np.array(test_df.loc[:, 'pixel0':])
print(X_train.shape, y_train.shape, X_test.shape)

#### Reshaping image from 1D to 2D, for whole dataset 2D to 3D.
- Image: (-1, 28, 28)
- Label: (-1, 1)

In [None]:
X_train = X_train.reshape(-1,28,28)
X_test = X_test.reshape(-1,28,28)
print(X_train.shape, y_train.shape, X_test.shape)

## Plotting

#### Check for number of samples per target variable

In [None]:
count_array = np.unique(y_train, return_counts=True)
plt.bar(count_array[0], count_array[1])
plt.xticks(count_array[0])
plt.xlabel("Target Variable")
plt.ylabel("Samples")
plt.title("Samples per target variable")
plt.show()

#### let's we plot some sample images

In [None]:
# let see some images 
plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5, 5, i+1)
    plt.imshow(X_train[i])
    plt.colorbar()
    plt.xlabel(y_train[i])
    plt.xticks([])
    plt.yticks([])
plt.show()

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout, BatchNormalization

from tensorflow.keras.callbacks import ReduceLROnPlateau

from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split 

In [None]:
X_train = X_train.reshape(-1,28,28,1)
X_test = X_test.reshape(-1,28,28,1)
print(X_train.shape, y_train.shape, X_test.shape)

In [None]:
# preprocessing and convert range from (0,255) -> (0, 1)
X_train = X_train / 255.0
X_test = X_test / 255.0

In [None]:
# cross check for preprocessing. what is data range actually is ?
X_train.min(), X_train.max(), X_test.min(), X_train.max()

In [None]:
# one hot encoding
y_train = to_categorical(y_train, num_classes=10)
y_train.shape

In [None]:
# split the data into train & validation, size of validation data is "33%"
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

## Model Training 

In [None]:
# let's try simple CNN model

model = Sequential([
    
    # feature learning model
    Conv2D(8, (3,3), padding='same', input_shape=(28,28,1), activation='relu'),
    Conv2D(8, (3,3), padding='same', activation='relu'),
    BatchNormalization(),
    MaxPool2D(2,2),
    
    Conv2D(16, (3,3), padding='same', activation='relu'),
    Conv2D(16, (3,3), padding='same', activation='relu'),
    BatchNormalization(),
    MaxPool2D(2,2),
    
    Dropout(0.2),
    
    Flatten(),
    
    # classification model
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    
    Dropout(0.2),
    Dense(10, activation='softmax')
    
])
model.summary()

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)

In [None]:
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_valid, y_valid), callbacks=[reduce_lr])

## Model validation

#### checking model accuracy

In [None]:
prediction = model.predict(X_valid, verbose=1)
prediction_arg_max = prediction.argmax(axis=1)

In [None]:
y_valid_arg_max = y_valid.argmax(axis=1)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_valid_arg_max, prediction_arg_max)
plt.figure(figsize=(10,5))
sns.heatmap(pd.DataFrame(cm), annot=True, cmap='Blues', fmt='g')
plt.title("Confusion Matrix")
plt.show()

## Save submission

In [None]:
pred = model.predict(X_test, verbose=1)
pred = pred.argmax(axis=1)

In [None]:
sample_df = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
sample_df.to_csv("submission.csv", index=False)

In [None]:
sample_df.head()

In [None]:
# i am kidding 
sample_df.groupby('Label').size()

In [None]:
# update classes with actual result
sample_df['Label'] = pred

In [None]:
# now looking good
sample_df.groupby('Label').size()

In [None]:
sample_df.to_csv("submission.csv", index=False)