In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading dataset

In [None]:
train_df = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test_df = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
submission_df = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')

#### Getting general idea about the dataset

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
test_df.shape

In [None]:
test_df.head()

In [None]:
submission_df.shape

In [None]:
submission_df.head()

#### Splitting training data into training and validation data

In [None]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [None]:
# Setting the features and target column for training dataset
train_x = train_df.drop('label',axis=1)
train_y = train_df.label
val_x = val_df.drop('label',axis=1)
val_y = val_df.label
train_y.head()
train_x.head()

# Scaling the dataset

In [None]:
train_x = train_x/255
val_x = val_x/255
test_df = test_df/255

# Simple Neural network With no hidden layers

In [None]:
model = keras.Sequential([
    keras.layers.Dense(10, input_shape=(784,),activation = 'sigmoid')
])

### Compiling the model

Before the model is ready for training, it needs a few more settings. These are added during the model's compile step:

* Loss function —This measures how accurate the model is during training. You want to minimize this function to "steer" the model in the right direction.
* Optimizer —This is how the model is updated based on the data it sees and its loss function.
* Metrics —Used to monitor the training and testing steps. The following example uses accuracy, the fraction of the images that are correctly classified.

In [None]:
model.compile(optimizer = 'adam',
             loss = 'sparse_categorical_crossentropy',
             metrics = ['accuracy'])
# Fitting the model with
model.fit(train_x,train_y,epochs = 20)

In [None]:
# lets check if our model predicts the first digit correctly


In [None]:
# To visualise the image it has to be reshaped into (28,28) pixel grid
plt.matshow(np.array(train_x.loc[0]).reshape(28,28))

In [None]:
prediction = model.predict(train_x)
prediction[0]

In [None]:
np.argmax(prediction[0])
prediction_label = [np.argmax(i) for i in prediction]

In [None]:
# We see that our model has predicted 1 and it matches the input

In [None]:
# Visualising the accuracy of the model
cm = tf.math.confusion_matrix(labels=train_y,predictions = prediction_label)
import seaborn as sns
sns.heatmap(cm,annot=True,fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

# Using Hidden Layer

In [None]:
from keras.layers import Dense, Dropout
model = keras.Sequential([
    Dense(100, input_shape=(784,), activation='relu'),
      
                    Dropout(0.2),
                    Dense(10, activation='softmax')
])

 the network consists of a sequence of two tf.keras.layers.Dense layers. These are densely connected, or fully connected, neural layers. The first Dense layer has 128 nodes (or neurons). The second (and last) layer is a 10-node softmax layer that returns an array of 10 probability scores that sum to 1. Each node contains a score that indicates the probability that the current image belongs to one of the 10 classes.

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
r=model.fit(train_x, train_y, epochs=10)

In [None]:
model.evaluate(val_x,val_y)

In [None]:
prediction = model.predict(train_x)
prediction_label = [np.argmax(i) for i in prediction]
# Visualising the accuracy of the model
cm = tf.math.confusion_matrix(labels=train_y,predictions = prediction_label)
import seaborn as sns
sns.heatmap(cm,annot=True,fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

# Submission

In [None]:
# predict = model.predict(test_df)
# predict_label = predict.argmax(axis=1)
# ImageID = np.arange(len(predict))+1
# Out = pd.DataFrame([ImageID,predict_label]).T
# Out.rename(columns = {0:'ImageId', 1:'Label'})
# #Out
# Out.to_csv('submission.csv', header =  ['ImageId', 'Label' ], index = None)

# DOING IT THE CNN WAY

### I have created the CNN notebook as a seperate unit but still adding it here to maintain continuity. I have repeated some introductory steps, this is just to demonstrate how the CNN differs and how to do it.

In [None]:
# Loading the dataset
train = pd.read_csv('../input/digit-recognizer/train.csv')
test = pd.read_csv('../input/digit-recognizer/test.csv')

In [None]:
train.shape

In [None]:
train.info()


In [None]:
train.head()


In [None]:
# Defining the target and features for the dataset
y_train = train['label']
x_train = train.drop('label',axis=1)

In [None]:
x_train.head()

# Modifiying the dataset for CNN


Train and test images (28px x 28px) has been stock into pandas.Dataframe as 1D vectors of 784 values. We reshape all data to 28x28x1 3D matrices.<br>
Keras requires an extra dimension in the end which correspond to channels. MNIST images are gray scaled so it use only one channel. For RGB images, there is 3 channels, we would have reshaped 784px vectors to 28x28x3 3D matrices.

In [None]:
# Scaling the values from 0 to 255   to   0 to 1
x_train = x_train/255.0
test = test/255.0


In [None]:
x_train = x_train.values.reshape(-1,28,28,1)
test = test.values.reshape(-1,28,28,1)

In [None]:
plt.imshow(x_train[4],cmap='Greys')

# Defining the CNN Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPool2D

In [None]:
model = Sequential()
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                 activation ='relu', input_shape = (28,28,1)))
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(10, activation = "softmax"))

The first layer is the convolutional layer. I set 32 filters for the two layers with a kernel size of (5 x 5).

The CNN can isolate features that are useful everywhere from these transformed images (feature maps).

The next layer after these two is the pooling layer.The pooling layers are used in CNN for consolidating the features learned by the convolutional layer feature map. It basically helps in the reduction of overfitting by the time of training of the model by compressing or generalizing the features in the feature map.

Combining convolutional and pooling layers, CNN are able to combine local features and learn more global features of the image.

Next is the dropout layer.Dropout is a regularization method that approximates training a large number of neural networks with different architectures in parallel.

During training, some number of layer outputs are randomly ignored or “dropped out.” This has the effect of making the layer look-like and be treated-like a layer with a different number of nodes and connectivity to the prior layer. In effect, each update to a layer during training is performed with a different “view” of the configured layer.

In a neural network, the activation function is responsible for transforming the summed weighted input from the node into the activation of the node or output for that input.

The rectified linear activation function or ReLU for short is a piecewise linear function that will output the input directly if it is positive, otherwise, it will output zero. It has become the default activation function for many types of neural networks because a model that uses it is easier to train and often achieves better performance.

Flattening is converting the data into a 1-dimensional array for inputting it to the next layer. We flatten the output of the convolutional layers to create a single long feature vector. And it is connected to the final classification model, which is called a fully-connected layer.

In [None]:
model.compile(optimizer='adam',
             loss = 'sparse_categorical_crossentropy',
             metrics=['accuracy'])
model.fit(x_train,y_train,epochs=10)

In [None]:
model.predict(test[0].reshape(1,28,28,1)).argmax()

In [None]:
plt.imshow(test[0],cmap='Greys')

### We clearly see the CNN model performing better, as was expected in the case of images. It is basically a modification in front of the already existing ANN,being able to create the vectors in a much better , reliable and efficient way.

# UPVOTE If you liked the work.