In [None]:
# to do OHE labels
from keras.utils import to_categorical
# to show numbers
import matplotlib.pyplot as plt
# the only python lib we really need
import numpy as np

In [None]:
# read data
import pandas as pd
mnist_test = pd.read_csv("../input/mnist-in-csv/mnist_test.csv")
mnist_train = pd.read_csv("../input/mnist-in-csv/mnist_train.csv")

In [None]:
mnist_train.head()

In [None]:
# do numpy arrays
Xtrain = mnist_train.drop(['label'], axis=1).values
Ytrain =  mnist_train.loc[:, 'label'].values
Xtest = mnist_test.drop(['label'], axis=1).values
Ytest =  mnist_test.loc[:, 'label'].values

In [None]:
print(Xtrain.shape)
print(Ytrain.shape)
print(Xtest.shape)
print(Ytest.shape)

In [None]:
number_example = Xtrain[0].reshape(28, 28)

In [None]:
plt.imshow(number_example, cmap='gray')

In [None]:
# one hot encoded Y
Ytrain_ohe = to_categorical(Ytrain) 
Ytest_ohe = to_categorical(Ytest) 
print(Ytrain.shape)
print(Ytrain_ohe.shape)
print(Xtest.shape)
print(Ytest_ohe.shape)
print(Ytrain[0])
print(Ytrain_ohe[0])

<h3>Network math</h>
<br>
$$
\begin{aligned}
z^{(1)} & = xW^{(1)} + b^{(1)} \\
a^{(1)} & = \tanh(z^{(1)}) \\
z^{(2)} & = a^{(1)}W^{(2)} + b^{(2)} \\
a^{(2)} & = \hat{y} = \mathrm{softmax}(z^{(2)})
\end{aligned}
$$

In [None]:
# layers size
input_layer = 784
hidden_layer = 100
output_layer = 10

In [None]:
# initial weights and bias
W1 = np.random.randn( input_layer, hidden_layer ) #W1
b1 = np.random.randn( 1, hidden_layer ) #b1
W2 = np.random.randn( hidden_layer, output_layer ) #W2
b2 = np.random.randn( 1, output_layer ) #b2

In [None]:
W1.shape

<h3>Activations functions</h3>
<br>
hyperbolic tangent - tanh<br>
$$
\tanh(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}}
$$
<br>
Softmax<br>
$$
s(x)_i = \frac{e^{x_i}}{\sum_{j=1}^{C} e^{x_j}}
$$
<br>

In [None]:
# activations functions
def tanh(x):
    return np.tanh(x)

# for tyhe last layer (output)
def softmax(x):
    exp_scores = np.exp(x)
    return exp_scores / np.sum(exp_scores, axis=1, keepdims=True) 

In [None]:
# first layer
Z1 = Xtrain.dot(W1) + b1
print(Z1.shape)

In [None]:
# first layer after activation
A1 = tanh(Z1)
print(A1.shape)

In [None]:
# second layer
Z2 = A1.dot(W2) + b2
print(Z2.shape)

In [None]:
# second layer after activation - networkoutput (yhat)
A2 = softmax(Z2)
print(A2.shape)

In [None]:
# predict 
predictions = np.argmax(A2, axis=1)
print(predictions.shape)

<h3>Loss function used in multicalssifiaction</h3>
<br>
Cross entropy<br>
$$
L(y,\hat{y}) = -\sum_{j=1}^{C} y_j\log\hat{y}_j
$$
<br>
Cross entropy loss<br>
$$
J(W, b) = - \frac{1}{N}\sum_{j=1}^{C} L(y,\hat{y}) = - \frac{1}{N} \sum_{i=1}^{n} \sum_{j=1}^{C} y_{i,j} \log\hat{y}_{i,j}
$$

In [None]:
# forward propagation
def forward(X):
    Z1 = X.dot(W1) + b1
    A1 = tanh(Z1)
    Z2 = A1.dot(W2) + b2
    A2 = softmax(Z2)
    return A2

In [None]:
def loss(y, X):
    N = len(y)
    yhat = forward(X)
    logs = np.sum(np.log(yhat[range(N), y]))
    return -1.0/N * logs

In [None]:
def loss_alt(y, X):
    yhat = forward(X)
    return - np.mean( np.log( yhat[ range(len(yhat)), y ] ) )

In [None]:
# check our loss
print(loss(Ytrain, Xtrain))
print(loss_alt(Ytrain, Xtrain))

<h3>Backpropagation and fitting<h3><br>
tanh derivative<br><br>
$$
\frac{\mathrm d}{\mathrm d x} \tanh x = (1 - \tanh^2x)
$$
<br>
Loss fuction grandient for weights and bias computations:<br><br>
$$
\begin{aligned}
& \delta^{(2)} = \frac{\hat{y} - y}{ m } \\
\end{aligned}
$$
<BR>
$$
\begin{aligned}
& \delta^{(1)} = \delta^{(2)}W^{(2)T} \circ \frac{\mathrm d}{\mathrm d x} \tanh z^{(1)}  \\
\end{aligned}
$$
<BR>
$$
\begin{aligned}
& \frac{\partial{J}}{\partial{W^{(2)}}} = a^{(1)T} \delta^{(2)} \hspace{10mm} \frac{\partial{J}}{\partial{b^{(2)}}} = \delta^{(2)}\\ 
\end{aligned}
$$
<BR>
$$
\begin{aligned}  
& \frac{\partial{J}}{\partial{W^{(1)}}} = x^T \delta^{(1)} \hspace{10mm} \frac{\partial{J}}{\partial{b^{(1)}}} = \delta^{(1)} \\
\end{aligned}
$$

In [None]:
# tanh derivative
def tanh_dev(x):
    return 1.0-np.tanh(x)**2

In [None]:
# learning rate
learning_rate = 0.1

In [None]:
#backpropagation
# delta 2
delta2 = (A2-Ytrain_ohe)/len(Ytrain)

# to compute delta1 we need 
dZ1 = tanh_dev(Z1)

# delta 1
delta1 = delta2.dot(W2.T) * dZ1

# partial derivatives for weighs
dev_W2 = A1.T.dot(delta2)
dev_W1 = Xtrain.T.dot(delta1)

# partial derivatives for bias
dev_b2 = np.sum( delta2, axis=0, keepdims=True )
dev_b1 = np.sum( delta1, axis=0, keepdims=True )

# update waights and bias
W1 -= (learning_rate * dev_W1)
b1 -= (learning_rate * dev_b1)
W2 -= (learning_rate * dev_W2)
b2 -= (learning_rate * dev_b2)

In [None]:
print(loss(Ytrain, Xtrain))