In [24]:
import numpy as np
import pandas as pd
np.random.seed(42)

In [25]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
train.shape

(42000, 785)

In [27]:
test.shape

(28000, 784)

In [28]:
X_train = np.array(train.iloc[:,1:]).T
y_train = train['label'].to_numpy().reshape(-1,1)
X_train.shape

(784, 42000)

In [29]:
y_train.shape

(42000, 1)

In [30]:
practice = y_train.copy()

def label_encoder(y):
    zeros = np.zeros((y.shape[0], np.max(y)+1))
    for i, num in enumerate(y):
        zeros[i][np.squeeze(num)] = 1
    return zeros

label_encoder(practice).shape

(42000, 10)

In [31]:
def initialize_parameters(n_x, n_h=10, n_y=10): # (number of inputs, number of units in hidden, number of outputs)
    # Need 2 sets of parameters for 2-layer network
    W1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))

    parameters = {
        'W1': W1,
        'b1': b1,
        'W2': W2,
        'b2': b2
    }
    return parameters

In [32]:
def relu(Z):
    return np.maximum(0,Z)

In [33]:
def sigmoid(Z):
    sigmoid = 1 / (1 + np.exp(-Z))
    return sigmoid

In [53]:
def softmax(Z):
    softmax = np.exp(Z) / sum(np.exp(Z))
    return softmax

In [35]:
def forward_propagation(X, params):
    W1, b1, W2, b2 = params.values()
    Z1 = np.dot(W1, X) + b1
    A1 = relu(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = softmax(Z2)
    cache = {
        'Z1': Z1,
        'A1': A1,
        'Z2': Z2,
        'A2': A2
      }
    return A2, cache

In [36]:
params = initialize_parameters(X_train.shape[0])
A2, cache = forward_propagation(X_train, params)
A2.shape

(10, 1)

In [40]:
def compute_cost(A2, y):
    m = y.shape[0] 
    cost = -np.sum(label_encoder(y) * np.log(A2.T)) / m

    return cost

In [41]:
compute_cost(A2, y_train)

-8.186119788827666

### Back Propagation

In [73]:
def relu_deriv(Z):
    return Z > 0

In [74]:
def back_propagation(params, cache, X, y):
    W1, b1, W2, b2 = params.values()
    Z1, A1, Z2, A2 = cache.values()

    m = X.shape[1]

    y = label_encoder(y).T
    
    dZ2 = A2 - y
    dW2 = (1 / m) * np.dot(dZ2, A1.T)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)
    dZ1 = np.dot(W2.T, dZ2) * relu_deriv(Z1)
    dW1 = (1 / m) * np.dot(dZ1, X.T)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

    grad_params = {
        'dZ2': dZ2,
        'dW2': dW2,
        'db2': db2,
        'dZ1': dZ1,
        'dW1': dW1,
        'db1': db1
    }

    return grad_params

In [75]:
def update_parameters(params, gradients, learning_rate):
    
    params['W1'] -= learning_rate * gradients['dW1']
    params['b1'] -= learning_rate * gradients['db1']
    params['W2'] -= learning_rate * gradients['dW2']
    params['b2'] -= learning_rate * gradients['db2']

### Putting together the model

In [76]:
def model(X, y, learning_rate = 0.01, num_iterations = 3000):
    parameters = initialize_parameters(X.shape[0])
    costs = []
    for i in range(num_iterations):  
        A2, cache = forward_propagation(X, parameters)
        curr_cost = compute_cost(A2,y)

        gradients = back_propagation(parameters, cache, X, y)
        update_parameters(parameters, gradients, learning_rate)
        if i % 100 == 0:
            costs.append(curr_cost)
            print(f'Cost for iteration {i}: {curr_cost}')
    return parameters, costs

In [77]:
parameters, costs = model(X_train, y_train, num_iterations = 2500)

Cost for iteration 0: 2.4944104156178533
Cost for iteration 100: 2.302396730512482
Cost for iteration 200: 2.302166897443555
Cost for iteration 300: 2.3019771846543073
Cost for iteration 400: 2.3017891536421793
Cost for iteration 500: 2.3016792087709543
Cost for iteration 600: 2.301580162815391
Cost for iteration 700: 2.3014963242791704
Cost for iteration 800: 2.301423360939602
Cost for iteration 900: 2.3013342556558625
Cost for iteration 1000: 2.301528670024918
Cost for iteration 1100: 2.301464063694734
Cost for iteration 1200: 2.3014107729753035
Cost for iteration 1300: 2.3013668255607054
Cost for iteration 1400: 2.3013305903779147
Cost for iteration 1500: 2.3013007191886077
Cost for iteration 1600: 2.3012760980312432
Cost for iteration 1700: 2.301255806880615
Cost for iteration 1800: 2.301239086162089
Cost for iteration 1900: 2.301225308977603
Cost for iteration 2000: 2.3012139580860986
Cost for iteration 2100: 2.301204606837594
Cost for iteration 2200: 2.3011969033918955
Cost for i

In [79]:
def predict(parameters, X):
    A2, cache = forward_propagation(X, parameters)
    return A2

def accuracy(A2, Y):
    return np.sum(predictions == label_encode(Y).T) / label_encode(Y).T.size[1]

In [None]:
y_hat = predict(A2, X_tetx

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
