In [1]:
import numpy as np
import mnist

In [2]:
def one_hot(a, num_classes):
  return np.squeeze(np.eye(num_classes)[a.reshape(-1)])

In [3]:
train_images = mnist.train_images().reshape(-1,784)
train_labels = mnist.train_labels()
train_labels = one_hot(train_labels,10)

test_images = mnist.test_images().reshape(-1,784)
test_labels = mnist.test_labels()

In [4]:
def cross_entropy_loss(targets, predictions , epsilon=1e-12):
    predictions = np.clip(predictions, epsilon, 1. - epsilon)
    batch_size = predictions.shape[0]
    ce = -np.sum(targets*np.log(predictions+epsilon))/batch_size   
    return ce

def softmax_cross_entropy_grad(targets, predictions):
    batch_size = predictions.shape[0]
    grad = -(targets-predictions)/batch_size
    return grad

def softmax(predictions):
    predictions=predictions-np.max(predictions,axis=1,keepdims=True)  # dodging numerical overflows
    predictions=np.exp(predictions)
    return predictions/(np.sum(predictions,axis=1,keepdims=True))

def relu(x):
    return np.maximum(0,x)


In [5]:
train_images_batch =  [train_images[i:i + 256] for i in range(0, len(train_images), 256)] #  batch len = 256 
train_labels_batch = train_img =  [train_labels[i:i + 256] for i in range(0, len(train_labels), 256)]

In [48]:
input_shape = 784
layers_width=[256,256,10]
W = []
b = []
for i in range(len(layers_width)):
    if len(W)==0:
        W.append(0.01 * np.random.rand(input_shape,layers_width[i]))
    else:
        W.append(0.01 * np.random.rand(layers_width[i-1],layers_width[i]))
    b.append(np.zeros((1,layers_width[i])))


In [49]:
from time import sleep
from tqdm import tqdm
epochs=10
lr = 0.00001
for i in tqdm(range(epochs)):
    for batch_idx in range(len(train_images_batch)):
        img_batch=train_images_batch[i]
        label_batch=train_labels_batch[i]

        Z=[]  #pre-activation values
        A=[]  #post-activation values

        for i in range(len(W)):
            if i == 0:
                z=img_batch @ W[i] +b[i]
            else:
                z = A[i-1] @ W[i] + b[i]
            
            if i!=len(W)-1:
                a = relu(z)
            else:
                a = softmax(z)
            Z.append(z)
            A.append(a)
        # print(Z)
        dW=len(W)*[None]
        db = len(b) *[None]
        dL_dz = softmax_cross_entropy_grad(label_batch,A[-1])

        for i in range(len(W)-1,-1,-1):
            db[i] = np.sum(dL_dz, axis=0, keepdims=True)
            if i==0:
                dW[i] =  img_batch.T @ dL_dz
                break
            else:
                dW[i] =  A[i-1].T @ dL_dz

            dL_da = dL_dz @ W[i].T
            dL_da[Z[i-1]<=0]=0           #dL/dz = dL/da * da/dz    #da/dz = (0 if z<0) (1 if z>0)
            dL_dz=dL_da    
        
        for i in range(len(W)):
            W[i]+= -lr*dW[i] + 1e-3*W[i]
            b[i]+= -lr*db[i] + 1e-3*b[i]
    print(cross_entropy_loss(label_batch,  A[-1]))
        

 10%|█         | 1/10 [00:03<00:27,  3.01s/it]

22.36499092145624


 20%|██        | 2/10 [00:06<00:25,  3.22s/it]

23.21546660655528


 30%|███       | 3/10 [00:09<00:21,  3.11s/it]

24.096770043747696


 40%|████      | 4/10 [00:12<00:19,  3.30s/it]

24.833352534167933


 50%|█████     | 5/10 [00:17<00:18,  3.67s/it]

24.27503439403027


 60%|██████    | 6/10 [00:21<00:14,  3.74s/it]

24.096770043747696


 70%|███████   | 7/10 [00:25<00:11,  3.96s/it]

22.998645619007185


 80%|████████  | 8/10 [00:29<00:07,  3.94s/it]

23.378264643204812


 90%|█████████ | 9/10 [00:33<00:03,  3.83s/it]

21.676570516363473


100%|██████████| 10/10 [00:36<00:00,  3.66s/it]

13.944012679763881





In [43]:
a=test_images
for i in range(len(W)):
    z = a @ W[i] +b[i]
    if i!=len(W)-1:
        a = relu(z)
    else:
        a= softmax(z)
preds = np.argmax(a,axis=1)

In [45]:
from sklearn.metrics import accuracy_score
accuracy_score(test_labels,preds)

0.794

In [46]:
preds

array([7, 0, 1, ..., 4, 8, 6], dtype=int64)

In [47]:
test_labels[0]

7