In [59]:
import wandb
import numpy as np

config = {
    "lr": 0.15,
    "lr_reduce": 0.99992,
    "epochs": 500,
    "normalization": "max_div",
    "batch_size": 2,
    "layers": [{
        "nodes": 60,
        "activation": "tanh",
        "W_init_type": "random_small",
        "b_init_type": "zeros",
    },{
        "nodes": 10,
        "activation": "sigmoid",
        "W_init_type": "random_small",
        "b_init_type": "zeros",
    }]
}

lr = config['lr']
for i in range(config['epochs']):
    lr *= config['lr_reduce']
    if i%100 == 0:
        print(lr)

0.149988
0.14879283522632714
0.14760719400678016
0.14643100045384516
0.14526417928471075


In [60]:
from common import load_MNIST_data, sigmoid, relu, norm_max_pix_div, backward_relu, initialize_weights, Y_to_y, backward_sigmoid, backward_tanh, tanh

In [61]:
X_train, y_train, X_test, y_test = load_MNIST_data()
if config['normalization'] == 'max_div':
    X_train = X_train/255
    X_test = X_test/255
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((784, 60000), (1, 60000), (784, 10000), (1, 10000))

In [62]:
weights_and_bais =  initialize_weights(X_train.shape[0], config['layers'], mul=0.0001)
[[x['W'].shape, x['b'].shape] for x in weights_and_bais]

(60, 784) random_small
(60, 1) zeros
(10, 60) random_small
(10, 1) zeros


[[(60, 784), (60, 1)], [(10, 60), (10, 1)]]

In [63]:
Y = Y_to_y(y_train)
Y.T[0], y_train.T[0]

(array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]), array([5], dtype=uint8))

In [64]:
def accuracy(a0, weights, y):
    output = a0
    layers = config['layers']
    for layer_n in range(len(layers)):
        activation = layers[layer_n]['activation']
        W = weights_and_bais_local[layer_n]['W']
        b = weights_and_bais_local[layer_n]['b']
        Z = W.dot(output) + b
        if activation == 'relu':
            output = relu(Z)
        elif activation == 'sigmoid':
            output = sigmoid(Z)
        elif activation == 'tanh':
            output = tanh(Z)
        else:
            print('fuck')

    return np.mean(np.argmax(output.T, axis=1).reshape(1, a0.shape[1])==y)*100

In [65]:
import math
bs=config['batch_size']
n_batches = X_train.shape[1] // bs
X_train_batches = np.array_split(X_train.T[0:n_batches*bs], n_batches)
X_train_batches = np.array([X_train_batch.T for X_train_batch in X_train_batches])
y_train_batches = np.array_split(Y.T[0:n_batches*bs], n_batches)
y_train_batches = np.array([y_train_batch.T for y_train_batch in y_train_batches])
X_train_batches.shape, y_train_batches.shape

((30000, 784, 2), (30000, 10, 2))

In [None]:
import numpy as np
from copy import deepcopy
weights_and_bais_local = deepcopy(weights_and_bais)
wandb.init(project="mnist", entity="kaizen", config = config)
# model
train_accuracy = None
test_accuracy = None

lr = config['lr']
for epoch in range(config['epochs']):
    #forward
    cost = 0
    for batch_n in range(len(X_train_batches)):
        batch = X_train_batches[batch_n]
        output = batch
        layers = config['layers']
        caches = []
        for layer_n in range(len(layers)):
            activation = layers[layer_n]['activation']
            W = weights_and_bais_local[layer_n]['W']
            b = weights_and_bais_local[layer_n]['b']
            Z = W.dot(output) + b
            if activation == 'relu':
                output = relu(Z)
            elif activation == 'sigmoid':
                output = sigmoid(Z)
            elif activation == 'tanh':
                output = tanh(Z)
            else:
                print('fuck')
            cache = {}
            cache['W'] = W
            cache['b'] = b
            cache['Z'] = Z
            cache['A'] = output
            caches.append(cache)
        delta = output - y_train_batches[batch_n]
        loss = np.sum(delta ** 2)/batch.shape[1]
        cost += loss
        last_dA = 2*delta
        for layer_n in reversed(range(len(layers))):
            cache = caches[layer_n]
            layer = layers[layer_n]
            activation = layer['activation']
            activation_multiplicant = None
            if activation == 'relu':
                activation_multiplicant = backward_relu(cache['Z'])
            elif activation == 'sigmoid':
                activation_multiplicant = backward_sigmoid(cache['Z'])
            elif activation == 'tanh':
                activation_multiplicant = backward_tanh(cache['Z'])
            else:
                print('fuck')
            dZ = last_dA * activation_multiplicant
            previous_activation = caches[layer_n - 1]['A'] if layer_n > 0 else batch 
            m = previous_activation.shape[1] 
            dW = dZ.dot(previous_activation.T)/m
            db = np.sum(dZ, axis=1, keepdims=True)/m
            weights_and_bais_local[layer_n]['W'] = weights_and_bais_local[layer_n]['W'] - dW*lr
            weights_and_bais_local[layer_n]['b'] = weights_and_bais_local[layer_n]['b'] - db*lr
            last_dA = cache['W'].T.dot(dZ)
    if epoch%10 == 0:
        train_accuracy = accuracy(X_train, weights_and_bais_local, y_train)
        test_accuracy = accuracy(X_test, weights_and_bais_local, y_test)
        wandb.log({
            "train_accuracy": train_accuracy,
            "test_accuracy": test_accuracy,
        })
    cost /= X_train_batches.shape[0]
    wandb.log({
        'loss': cost,
        'learning_rate': lr,
        'epoch': epoch
    })
    if epoch%10 == 0:
        print(cost)
    lr *= config['lr_reduce']
        
        
from datetime import timedelta
from wandb import AlertLevel

wandb.alert(
    title='Experiment Completed',
    text=f'Accuracy train:{train_accuracy} test:{test_accuracy}',
    level=AlertLevel.WARN,
    wait_duration=timedelta(minutes=0)
)


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁
loss,█▄▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_accuracy,▁███████████████████████████████████████
train_accuracy,▁███████████████████████████████████████

0,1
epoch,499.0
learning_rate,0.14413
loss,0.01252
test_accuracy,96.46
train_accuracy,99.035


0.1574693372746708
0.04927198501382612
0.03641803051473059
0.029514955326080276
0.024695874471798766
0.021088985264942576
0.01959526377098124
0.0165391111735661
0.014324372604367168
0.014738092772414783
0.01261515565762205
0.012927127385706507
0.013428411821795012
0.013837371292257058
0.011212702852733169
0.010667810964431767
