In [323]:
import numpy as np
import copy
import time

In [351]:
X_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_train.npy"
y_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_train.npy"
X_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_test.npy"
y_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_test.npy"

X_train, y_train = np.load(X_train_file), np.load(y_train_file)
X_test, y_test = np.load(X_test_file), np.load(y_test_file)

In [342]:
def one_hot_encode(y):
    ohe = np.zeros((y.size, y.max()+1))
    ohe[np.arange(y.size), y] = 1
    return ohe

In [360]:
m = X_train.shape[0]
n = 28 * 28
hidden_layers = [1]
r = 10
layers = hidden_layers.copy()
layers.append(r)

X_train = X_train.reshape((m, n)) # reshape
X_train = X_train / 255 # scale to 0-1
y_train = one_hot_encode(y_train)

In [5]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

In [None]:
def ReLU(z):
    return np.maximum(0, z)

def ReLU_derivative(z):
    return 1 * (z > 0) # 1 for z[i] > 0 else 0 for each i

In [330]:
def net(theta, x):
    """
        Returns net_j = sum(theta_j . x_j)
    """
    return theta.dot(x)

In [332]:
def forward_propagation(x, theta, only_output_layer, use_relu):
    """
        Arguments: 
            x: input example
            theta: parmeters
            only_output_layer: if True, return only output of last layer else return output for all layers
        Returns:
            o: outputs of each layer l and neuron j by forward propagation
    """
    
    o = [np.zeros(l) for l in layers]

    if use_relu:
        g = ReLU
    else:
        g = sigmoid

    for l in range(len(layers)):
        if l == 0: # first layer: input is x from training data
            o[l] = g(net(theta[l], x))
        elif l == len(layers) - 1: # output layer: use sigmoid always
            o[l] = sigmoid(net(theta[l], o[l-1]))
        else: # hidden layers: input is output of prev layer
            o[l] = g(net(theta[l], o[l-1]))

    if only_output_layer:
        return o[-1]
    else:
        return o

In [345]:
def back_propagation(y, o, theta, use_relu):
    """
        Arguments:
            y: class labels
            o: outputs of each layer
            theta: parameters
        Returns:
            deltas: deltas[l][j] for each layer l and neuron j by backpropagation
    """
    deltas = [np.zeros(l) for l in layers]

    # output layer
    output_layer = -1
    delta = (y - o[output_layer]) * o[output_layer] * (1 - o[output_layer])
    deltas[output_layer] = delta

    # hidden layers
    for l in reversed(range(len(hidden_layers))):
        if use_relu:
            derivative = ReLU_derivative(o[l])
        else:
            derivative = o[l] * (1 - o[l]) # equivalent to derivative of sigmoid(netj)

        deltas[l] = (theta[l+1].T @ deltas[l+1]) * derivative # = sum(deltas[l+1][dwn_nbr] * theta[l+1][dwn_nbr, j] * derivative for dwn_nbr in range(layers[l+1]))
        
    return deltas

In [334]:
def get_cost(theta, X, Y, use_relu):
    m = X.shape[0]
    outputs = np.apply_along_axis(forward_propagation2, 1, X, theta, True, use_relu)
    return np.sum((outputs - Y) ** 2) / (2 * m)

In [335]:
def init_theta(n, layers):
    # He initialization
    theta = [np.random.randn(layers[0], n) * np.sqrt(2/(n))] + [np.random.randn(layers[l], layers[l-1]) * np.sqrt(2/layers[l-1]) for l in range(1, len(layers))]
    return theta

In [336]:
def gradient_descent(X_train, y_train, M, learning_rate, epsilon, max_epochs, adaptive_learning, use_relu):
    """
        mini-batch SGD
    """
    epoch = 0
    k_repeats = 0
    k_repeats_limit = 2
    theta = init_theta(n, layers)
    prev_cost = np.inf

    t0 = time.time()

    while True:
        epoch += 1
        if epoch > max_epochs:
            return theta

        if adaptive_learning:
            learning_rate = 0.5 / np.sqrt(epoch)

        print("epoch", epoch, get_cost(theta, X_train, y_train, use_relu), learning_rate, time.time() - t0)

        # shuffle at each epoch
        indices = np.arange(m)
        np.random.shuffle(indices)
        X_train_e = X_train[indices]
        y_train_e = y_train[indices]

        for b in range(int(m/M)):
            sum_J_theta_derivatives = [np.zeros((layers[0], n))] + [np.zeros((layers[l], layers[l-1])) for l in range(1, len(layers))]

            for i in range(b * M, (b+1) * M):
                x, y = X_train_e[i], y_train_e[i]
                o = forward_propagation2(x, theta, only_output_layer=False, use_relu=use_relu)
                deltas = back_propagation2(y, o, theta, use_relu)

                # calculate J(theta) derivatives
                for l in range(len(layers)):
                    if l == 0:
                        x_j = x
                    else:
                        x_j = o[l-1]
                    for j in range(layers[l]):
                        J_theta_derivative = - deltas[l][j] * x_j
                        sum_J_theta_derivatives[l][j] += J_theta_derivative # sum over J(theta) derivatives over the batch

            # calculating cost over the examples seen in the lastest batch before updating theta
            cost = get_cost(theta, X_train_e[b * M: (b+1) * M], y_train_e[b * M: (b+1) * M], use_relu)
            if abs(prev_cost - cost) <= epsilon:
                k_repeats += 1
            else:
                k_repeats = 0

            if k_repeats >= k_repeats_limit:
                print("converged")
                return theta
            prev_cost = cost

            # update theta
            for l in range(len(layers)):
                    theta[l] = theta[l] - learning_rate * (sum_J_theta_derivatives[l] / M)



In [356]:
t0 = time.time()

theta_opt = gradient_descent(X_train, y_train, M=100, learning_rate=0.5, epsilon=1e-4, max_epochs=100, adaptive_learning=False, use_relu=True)

print("done", time.time() - t0)

epoch 1 1.251868796491821 0.2 0.7239954471588135
epoch 2 0.09626699136686201 0.2 7.4801716804504395
epoch 3 0.06708203804556807 0.2 14.044877052307129
epoch 4 0.05719477509986197 0.2 20.333900928497314
epoch 5 0.05154458581180813 0.2 26.603447437286377
epoch 6 0.04790138041116837 0.2 32.909809589385986
epoch 7 0.045337861077174586 0.2 39.25278639793396
epoch 8 0.04325231084283853 0.2 46.45357155799866
epoch 9 0.041800947637161696 0.2 53.55356407165527
epoch 10 0.04016288265537348 0.2 60.78118085861206
epoch 11 0.038850821389720194 0.2 70.68149065971375
epoch 12 0.03792558625924073 0.2 77.76149034500122
epoch 13 0.036788570797042786 0.2 84.7254958152771
epoch 14 0.03583285780447631 0.2 91.55201292037964
epoch 15 0.03519570124087757 0.2 98.39052367210388
epoch 16 0.034315934810453745 0.2 105.34255409240723
epoch 17 0.033888507299502095 0.2 112.38655519485474
epoch 18 0.033023811449058214 0.2 119.33883595466614
epoch 19 0.03254031252073965 0.2 126.2627854347229
epoch 20 0.0319562657390083

In [358]:
get_cost(theta_opt, X_train, y_train, use_relu=True)

0.019688736245311384

In [339]:
def predict(theta, X, use_relu):
    outputs = np.apply_along_axis(forward_propagation2, 1, X, theta, True, use_relu)
    predictions = np.argmax(outputs, axis=1)
    return predictions

In [359]:
X_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_train.npy"
y_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_train.npy"
X_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_test.npy"
y_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_test.npy"

X_train, y_train = np.load(X_train_file), np.load(y_train_file)
X_test, y_test = np.load(X_test_file), np.load(y_test_file)

m = X_train.shape[0]
X_train = X_train.reshape((m, n)) # reshape
X_train = X_train / 255 # scale to 0-1

m_test = X_test.shape[0]
X_test = X_test.reshape((m_test, n)) # reshape
X_test = X_test / 255 # scale to 0-1


acc = 0
# for i in range(m):
#     o_pred = forward_propagation(X_train[i], theta_opt, use_relu=True)
#     acc += int(np.argmax(o_pred[-1]) == y_train[i])
acc = np.sum(predict(theta_opt, X_train, use_relu=True) == y_train)

print("train", acc/m)

acc = 0
# for i in range(m_test):
#     o_pred = forward_propagation(X_test[i], theta_opt, use_relu=True)
#     acc += int(np.argmax(o_pred[-1]) == y_test[i])

acc = np.sum(predict(theta_opt, X_test, use_relu=True) == y_test)

print("test", acc/m_test)

train 0.9801333333333333
test 0.9108
