In [1]:
import numpy as np
import copy

In [59]:
X_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_train.npy"
y_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_train.npy"
X_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_test.npy"
y_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_test.npy"

X_train, y_train = np.load(X_train_file), np.load(y_train_file)
X_test, y_test = np.load(X_test_file), np.load(y_test_file)

In [55]:
X_train = X_train[:5000]
y_train = y_train[:5000]

In [4]:
def one_hot_encode(y):
    ohe = np.zeros((y.size, y.max()+1))
    ohe[np.arange(y.size), y] = 1
    return ohe

In [60]:
m = X_train.shape[0]
n = 28 * 28
hidden_layers = [15]
r = 10
layers = hidden_layers.copy()
layers.append(r)

X_train = X_train.reshape((m, n)) # reshape
X_train = X_train / 255 # scale to 0-1
y_train = one_hot_encode(y_train)

In [6]:
def g(z):
    """
        sigmoid(z)
    """
    return 1/(1 + np.exp(-z))

In [7]:
def netj(theta_j, x_j):
    return np.dot(theta_j.T, x_j)

In [47]:
def add_x0(x):
    return x
    # return np.hstack((np.ones(1), x))

In [34]:
def forward_propagation(x, theta):
    o = [np.zeros(l) for l in layers]

    for l in range(len(layers)):
        for j in range(layers[l]):
            if l == 0:
                o[l][j] = g(netj(theta[l][j], add_x0(x)))
            else:
                o[l][j] = g(netj(theta[l][j], add_x0(o[l-1]))) # use all outputs of prev layer as network is fully connected
    
    return o

In [9]:
def back_propagation(y, o, theta):
    """
        Arguments:
            y: class labels
            o: outputs of each layer
            theta: parameters
        Returns:
            deltas: deltas[l][j] for each layer l and perceptron j
    """
    deltas = [np.zeros(l) for l in layers]

    # output layer
    output_layer = -1
    delta = (y - o[output_layer]) * o[output_layer] * (1 - o[output_layer])
    deltas[output_layer] = delta

    # hidden layers
    for l in reversed(range(len(hidden_layers))):
        for j in range(hidden_layers[l]):
            deltas[l][j] = sum(deltas[l+1][dwn_nbr] * theta[l+1][dwn_nbr, j] * o[l][j] * (1 - o[l][j]) for dwn_nbr in range(layers[l+1]))
        
    return deltas

In [10]:
def total_cost(theta, X, Y):
    m = X.shape[0]
    error = 0
    for i in range(m):
        x, y = X[i], Y[i]
        o = forward_propagation(x, theta)
        error += np.sum((y - o[-1]) ** 2)
    return error / (2 * m)

In [77]:
def gradient_descent(X_train, y_train, M, learning_rate, epsilon, max_epochs):
    """
        mini-batch SGD
    """
    epoch = 0
    t = 0
    k_repeats = 0
    # theta = [np.random.rand(layers[0], n+1)] + [np.random.rand(layers[l], layers[l-1]+1) for l in range(1, len(layers))]
    theta = [np.random.randn(layers[0], n) * np.sqrt(2/(n))] + [np.random.randn(layers[l], layers[l-1]) * np.sqrt(2/layers[l-1]) for l in range(1, len(layers))]
    # theta = [np.random.randn(layers[0], n+1) * np.sqrt(2/(n+1))] + [np.random.randn(layers[l], layers[l-1]+1) * np.sqrt(2/layers[l-1]+1) for l in range(1, len(layers))]
    # prev_theta = copy.deepcopy(theta)
    prev_cost = np.inf
    # prev_cost = total_cost(theta, X_train, y_train)

    # theta = [np.zeros((layers[0], n+1))] + [np.zeros((layers[l], layers[l-1]+1)) for l in range(1, len(layers))]
    # print("theta", theta)

    while True:
        epoch += 1
        if epoch > max_epochs:
            return theta

        print("epoch", epoch, total_cost(theta, X_train, y_train))

        # shuffle
        indices = np.arange(m)
        np.random.shuffle(indices)
        X_train_e = X_train[indices]
        y_train_e = y_train[indices]

        for b in range(int(m/M)):
            t += 1
            # print("b", b, epoch, t)
            sum_J_theta_derivatives = [np.zeros((layers[0], n))] + [np.zeros((layers[l], layers[l-1])) for l in range(1, len(layers))]

            for i in range(b * M, (b+1) * M):
                x, y = X_train_e[i], y_train_e[i]
                o = forward_propagation(x, theta)
                deltas = back_propagation(y, o, theta)

                # print("c", y, o[-1], y - o[-1])
                # cost += np.sum((y - o[-1]) ** 2)
                # print("x", x.shape, x)
                # print("o", o)
                # print("deltas", deltas)

                # calculate J(theta) derivatives
                for l in range(len(layers)):
                    if l == 0:
                        # x_j = add_x0(x)
                        x_j = x
                    else:
                        # x_j = add_x0(o[l-1])
                        x_j = o[l-1]
                    for j in range(layers[l]):
                        J_theta_derivative = - deltas[l][j] * x_j
                        sum_J_theta_derivatives[l][j] += J_theta_derivative

            # prev_theta = copy.deepcopy(theta)
            # print("sumJ", sum_J_theta_derivatives)
            # print("err", total_cost(theta, X_train, y_train))

            # calculating cost over the examples seen in the lastest batch before updating theta
            cost = total_cost(theta, X_train_e[b * M: (b+1) * M], y_train_e[b * M: (b+1) * M])
            # print("cost", prev_cost, cost, abs(prev_cost - cost))
            if abs(prev_cost - cost) <= epsilon:
                k_repeats += 1
            else:
                k_repeats = 0

            if k_repeats > 2:
                print("converged")
                return theta
            prev_cost = cost

            # update theta
            for l in range(len(layers)):
                for j in range(layers[l]):
                    theta[l][j] = theta[l][j] - learning_rate * (sum_J_theta_derivatives[l][j] / M)



            # print("w", theta)

            # break when stopping criteria meets
            # converged = True
            # theta_diff_all = 0
            # for l in range(len(layers)):
            #     # print("theta", theta[l], prev_theta[l])
            #     # print("theta_diff", abs(theta[l] - prev_theta[l]))
            #     theta_diff_all += np.sum(abs(theta[l] - prev_theta[l]))
            #     if not (abs(theta[l] - prev_theta[l]) <= epsilon).all():
            #         converged = False
            #         break
            # # print("stop", coverged, theta_diff_all)
            # if converged:
            #     k_repeats += 1
            # else:
            #     k_repeats = 0

            # if k_repeats > 2:
            #     print("converged!!")
            #     return theta


In [78]:
theta_opt = gradient_descent(X_train, y_train, 100, 0.5, 1e-5, 15)

epoch 1 1.2480930098810668
epoch 2 0.16601747991950103
epoch 3 0.09399524419852992
epoch 4 0.07181912906889298
epoch 5 0.06096475562185019
epoch 6 0.05444664585790814
epoch 7 0.049872758603950025
epoch 8 0.046602157939669965
epoch 9 0.04415451451126207
epoch 10 0.04202747502025329
epoch 11 0.04038629142855408
epoch 12 0.03899186531686826
epoch 13 0.0378164943150461
epoch 14 0.03671045051623328
epoch 15 0.03585942189879667


In [79]:
total_cost(theta_opt, X_train, y_train)

0.03496685118310934

In [80]:
X_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_train.npy"
y_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_train.npy"
X_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_test.npy"
y_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_test.npy"

X_train, y_train = np.load(X_train_file), np.load(y_train_file)
X_test, y_test = np.load(X_test_file), np.load(y_test_file)

m = X_train.shape[0]
X_train = X_train.reshape((m, n)) # reshape
X_train = X_train / 255 # scale to 0-1

m_test = X_test.shape[0]
X_test = X_test.reshape((m_test, n)) # reshape
X_test = X_test / 255 # scale to 0-1

acc = 0
for i in range(m):
    o_pred = forward_propagation(X_train[i], theta_opt)
    acc += int(np.argmax(o_pred[-1]) == y_train[i])

print("train", acc/m)

acc = 0
for i in range(m_test):
    o_pred = forward_propagation(X_test[i], theta_opt)
    acc += int(np.argmax(o_pred[-1]) == y_test[i])

print("test", acc/m_test)

train 0.9646666666666667
test 0.8897
