In [60]:
import numpy as np
import copy

In [346]:
X_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_train.npy"
y_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_train.npy"
X_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_test.npy"
y_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_test.npy"

X_train, y_train = np.load(X_train_file), np.load(y_train_file)
X_test, y_test = np.load(X_test_file), np.load(y_test_file)

In [347]:
X_train = X_train[:15000]
y_train = y_train[:15000]

In [288]:
def one_hot_encode(y):
    ohe = np.zeros((y.size, y.max()+1))
    ohe[np.arange(y.size), y] = 1
    return ohe

In [348]:
M = 1 # batch size
m = X_train.shape[0]
n = 28 * 28
hidden_layers = [10]
r = 10

X_train = X_train.reshape((m, n)) # reshape
# X_train = X_train / 255 # scale to 0-1
y_train = one_hot_encode(y_train)

In [297]:
# shuffle
indices = np.arange(m)
np.random.shuffle(indices)
X_train = X_train[indices]
y_train = y_train[indices]

In [298]:
layers = hidden_layers.copy()
layers.append(r)

In [114]:
def g(z):
    """
        sigmoid(z)
    """
    return 1/(1 + np.exp(-z))

In [115]:
def netj(theta_j, x_j):
    return np.dot(theta_j.T, x_j)

In [285]:
def forward_propagation(x, theta):
    o = [np.zeros(l) for l in layers]

    for l in range(len(layers)):
        for j in range(layers[l]):
            if l == 0:
                o[l][j] = g(netj(theta[l][j], np.hstack((np.ones(1), x))))
            else:
                o[l][j] = g(netj(theta[l][j], np.hstack((np.ones(1), o[l-1]))))
    
    return o

In [70]:
def back_propagation(y, o, theta):
    """
        Arguments:
            y: class labels
            o: outputs of each layer
            theta: parameters
        Returns:
            deltas: deltas[l][j] for each layer l and perceptron j
    """
    deltas = [np.zeros(l) for l in layers]

    # output layer
    output_layer = -1
    delta = (y - o[output_layer]) * o[output_layer] * (1 - o[output_layer])
    deltas[output_layer] = delta

    # hidden layers
    for l in reversed(range(len(hidden_layers))):
        for j in range(hidden_layers[l]):
            deltas[l][j] = sum(deltas[l+1][dwn_nbr] * theta[l+1][dwn_nbr, j] * o[l][j] * (1 - o[l][j]) for dwn_nbr in range(layers[l+1]))
        
    return deltas

In [291]:
def total_cost(theta, X, Y):
    m = X.shape[0]
    error = 0
    for i in range(m):
        x, y = X[i], Y[i]
        o = forward_propagation(x, theta)
        error += np.sum((y - o[-1]) ** 2)
    return error / (2 * m)

In [292]:
total_cost(theta, X_train, y_train)

4.4177357821552805

In [350]:
def gradient_descent(X_train, y_train, M, learning_rate, epsilon, max_epochs):
    """
        mini-batch SGD
    """
    epoch = 0
    t = 0
    k_repeats = 0
    theta = [np.random.rand(layers[0], n+1)] + [np.random.rand(layers[l], layers[l-1]+1) for l in range(1, len(layers))]
    # prev_theta = copy.deepcopy(theta)
    # prev_cost = -1
    prev_cost = total_cost(theta, X_train, y_train)

    # theta = [np.zeros((layers[0], n+1))] + [np.zeros((layers[l], layers[l-1]+1)) for l in range(1, len(layers))]
    # print("theta", theta)

    while True:
        epoch += 1
        if epoch > max_epochs:
            return theta

        print("epoch", epoch, total_cost(theta, X_train, y_train))

        # shuffle
        indices = np.arange(m)
        np.random.shuffle(indices)
        X_train_e = X_train[indices]
        y_train_e = y_train[indices]

        for b in range(int(m/M)):
            t += 1
            # print("b", b, epoch, t)
            sum_J_theta_derivatives = [np.zeros((layers[0], n+1))] + [np.zeros((layers[l], layers[l-1]+1)) for l in range(1, len(layers))]

            for i in range(b * M, (b+1) * M):
                x, y = X_train_e[i], y_train_e[i]
                o = forward_propagation(x, theta)
                deltas = back_propagation(y, o, theta)

                # print("c", y, o[-1], y - o[-1])
                # cost += np.sum((y - o[-1]) ** 2)
                # print("x", x.shape, x)
                # print("o", o)
                # print("deltas", deltas)

                # calculate J(theta) derivatives
                for l in range(len(layers)):
                    if l == 0:
                        x_j = np.hstack((np.ones(1), x))
                    else:
                        x_j = np.hstack((np.ones(1), o[l-1]))
                    for j in range(layers[l]):
                        J_theta_derivative = - deltas[l][j] * x_j
                        sum_J_theta_derivatives[l][j] += J_theta_derivative

            # prev_theta = copy.deepcopy(theta)
            # print("sumJ", sum_J_theta_derivatives)
            # print("err", total_cost(theta, X_train, y_train))

            # update theta
            for l in range(len(layers)):
                for j in range(layers[l]):
                    theta[l][j] = theta[l][j] - learning_rate * (sum_J_theta_derivatives[l][j] / M)

            cost = total_cost(theta, X_train, y_train)
            # print("cost", prev_cost, cost, abs(prev_cost - cost))
            if abs(prev_cost - cost) <= epsilon:
                k_repeats += 1
            else:
                k_repeats = 0

            if k_repeats > 1:
                print("converged")
                return theta
            prev_cost = cost

            # print("w", theta)

            # break when stopping criteria meets
            # converged = True
            # theta_diff_all = 0
            # for l in range(len(layers)):
            #     # print("theta", theta[l], prev_theta[l])
            #     # print("theta_diff", abs(theta[l] - prev_theta[l]))
            #     theta_diff_all += np.sum(abs(theta[l] - prev_theta[l]))
            #     if not (abs(theta[l] - prev_theta[l]) <= epsilon).all():
            #         converged = False
            #         break
            # # print("stop", coverged, theta_diff_all)
            # if converged:
            #     k_repeats += 1
            # else:
            #     k_repeats = 0

            # if k_repeats > 2:
            #     print("converged!!")
            #     return theta


In [351]:
theta_opt = gradient_descent(X_train, y_train, 100, 0.5, 1e-6, 40)

epoch 1 4.433368953545143
epoch 2 0.84473238719306
epoch 3 0.45008193093226584
epoch 4 0.4500479464755232
converged


In [352]:
total_cost(theta_opt, X_train, y_train)

0.4500666523720425

In [248]:
m_test = X_test.shape[0]
X_test = X_test.reshape((m_test, n)) # reshape
X_test = X_test / 255 # scale to 0-1
y_test = one_hot_encode(y_test)
y_test

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [344]:
theta_opt

[array([[0.86991288, 0.32321739, 0.83377991, ..., 0.82080913, 0.07864687,
         0.35536597],
        [0.7883652 , 0.19921964, 0.96623474, ..., 0.38366965, 0.5132148 ,
         0.38671487],
        [0.43544278, 0.20834028, 0.73324651, ..., 0.11509007, 0.03154932,
         0.67572304],
        ...,
        [0.08513006, 0.41104814, 0.51339887, ..., 0.89236081, 0.8689392 ,
         0.7074392 ],
        [0.99745042, 0.05653553, 0.45763151, ..., 0.42595098, 0.15559207,
         0.27000851],
        [0.82311161, 0.05872331, 0.72825314, ..., 0.70812267, 0.65214924,
         0.53505052]]),
 array([[ 0.17288016, -0.01834767,  0.05160397, -0.27903105, -0.63207329,
         -0.46659137,  0.16931675, -0.172058  , -0.37038378,  0.0104089 ,
         -0.58379329],
        [-0.19192866,  0.11521075,  0.0814054 ,  0.12284444, -0.56885953,
         -0.30784747, -0.10268008, -0.33978089, -0.49283215, -0.64688701,
          0.13491257],
        [ 0.14332961, -0.20412825, -0.00567166, -0.64390761, -0.349

In [353]:
tt = 4357

o_pred = forward_propagation(X_train[tt], theta_opt)

print(o_pred[-1], y_train[tt])
# o_pred[-1]

dd = y_train[tt] - o_pred[-1]

# print(y_train[230], o_pred[-1])

print(dd, dd ** 2, np.sum(dd ** 2))

[0.10523046 0.09570368 0.10130242 0.10370809 0.10464016 0.09657548
 0.10038259 0.09881762 0.09730301 0.09452589] [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[-0.10523046 -0.09570368 -0.10130242 -0.10370809 -0.10464016 -0.09657548
 -0.10038259  0.90118238 -0.09730301 -0.09452589] [0.01107345 0.00915919 0.01026218 0.01075537 0.01094956 0.00932682
 0.01007666 0.81212968 0.00946787 0.00893514] 0.9021359403644441
