In [91]:
import numpy as np
import copy

In [153]:
X_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_train.npy"
y_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_train.npy"
X_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_test.npy"
y_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_test.npy"

X_train, y_train = np.load(X_train_file), np.load(y_train_file)
X_test, y_test = np.load(X_test_file), np.load(y_test_file)

In [93]:
def one_hot_encode(y):
    ohe = np.zeros((y.size, y.max()+1))
    ohe[np.arange(y.size), y] = 1
    return ohe

In [159]:
M = 1 # batch size
m = X_train.shape[0]
n = 28 * 28
hidden_layers = [10]
r = 10

X_train = X_train.reshape((m, n)) # reshape
X_train = X_train / 255 # scale to 0-1
y_train = one_hot_encode(y_train)
X_train.shape

(60000, 784)

In [95]:
layers = hidden_layers.copy()
layers.append(r)

In [96]:
theta = [np.random.rand(layers[0], n+1)] + [np.random.rand(layers[l], layers[l-1]+1) for l in range(1, len(layers))]
theta[0].shape

(10, 785)

In [97]:
def g(z):
    """
        sigmoid(z)
    """
    return 1/(1 + np.exp(-z))

In [98]:
def netj(theta_j, x_j):
    return np.dot(theta_j.T, x_j)

In [99]:
def forward_propagation(x, theta):
    o = [np.zeros(l) for l in layers]

    for l in range(len(layers)):
        for j in range(layers[l]):
            if l == 0:
                o[l][j] = g(netj(theta[l][j], np.hstack((np.ones(1), x))))
            else:
                o[l][j] = g(netj(theta[l][j], np.hstack((np.ones(1), o[l-1]))))
    
    return o

In [100]:
def back_propagation(y, o, theta):
    """
        Arguments:
            y: class labels
            o: outputs of each layer
            theta: parameters
        Returns:
            deltas: deltas[l][j] for each layer l and perceptron j
    """
    deltas = [np.zeros(l) for l in layers]

    # output layer
    output_layer = -1
    delta = (y - o[output_layer]) * o[output_layer] * (1 - o[output_layer])
    deltas[output_layer] = delta

    # hidden layers
    for l in reversed(range(len(hidden_layers))):
        for j in range(hidden_layers[l]):
            deltas[l][j] = sum(deltas[l+1][dwn_nbr] * theta[l+1][dwn_nbr, j] * o[l][j] * (1 - o[l][j]) for dwn_nbr in range(layers[l+1]))
        
    return deltas

In [157]:
def gradient_descent(M, learning_rate, epsilon):
    """
        mini-batch SGD
    """
    epoch = 0
    t = 0
    prev_cost = -1
    k_repeats = 0
    # theta = [np.random.rand(layers[0], n+1)] + [np.random.rand(layers[l], layers[l-1]+1) for l in range(1, len(layers))]
    theta = [np.zeros((layers[0], n+1))] + [np.zeros((layers[l], layers[l-1]+1)) for l in range(1, len(layers))]

    while True:
        epoch += 1
        if epoch > 8:
            return theta

        for b in range(int(m/M)):
            if b > 1000:
                return -1
            t += 1
            print("b", b, epoch, t)
            sum_J_theta_derivatives = [np.zeros((layers[0], n+1))] + [np.zeros((layers[l], layers[l-1]+1)) for l in range(1, len(layers))]

            cost = 0
            for i in range(b * M, (b+1) * M):
                x, y = X_train[i], y_train[i]
                o = forward_propagation(x, theta)
                deltas = back_propagation(y, o, theta)

                cost += np.sum((y - o[-1]) ** 2)
                # print("x", x.shape, x)
                # print("o", o)
                # print("deltas", deltas)

                # calculate J(theta) derivatives
                for l in range(len(layers)):
                    if l == 0:
                        x_j = np.hstack((np.ones(1), x))
                    else:
                        x_j = np.hstack((np.ones(1), o[l-1]))
                    for j in range(layers[l]):
                        J_theta_derivative = - deltas[l][j] * x_j
                        sum_J_theta_derivatives[l][j] += J_theta_derivative

            prev_theta = copy.deepcopy(theta)

            cost = cost / (2 * M)
            print("cost", cost)
            if prev_cost == -1:
                prev_cost = cost
            else:
                print("cost_diff", abs(prev_cost - cost))
                if abs(prev_cost - cost) <= epsilon:
                    k_repeats += 1
                else:
                    k_repeats = 0

                if k_repeats > 2:
                    print("converged")
                    return theta
                prev_cost = cost


            # print("sumJ", sum_J_theta_derivatives)

            # update theta
            for l in range(len(layers)):
                for j in range(layers[l]):
                    theta[l][j] = theta[l][j] - learning_rate * (sum_J_theta_derivatives[l][j] / M)

            # break when stopping criteria meets
            # stop = True
            # theta_diff_all = 0
            # for l in range(len(layers)):
            #     # print("theta", theta[l], prev_theta[l])
            #     # print("theta_diff", abs(theta[l] - prev_theta[l]))
            #     theta_diff_all += np.sum(abs(theta[l] - prev_theta[l]))
            #     if not (abs(theta[l] - prev_theta[l]) <= epsilon).all():
            #         stop = False
            #         break
            # print("stop", stop, theta_diff_all)
            # if stop:
            #     return theta


In [160]:
theta_opt = gradient_descent(100, 0.001, 1e-8)

b 0 1 1
cost 1.25
b 1 1 2
cost 1.249650038284824
cost_diff 0.00034996171517609476
b 2 1 3
cost 1.249300229635455
cost_diff 0.0003498086493689545
b 3 1 4
cost 1.2489505739116198
cost_diff 0.00034965572383516275
b 4 1 5
cost 1.2486010709806925
cost_diff 0.00034950293092728124
b 5 1 6
cost 1.248251720689289
cost_diff 0.00034935029140359397
b 6 1 7
cost 1.2479025228917173
cost_diff 0.00034919779757158764
b 7 1 8
cost 1.2475534774461274
cost_diff 0.0003490454455898906
b 8 1 9
cost 1.2472045843786506
cost_diff 0.0003488930674768742
b 9 1 10
cost 1.2468558436104746
cost_diff 0.0003487407681759258
b 10 1 11
cost 1.2465072544592908
cost_diff 0.0003485891511838535
b 11 1 12
cost 1.2461588170709277
cost_diff 0.0003484373883630987
b 12 1 13
cost 1.2458105313425383
cost_diff 0.0003482857283894081
b 13 1 14
cost 1.2454623983999789
cost_diff 0.0003481329425594204
b 14 1 15
cost 1.2451144155762872
cost_diff 0.00034798282369163935
b 15 1 16
cost 1.2447665858093742
cost_diff 0.0003478297669130548
b 16 1

In [163]:
m_test = X_test.shape[0]
X_test = X_test.reshape((m_test, n)) # reshape
X_test = X_test / 255 # scale to 0-1
y_test = one_hot_encode(y_test)
y_test

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [161]:
theta_opt

[array([[7.62353649e-02, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 9.28398155e-07, 0.00000000e+00],
        [4.47471888e-02, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 5.19511938e-07, 0.00000000e+00],
        [4.30329033e-02, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 4.98600540e-07, 0.00000000e+00],
        ...,
        [4.29969167e-02, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 4.98161513e-07, 0.00000000e+00],
        [4.29969167e-02, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 4.98161513e-07, 0.00000000e+00],
        [4.29969167e-02, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 4.98161513e-07, 0.00000000e+00]]),
 array([[-0.26113835, -0.15287051, -0.14307098, -0.14276276, -0.14275832,
         -0.14275829, -0.14275829, -0.14275829, -0.14275829, -0.14275829,
         -0.14275829],
        [-0.25981684, -0.15453163, -0.1438712 , -0.14352943, -0.1435245 ,
         -0.14352446

In [165]:
o_pred = forward_propagation(X_test[384], theta_opt)
np.argmax(o_pred[-1])

0

In [155]:
np.bincount(y_train)

array([6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000],
      dtype=int64)