In [1]:
import numpy as np
import copy

In [10]:
X_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_train.npy"
y_train_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_train.npy"
X_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/X_test.npy"
y_test_file = "C:/IITD/sem5/col774-ml/datasets/kannada_digits/neural_network_kannada/y_test.npy"

X_train, y_train = np.load(X_train_file), np.load(y_train_file)
X_test, y_test = np.load(X_test_file), np.load(y_test_file)

In [3]:
def one_hot_encode(y):
    ohe = np.zeros((y.size, y.max()+1))
    ohe[np.arange(y.size), y] = 1
    return ohe

In [11]:
M = 1 # batch size
m = X_train.shape[0]
n = 28 * 28
hidden_layers = [10]
r = 10

X_train = X_train.reshape((m, n)) # reshape
X_train = X_train / 255 # scale to 0-1
y_train = one_hot_encode(y_train)

# shuffle
indices = np.arange(m)
np.random.shuffle(indices)
X_train = X_train[indices]
y_train = y_train[indices]

In [12]:
layers = hidden_layers.copy()
layers.append(r)

In [13]:
theta = [np.random.rand(layers[0], n+1)] + [np.random.rand(layers[l], layers[l-1]+1) for l in range(1, len(layers))]
theta[0].shape

(10, 785)

In [14]:
def g(z):
    """
        sigmoid(z)
    """
    return 1/(1 + np.exp(-z))

In [15]:
def netj(theta_j, x_j):
    return np.dot(theta_j.T, x_j)

In [16]:
def forward_propagation(x, theta):
    o = [np.zeros(l) for l in layers]

    for l in range(len(layers)):
        for j in range(layers[l]):
            if l == 0:
                o[l][j] = g(netj(theta[l][j], np.hstack((np.ones(1), x))))
            else:
                o[l][j] = g(netj(theta[l][j], np.hstack((np.ones(1), o[l-1]))))
    
    return o

In [17]:
def back_propagation(y, o, theta):
    """
        Arguments:
            y: class labels
            o: outputs of each layer
            theta: parameters
        Returns:
            deltas: deltas[l][j] for each layer l and perceptron j
    """
    deltas = [np.zeros(l) for l in layers]

    # output layer
    output_layer = -1
    delta = (y - o[output_layer]) * o[output_layer] * (1 - o[output_layer])
    deltas[output_layer] = delta

    # hidden layers
    for l in reversed(range(len(hidden_layers))):
        for j in range(hidden_layers[l]):
            deltas[l][j] = sum(deltas[l+1][dwn_nbr] * theta[l+1][dwn_nbr, j] * o[l][j] * (1 - o[l][j]) for dwn_nbr in range(layers[l+1]))
        
    return deltas

In [79]:
def gradient_descent(M, learning_rate, epsilon):
    """
        mini-batch SGD
    """
    epoch = 0
    t = 0
    prev_cost = -1
    k_repeats = 0
    # theta = [np.random.rand(layers[0], n+1)] + [np.random.rand(layers[l], layers[l-1]+1) for l in range(1, len(layers))]
    theta = [np.zeros((layers[0], n+1))] + [np.zeros((layers[l], layers[l-1]+1)) for l in range(1, len(layers))]
    print("theta", theta)

    while True:
        epoch += 1
        if epoch > 8:
            return theta

        for b in range(int(m/M)):
            t += 1
            print("b", b, epoch, t)
            sum_J_theta_derivatives = [np.zeros((layers[0], n+1))] + [np.zeros((layers[l], layers[l-1]+1)) for l in range(1, len(layers))]

            cost = 0
            for i in range(b * M, (b+1) * M):
                x, y = X_train[i], y_train[i]
                o = forward_propagation(x, theta)
                deltas = back_propagation(y, o, theta)

                print("c", y, o[-1], y - o[-1])
                cost += np.sum((y - o[-1]) ** 2)
                # print("x", x.shape, x)
                # print("o", o)
                # print("deltas", deltas)

                # calculate J(theta) derivatives
                for l in range(len(layers)):
                    if l == 0:
                        x_j = np.hstack((np.ones(1), x))
                    else:
                        x_j = np.hstack((np.ones(1), o[l-1]))
                    for j in range(layers[l]):
                        J_theta_derivative = - deltas[l][j] * x_j
                        sum_J_theta_derivatives[l][j] += J_theta_derivative

            prev_theta = copy.deepcopy(theta)

            cost = cost / (2 * M)
            print("cost", cost)
            if prev_cost == -1:
                prev_cost = cost
            else:
                print("cost_diff", abs(prev_cost - cost))
                if abs(prev_cost - cost) <= epsilon:
                    k_repeats += 1
                else:
                    k_repeats = 0

                if k_repeats > 2:
                    print("converged")
                    return theta
                prev_cost = cost


            # print("sumJ", sum_J_theta_derivatives)

            # update theta
            for l in range(len(layers)):
                for j in range(layers[l]):
                    theta[l][j] = theta[l][j] - learning_rate * (sum_J_theta_derivatives[l][j] / M)

            # break when stopping criteria meets
            # stop = True
            # theta_diff_all = 0
            # for l in range(len(layers)):
            #     # print("theta", theta[l], prev_theta[l])
            #     # print("theta_diff", abs(theta[l] - prev_theta[l]))
            #     theta_diff_all += np.sum(abs(theta[l] - prev_theta[l]))
            #     if not (abs(theta[l] - prev_theta[l]) <= epsilon).all():
            #         stop = False
            #         break
            # print("stop", stop, theta_diff_all)
            # if stop:
            #     return theta


In [78]:
theta_opt = gradient_descent(100, 0.001, 1e-7)

theta [array([[0.80903161, 0.12906522, 0.49489552, ..., 0.9293648 , 0.27943917,
        0.25058952],
       [0.18313831, 0.20935472, 0.38695125, ..., 0.38538855, 0.34106078,
        0.2957163 ],
       [0.52972281, 0.87856907, 0.234109  , ..., 0.9784931 , 0.91903609,
        0.10498746],
       ...,
       [0.64502984, 0.2696028 , 0.64886614, ..., 0.94101996, 0.67868091,
        0.03004592],
       [0.65832776, 0.94056994, 0.95609108, ..., 0.97516941, 0.40012643,
        0.2619641 ],
       [0.21621508, 0.05380787, 0.03204319, ..., 0.1972469 , 0.22126481,
        0.41581646]]), array([[0.96816134, 0.73802811, 0.82751551, 0.25536976, 0.63116236,
        0.84754177, 0.93965417, 0.32543112, 0.41880811, 0.37579071,
        0.80171154],
       [0.43060755, 0.42620427, 0.58408956, 0.56129657, 0.65285415,
        0.84157626, 0.15495038, 0.92508703, 0.01244827, 0.5425902 ,
        0.48826465],
       [0.18488911, 0.97459749, 0.42663863, 0.1358867 , 0.19081594,
        0.75086005, 0.73261126, 0

KeyboardInterrupt: 

In [27]:
m_test = X_test.shape[0]
X_test = X_test.reshape((m_test, n)) # reshape
X_test = X_test / 255 # scale to 0-1
y_test = one_hot_encode(y_test)
y_test

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [55]:
theta_opt

[array([[0.8983768 , 0.89354809, 0.16856843, ..., 0.16482788, 0.71728925,
         0.7951078 ],
        [0.85842325, 0.09148342, 0.97794681, ..., 0.63028074, 0.84748423,
         0.50002625],
        [0.82945916, 0.85497264, 0.17852684, ..., 0.21572216, 0.33094666,
         0.43530654],
        ...,
        [0.08779692, 0.59418405, 0.38972125, ..., 0.61208763, 0.32395492,
         0.73807391],
        [0.82741787, 0.94958411, 0.761047  , ..., 0.96900833, 0.29796601,
         0.77157297],
        [0.35601509, 0.56539402, 0.38060999, ..., 0.00141187, 0.19196607,
         0.6197561 ]]),
 array([[0.7750718 , 0.83246727, 0.85055511, 0.02711427, 0.83854702,
         0.79856006, 0.50768414, 0.6343088 , 0.40877504, 0.61151284,
         0.08072968],
        [0.08275195, 0.23828266, 0.90784493, 0.06780219, 0.55238097,
         0.22962959, 0.11666116, 0.28212419, 0.93011742, 0.6489661 ,
         0.20197574],
        [0.2179453 , 0.6276273 , 0.688323  , 0.83465417, 0.58426554,
         0.21103362,

In [66]:
o_pred = forward_propagation(X_train[0], theta_opt)
print(o_pred[-1], y_train[0])

[0.99828277 0.98605425 0.99650682 0.99478414 0.99835403 0.98887731
 0.99715039 0.99358469 0.99890484 0.99704683] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


In [68]:
theta_opt[0].shape

(10, 785)