In [1]:
import numpy as np
import tensorflow as tf
import sys
import utils
import backward
import forward
from tensorflow import keras
from tensorflow.keras import datasets

In [2]:
class myCNN(object):
    
    def __init__(self):
        pass

    
    ######################################
    ############## Forward ###############
    ######################################   
    
    def forwardPass(self, X, y, params):
        
        k1 ,k2  ,w3  ,w4 ,b1 ,b2 ,b3 ,b4 = params 
        
        z1 = forward.convolve(self, X, k1, b1, stride = 1)
        c1 = utils.relu(self, z1)
        
        p1 = forward.pool(self, c1, filSize = (2,2), stride = 2)
        
        z2 = forward.convolve(self, p1, k2, b2, stride = 1)
        c2 = utils.relu(self, z2)
        
        p2 = forward.pool(self, c2, filSize = (3,3), stride = 2)
        p2Flat = p2.flatten().reshape(1,1600)
        
        z3 = (w3.dot(p2Flat.T)).reshape(64,1)+b3
        f1 = utils.relu(self, z3)

        z4 = w4.dot(z3).reshape(10,1)+b4
        f2 = utils.softmax(self,z4)  
        
        cost = forward.loss(self, f2, y)
        
        fp = [z1,c1,p1,z2,c2,p2,p2Flat,z3,f1,z4,f2]
        
        return (cost, fp)
    
     ######################################
     ############# Backward ###############
     ######################################
    
    def backwardPass(self, params, cost, fp, X, y):
        z1,c1,p1,z2,c2,p2,p2Flat,z3,f1,z4,f2 = fp
        k1 ,k2  ,w3  ,w4 ,b1 ,b2 ,b3 ,b4 = params
        
        grads = []
        
        # fc second gradient
        df2, dw4, db4 = backward.fc_grad_second(self, y, f2, f1)

        # fc first gradient
        df1, dw3, db3 = backward.fc_grad_first(self, df2, dw4, p2Flat, z1)

        # flattened pool 2 gradient
        dp2Flat = dw3.T.dot(df1)
        dp2 = np.reshape(dp2Flat, p2.shape)

        # conv 2 gradient
        dc2 = backward.maxpool_gradient(self, c2, dp2, filSize = (3, 3), stride = 2)
        dc2[c2<=0] = 0

        # pool 1 gradient
        dp1, dk2, db2 = backward.convolution_gradient(self, dc2, p1, k2, b2, filSize = (3,3), stride = 1)

        # conv 1 gradient
        dc1 = backward.maxpool_gradient(self, c1, dp1, filSize = (2, 2), stride = 2)
        dc1[c1<=0] = 0

        # image gradient 
        dX, dk1, db1 = backward.convolution_gradient(self, dc1, X, k1, b1, filSize = (3,3), stride = 1)
        
        grads = [df2, dw4, db4,df1, dw3, db3,dp2,dc2,dp1, dk2, db2,dc1,dX, dk1, db1]
        return grads

    ######################################
    ########### Optimization #############
    ######################################

    def optimize(self, alpha, beta1, beta2, epsilon, moments, grads, params, t, batchSize):
        """
        1 - Initialize a step size, alpha
        2 - Initialize exponential decay rates for first and second order moment estimates of the gradients, 
            beta1 & beta2
        3 - Obtain the stochastic functions (the loss) from the forward pass
        4 - Obtain parameters (kn, wn, bn) from the forward pass
        5 - Initialize first order moment, mo
        6 - Initialize second order moment, vo
        7 - Initialize timestep
        """
        v1,m1,bv1,bm1,v2,m2,bv2,bm2,v3,m3,bv3,bm3,v4,m4,bv4,bm4 = moments
        df2, dw4, db4,df1, dw3, db3,dp2,dc2,dp1, dk2, db2,dc1,dX, dk1, db1 = grads
        k1,k2  ,w3  ,w4 ,b1 ,b2 ,b3 ,b4 = params
                
        # Get first and second moment estimates
        m1 = beta1 * m1 + (1-beta1) * dk1/batchSize
        v1 = beta2 * v1 + (1-beta2) * (dk1/batchSize)**2
        bm1 = beta1 * bm1 + (1-beta1) * db1/batchSize
        bv1 = beta2 * bv1 + (1-beta2) * (db1/batchSize)**2

        m2 = beta1 * m2 + (1-beta1) * dk2/batchSize
        v2 = beta2 * v2 + (1-beta2) * (dk2/batchSize)**2
        bm2 = beta1 * bm2 + (1-beta1) * db2/batchSize
        bv2 = beta2 * bv2 + (1-beta2) * (db2/batchSize)**2

        m3 = beta1 * m3 + (1-beta1) * dw3/batchSize
        v3 = beta2 * v3 + (1-beta2) * (dw3/batchSize)**2
        bm3 = beta1 * bm3 + (1-beta1) * db3/batchSize
        bv3 = beta2 * bv3 + (1-beta2) * (db3/batchSize)**2

        m4 = beta1 * m4 + (1-beta1) * dw4/batchSize
        v4 = beta2 * v4 + (1-beta2) * (dw4/batchSize)**2
        bm4 = beta1 * bm4 + (1-beta1) * db4/batchSize
        bv4 = beta2 * bv4 + (1-beta2) * (db4/batchSize)**2

        # Correct estimates for zero bias
        
        v1 = v1 / (1-beta2**t)
        bm1 = bm1 / (1-beta1**t)
        bv1 = bv1 / (1-beta2**t)

        m2 = m2 / (1-beta1**t)
        v2 = v2 / (1-beta2**t)
        bm2 = bm2 / (1-beta1**t)
        bv2 = bv2 / (1-beta2**t)

        m3 = m3 / (1-beta1**t)
        v3 = v3 / (1-beta2**t)
        bm3 = bm3 / (1-beta1**t)
        bv3 = bv3 / (1-beta2**t)

        m4 = m4 / (1-beta1**t)
        v4 = v4 / (1-beta2**t)
        bm4 = bm4 / (1-beta1**t)
        bv4 = bv4 / (1-beta2**t)

        # Update parameters
        k1 -= alpha*m1/(np.sqrt(v1 + epsilon))
        b1 -= alpha*bm1/(np.sqrt(bv1 + epsilon))

        k2 -= alpha*m2/(np.sqrt(v2 + epsilon))
        b2 -= alpha*bm2/(np.sqrt(bv2 + epsilon))

        w3 -= alpha*m3/(np.sqrt(v3 + epsilon))
        b3 -= alpha*bm3/(np.sqrt(bv3 + epsilon))

        w4 -= alpha*m4/(np.sqrt(v4 + epsilon))
        b4 -= alpha*bm4/(np.sqrt(bv4 + epsilon))
        
        # return updated parameters
        newParams = [k1,k2,w3,w4,b1,b2,b3,b4]
        
        return newParams

In [7]:

"""
This whole cell will later go into a class or function
"""

########################################## Initialization step ##########################################

# load the data
(train_images, train_labels), (test_images, test_labels) = datasets.mnist.load_data()
train_images = train_images.reshape((1, 28, 28, 60000)) # (channels, rows, cols, imgs)
test_images = test_images.reshape((1, 28, 28, 10000)) # (channels, rows, cols, imgs)

# normalization of the image pixel values
train_images, test_images = train_images / 255.0, test_images / 255.0

# break up data, first 10 images 
batchSize = 10
train_image = train_images[:,:,:,0:batchSize].reshape(1,28,28,batchSize) # batchsize x 28 pixels x 28 pixels x 1
train_label = train_labels[0:batchSize].reshape(batchSize)


# initialize parameters (2 convolution, 2 pool, 2 fully connected)
"""
kn = conv. kernels
wn = fc. weights
"""

k1 = np.random.randn(32, 1, 3, 3)
k2 = np.random.randn(64, 32, 3, 3)
w3 = np.random.randn(64, 1600) * 0.01
w4 = np.random.randn(10, 64) * 0.01

b1 = np.zeros((k1.shape[0],1))
b2 = np.zeros((k2.shape[0],1))
b3 = np.zeros((w3.shape[0],1))
b4 = np.zeros((w4.shape[0],1))

# Initialize optimization moments
v1 = np.zeros(k1.shape)
m1 = np.zeros(k1.shape)
bv1 = np.zeros(b1.shape)
bm1 = np.zeros(b1.shape)

v2 = np.zeros(k2.shape)
m2 = np.zeros(k2.shape)
bv2 = np.zeros(b2.shape)
bm2 = np.zeros(b2.shape)

v3 = np.zeros(w3.shape)
m3 = np.zeros(w3.shape)
bv3 = np.zeros(b3.shape)
bm3 = np.zeros(b3.shape)

v4 = np.zeros(w4.shape)
m4 = np.zeros(w4.shape)
bv4 = np.zeros(b4.shape)
bm4 = np.zeros(b4.shape)

params = [k1, k2, w3, w4, b1, b2, b3, b4]
moments = [v1,m1,bv1,bm1,v2,m2,bv2,bm2,v3,m3,bv3,bm3,v4,m4,bv4,bm4]

########################################## Training step #########################################

"""
To train, specify number of epochs. Batch size was determined earlier to break up the data. This method isn't permanent.
"""

Y = np.zeros((batchSize,numLabels,1))
numEpochs = 10
numLabels = 10 

# for each image in batch, one iteration = forward, backward, and optimization
for epoch in range(numEpochs):
    cost = []
    for img in range(batchSize):
        Y[img,train_label[img]] = 1.                                         # one hot vector labels
        image, label = train_image[:,:,:,img],Y[img]
        loss, fp = myCNN().forwardPass(image, label, params)                     # this returns the loss and forward pass
        grads =  myCNN().backwardPass(params, loss, fp, image, label)            # this returns the gradiets w.r.t the loss
        cost.append(loss)
        if (img+1) % batchSize == 0:
            print("now optimizing: epoch ", epoch+1)
            params = myCNN().optimize(0.001, 0.9, 0.999, 1E-7, moments, grads, params, img, batchSize)
            print("average cost: ", sum(cost)/batchSize)


now optimizing: epoch  1
average cost:  2.4020782075067157
now optimizing: epoch  2
average cost:  2.377868983514366
now optimizing: epoch  3
average cost:  2.373417179883282
now optimizing: epoch  4
average cost:  2.374710771120108
now optimizing: epoch  5
average cost:  2.3690391516507026
now optimizing: epoch  6
average cost:  2.3562317444532437
now optimizing: epoch  7
average cost:  2.3562317444532437
now optimizing: epoch  8
average cost:  2.3562317444532437
now optimizing: epoch  9
average cost:  2.3562317444532437
now optimizing: epoch  10
average cost:  2.3562317444532437
