In [None]:
import numpy as np
import tensorflow as tf
import random
from tensorflow.examples.tutorials.mnist import input_data

# Read data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

In [None]:
class neuralNetwork:
    def __init__(self, l1Size, l2Size):
        self.l1Size = l1Size
        self.l2Size = l2Size
        
    def randInit(self, x, y):
        n = x.shape[1] #features
        classes = y.shape[1] #classes
        self.l1W = np.random.rand(n, self.l1Size) - 0.5
        self.l2W = np.random.rand(self.l1Size + 1, self.l2Size) - 0.5
        self.oW = np.random.rand(self.l2Size + 1, classes) - 0.5

        
    def forwardProp(self, x):
        def addBias(x):
            return np.append(np.ones((x.shape[0], 1)), x, axis=1)
        
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))
        
        z1 = x @ self.l1W
        a1 = z1 * (z1 > 0)
        z2 = addBias(a1) @ self.l2W
        a2 = z2 * (z2 > 0)
        z3 = addBias(a2) @ self.oW
        a3 = np.minimum(0.9999, np.maximum(0.0001, sigmoid(z3)))
        return (a3, a2, a1);
        
    def costFunc(self, x, y):   
        a3 = self.forwardProp(x)[0]
        return sum(sum(y * np.log(a3) + (1 - y) * np.log(1 - a3))) / -y.shape[0]          
        
    def optFunc(self, x, y, algo = "gradientdescent", 
                alpha = 0.01, beta = 0.9, beta2 = 0.999, 
                batchSize = 128):
        def addBias(x):
            return np.append(np.ones((x.shape[0], 1)), x, axis=1)
        
        def getDerivative():
            a3, a2, a1 = self.forwardProp(x)
            
            #Backprop
            a3Err = (a3 - y) / x.shape[0]
            a2Err = (a3Err @ self.oW.T)
            a1Err = (a2Err[:,1:] @ self.l2W.T)

            oD = addBias(a2).T @ a3Err 
            l2D = addBias(a1).T @ a2Err[:,1:]  
            l1D = x.T @ a1Err[:,1:]
            return (l1D, l2D, oD)
        
        def gradientDescent():
            def optimize():
                ders = getDerivative()
                self.l1W -= alpha * ders[0]
                self.l2W -= alpha * ders[1]
                self.oW -= alpha * ders[2]
            return optimize
        
        def GDWMomentum():
            dersO = getDerivative()
            l1WB = dersO[0]
            l2WB = dersO[1]
            oWB = dersO[2]
            def optimize():
                nonlocal l1WB
                nonlocal l2WB
                nonlocal oWB
                ders = getDerivative()
                l1WB = beta * l1WB + (1 - beta) * ders[0]
                l2WB = beta * l2WB + (1 - beta) * ders[1]
                oWB = beta * oWB + (1 - beta) * ders[2]
                self.l1W -= alpha * l1WB
                self.l2W -= alpha * l2WB
                self.oW -= alpha * oWB
            return optimize
        
        def RMSProp():
            dersO = getDerivative()
            l1WS = dersO[0] * dersO[0]
            l2WS = dersO[1] * dersO[1]
            oWS = dersO[2] * dersO[2]
            epsilon = 0.0000001
            def optimize():
                nonlocal l1WS
                nonlocal l2WS
                nonlocal oWS
                ders = getDerivative()
                l1WS = beta * l1WS + (1 - beta) * ders[0] * ders[0]
                l2WS = beta * l2WS + (1 - beta) * ders[1] * ders[1]
                oWS = beta * oWS + (1 - beta) * ders[2] * ders[2]
                self.l1W -= alpha * ders[0] / (np.sqrt(l1WS) + epsilon)
                self.l2W -= alpha * ders[1] / (np.sqrt(l2WS) + epsilon)
                self.oW -= alpha * ders[2] / (np.sqrt(oWS) + epsilon)
            return optimize
        
        def ADAM():
            dersO = getDerivative()
            l1WB = dersO[0]
            l2WB = dersO[1]
            oWB = dersO[2]
            l1WS = dersO[0] * dersO[0]
            l2WS = dersO[1] * dersO[1]
            oWS = dersO[2] * dersO[2]
            epsilon = 0.0000001
            def optimize():
                nonlocal l1WB
                nonlocal l2WB
                nonlocal oWB
                nonlocal l1WS
                nonlocal l2WS
                nonlocal oWS
                ders = getDerivative()
                l1WB = beta * l1WB + (1 - beta) * ders[0]
                l2WB = beta * l2WB + (1 - beta) * ders[1]
                oWB = beta * oWB + (1 - beta) * ders[2]
                l1WS = beta * l1WS + (1 - beta) * ders[0] * ders[0]
                l2WS = beta * l2WS + (1 - beta) * ders[1] * ders[1]
                oWS = beta * oWS + (1 - beta) * ders[2] * ders[2]
                self.l1W -= alpha * l1WB / (np.sqrt(l1WS) + epsilon)
                self.l2W -= alpha * l2WB / (np.sqrt(l2WS) + epsilon)
                self.oW -= alpha * oWB / (np.sqrt(oWS) + epsilon)
            return optimize
            
        def switch(x):
            return {
                "gradientdescent": gradientDescent(),
                "gd": gradientDescent(),
                "gdwmomentum": GDWMomentum(),
                "momentum": GDWMomentum(),
                "rmsprop": RMSProp(),
                "adam": ADAM()
            }.get(x.replace(" ", "").lower(), gradientDescent) 
        return switch(algo)
    
    def train(self, x, y, iters = 500):
        optimize = self.optFunc(x, y, "adam", 0.002)
        for i in range(1,iters):
            optimize()
            if(i % 20 == 0):
                print("Iter: ", i, " JVal: ", self.costFunc(x, y))
        
    def predict(self, x):
        return self.forwardProp(x)[0]
            
            
def testAccuracy(predict, label):
    n = label.shape[1]
    m = label.shape[0]
    maxes = np.tile(np.array([np.max(predict, 1)]).T, (1, n))
    oneHot = np.array(maxes) == np.array(predict)
    return (sum(sum(oneHot == label)) / (m) - (n-2)) / 2

        
n = neuralNetwork(100, 100)
trainX = mnist.train.images[0:10000] 
trainY = mnist.train.labels[0:10000]
n.randInit(trainX, trainY)
n.train(trainX, trainY, 500)

testX = mnist.test.images[0:2000]
testY = mnist.test.labels[0:2000]

print(testAccuracy(n.predict(testX), testY))



In [None]:
"""
#testing code
     def checkDerivative(self, x, y):
        oWDer = np.zeros((self.oW.shape[0], self.oW.shape[1]))
        for i in range(self.oW.shape[0]):
            for j in range(self.oW.shape[1]):
                self.oW[i][j] += 0.00001
                upper = self.costFunc(x, y)
                self.oW[i][j] -= 0.00002
                lower = self.costFunc(x, y)
                oWDer[i][j] = (upper - lower) / 0.00002
                self.oW[i][j] += 0.00001
                
        l2WDer = np.ones((self.l2W.shape[0], self.l2W.shape[1]))
        for i in range(self.l2W.shape[0]):
            for j in range(self.l2W.shape[1]):
                self.l2W[i][j] += 0.00001
                upper = self.costFunc(x, y)
                self.l2W[i][j] -= 0.00002
                lower = self.costFunc(x, y)
                l2WDer[i][j] = (upper - lower) / 0.00002
                self.l2W[i][j] += 0.00001
                
        l1WDer = np.ones((self.l1W.shape[0], self.l1W.shape[1]))
        for i in range(self.l1W.shape[0]):
            for j in range(self.l1W.shape[1]):
                self.l1W[i][j] += 0.00001
                upper = self.costFunc(x, y)
                self.l1W[i][j] -= 0.00002
                lower = self.costFunc(x, y)
                l1WDer[i][j] = (upper - lower) / 0.00002
                self.l1W[i][j] += 0.00001
                
        print("Done")
        return (l1WDer, l2WDer,oWDer)
"""

In [None]:
"""
#if you really want to implement batch norm
#https://chrisyeh96.github.io/2017/08/28/deriving-batchnorm-backprop.html

class neuralNetwork:
    def __init__(self, l1Size, l2Size):
        self.l1Size = l1Size
        self.l2Size = l2Size
        
    def randInit(self, x, y):
        n = x.shape[1] #features
        classes = y.shape[1] #classes
        self.l1W = np.random.rand(n, self.l1Size) - 0.5
        self.l2W = np.random.rand(self.l1Size + 1, self.l2Size) - 0.5
        self.oW = np.random.rand(self.l2Size + 1, classes) - 0.5
        self.gamma2 = np.ones(self.l1Size + 1)
        self.beta2 = np.zeros(self.l1Szie + 1)

        
    def forwardProp(self, x):
        def addBias(x):
            return np.append(np.ones((x.shape[0], 1)), x, axis=1)
        
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))
        
        z1 = x @ self.l1W
        a1 = z1 * (z1 > 0)
        u1 = np.mean(a1)
        s1 = np.std(a1)
        zNorm2 = (a1-u1)/s1
        z2 = addBias(self.gamma2 * zNorm2 + self.beta2) @ self.l2W
        a2 = z2 * (z2 > 0)
        z3 = addBias(a2) @ self.oW
        a3 = np.minimum(0.9999, np.maximum(0.0001, sigmoid(z3)))
        return (a3, a2, a1, Zorm2)
    
    def costFunc(self, x, y):   
        a3 = self.forwardProp(x)[0]
        return sum(sum(y * np.log(a3) + (1 - y) * np.log(1 - a3))) / -y.shape[0]          
        
    def optFunc(self, x, y, algo = "gradientdescent", 
                alpha = 0.01, beta = 0.9, beta2 = 0.999, 
                batchSize = 128):
        def addBias(x):
            return np.append(np.ones((x.shape[0], 1)), x, axis=1)
        
        def getDerivative():
            a3, a2, a1, zNorm2 = self.forwardProp(x)
            
            #Backprop
            a3Err = (a3 - y) / x.shape[0]
            a2Err = (a3Err @ self.oW.T)
            a1Err = (a2Err[:,1:] @ self.l2W.T)

            oD = addBias(a2).T @ a3Err 
            l2D = addBias(a1).T @ a2Err[:,1:]
            gamma2D = np.(a2Err * zNorm2)
            beta2D = np.mean(a2Err)
            #
            # Todo is here
            #
            ###l1D needs to be changed
            l1D = x.T @ a1Err[:,1:]
            return (l1D, l2D, oD)
        
        def gradientDescent():
            def optimize():
                ders = getDerivative()
                self.l1W -= alpha * ders[0]
                self.l2W -= alpha * ders[1]
                self.oW -= alpha * ders[2]
            return optimize
        
        def GDWMomentum():
            dersO = getDerivative()
            l1WB = dersO[0]
            l2WB = dersO[1]
            oWB = dersO[2]
            def optimize():
                nonlocal l1WB
                nonlocal l2WB
                nonlocal oWB
                ders = getDerivative()
                l1WB = beta * l1WB + (1 - beta) * ders[0]
                l2WB = beta * l2WB + (1 - beta) * ders[1]
                oWB = beta * oWB + (1 - beta) * ders[2]
                self.l1W -= alpha * l1WB
                self.l2W -= alpha * l2WB
                self.oW -= alpha * oWB
            return optimize
        
        def RMSProp():
            dersO = getDerivative()
            l1WS = dersO[0] * dersO[0]
            l2WS = dersO[1] * dersO[1]
            oWS = dersO[2] * dersO[2]
            epsilon = 0.0000001
            def optimize():
                nonlocal l1WS
                nonlocal l2WS
                nonlocal oWS
                ders = getDerivative()
                l1WS = beta * l1WS + (1 - beta) * ders[0] * ders[0]
                l2WS = beta * l2WS + (1 - beta) * ders[1] * ders[1]
                oWS = beta * oWS + (1 - beta) * ders[2] * ders[2]
                self.l1W -= alpha * ders[0] / (np.sqrt(l1WS) + epsilon)
                self.l2W -= alpha * ders[1] / (np.sqrt(l2WS) + epsilon)
                self.oW -= alpha * ders[2] / (np.sqrt(oWS) + epsilon)
            return optimize
        
        def ADAM():
            dersO = getDerivative()
            l1WB = dersO[0]
            l2WB = dersO[1]
            oWB = dersO[2]
            l1WS = dersO[0] * dersO[0]
            l2WS = dersO[1] * dersO[1]
            oWS = dersO[2] * dersO[2]
            epsilon = 0.0000001
            def optimize():
                nonlocal l1WB
                nonlocal l2WB
                nonlocal oWB
                nonlocal l1WS
                nonlocal l2WS
                nonlocal oWS
                ders = getDerivative()
                l1WB = beta * l1WB + (1 - beta) * ders[0]
                l2WB = beta * l2WB + (1 - beta) * ders[1]
                oWB = beta * oWB + (1 - beta) * ders[2]
                l1WS = beta * l1WS + (1 - beta) * ders[0] * ders[0]
                l2WS = beta * l2WS + (1 - beta) * ders[1] * ders[1]
                oWS = beta * oWS + (1 - beta) * ders[2] * ders[2]
                self.l1W -= alpha * l1WB / (np.sqrt(l1WS) + epsilon)
                self.l2W -= alpha * l2WB / (np.sqrt(l2WS) + epsilon)
                self.oW -= alpha * oWB / (np.sqrt(oWS) + epsilon)
            return optimize
            
        def switch(x):
            return {
                "gradientdescent": gradientDescent(),
                "gd": gradientDescent(),
                "gdwmomentum": GDWMomentum(),
                "momentum": GDWMomentum(),
                "rmsprop": RMSProp(),
                "adam": ADAM()
            }.get(x.replace(" ", "").lower(), gradientDescent) 
        return switch(algo)
    
    def train(self, x, y, iters = 500):
        optimize = self.optFunc(x, y, "adam", 0.002)
        for i in range(1,iters):
            optimize()
            if(i % 20 == 0):
                print("Iter: ", i, " JVal: ", self.costFunc(x, y))
        
    def predict(self, x):
        return self.forwardProp(x)[0]
            
    def checkDerivative(self, x, y):
        oWDer = np.zeros((self.oW.shape[0], self.oW.shape[1]))
        for i in range(self.oW.shape[0]):
            for j in range(self.oW.shape[1]):
                self.oW[i][j] += 0.00001
                upper = self.costFunc(x, y)
                self.oW[i][j] -= 0.00002
                lower = self.costFunc(x, y)
                oWDer[i][j] = (upper - lower) / 0.00002
                self.oW[i][j] += 0.00001
                
        l2WDer = np.ones((self.l2W.shape[0], self.l2W.shape[1]))
        for i in range(self.l2W.shape[0]):
            for j in range(self.l2W.shape[1]):
                self.l2W[i][j] += 0.00001
                upper = self.costFunc(x, y)
                self.l2W[i][j] -= 0.00002
                lower = self.costFunc(x, y)
                l2WDer[i][j] = (upper - lower) / 0.00002
                self.l2W[i][j] += 0.00001
                
        l1WDer = np.ones((self.l1W.shape[0], self.l1W.shape[1]))
        for i in range(self.l1W.shape[0]):
            for j in range(self.l1W.shape[1]):
                self.l1W[i][j] += 0.00001
                upper = self.costFunc(x, y)
                self.l1W[i][j] -= 0.00002
                lower = self.costFunc(x, y)
                l1WDer[i][j] = (upper - lower) / 0.00002
                self.l1W[i][j] += 0.00001
                
        gamma2Der = np.ones(self.l1Size + 1)
            for i in range(self.l1Size + 1):
                self.gamma2[i] += 0.00001
                upper = self.costFunc(x, y)
                self.gamma2[i] -= 0.00002
                lower = self.costFunc(x, y)
                l1WDer[i] = (upper - lower) / 0.00002
                self.gamma2[i] += 0.00001
        
        beta2Der = np.ones(self.l1Size + 1)
            for i in range(self.l1Size + 1):
                self.l1W[i] += 0.00001
                upper = self.costFunc(x, y)
                self.beta2[i] -= 0.00002
                lower = self.costFunc(x, y)
                l1WDer[i] = (upper - lower) / 0.00002
                self.beta2[i] += 0.00001
            
        return (l1WDer, l2WDer,oWDer)
"""