# Description

In this notebook, I am going to code a 3 layer (input, hidden, output), customizable neural network from scratch - including back propagation. It will be a fully-connected binary classification neural network to start.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [166]:
class NNModel:
    def __init__(self, input_dim, hidden_dim, learning_rate=0.01, regularization=5):
        self.weights = dict()
        self.biases = dict()
        
        self.weights['L1'] = np.random.rand(hidden_dim, input_dim)
        self.weights['L2'] = np.random.rand(1, hidden_dim)
        
        self.biases['L1'] = np.random.rand(hidden_dim, 1)
        self.biases['L2'] = np.random.rand(1,1)
        
        self.learning_rate = learning_rate
        self.regularization = regularization
        
        next;
        
    def sigmoid(self, data):
#         bigger_data = np.array(data,dtype=np.float128)
        data = np.clip(data,a_min=-100, a_max=100)
        if np.sum(np.isnan(data)) > 0:
            print('nan:',np.sum(np.isnan(data)))
            print(data)
        return 1/(1+np.exp(-data))
    
    def sigmoidPrime(self, data):
        oz = self.sigmoid(data)
        return oz * (1-oz)
    
    def crossEntropyPrime(self, a2, target):
        epsilon = 10e-7
        return (-target+a2)/(a2*(1-a2)+epsilon)
    
    def crossEntropyError(self, a2, target):
        epsilon = 10e-8
        return np.sum(-target*np.log(a2+epsilon)-(1-target)*np.log(1-a2+epsilon))
    
    def normalize(self, data):
        # assumes samples are given as columns
        X = data.transpose()
        X_norm = (X - X.mean(axis=0))/X.std(axis=0)
        return X_norm.transpose()
    
    def is_converged(self, old_weights, old_biases, new_weights, new_biases, threshold=0.01) -> bool:
        max_relative_change = 0
        epsilon = 10e-6
        for key, val in old_weights.items():
            rel_changes_w = np.max(np.absolute(new_weights[key] - old_weights[key])/(old_weights[key]+epsilon))
            rel_changes_b = np.max(np.absolute(new_biases[key] - old_biases[key])/(old_biases[key]+epsilon))
            max_relative_change = max(max_relative_change, rel_changes_w, rel_changes_b)
        return max_relative_change < threshold
    
    # WX + B, where each column of X is a sample
    def forward(self, data):
        self.data = data.transpose()
        self.data = self.normalize(self.data)  # normalize
        self.z1 = np.dot(self.weights.get('L1'), self.data) + self.biases.get('L1')
        self.a1 = self.sigmoid(self.z1)
        self.a1 = self.normalize(self.a1)  # normalize
        self.z2 = np.dot(self.weights.get('L2'), self.a1) + self.biases.get('L2')
        self.a2 = self.sigmoid(self.z2)
#         print('a2 shape:',self.a2.shape)
        return self.a2
        
    
    def MSE(self, data, target):
        return np.mean((self.forward(data) - target)**2)
    
    def backprop(self, a2, target):
        new_weights = dict()
        new_biases = dict()
        
        num_samples = a2.shape[1]
#         print('num_samples',num_samples)
#         print('a2:',a2.shape)
#         print('target:',target.shape)
        dJda2 = self.crossEntropyPrime(a2, target)
#         print('dJda2:',dJda2.shape)
        da2dz2 = self.sigmoidPrime(a2)
#         print('da2dz2:',da2dz2.shape)
        dJdz2 = dJda2 * da2dz2  # delta last layer
#         print('dJdz2:',dJdz2.shape)
        dz2dW2 = self.a2.transpose()
#         print('dz2dW2:',dz2dW2.shape)
        regularization_2 = self.regularization*self.weights['L2']
        dJdW2 = np.dot(dJdz2, dz2dW2) + regularization_2
#         print('dJdW2:',dJdW2.shape)

    
        # update W2, b2
        new_weights['L2'] = self.weights['L2'] - self.learning_rate*dJdW2/num_samples
        new_biases['L2'] = self.biases['L2'] - self.learning_rate*np.sum(dJdz2, axis=1)/num_samples
#         print('bias shape after', new_biases['L2'])
        
        dJda1 = np.dot(self.weights['L2'].transpose(), dJdz2)
        dJdz1 = self.sigmoidPrime(self.z1)
        dJdz1 = dJda1*dJdz1  # delta hidden layer
        regularization_1 = self.regularization*self.weights['L1']
#         print('dJdz1 shape:',dJdz1.shape)
        dJdW1 = np.dot(dJdz1, self.data.transpose()) + regularization_1

    
#         print('layer 1')
#         print('delta sum shape',np.sum(dJdz1, axis=1).shape)
#         print('bias shape before',self.biases['L1'])
        
        # update W1, b1
        new_weights['L1'] = self.weights['L1'] - self.learning_rate*dJdW1/num_samples
        new_biases['L1'] = self.biases['L1'] - self.learning_rate*np.sum(dJdz1, axis=1).reshape((dJdz1.shape[0],1))/num_samples
#         print('bias shape after', new_biases['L1'])
        
        return new_weights, new_biases
        
        
    def train(self, data, target, epochs=5, batch_size=10):
        num_batches = np.ceil(data.shape[0]/batch_size)
        x_batches = np.array_split(data, num_batches)
        t_batches = np.array_split(target, num_batches, axis=1)
        # stopping condition < 1% change
        stop_thrsh = 0.0002
        converged = False
        for i in range(epochs):
#             print('epoch:',i)

            for j, batch in enumerate(x_batches):
#                 print('batch:',j)
#                 if i == 12 and j == 27:
#                     print('weights:')
#                     print(weights)
#                     print('biases:')
#                     print(biases)
#                 print('forward')
                a2 = self.forward(batch)
#                 print('backward')
                weights, biases = self.backprop(a2, t_batches[j])
                # check convergence
                converged = self.is_converged(self.weights, self.biases, weights, biases, stop_thrsh)
                if converged:
                    break
                
                self.weights = weights
                self.biases = biases
    #             print('epoch',i)
    #             print('weights:')
    #             print(self.weights)
    #             print('biases:')
    #             print(self.biases)
            # check convergence
            if converged:
                break
            a2 = self.forward(data)
            print('mean cross entropy error:', self.crossEntropyError(a2, target)/a2.shape[1])
            print('L2 weights:', self.weights['L2'])
#         print(self.weights)
#         print(self.biases)
        return None
        
        
    
        
        
        
    
        
    
        

In [45]:
model = NNModel(2,3,0.01)
data = np.array([[1,3],[2,4],[3,1],[4,1]])
targets = np.array([[0,0,1,1]])
model.train(data, targets)

mean cross entropy error: 1.3919377919326879
mean cross entropy error: 1.3744573063513847
mean cross entropy error: 1.3578751930700232
mean cross entropy error: 1.342113794118843
mean cross entropy error: 1.3271048698146937
{'L2': array([[0.39019367, 0.4188895 , 0.379056  ]]), 'L1': array([[0.20584178, 0.25366756],
       [0.02314041, 0.79365522],
       [0.27452474, 0.69370368]])}
{'L2': array([[0.84405153]]), 'L1': array([[0.52164269],
       [0.4219409 ],
       [0.90372831]])}


In [6]:
np.array_split(data, 3)
np.array_split(targets, 3, axis=1)

[array([[0, 0]]), array([[1]]), array([[1]])]

In [7]:
df = pd.read_csv('pima-indians-diabetes.txt')
df.head()
X = df.iloc[:,:-1].values
y = np.array([df.iloc[:,-1].values])

print(X.shape)
print(y.shape)

(767, 8)
(1, 767)


In [8]:
print(X[:10])
print(y[0][:10])
print(np.array_split(X,100)[0])
print(np.array_split(y,80,axis=1)[0])

[[1.000e+00 8.500e+01 6.600e+01 2.900e+01 0.000e+00 2.660e+01 3.510e-01
  3.100e+01]
 [8.000e+00 1.830e+02 6.400e+01 0.000e+00 0.000e+00 2.330e+01 6.720e-01
  3.200e+01]
 [1.000e+00 8.900e+01 6.600e+01 2.300e+01 9.400e+01 2.810e+01 1.670e-01
  2.100e+01]
 [0.000e+00 1.370e+02 4.000e+01 3.500e+01 1.680e+02 4.310e+01 2.288e+00
  3.300e+01]
 [5.000e+00 1.160e+02 7.400e+01 0.000e+00 0.000e+00 2.560e+01 2.010e-01
  3.000e+01]
 [3.000e+00 7.800e+01 5.000e+01 3.200e+01 8.800e+01 3.100e+01 2.480e-01
  2.600e+01]
 [1.000e+01 1.150e+02 0.000e+00 0.000e+00 0.000e+00 3.530e+01 1.340e-01
  2.900e+01]
 [2.000e+00 1.970e+02 7.000e+01 4.500e+01 5.430e+02 3.050e+01 1.580e-01
  5.300e+01]
 [8.000e+00 1.250e+02 9.600e+01 0.000e+00 0.000e+00 0.000e+00 2.320e-01
  5.400e+01]
 [4.000e+00 1.100e+02 9.200e+01 0.000e+00 0.000e+00 3.760e+01 1.910e-01
  3.000e+01]]
[0 1 0 1 0 1 0 1 1 0]
[[1.000e+00 8.500e+01 6.600e+01 2.900e+01 0.000e+00 2.660e+01 3.510e-01
  3.100e+01]
 [8.000e+00 1.830e+02 6.400e+01 0.000e+00 

In [167]:
np.random.seed(23)
pid_model = NNModel(8,10,0.003)
pid_model.train(X,y,40)
pred = pid_model.forward(X) > 0.5
print('cross entropy error:',pid_model.crossEntropyError(pred, y))
# print('accuracy:', (y==pred)/len(y[0]))


mean cross entropy error: 0.7175027298137092
L2 weights: [[ 0.48451598  0.11555915 -0.1439925  -0.11371988  0.24927041  0.2251468
   0.22179052 -0.11382516  0.47800752  0.30416694]]
mean cross entropy error: 0.6670588393219025
L2 weights: [[ 0.38302037  0.0545022  -0.17660189 -0.14964723  0.17355835  0.15207875
   0.14909033 -0.14974097  0.37722526  0.22243807]]
mean cross entropy error: 0.6660758294502231
L2 weights: [[ 0.309982    0.0174703  -0.18830416 -0.1643038   0.12347758  0.1043522
   0.10169132 -0.16438726  0.30482204  0.16699995]]
mean cross entropy error: 0.6861823140913532
L2 weights: [[ 0.25068221 -0.00976942 -0.19299044 -0.17162059  0.08461918  0.06758999
   0.06522075 -0.1716949   0.2460878   0.12337138]]
mean cross entropy error: 0.7188525409722354
L2 weights: [[ 0.19974197 -0.03216346 -0.19530296 -0.1762753   0.05187989  0.03671715
   0.03460758 -0.17634147  0.19565112  0.08638475]]
mean cross entropy error: 0.7614389911370003
L2 weights: [[ 0.15425006 -0.05223791 -0.1

In [168]:
# print(y)
# print(pred)
print(np.sum(y==pred)/len(pred[0]))

0.7627118644067796


In [54]:
# trying to standardize the data
print('max values:',X.max(axis=0))
print('min values:',X.min(axis=0))
print(X.std(axis=0))
print(X.mean(axis=0))
X_norm = (X - X.mean(axis=0))/X.std(axis=0)
print(X_norm)


max values: [ 17.   199.   122.    99.   846.    67.1    2.42  81.  ]
min values: [ 0.     0.     0.     0.     0.     0.     0.078 21.   ]
[  3.36867837  31.9576152   19.35552462  15.94365537 115.20792869
   7.8839464    0.33128119  11.74463189]
[  3.8422425  120.85919166  69.10169492  20.51760104  79.90352021
  31.9904824    0.47167405  33.2190352 ]
[[-0.84372629 -1.12208597 -0.16024856 ... -0.68372895 -0.36426474
  -0.18894038]
 [ 1.23423997  1.94447577 -0.26357823 ... -1.10230105  0.60470064
  -0.1037951 ]
 [-0.84372629 -0.99692019 -0.16024856 ... -0.49346891 -0.91968415
  -1.0403932 ]
 ...
 [ 0.343683    0.0044061   0.14974046 ... -0.73446496 -0.68423462
  -0.27408566]
 [-0.84372629  0.16086333 -0.47023757 ... -0.23978884 -0.37030191
   1.17338414]
 [-0.84372629 -0.8717544   0.04641078 ... -0.20173684 -0.47293375
  -0.87010264]]


In [89]:
import keras as k
from keras.models import Sequential
from keras.layers import Dense

In [97]:
# check performance of Keras network
# https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/
# https://keras.io/api/optimizers/
model = Sequential()
model.add(Dense(10, input_dim=8, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# opt = keras.optimizers.Adam(learning_rate=0.01)
# model.compile(loss='categorical_crossentropy', optimizer=opt)
model.fit(X,y.reshape(len(y[0]),-1), epochs = 20, batch_size=10)
preds = model.predict(X)
pid_model.crossEntropyError(preds, y)





Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


390251.22928100824