In [1]:
import numpy as np
import pandas as pd
#from keras.datasets import fashion_mnist
#import matplotlib.pyplot as plt
import wandb

In [None]:
(X_train, Y_train) , (X_test, Y_test) = fashion_mnist.load_data()

In [None]:
done = set()
fig, ax = plt.subplots(5, 2, figsize = (25, 25))
fig.tight_layout()
for x, y in zip(X_train, Y_train):
    if y not in done:
        done.add(y)
        ax[y % 5, y // 5].imshow(x)

In [2]:
class FNNClassifier:
    def __init__(self, layer_size, num_layers, activation='ReLU', optimizer='adam', weight_decay=0.0001, batch_size=200, learning_rate=0.001, num_epochs=200, weight_init='Xavier'):
        self.activation = activation # 'identity', 'logistic', 'tanh', 'relu'
        self.optimizer = optimizer # 'normal', 'sgd', 'momentum', 'nesterov', 'rmsprop', 'adam', 'nadam'
        self.weight_decay = weight_decay # L2 regularization hyperparameter
        self.batch_size = batch_size # Batch size
        self.learning_rate = learning_rate # Learning Rate
        self.num_epochs = num_epochs # Number of epochs
        self.weight_init = weight_init # 'random', 'xavier'
        self.n = 100
        self.K = 10
        self.L = num_layers
        self.N = layer_size
        self.layer_sizes = np.zeros((num_layers + 2))
        self.layer_sizes[1 : num_layers + 1] = layer_size
        self.W = []
        self.b = []
        self.a = []
        self.h = []
        wandb.log({'layer_sizes': layer_size})
        wandb.log({'num_layers': num_layers})
        wandb.log({'activation': activation})
        wandb.log({'optimizer': optimizer})
        wandb.log({'weight_decay': weight_decay})
        wandb.log({'batch_size': batch_size})
        wandb.log({'learning_rate': learning_rate})
        wandb.log({'weight_init': weight_init})
    
    def act(self, z):
        if(self.activation == 'sigmoid'):
            return 1 / (1 + np.exp(-z))
        elif(self.activation == 'tanh'):
            return np.tanh(z)
        elif(self.activation == 'ReLU'):
            return np.maximum(z, np.zeros(z.shape)) 
    
    def deriv_act(self, z):
        if(self.activation == 'sigmoid'):
            return 1 / (2 + np.exp(-z) + np.exp(z))
        elif(self.activation == 'tanh'):
            return np.cosh(z) ** -2
        elif(self.activation == 'ReLU'):
            return np.maximum(np.sign(z), np.zeros(z.shape))
    
    def oact(self, z):
        return np.exp(z) / np.exp(z).sum(axis=0)
    
    def forward_prop(self, X):
        self.a = [ np.zeros((1, 1))]
        self.h = [X]
        
        for i in range(1, self.L + 1):
            self.a.append( (self.b[i].T + (self.W[i].T @ self.h[i - 1]).T ).T)
            self.h.append(self.act(self.a[i]))
        self.h[-1] = self.oact(self.a[-1])
        return self.h[-1]
    
    def back_prop(self, Y_pred, ey):
        self.gradW, self.gradB, self.grada = [], [], []
        self.gradh = [ np.zeros((1, 1))]
        self.grada.append(-(ey - Y_pred))
        for i in range (self.L, 0, -1):
            self.gradW.append((self.grada[self.L - i] @ self.h[i - 1].T).T)
            self.gradB.append(self.grada[self.L - i].sum(axis=1))
            self.gradh.append(self.W[i] @ self.grada[self.L - i])
            self.grada.append( self.gradh[self.L - i + 1] * self.deriv_act(self.a[i - 1]))
        
        self.gradW.append(np.zeros((self.N, self.N)))
        self.gradB.append(np.zeros(self.N))
            
        return
    
    def wb_init(self, num):
        L = 10
        if(self.weight_init == 'random'):
            for i in range(self.L):
                self.W.append(np.random.randn(self.N, self.N))
                self.b.append(np.random.randn(self.N))
            self.W.append(np.random.randn(self.N, self.K))
            self.W[1] = np.random.randn(num, self.W[1].shape[1])
            self.b.append(np.random.randn(self.K))
            
        elif (self.weight_init == 'Xavier'):
            for i in range(self.L):
                self.W.append(np.random.normal(0, np.sqrt(1 / self.N), (self.N, self.N)))
                self.b.append(np.random.normal(0, np.sqrt(1 / self.N), self.N))
            self.W.append(np.random.normal(0, np.sqrt(2 / (self.N + self.K)), (self.N, self.K)))
            self.W[1] = np.random.normal(0, np.sqrt(2 / (num + self.W[1].shape[1])), (num, self.W[1].shape[1]))
            self.b.append(np.random.normal(0, np.sqrt(1 / self.K), self.K))
    
    def grad_desc(self, X, Y):
        
        self.wb_init(X.shape[0])
        
        update_W, update_b = self.W, self.b
        v_W, v_b = self.W, self.b
        m_W, m_b = self.W, self.b
        
        ey = np.zeros((self.K, self.n))
        rows = np.arange(self.n)
        ey[Y.T - 1, rows] = 1
        
        for t in range(1, self.num_epochs + 1):
            
            if(self.optimizer == 'sgd'):
                for tt in range(0, ((self.n + self.batch_size - 1) // self.batch_size)):
                    idx = np.random.randint(self.n, size = self.batch_size)
                    ey = np.zeros((self.K, self.batch_size))
                    rows = np.arange(self.batch_size)
                    ey[Y.T[idx] - 1, rows] = 1
                    Y_pred = self.forward_prop(X.T[idx, :].T)

                    self.back_prop(Y_pred, ey)
                    dW = self.gradW[::-1]
                    db = self.gradB[::-1]

                    update_W = [self.learning_rate * u for u in dW]
                    update_b = [self.learning_rate * u for u in db]
                    update_W = [u + v * self.weight_decay for u, v in zip(update_W, dW)]
                    self.W = [u - v for u, v in zip(self.W, update_W)]
                    self.b = [u - v for u, v in zip(self.b, update_b)]
                continue
                
            Y_pred = self.forward_prop(X)
            
            if(self.optimizer == 'nesterov'):
                W , b = self.W, self.b
                ngamma = 0.9 # Hyperparameter
                self.W = [u - ngamma * v for u, v in zip(self.W, update_W)]
                self.b = [u - ngamma * v for u, v in zip(self.b, update_b)]
            
            self.back_prop(Y_pred, ey)
            
            
            
            dW = self.gradW[::-1]
            db = self.gradB[::-1]
            
            if(self.optimizer == 'normal'):
                update_W = [self.learning_rate * u for u in dW]
                update_b = [self.learning_rate * u for u in db]

                
            elif(self.optimizer == 'momentum'):
                mgamma = 0.9 # Hyperparameter
                update_W = [mgamma * u + self.learning_rate * v for u, v in zip(update_W, dW)]
                update_b = [mgamma * u + self.learning_rate * v for u, v in zip(update_b, db)]
            
            elif(self.optimizer == 'nesterov'):
                self.W, self.b = W, b
                update_W = [ngamma * u + self.learning_rate * v for u, v in zip(update_W, dW)]
                update_b = [ngamma * u + self.learning_rate * v for u, v in zip(update_b, db)]
            
            elif(self.optimizer == 'rmsprop'):
                rbeta = 0.9
                epsilon = 0.1
                v_W = [rbeta * u + (1 - rbeta) * (v ** 2) for u, v in zip(v_W, dW)]
                v_b = [rbeta * u + (1 - rbeta) * (v ** 2) for u, v in zip(v_b, db)]
                update_W = [(self.learning_rate * u) / np.sqrt(v + epsilon) for u, v in zip(dW, v_W)]
                update_b = [(self.learning_rate * u) / np.sqrt(v + epsilon) for u, v in zip(db, v_b)]
                
            elif(self.optimizer == 'adam'):
                abeta1 = 0.99
                abeta2 = 0.999
                epsilon = 0.1
                m_W = [abeta1 * u + (1 - abeta1) * v for u, v in zip(m_W, dW)]
                m_b = [abeta1 * u + (1 - abeta1) * v for u, v in zip(m_b, db)]
                v_W = [abeta2 * u + (1 - abeta2) * (v ** 2) for u, v in zip(v_W, dW)]
                v_b = [abeta2 * u + (1 - abeta2) * (v ** 2) for u, v in zip(v_b, db)]
                update_W = [( (self.learning_rate * u) / (1 - abeta1 ** t) ) / np.sqrt( (v / (1 - abeta2 ** t)) + epsilon) for u, v in zip(m_W, v_W)]
                update_b = [( (self.learning_rate * u) / (1 - abeta1 ** t) ) / np.sqrt( (v / (1 - abeta2 ** t)) + epsilon) for u, v in zip(m_b, v_b)]
                 
            elif(self.optimizer == 'nadam'):
                nbeta1 = 0.99
                nbeta2 = 0.999
                epsilon = 0.1
                m_W = [nbeta1 * u + (1 - nbeta1) * v for u, v in zip(m_W, dW)]
                m_b = [nbeta1 * u + (1 - nbeta1) * v for u, v in zip(m_b, db)]
                v_W = [nbeta2 * u + (1 - nbeta2) * (v ** 2) for u, v in zip(v_W, dW)]
                v_b = [nbeta2 * u + (1 - nbeta2) * (v ** 2) for u, v in zip(v_b, db)]
                update_W = [(self.learning_rate * (nbeta1 * u + (1 - nbeta1) * v)) / (1 - nbeta1 ** t) for u, v in zip(m_W, dW)]
                update_W = [ (u) / np.sqrt( (v / (1 - nbeta2 ** t)) + epsilon) for u, v in zip(m_W, v_W)]
                update_b = [(self.learning_rate * (nbeta1 * u + (1 - nbeta1) * v)) / (1 - nbeta1 ** t) for u, v in zip(m_b, db)]
                update_b = [ (u) / np.sqrt( (v / (1 - nbeta2 ** t)) + epsilon) for u, v in zip(m_b, v_b)]
            
            
            update_W = [u + v * self.weight_decay for u, v in zip(update_W, dW)]
            
            self.W = [u - v for u, v in zip(self.W, update_W)]
            self.b = [u - v for u, v in zip(self.b, update_b)]
    
    def fit(self, X_train, Y_train):
        self.n = X_train.shape[0]
        self.batch_size = min(self.batch_size, self.n)
        K = np.max(Y_train) # Y_train must have values from 1 to K
        self.layer_sizes[self.L - 1] = K
        self.K = K
        self.grad_desc(X_train.T, Y_train.T)
        
    def predict_proba(self, X_test):
        return self.forward_prop(X_test.T).T
    
    def predict(self, X_test):
        return (self.predict_proba(X_test).argmax(axis=1) + 1)

In [None]:
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
X.shape, y.shape

In [3]:
X_train = np.array([[ 1.,  2.,  0.,  8.],
 [ 6.,  3., 10.,  5.],
 [ 5.,  2.,  9.,  4.],
 [ 5.,  6.,  3.,  9.],
 [ 4.,  9.,  5.,  5.],
 [ 4.,  5.,  2.,  1.],
 [ 8.,  2.,  8.,  0.],
 [ 3.,  3.,  7.,  6.],
 [ 4.,  6.,  3.,  6.],
 [ 7.,  8.,  2.,  2.],
 [ 5.,  5., 10.,  2.]])
X_test = np.array([[ 3.,  7.,  8.,  7.],
 [ 2.,  5.,  3.,  2.],
 [ 0.,  5.,  2.,  3.],
 [ 7.,  0.,  6.,  8.]])
Y_train = np.array([3, 6, 5, 6, 6, 3, 4, 5, 5, 5, 6])
Y_test = np.array([6, 3, 2, 5])

#model = FNNClassifier(10, 2, activation = 'ReLU', optimizer = 'normal', num_epochs = 100)
#model.fit(X, y)
#print(model.predict_proba(X))
#print(model.predict(X))

In [4]:
sweep_config = {
    'method': 'grid', #'random',
    'metric': {
        'name': 'accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'layer_size': { 
            'values': [128]#[32, 64, 128] 
        },
        'num_layers': { 
            'values': [5]#[3, 4, 5] 
        },
        'activation': { 
            'values': ['sigmoid']#, 'tanh', 'ReLU'] 
        },
        'optimizer': { 
            'values': ['sgd', 'momentum', 'nesterov', 'rmsprop', 'adam', 'nadam'] 
        },
        'weight_decay': { 
            'values': [0]#[0, 0.0005, 0.5] 
        },
        'batch_size': { 
            'values': [16]#[16, 32, 64] 
        },
        'num_epochs': { 
            'values': [100]#[5, 10] 
        },
        'learning_rate': { 
            'values': [0.001]#[0.001, 0.0001] 
        },
        'weight_init': { 
            'values': ['Xavier']#['random', 'Xavier'] 
        }
    }
}


In [5]:
sweep_id = wandb.sweep(sweep_config, entity = 'raghuraman2000', project = 'cs6910_a1_0740')

Create sweep with ID: ss3uuml7
Sweep URL: https://wandb.ai/raghuraman2000/cs6910_a1_0740/sweeps/ss3uuml7


In [6]:
def train():
    def_params = {
        'layer_size' : 32,
        'num_layers' : 4,
        'activation' : 'ReLU',
        'optimizer' : 'adam',
        'weight_decay' : 0.0005,
        'batch_size' : 32,
        'num_epochs' : 5,
        'learning_rate' : 0.001,
        'weight_init' : 'random'
    }
    wandb.init(config = def_params)
    config = wandb.config
    model = FNNClassifier(**config)
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_test)
    wandb.log({'accuracy': np.sum(Y_predict == Y_test) / Y_test.shape[0]})
    

In [7]:
wandb.agent(sweep_id, train)

wandb: Agent Starting Run: mmtd9yur with config:
wandb: 	activation: sigmoid
wandb: 	batch_size: 16
wandb: 	layer_size: 128
wandb: 	learning_rate: 0.001
wandb: 	num_epochs: 100
wandb: 	num_layers: 5
wandb: 	optimizer: sgd
wandb: 	weight_decay: 0
wandb: 	weight_init: Xavier
wandb: Currently logged in as: raghuraman2000 (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.10.22 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
layer_sizes,128
_runtime,7
_timestamp,1615401465
_step,8
num_layers,5
activation,sigmoid
optimizer,sgd
weight_decay,0
batch_size,16
learning_rate,0.001


0,1
layer_sizes,▁
_runtime,▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁
_step,▁▂▃▄▅▅▆▇█
num_layers,▁
weight_decay,▁
batch_size,▁
learning_rate,▁
accuracy,▁


wandb: Agent Starting Run: t899sytg with config:
wandb: 	activation: sigmoid
wandb: 	batch_size: 16
wandb: 	layer_size: 128
wandb: 	learning_rate: 0.001
wandb: 	num_epochs: 100
wandb: 	num_layers: 5
wandb: 	optimizer: momentum
wandb: 	weight_decay: 0
wandb: 	weight_init: Xavier
wandb: wandb version 0.10.22 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
layer_sizes,128
_runtime,7
_timestamp,1615401477
_step,8
num_layers,5
activation,sigmoid
optimizer,momentum
weight_decay,0
batch_size,16
learning_rate,0.001


0,1
layer_sizes,▁
_runtime,▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁
_step,▁▂▃▄▅▅▆▇█
num_layers,▁
weight_decay,▁
batch_size,▁
learning_rate,▁
accuracy,▁


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: m24cj0je with config:
wandb: 	activation: sigmoid
wandb: 	batch_size: 16
wandb: 	layer_size: 128
wandb: 	learning_rate: 0.001
wandb: 	num_epochs: 100
wandb: 	num_layers: 5
wandb: 	optimizer: nesterov
wandb: 	weight_decay: 0
wandb: 	weight_init: Xavier
wandb: wandb version 0.10.22 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
layer_sizes,128
_runtime,6
_timestamp,1615401502
_step,8
num_layers,5
activation,sigmoid
optimizer,nesterov
weight_decay,0
batch_size,16
learning_rate,0.001


0,1
layer_sizes,▁
_runtime,▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁
_step,▁▂▃▄▅▅▆▇█
num_layers,▁
weight_decay,▁
batch_size,▁
learning_rate,▁
accuracy,▁


wandb: Agent Starting Run: tzyi3aab with config:
wandb: 	activation: sigmoid
wandb: 	batch_size: 16
wandb: 	layer_size: 128
wandb: 	learning_rate: 0.001
wandb: 	num_epochs: 100
wandb: 	num_layers: 5
wandb: 	optimizer: rmsprop
wandb: 	weight_decay: 0
wandb: 	weight_init: Xavier
wandb: wandb version 0.10.22 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


  update_W = [(self.learning_rate * u) / np.sqrt(v + epsilon) for u, v in zip(dW, v_W)]
  update_b = [(self.learning_rate * u) / np.sqrt(v + epsilon) for u, v in zip(db, v_b)]


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
layer_sizes,128
_runtime,7
_timestamp,1615401515
_step,8
num_layers,5
activation,sigmoid
optimizer,rmsprop
weight_decay,0
batch_size,16
learning_rate,0.001


0,1
layer_sizes,▁
_runtime,▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁
_step,▁▂▃▄▅▅▆▇█
num_layers,▁
weight_decay,▁
batch_size,▁
learning_rate,▁
accuracy,▁


wandb: Agent Starting Run: orrvibz4 with config:
wandb: 	activation: sigmoid
wandb: 	batch_size: 16
wandb: 	layer_size: 128
wandb: 	learning_rate: 0.001
wandb: 	num_epochs: 100
wandb: 	num_layers: 5
wandb: 	optimizer: adam
wandb: 	weight_decay: 0
wandb: 	weight_init: Xavier
wandb: wandb version 0.10.22 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


  update_W = [( (self.learning_rate * u) / (1 - abeta1 ** t) ) / np.sqrt( (v / (1 - abeta2 ** t)) + epsilon) for u, v in zip(m_W, v_W)]
  update_b = [( (self.learning_rate * u) / (1 - abeta1 ** t) ) / np.sqrt( (v / (1 - abeta2 ** t)) + epsilon) for u, v in zip(m_b, v_b)]


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
layer_sizes,128
_runtime,7
_timestamp,1615401527
_step,8
num_layers,5
activation,sigmoid
optimizer,adam
weight_decay,0
batch_size,16
learning_rate,0.001


0,1
layer_sizes,▁
_runtime,▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁
_step,▁▂▃▄▅▅▆▇█
num_layers,▁
weight_decay,▁
batch_size,▁
learning_rate,▁
accuracy,▁


wandb: Agent Starting Run: ff63h109 with config:
wandb: 	activation: sigmoid
wandb: 	batch_size: 16
wandb: 	layer_size: 128
wandb: 	learning_rate: 0.001
wandb: 	num_epochs: 100
wandb: 	num_layers: 5
wandb: 	optimizer: nadam
wandb: 	weight_decay: 0
wandb: 	weight_init: Xavier
wandb: wandb version 0.10.22 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


  update_W = [ (u) / np.sqrt( (v / (1 - nbeta2 ** t)) + epsilon) for u, v in zip(m_W, v_W)]
  update_b = [ (u) / np.sqrt( (v / (1 - nbeta2 ** t)) + epsilon) for u, v in zip(m_b, v_b)]


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
layer_sizes,128
_runtime,6
_timestamp,1615401538
_step,8
num_layers,5
activation,sigmoid
optimizer,nadam
weight_decay,0
batch_size,16
learning_rate,0.001


0,1
layer_sizes,▁
_runtime,▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁
_step,▁▂▃▄▅▅▆▇█
num_layers,▁
weight_decay,▁
batch_size,▁
learning_rate,▁
accuracy,▁


wandb: Sweep Agent: Waiting for job.
wandb: Sweep Agent: Exiting.
