In [3]:
import numpy as np
import pandas as pd
from keras.datasets import fashion_mnist
import matplotlib.pyplot as plt

In [None]:
(X_train, Y_train) , (X_test, Y_test) = fashion_mnist.load_data()

In [None]:
done = set()
fig, ax = plt.subplots(5, 2, figsize = (25, 25))
fig.tight_layout()
for x, y in zip(X_train, Y_train):
    if y not in done:
        done.add(y)
        ax[y % 5, y // 5].imshow(x)

In [11]:
class FNNClassifier:
    def __init__(self, N, L, activation='ReLU', optimizer='adam', weight_decay=0.0001, batch_size=200, learning_rate=0.001, num_epochs=200, weight_init='random'):
        self.activation = activation # 'identity', 'logistic', 'tanh', 'relu'
        self.optimizer = optimizer # 'normal', 'sgd', 'momentum', 'nesterov', 'rmsprop', 'adam', 'nadam'
        self.weight_decay = weight_decay # L2 regularization hyperparameter
        self.batch_size = batch_size # Batch size
        self.learning_rate = learning_rate # Learning Rate
        self.num_epochs = num_epochs # Number of epochs
        self.weight_init = weight_init # 'random', 'xavier'
        self.n = 100
        self.K = 10
        self.L = L
        self.N = N
        self.layer_sizes = np.zeros((L + 2))
        self.layer_sizes[1 : L + 1] = N
        self.W = []
        self.b = []
        self.a = []
        self.h = []
    
    def act(self, z):
        if(self.activation == 'sigmoid'):
            return 1 / (1 + np.exp(-z))
        elif(self.activation == 'tanh'):
            return np.tanh(z)
        elif(self.activation == 'ReLU'):
            return np.maximum(z, np.zeros(z.shape)) 
    
    def deriv_act(self, z):
        if(self.activation == 'sigmoid'):
            return 1 / (2 + np.exp(-z) + np.exp(z))
        elif(self.activation == 'tanh'):
            return np.cosh(z) ** -2
        elif(self.activation == 'ReLU'):
            return np.maximum(np.sign(z), np.zeros(z.shape))
    
    def oact(self, z):
        return np.exp(z) / np.exp(z).sum(axis=0)
    
    def forward_prop(self, X):
        self.a = [ np.zeros((1, 1))]
        self.h = [X]
        
        for i in range(1, self.L + 1):
            self.a.append( (self.b[i].T + (self.W[i].T @ self.h[i - 1]).T).T )
            self.h.append(self.act(self.a[i]))
        self.h[-1] = self.oact(self.a[-1])
        return self.h[-1]
    
    def back_prop(self, Y_pred, ey):
        self.gradW, self.gradB, self.grada = [], [], []
        self.gradh = [ np.zeros((1, 1))]
        self.grada.append(-(ey - Y_pred))
        for i in range (self.L, 0, -1):
            self.gradW.append((self.grada[self.L - i] @ self.h[i - 1].T).T)
            self.gradB.append(self.grada[self.L - i].sum(axis=1))
            self.gradh.append(self.W[i] @ self.grada[self.L - i])
            self.grada.append( self.gradh[self.L - i + 1] * self.deriv_act(self.a[i - 1]))
        
        self.gradW.append(np.zeros((self.N, self.N)))
        self.gradB.append(np.zeros(self.N))
            
        return
    
    def wb_init(self, num):
        if(self.weight_init == 'random'):
            self.W = [1 + 10 * np.random.rand(self.N, self.N) for _ in range(0, self.L)]
            self.b = [1 + 10 * np.random.rand(self.N) for _ in range(0, self.L)]
            self.W.append(1 + 10 * np.random.rand(self.N, self.K))
            self.W[1] = 1 + 10 * np.random.rand(num, self.W[1].shape[1])
            self.b.append(1 + 10 * np.random.rand(self.K))
            
        '''
        elif self.weight_init == 'Xavier':
# example of the xavier weight initialization
from math import sqrt
from numpy import mean
from numpy.random import rand
# number of nodes in the previous layer
n = 10
# calculate the range for the weights
lower, upper = -(1.0 / sqrt(n)), (1.0 / sqrt(n))
# generate random numbers
numbers = rand(1000)
# scale to the desired range
scaled = lower + numbers * (upper - lower)
# summarize
print(lower, upper)
print(scaled.min(), scaled.max())
print(scaled.mean(), scaled.std())
        '''
    
    def grad_desc(self, X, Y):
        
        self.wb_init(X.shape[0])
        
        update_W, update_b = self.W, self.b
        v_W, v_b = self.W, self.b
        m_W, m_b = self.W, self.b
        
        ey = np.zeros((self.K, self.n))
        rows = np.arange(self.n)
        ey[Y.T - 1, rows] = 1
        
        for t in range(1, self.num_epochs + 1):
                
            if(self.optimizer == 'sgd'):
                for tt in range(0, ((self.n + self.batch_size - 1) // self.batch_size)):
                    idx = np.random.randint(self.n, size = self.batch_size)
                    ey = np.zeros((self.K, self.batch_size))
                    rows = np.arange(self.batch_size)
                    ey[Y.T[idx] - 1, rows] = 1
                    Y_pred = self.forward_prop(X.T[idx, :].T)

                    self.back_prop(Y_pred, ey)
                    dW = self.gradW[::-1]
                    db = self.gradB[::-1]

                    update_W = [self.learning_rate * u for u in dW]
                    update_b = [self.learning_rate * u for u in db]
                    update_W = [u + v * self.weight_decay for u, v in zip(update_W, dW)]
                    self.W = [u - v for u, v in zip(self.W, update_W)]
                    self.b = [u - v for u, v in zip(self.b, update_b)]
                continue
                
            Y_pred = self.forward_prop(X)
            
            if(self.optimizer == 'nesterov'):
                W , b = self.W, self.b
                ngamma = 0.9 # Hyperparameter
                self.W = [u - ngamma * v for u, v in zip(self.W, update_W)]
                self.b = [u - ngamma * v for u, v in zip(self.b, update_b)]
            
            self.back_prop(Y_pred, ey)
            
            dW = self.gradW[::-1]
            db = self.gradB[::-1]
            
            if(self.optimizer == 'normal'):
                update_W = [self.learning_rate * u for u in dW]
                update_b = [self.learning_rate * u for u in db]

                
            elif(self.optimizer == 'momentum'):
                mgamma = 0.9 # Hyperparameter
                update_W = [mgamma * u + self.learning_rate * v for u, v in zip(update_W, dW)]
                update_b = [mgamma * u + self.learning_rate * v for u, v in zip(update_b, db)]
            
            elif(self.optimizer == 'nesterov'):
                self.W, self.b = W, b
                update_W = [ngamma * u + self.learning_rate * v for u, v in zip(update_W, dW)]
                update_b = [ngamma * u + self.learning_rate * v for u, v in zip(update_b, db)]
            
            elif(self.optimizer == 'rmsprop'):
                rbeta = 0.9
                epsilon = 0.1
                v_W = [rbeta * u + (1 - rbeta) * (v ** 2) for u, v in zip(v_W, dW)]
                v_b = [rbeta * u + (1 - rbeta) * (v ** 2) for u, v in zip(v_b, db)]
                update_W = [(self.learning_rate * u) / np.sqrt(v + epsilon) for u, v in zip(dW, v_W)]
                update_b = [(self.learning_rate * u) / np.sqrt(v + epsilon) for u, v in zip(db, v_b)]
                
            elif(self.optimizer == 'adam'):
                abeta1 = 0.99
                abeta2 = 0.999
                epsilon = 0.1
                m_W = [abeta1 * u + (1 - abeta1) * v for u, v in zip(m_W, dW)]
                m_b = [abeta1 * u + (1 - abeta1) * v for u, v in zip(m_b, db)]
                v_W = [abeta2 * u + (1 - abeta2) * (v ** 2) for u, v in zip(v_W, dW)]
                v_b = [abeta2 * u + (1 - abeta2) * (v ** 2) for u, v in zip(v_b, db)]
                update_W = [( (self.learning_rate * u) / (1 - abeta1 ** t) ) / np.sqrt( (v / (1 - abeta2 ** t)) + epsilon) for u, v in zip(m_W, v_W)]
                update_b = [( (self.learning_rate * u) / (1 - abeta1 ** t) ) / np.sqrt( (v / (1 - abeta2 ** t)) + epsilon) for u, v in zip(m_b, v_b)]
                 
            elif(self.optimizer == 'nadam'):
                nbeta1 = 0.99
                nbeta2 = 0.999
                epsilon = 0.1
                m_W = [nbeta1 * u + (1 - nbeta1) * v for u, v in zip(m_W, dW)]
                m_b = [nbeta1 * u + (1 - nbeta1) * v for u, v in zip(m_b, db)]
                v_W = [nbeta2 * u + (1 - nbeta2) * (v ** 2) for u, v in zip(v_W, dW)]
                v_b = [nbeta2 * u + (1 - nbeta2) * (v ** 2) for u, v in zip(v_b, db)]
                update_W = [(self.learning_rate * (nbeta1 * u + (1 - nbeta1) * v)) / (1 - nbeta1 ** t) for u, v in zip(m_W, dW)]
                update_W = [ (u) / np.sqrt( (v / (1 - nbeta2 ** t)) + epsilon) for u, v in zip(m_W, v_W)]
                update_b = [(self.learning_rate * (nbeta1 * u + (1 - nbeta1) * v)) / (1 - nbeta1 ** t) for u, v in zip(m_b, db)]
                update_b = [ (u) / np.sqrt( (v / (1 - nbeta2 ** t)) + epsilon) for u, v in zip(m_b, v_b)]
            
            
            update_W = [u + v * self.weight_decay for u, v in zip(update_W, dW)]
            
            self.W = [u - v for u, v in zip(self.W, update_W)]
            self.b = [u - v for u, v in zip(self.b, update_b)]
    
    def fit(self, X_train, Y_train):
        self.n = X_train.shape[0]
        self.batch_size = min(self.batch_size, self.n)
        K = np.max(Y_train) # Y_train must have values from 1 to K
        self.layer_sizes[self.L - 1] = K
        self.K = K
        self.grad_desc(X_train.T, Y_train.T)
        
    def predict_proba(self, X_test):
        return self.forward_prop(X_test.T).T

In [None]:
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
X.shape, y.shape

In [13]:
X = np.array([[ 1.,  2.,  0.,  8.],
 [ 6.,  3., 10.,  5.],
 [ 5.,  2.,  9.,  4.],
 [ 5.,  6.,  3.,  9.],
 [ 4.,  9.,  5.,  5.],
 [ 4.,  5.,  2.,  1.],
 [ 8.,  2.,  8.,  0.],
 [ 3.,  3.,  7.,  6.],
 [ 4.,  6.,  3.,  6.],
 [ 7.,  8.,  2.,  2.],
 [ 5.,  5., 10.,  2.],
 [ 3.,  7.,  8.,  7.],
 [ 2.,  5.,  3.,  2.],
 [ 0.,  5.,  2.,  3.],
 [ 7.,  0.,  6.,  8.]])
y = np.array([3, 6, 5, 6, 6, 3, 4, 5, 5, 5, 6, 6, 3, 2, 5])
model = FNNClassifier(1, 1, activation = 'tanh', optimizer = 'normal', batch_size = 25, num_epochs = 10)
model.fit(X, y)
model.predict_proba(X)

array([[3.59099535e-18, 2.34229116e-10, 6.08696077e-13, 9.99944458e-01,
        3.16884025e-28, 5.55421397e-05],
       [9.62028555e-05, 9.99903637e-01, 1.57574863e-07, 5.69567968e-15,
        4.60078333e-35, 2.35288953e-09],
       [1.42428641e-04, 9.99857458e-01, 1.13358619e-07, 4.58868951e-15,
        2.66564040e-30, 9.89525855e-11],
       [7.43774594e-18, 2.98326877e-13, 2.80494525e-09, 3.19101077e-02,
        1.00802482e-42, 9.68089890e-01],
       [7.44309628e-08, 1.16343406e-20, 8.16261450e-11, 4.03693545e-07,
        1.44496099e-39, 9.99999522e-01],
       [3.51667363e-08, 2.35829469e-14, 9.94969318e-01, 3.96167381e-06,
        1.63423510e-24, 5.02668470e-03],
       [3.47974667e-10, 9.01498317e-06, 9.99990985e-01, 1.53858831e-21,
        1.23132261e-32, 2.78935977e-15],
       [2.96113675e-02, 9.69001058e-01, 3.51131225e-08, 5.62544691e-06,
        1.07591844e-26, 1.38191396e-03],
       [8.91550826e-12, 1.03775590e-13, 4.48998829e-07, 3.73692233e-03,
        1.12959523e-34, 

In [None]:
import wandb

sweep_config = {
    'method' : 'random',
    'metric' : {
        'name' : 'accuracy',
        'goal' : 'maximize'
    },
    'parameters' : {
        'N' : [32, 64, 128],
        'L' : [3, 4, 5],
        'activation' : ['sigmoid', 'tanh', 'ReLU'],
        'optimizer' : ['sgd', 'momentum', 'nesterov', 'rmsprop', 'adam', 'nadam'],
        'weight_decay' : [0, 0.0005, 0.5],
        'batch_size' : [16, 32, 64],
        'num_epochs' : [5, 10],
        'learning_rate' : [0.001, 0.0001],
        'weight_init' : ['random', 'Xavier']
    }
}

In [None]:
sweep_id = wandb.sweep(sweep_config, entity = 'sweep', project = 'cs6910a12021')

In [None]:
def train():
    def_params = {
        'N' : 32,
        'L' : 4,
        'activation' : 'ReLU',
        'optimizer' : 'adam',
        'weight_decay' : 0.0005,
        'batch_size' : 32,
        'num_epochs' : 5,
        'learning_rate' : 0.001,
        'weight_init' : 'random'
    }
    wandb.init(config = def_params)
    config = wandb.config
    model = FFNClassifier(**config)
    model.fit(X_train, Y_train)
    

In [None]:
wandb.agent(sweep_id, train)