In [221]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder

In [222]:
df = pd.read_csv('./data/iris.csv')
df.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [223]:
len(pd.unique(df['Name']))

3

In [224]:
data = np.array(df)
np.random.shuffle(data)
data.shape

(150, 5)

In [225]:
X = data[:, 0:4]
y = data[:, -1]


In [226]:
y

array(['Iris-setosa', 'Iris-virginica', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',
       'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-s

In [227]:
X.shape


(150, 4)

In [228]:
y.shape

(150,)

In [229]:
X_train = X[0: 120, :]
X_train.shape

(120, 4)

In [230]:
X_test = X[120:, :]
X_test.shape

(30, 4)

In [231]:
y_train = y[0: 120,]
y_train.shape

(120,)

In [232]:
y_test = y[120:, ]
y_test.shape

(30,)

In [233]:
encoder = OneHotEncoder(sparse_output=False)
y_train_oh = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_oh = encoder.transform(y_test.reshape(-1, 1))

In [234]:
y_test_oh.shape

(30, 3)

In [235]:
y_train_oh.shape

(120, 3)

In [367]:
print(np.unique(y_train, return_counts=True))
print(encoder.categories_)
print(y_train_oh[:5])

(array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object), array([37, 41, 42]))
[array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)]
[[1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [280]:
def init_weights():
    # Xavier/He init 
    input, out = 4, 4
    limit = np.sqrt(6 / (input + out))
    W1 = np.random.uniform(-limit, limit, size=(4, 4))
    b1 = np.expand_dims(np.random.randn(4,), axis=1)
    
    input, out = 4, 3
    limit = np.sqrt(6 / (input + out))
    W2 = np.random.uniform(-limit, limit, size=(3, 4)) 
    b2 = np.expand_dims(np.random.randn(3,), axis=1)
    return W1, b1, W2, b2


In [289]:
W1, b1, W2, b2 = init_weights()
W1

array([[-0.77651004,  0.05430781,  0.0703821 ,  0.23803557],
       [ 0.39160168,  0.82419998,  0.02823303, -0.30664838],
       [ 0.51127749, -0.39693018, -0.1057046 , -0.73013497],
       [-0.82211663,  0.80133056,  0.58193464,  0.33943728]])

In [290]:
b1.shape

(4, 1)

In [291]:
def relu(X):
    return np.maximum(0, X)

In [292]:
def softmax(X):
    X = np.array(X, dtype=float)
    exp_X = np.exp(X - np.max(X, axis=0, keepdims=True))
    return exp_X / np.sum(exp_X, axis=0, keepdims=True)

In [293]:
def deriv_relu(X):
    '''
    If X > 1 => d relu/d x = 1
    else 0
    '''
    #will be converted from boolean to number
    return X > 0

In [294]:
X = [1, 2, 4]
softmax(X)

array([0.04201007, 0.1141952 , 0.84379473])

In [295]:
x = [1, -2, 3, 4]
print(relu(x))

[1 0 3 4]


In [296]:
def foward_pass(W1, b1, W2, b2, X_train):
    Z1 = W1.dot(X_train.T) + b1
    A1 = relu(Z1)

    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)

    return Z1, A1, Z2, A2

In [297]:
'''
m = batch size 

! MUST DIVIDE BY BATCH SIZE DUE TO BATCH LEARNING !
'''

def backward(Z1, A1, Z2, A2, W2, X, Y):
    m = X.shape[0]
    dZ2 = A2 - Y.T
    dW2 = dZ2.dot(A1.T) / m
    db2 = np.sum(dZ2, axis=1, keepdims=True) / m

    dZ1 = deriv_relu(Z1) * W2.T.dot(dZ2)
    dW1 = dZ1.dot(X) / m
    db1 = np.sum(dZ1, axis=1, keepdims=True) / m

    return dW1, db1, dW2, db2

In [298]:
def gradient_descent(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2

    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1

    return W1, b1, W2, b2

In [299]:
'''
Used for visualization
'''
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    predictions = np.expand_dims(predictions, axis=1)
    return np.sum(predictions == Y) / Y.size


In [300]:
'''
To be called with X_train, y_train_oh
'''


def fit(X, Y, epochs, alpha):
    W1, b1, W2, b2 = init_weights()

    for i in range(epochs):
        Z1, A1, Z2, A2 = foward_pass(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward(Z1, A1, Z2, A2, W2, X, Y)
        W1, b1, W2, b2 = gradient_descent(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)

        if i % 10 == 0:
            print('Iteration: ', i)
            print('Accuracy: ', get_accuracy(get_predictions(A2), Y))

    return W1, b1, W2, b2

In [301]:
X_train

array([[-1.20898997,  0.10496967, -1.35834557, -1.49513612],
       [ 0.52699563, -0.3530798 ,  1.00650234,  0.72471664],
       [-0.34099717, -0.12405506,  0.37203095,  0.33297791],
       [ 0.27899768, -0.12405506,  0.42971017,  0.20239834],
       [ 2.26298122,  1.7081428 ,  1.64097373,  1.24703493],
       [ 0.6509946 , -0.81112926,  0.83346469,  0.85529621],
       [-0.2169982 ,  3.0822912 , -1.35834557, -1.1033974 ],
       [ 1.02299151,  0.10496967,  0.48738939,  0.33297791],
       [ 0.89899254, -0.12405506,  0.31435173,  0.20239834],
       [ 1.02299151,  0.56301914,  1.06418156,  1.63877366],
       [-0.71299408,  1.47911807, -1.35834557, -1.36455655],
       [-0.58899511,  0.79204387, -1.24298713, -1.36455655],
       [ 1.27098945,  0.10496967,  0.71810625,  1.37761451],
       [ 0.27899768, -0.58210453,  0.48738939, -0.05876081],
       [ 0.27899768, -0.3530798 ,  0.48738939,  0.20239834],
       [-0.96099202,  1.0210686 , -1.41602478, -1.23397698],
       [-1.20898997,  0.

In [302]:
X_test

array([[ 0.15499871, -1.95625293,  0.08363486, -0.31991996],
       [ 1.64298636,  0.3339944 ,  1.23721921,  0.72471664],
       [-1.08499099,  0.79204387, -1.35834557, -1.36455655],
       [ 1.76698533, -0.3530798 ,  1.41025686,  0.72471664],
       [-0.96099202,  1.0210686 , -1.41602478, -1.36455655],
       [ 0.52699563, -1.7272282 ,  0.31435173,  0.07181877],
       [-1.82898482, -0.3530798 , -1.41602478, -1.36455655],
       [ 0.03099974,  0.3339944 ,  0.5450686 ,  0.72471664],
       [-0.34099717, -0.12405506,  0.14131408,  0.07181877],
       [-0.34099717, -0.3530798 , -0.147082  ,  0.07181877],
       [-0.96099202,  1.7081428 , -1.1276287 , -1.1033974 ],
       [ 1.27098945,  0.10496967,  0.89114391,  1.11645536],
       [-1.08499099,  1.0210686 , -1.473704  , -1.23397698],
       [ 0.52699563, -0.81112926,  0.60274782,  0.72471664],
       [-0.96099202,  0.56301914, -1.24298713, -0.97281783],
       [ 1.27098945,  0.10496967,  0.60274782,  0.33297791],
       [ 0.77499357,  0.

In [303]:
'''
STANDARD SCALING

X_test must be scaled with the mean and std of X_train to prevent "data leaks"
'''
X_train = np.array(X_train, dtype=float)
X_test = np.array(X_test, dtype=float)

mean = X_train.mean(axis=0)
std = X_train.std(axis=0)

X_train = (X_train - mean) / std

X_test = (X_test - mean) / std


In [304]:
print(np.isnan(X_train).any(), np.isnan(X_test).any())
print(np.isinf(X_train).any(), np.isinf(X_test).any())

False False
False False


In [363]:
W1, b1, W2, b2 = fit(X_train, y_train_oh, 500, 0.001)

Iteration:  0
Accuracy:  0.6666666666666666
Iteration:  10
Accuracy:  0.6666666666666666
Iteration:  20
Accuracy:  0.6666666666666666
Iteration:  30
Accuracy:  0.6666666666666666
Iteration:  40
Accuracy:  0.6666666666666666
Iteration:  50
Accuracy:  0.6666666666666666
Iteration:  60
Accuracy:  0.6666666666666666
Iteration:  70
Accuracy:  0.6666666666666666
Iteration:  80
Accuracy:  0.6666666666666666
Iteration:  90
Accuracy:  0.6666666666666666
Iteration:  100
Accuracy:  0.6666666666666666
Iteration:  110
Accuracy:  0.6666666666666666
Iteration:  120
Accuracy:  0.6666666666666666
Iteration:  130
Accuracy:  0.6666666666666666
Iteration:  140
Accuracy:  0.6666666666666666
Iteration:  150
Accuracy:  0.6666666666666666
Iteration:  160
Accuracy:  0.6666666666666666
Iteration:  170
Accuracy:  0.6666666666666666
Iteration:  180
Accuracy:  0.6666666666666666
Iteration:  190
Accuracy:  0.6666666666666666
Iteration:  200
Accuracy:  0.6666666666666666
Iteration:  210
Accuracy:  0.6666666666666666

In [254]:
# print(f"Layer 1 weights \n"
#       f"{W1}\n"
#       f""
#       f""
#       f"Layer 1 biases \n"
#       f"{b1} \n"
#       f""
#       f""
#       f"Layer 2 weights \n"
#       f"{W2} \n"
#       f""
#       f""
#       f"Layer 2 biases \n"
#       f"{b2} \n"
#       )

In [364]:
'''
To be called with X_test, y_test_oh and learned weights + biases
'''

def predict(W1, b1, W2, b2, X):
    Z1 = W1.dot(X.T) + b1
    A1 = relu(Z1)

    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)

    return A2

In [366]:
y_pred = get_predictions(predict(W1, b1, W2, b2, X_test))
y_true = np.argmax(y_test_oh, axis=1)
test_acc = get_accuracy(y_pred, y_true)
test_acc

np.float64(13.0)

In [185]:
print(np.unique(y_pred, return_counts=True))

(array([0, 1]), array([28,  2]))
