In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [11]:
X = np.vstack([(np.random.rand(10, 2)*5), (np.random.rand(10, 2)*10)])
Y = np.hstack([[0]*10, [1]*10])
df = pd.DataFrame(X, columns={"X1", "X2"})
df["Y"] = Y

In [12]:
df

Unnamed: 0,X1,X2,Y
0,2.113504,2.184921,0
1,1.797302,1.056425,0
2,4.8007,0.081964,0
3,3.631863,3.607469,0
4,3.732544,4.013074,0
5,4.952091,0.521308,0
6,3.565606,3.518831,0
7,4.342235,2.501519,0
8,0.587853,0.532752,0
9,0.880891,3.927227,0


In [13]:
Z = np.zeros((20, 2))
for i in range(20):
    Z[i, Y[i]] = 1

In [14]:
X.shape

(20, 2)

In [15]:
Wi_1 = np.random.randn(3, 2)
Bi_1 = np.random.randn(3)
Wi_2 = np.random.randn(3, 2)
Bi_2 = np.random.randn(2)

In [18]:
print(Wi_1)
print("---")
print(Wi_2)
print("---")
print(Bi_1)
print("---")
print(Bi_2)
print("---")

[[-0.85315766 -1.21346041]
 [ 2.32628583 -0.73595807]
 [-0.60724462 -0.21752671]]
---
[[-0.77809779 -0.23264868]
 [ 2.23246867  0.61945548]
 [-0.20748072 -1.48405893]]
---
[ 0.59600879 -0.68271908  0.25547627]
---
[0.79136625 0.5297001 ]
---


In [19]:
print(Wi_1.T) #transpose of matrix

[[-0.85315766  2.32628583 -0.60724462]
 [-1.21346041 -0.73595807 -0.21752671]]


## Forward Propagation

In [21]:
def forward_prop(X, Wi_1, Bi_1, Wi_2, Bi_2):
    # first layer
    M = 1 / (1 + np.exp(-(X.dot(Wi_1.T) + Bi_1)))
    # second layer
    A = M.dot(Wi_2) + Bi_2
    expA = np.exp(A)
    Y = expA / expA.sum(axis=1, keepdims=True) # softmax, generates forward propagation
    return Y, M

In [22]:
forward_prop(X, Wi_1, Bi_1, Wi_2, Bi_2)

(array([[0.87940554, 0.12059446],
        [0.88581393, 0.11418607],
        [0.87459484, 0.12540516],
        [0.87460126, 0.12539874],
        [0.87343216, 0.12656784],
        [0.87389734, 0.12610266],
        [0.87498152, 0.12501848],
        [0.87409651, 0.12590349],
        [0.82567084, 0.17432916],
        [0.70205616, 0.29794384],
        [0.8616316 , 0.1383684 ],
        [0.87094923, 0.12905077],
        [0.86795084, 0.13204916],
        [0.61297248, 0.38702752],
        [0.82772657, 0.17227343],
        [0.86739006, 0.13260994],
        [0.86896014, 0.13103986],
        [0.86732714, 0.13267286],
        [0.78134161, 0.21865839],
        [0.86765981, 0.13234019]]),
 array([[2.06644576e-02, 9.32509095e-01, 1.81946261e-01],
        [9.80303691e-02, 9.38244975e-01, 2.56216607e-01],
        [2.66175633e-02, 9.99970313e-01, 6.43109914e-02],
        [1.02702330e-03, 9.94006603e-01, 6.09580926e-02],
        [5.76391542e-04, 9.93611146e-01, 5.29480878e-02],
        [1.39051530e-02, 9.9

# Backpropagation with Gradient Descent

In [24]:
# actual minus predicted value
def diffWi_2(H, Z, Y):
    return H.T.dot(Z - Y)

In [25]:
def diffWi_1(X, H, Z, output, Wi_2):
    dZ = (Z - output).dot(Wi_2.T) * H * (1-H)
    return X.T.dot(dZ)

In [26]:
def diffB2(Z, Y):
    return (Z-Y).sum(axis=0)

def diffB1(Z, Y, Wi_2, H):
    return ((Z-Y).dot(Wi_2.T) * H * (1-H)).sum(axis=0)

In [27]:
learning_rate = 1e-3
for epoch in range(10000):
    output, hidden = forward_prop(X, Wi_1, Bi_1, Wi_2, Bi_2)
    Wi_2 += learning_rate * diffWi_2(hidden, Z, output)
    Bi_2 += learning_rate * diffB2(Z, output)
    Wi_1 += learning_rate * diffWi_1(X, hidden, Z, output, Wi_2).T
    Bi_1 += learning_rate * diffB1(Z, output, Wi_2, hidden)

# Generating Predictions

In [32]:
X_test = np.array([1,3])

hidden_output = 1 / (1 + np.exp(-X_test.dot(Wi_1.T)-Bi_1))
outer_layer_output = hidden_output.dot(Wi_2) + Bi_2
expA = np.exp(outer_layer_output)
Y = expA / expA.sum()
print(f"prob of 0: {Y[0]}")
print(f"prob of 1: {Y[1]}")

prob of 0: 0.6125144665206897
prob of 1: 0.3874855334793103
