# Pure Python Implementation

In [2]:
import math as m 

In [None]:
# initial set up

# for testing 
input = [1, -1]
# weights for layer and bias 
w1 = [[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0]]
b1 = [0.0, 0.0, 0.0]
w2 = [[1.0, 1.0], [-1.0, -1.0], [-1.0, -1.0]]
b2 = [0.0, 0.0]
target_class = [1, 0]

In [None]:
# support functions 

def sigmoid(values: list):
    return [1/(1 + m.exp(-x)) for x in values] 

def softmax(values: list):
    exp_sum = 0
    for value in values:
        exp_sum += m.exp(value)
    
    return [(m.exp(x)/exp_sum) for x in values]

def weighted_sum(input: list, weights:list):
    # initialize empty output list
    output = [0] * len(weights[0])

    # loop over input values
    for idx1, val in enumerate(input):
        # loop over number of weights 
        for idx2 in range(len(weights[0])):
            output[idx2] += val * weights[idx1][idx2]
            
    return output 

def predict(smax_output):
    return target_class[smax_output.index(max(smax_output))]

def forwardpass(input, w1, b1, w2, b2, target):
    # feedforward pass
    k1 = weighted_sum(input, w1, b1) # first linear combination
    h1 = sigmoid(k1) # sigmoid activation function
    k2 = weighted_sum(h1, w2, b2) # second linear combination
    h2 = softmax(k2) # sigmoid activation function
    y = predict(h2)

    return k1, h1, k2, h2, y

def elemwise_mult(values1: list, values2: list):
    return [x * y for x, y in zip(values1, values2)]

def deriv_loss(values: list, target_idx):
    return [-1/x if values.index(x) == target_idx else 0 for x in values]

def deriv_sigmoid(values: list):
    sigmoid_vals = sigmoid(values)
    return [x*(1 - x) for x in sigmoid_vals]

def deriv_softmax(values: list, target_idx):
    
    deriv = [0] * len(values)
    for idx, value in enumerate(values):
        if idx == target_idx:
            deriv[idx] = value*(1 - value)
        else:
            deriv[idx] = (-1)*values[target_idx]*value
    
    return deriv

def backprop(input, w1, b1, w2, b2, target):
    # feedforward pass
    o1 = weighted_sum(input, w1, b1) # first linear combination
    h = sigmoid(o1) # sigmoid activation function
    o2 = weighted_sum(h, w2, b2) # second linear combination
    y = softmax(o2) # sigmoid activation function

    # backwards pass to compute derivatives
    target_idx = target_class.index(target)
    t_prob = y[target_idx] # target probability 
    dL_dy = deriv_loss(y, target_idx) # (2, 0)
    dL_do2 = elemwise_mult(dL_dy, deriv_softmax(y, target_idx)) # (2, )
    dL_dw2 = [[x*y for x in h] for y in dL_do2] # (3, 2)

    
    dL_dh = [dL_do2[i]*w2[i][j] for i in len(w2)]
    dL_dw2 = elemwise_mult(dL_do2, o2)
    dL_db2 = elemwise_mult(dL_dh2, [1] * len(b2))
    dL_dh1 = elemwise_mult(dL_dw2, deriv_sigmoid())
    dL_dw1 = 0
    dL_db1 = 0 
    
    return dL_dw1, dL_db1, dL_dw2, dL_db2

def gradient_desent(w1, b1, w2, b2, dL_w1, dL_b1, dL_w2, dL_b2, alpha):
    
    # update weights in first layer 
    for i in range(len(w1)):
        for idx, w in enumerate(w1[i]):
            w1[i][idx] = w - alpha*dL_w1[i][idx]

    # update weights in second layer 
    for i in range(len(w2)):
         for idx, w in enumerate(w2[i]):
            w2[i][idx] = w - alpha*dL_w2[i][idx]

    return w1, b1, w2, b2 
 

def loss(prediction, target):
    return - m.log(prediction[target_class.index(target)])