### Logbook
1. Read the data.
2. Visualise the data.
    1. Found that the features are all between 1 and 10.
    2. Found missing data. Decided to just remove the row.
    3. Thought of either using Neural Network or SVM.
3. Develop a Neural Network.
    - Stage 1: 
        1. Thought about the flow.
        2. First made a one layer NN with linear activation.
        3. Created one input/output pair with 5 features. 
        4. Trained the model and found that the weights were not what I want, but it worked. Then I realised that there was no way to find 5 unknowns with 1 equation.
        5. Added more input/output pairs. 
        6. Trained the model and succeed! Finally created a one layer model with MSE loss and linear activation!! <-rubbish
    - Stage 2:
        1. Added one more layer, i.e. created a (5-3-1) model.
        2. Found that I also need dL/dX but not juse dL/dW. So add that.
        3. Found that I need to make use of matrices, instead of just multiplying values like a 1-output node layer.
        4. Spent time to do the d_/d_, d_/d_, and d_/d_ on paper, and tried to figure out what the size of each matrix is.
        5. Struggled with dot products but finally made it.
        6. Created a two-layer regression model with MSE loss and linear activation!! <-which is equal to one layer NN with linear activation.
        7. Just decided try to add one more layer (3 layers in total) but it didn't work.
        8. Fixed the problem by dealing with the dot products again...
        9. Successfully created a 3-layer regression model with MSE loss and linear activation!! <-function-wise it is still same as one layer
    - Stage 3: 
        1. As we are supposed to make a classification NN but not a regression NN, so it's time to add sigmoid activation.
        2. Added sigmoid activation function.
        3. Worked on the derivative of the sigmoid function, and added d_activation_fucntion.
        4. Refactored the code as I was ignoring this part due to the d(linear)/dx is 1.
        5. Successfully created a 3-layer regression model with MSE loss and sigmoid-sigmoid-linear activations!! <-Due to the random weight initialisation, sometimes it never converges. 
    - Stage 4: 
        1. Classification NN uses cross-entropy instead of MSE. So time to add cross-entropy as a loss function.
        
        


### Import

In [1]:
import numpy as np
import pandas as pd
from numpy import log as ln

### Fetch the data from the Internet and save it as csv
The following cell only needs to run once. 

In [2]:
# from ucimlrepo import fetch_ucirepo 

# # fetch dataset 
# breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
  
# # data (as pandas dataframes) 
# X = breast_cancer_wisconsin_original.data.features 
# y = breast_cancer_wisconsin_original.data.targets 

# data = pd.concat([y, X], axis=1)
# data.to_csv("data.csv", index=False)

### Read the data to X and y, and replace B and M (2&4) with 0 and 1

Note that there are missing values. As we got a large dataset, we will just drop the records.

In [2]:
data = pd.read_csv("data.csv")
data.dropna(inplace=True)
X = data.drop("Class", axis=1)
y = data['Class']
y.replace({2: 0, 4: 1}, inplace=True)
print(X.shape, y.shape)

(683, 9) (683,)


In [250]:
from numpy import random
import matplotlib.pyplot as plt


class Layer:
    num_of_layers = 0
    def __init__(self, input_size, output_size, activation="linear", random_state=None):
        random.seed(random_state)
        self.weights = random.rand(input_size+1, output_size)-0.5   # weight[0, x] is bias's weight
        # self.weights[:,:] = 0.6
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation
        if activation == "linear":
            self.activation_function = lambda x: x
            self.d_activation_function = lambda x: x * 1
        elif activation == "sigmoid":
            self.activation_function = lambda x: 1/(1+np.exp(-x))
            self.d_activation_function = lambda x: self.activation_function(x) * (1 - self.activation_function(x))
        else:
            raise Exception("Wrong activation function")
        Layer.num_of_layers += 1

    def forward_propagation(self, input):   # input should be an array
        input = np.concatenate(([[1], input]))    # add bias = 1
        self.input = input
        output = self.activation_function(input.dot(self.weights))
        self.output = output
        return output
    
    def backward_propagation(self, dl_dy, learning_rate = 0.005):
        # print(f"dl_dy.shape: {dl_dy.shape}")
        # print(f"self.weights.shape: {self.weights.shape}")

        # dActivation/dWeights
        da_dw = self.input.reshape(-1, 1)
        # print(f"da_dw.shape: {da_dw.shape}")

        dy_da = self.d_activation_function(self.input.dot(self.weights))
        # print(f"dy_da.shape: {dy_da.shape}")

        # dLoss/dWeights = dL/dy(previous) * dy/dActivation * dActivation/dWeights
        # print(f"dl_dy: {dl_dy}, dy_da: {dy_da}")
        dl_da = dl_dy * dy_da.reshape(-1, 1)
        # print(f"dl_da.shape: {dl_da.shape}")
        # dl_dw = da_dw.dot(self.d_activation_function(dl_dy.T))
        dl_dw = da_dw.dot(dl_da.T)
        # print(f"dl_dw.shape: {dl_dw.shape}")

        # The previous layer needs this to do the back propagation
        # dLoss/dX = dLoss/dy(previous) * dy/dActivation * dActivation/dX
        # where dActivation/dX is just the weights without the bias' weight
        dl_dx = (self.weights[1:]).dot(self.d_activation_function(dl_dy))
        # print(f"dl_dx.shape: {dl_dx.shape}")

        # update the weight using the gradient
        self.weights -= learning_rate * dl_dw

        # print()
        # return the dL/dX, which will become the previous layer's dL/dy
        return dl_dx
        

class Model:
    def __init__(self, loss_function):
        self.layers = []

        # The reason of including MSE is I first developed a regression model
        # to find the coefficients of a polynomial with linear activation.
        if loss_function not in ['mse', 'cross_entropy']:
            raise Exception("The provided loss function is not available.")
        
        if loss_function == "mse":
            self.d_loss_function = lambda y, pred: -2*(y-pred)
        elif loss_function == "cross_entropy":
            self.d_loss_function = lambda y, pred: -(y-pred)/(pred * (1-pred))

    def add(self, layer):
        self.layers.append(layer)

    def show_layers(self):
        for i, l in enumerate(self.layers):
            print(f"Layer {i+1}: ", end="")
            print(f"input size: {l.input_size}, output size: {l.output_size}")

    def predict(self, input):
        prev_output = input
        # loop through all the layers
        for layer in self.layers:
            # actually this should be output=fp(previous_output) --> previous_output=output
            # but just want to make it a bit shorter
            prev_output = layer.forward_propagation(prev_output)
            # print(prev_output)
        return prev_output

    def train_one_cycle(self, input, expected_output, learning_rate = 1):
        # Just caused me trouble once, so I add this checking
        if len(input) != len(expected_output):
            raise Exception("Input and Expected output have different length")
                
        # Loop through each pair of input/output
        for X, y in zip(input, expected_output):
            # predict the output
            pred = self.predict(X)

            # print(self.layers[0].weights)

            # error = (y - pred)
            # derror_dpred = np.array([-2 * error])

            # # The following is d(squared error)/d(prediction)
            derror_dpred = np.array([self.d_loss_function(y, pred)])

            # dl_dy means dLoss/dOutput, where output is the layer's output
            dl_dy = derror_dpred
            for layer in reversed(self.layers):
                dl_dy = layer.backward_propagation(dl_dy, learning_rate)
            # print(f"X: {X}, y: {y}, pred: {pred}")

    def mean_squared_error(self, input, expected_output):
        if len(input) != len(expected_output):
            raise Exception("Input and Expected output have different length")                
        squared_error = 0
        for X, y in zip(input, expected_output):    
            squared_error += (y - self.predict(X))**2
        mse = squared_error / len(input)
        return mse







#### Classification test

In [267]:
input = np.array([[2, 3], [-2, -3]])
expected_output = [0, 1]

layer1 = Layer(2, 1, "sigmoid")
layer2 = Layer(5, 1, "sigmoid")

model = Model("cross_entropy")
model.add(layer1)
# model.add(layer2)
model.show_layers()

for i in range(500):
    model.train_one_cycle(input, expected_output, 0.01)

print(f"Input: {input[1]}, Expected output: {expected_output[1]}, Predicted: {model.predict(input[1])}")
print(f"Input: {input[0]}, Expected output: {expected_output[0]}, Predicted: {model.predict(input[0])}")
print(f"{model.layers[0].weights}")

Layer 1: input size: 2, output size: 1
Input: [-2 -3], Expected output: 1, Predicted: [0.98991573]
Input: [2 3], Expected output: 0, Predicted: [0.00552911]
[[-0.30277075]
 [-1.06843708]
 [-0.91751318]]


#### Regression test

In [328]:
layer1 = Layer(5, 5, activation="sigmoid")
layer2 = Layer(5, 1, activation="linear")


model = Model(loss_function='mse')
model.add(layer1)
model.add(layer2)
model.show_layers()

input = np.array([[5, 8, 3, 2, 1], [4, 2, 1, 5, 6], [6, 1, 6, 8, 9], [7, 4, 3, 2, 1], [3, 6, 7, 8, 8]])
expected_output = [(sum(i*[3, 4, 6, 2, 1])+5) for i in input]
print(expected_output)

print()
for i in range(50):
    model.train_one_cycle(input, expected_output, 0.001)
    # print(model.mean_squared_error(input, expected_output))

for i in range(len(input)):
    print(f"Input: {input[i]} Expected output: {expected_output[i]} prediction: {model.predict(input[i])}")
    # print(f"Input: {input[i]} Expected output: {expected_output[i]} prediction: {model.predict(input[i])}")

Layer 1: input size: 5, output size: 5
Layer 2: input size: 5, output size: 1
[np.int64(75), np.int64(47), np.int64(88), np.int64(65), np.int64(104)]

Input: [5 8 3 2 1] Expected output: 75 prediction: [87.69141924]
Input: [4 2 1 5 6] Expected output: 47 prediction: [87.69543731]
Input: [6 1 6 8 9] Expected output: 88 prediction: [87.69520404]
Input: [7 4 3 2 1] Expected output: 65 prediction: [87.69517587]
Input: [3 6 7 8 8] Expected output: 104 prediction: [87.69520356]


  self.activation_function = lambda x: 1/(1+np.exp(-x))
