In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

class DenseLayer:
    def __init__(self, neurons):
        # Initialize the layer with the given number of neurons
        self.neurons = neurons
        
    def relu(self, inputs):
        # Implement the ReLU activation function on the given inputs
        return np.maximum(0, inputs)

    def softmax(self, inputs):
        # Implement the softmax activation function on the given inputs
        exp_scores = np.exp(inputs)
        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    
    def relu_derivative(self, dA, N):
        # Calculate the derivative of the cost function with respect to the inputs of the layer using the derivative of the ReLU activation function
        # dA is the derivative of the cost function with respect to the output of the layer
        # N is the weighted sum of the inputs to the layer
        # First create a binary vector that is 1 where the input to the ReLU function was positive and 0 where it was negative or zero
        dN = (N <= 0).astype(float)
        # Multiply dN element-wise with dA to obtain the derivative of the cost function with respect to the output of the ReLU function
        return dN * dA
    
    def forward(self, inputs, weights, bias, activation):
        N = (inputs @ weights.T) + bias
        
        if activation == 'relu':
            A = self.relu(N)
        elif activation == 'softmax':
            A = self.softmax(N)
            
        return A, N
    
    def backward(self, dA_curr, W_curr, Z_curr, A_prev, activation):
        if activation == 'softmax':
            dW = A_prev.T @ dA_curr
            db = np.sum(dA_curr, axis=0, keepdims=True)
            dA = dA_curr @ W_curr
        else:
            dZ = self.relu_derivative(dA_curr, Z_curr)
            dW = A_prev.T @ dZ
            db = np.sum(dZ, axis=0, keepdims=True)
            dA = dZ @ W_curr
            
        return dA, dW, db

class Network:
    def __init__(self):
        self.network = []
        self.architecture = []
        self.params = []
        self.memory = []
        self.gradients = []
        
    def add(self, layer):
        self.network.append(layer)
            
    def _compile(self, data):
        for idx, layer in enumerate(self.network):
            if idx == 0:
                self.architecture.append({'input_dim':data.shape[1], 'output_dim':self.network[idx].neurons,
                                         'activation':'relu'})
            elif idx > 0 and idx < len(self.network)-1:
                self.architecture.append({'input_dim':self.network[idx-1].neurons, 'output_dim':self.network[idx].neurons,
                                         'activation':'relu'})
            else:
                self.architecture.append({'input_dim':self.network[idx-1].neurons, 'output_dim':self.network[idx].neurons,
                                         'activation':'softmax'})
        return self
    
    def _init_weights(self, data):
        self._compile(data)
        
        np.random.seed(99)
        
        for layers in self.architecture:
            self.params.append({
                'W':np.random.uniform(low=-1, high=1, size=(layers['output_dim'], layers['input_dim'])),
                'b':np.zeros((1, layers['output_dim']))})
        return self
    
    def _forwardprop(self, data):
        A_curr = data.copy()
        
        for i in range(len(self.params)):
            A_prev = A_curr
            A_curr, Z_curr = self.network[i].forward(inputs=A_prev, weights=self.params[i]['W'], 
                                           bias=self.params[i]['b'], activation=self.architecture[i]['activation'])
            
            self.memory.append({'inputs':A_prev, 'Z':Z_curr})
            
        return A_curr
    
    def _backprop(self, predicted, actual):
        num_samples = len(actual)
        
        dscores = predicted
        dscores[range(num_samples),actual] -= 1
        dscores /= num_samples
        
        dA_prev = dscores
        
        for idx, layer in reversed(list(enumerate(self.network))):
            dA_curr = dA_prev
            
            A_prev = self.memory[idx]['inputs']
            Z_curr = self.memory[idx]['Z']
            W_curr = self.params[idx]['W']
            
            activation = self.architecture[idx]['activation']

            dA_prev, dW_curr, db_curr = layer.backward(dA_curr, W_curr, Z_curr, A_prev, activation)

            self.gradients.append({'dW':dW_curr, 'db':db_curr})
            
    def _update(self, lr=0.01):
        for idx, _ in enumerate(self.network):
            self.params[idx]['W'] -= lr * list(reversed(self.gradients))[idx]['dW'].T  
            self.params[idx]['b'] -= lr * list(reversed(self.gradients))[idx]['db']
    
    def _get_accuracy(self, predicted, actual):
        return np.mean(np.argmax(predicted, axis=1)==actual)
    
    def _calculate_loss(self, predicted, actual):
        samples = len(actual)
        
        correct_logprobs = -np.log(predicted[range(samples),actual])
        data_loss = np.sum(correct_logprobs)/samples

        return data_loss
    
    def train(self, X_train, y_train, epochs):
        self.loss = []
        self.accuracy = []
        
        self._init_weights(X_train)
        
        for i in range(epochs):
            yhat = self._forwardprop(X_train)
            self.accuracy.append(self._get_accuracy(predicted=yhat, actual=y_train))
            self.loss.append(self._calculate_loss(predicted=yhat, actual=y_train))
            
            self._backprop(predicted=yhat, actual=y_train)
            
            self._update()
            
            if i % 20 == 0:
                s = 'EPOCH: {}, ACCURACY: {}, LOSS: {}'.format(i, self.accuracy[-1], self.loss[-1])
                print(s)

if __name__ == '__main__':
    def get_data(path):
        data = pd.read_csv(path, index_col=0)

        cols = list(data.columns)
        target = cols.pop()

        X = data[cols].copy()
        y = data[target].copy()

        y = LabelEncoder().fit_transform(y)

        return np.array(X), np.array(y)

    X, y = get_data("Iris.csv")

    model = Network()
    model.add(DenseLayer(6))
    model.add(DenseLayer(8))
    model.add(DenseLayer(10))
    model.add(DenseLayer(3))

    model.train(X_train=X, y_train=y, epochs=200)
    

FileNotFoundError: [Errno 2] No such file or directory: 'Iris.csv'

In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

# Define the relu function
def relu(x):
    return np.maximum(0, x)

#f'(x) = 1 if x > 0
#f'(x) = 0 if x <= 0
def relu_derivative(x):
    return np.where(x > 0, 1, 0)

# Define the sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

#f'(x) = f(x) * (1 - f(x))
def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

# Define the residual sum of squares loss function
def loss(Y_hat, Y):
    return np.sum(np.square(Y_hat - Y))

# Define the cross-entropy loss function
#def loss(Y_hat, Y):
#    return -np.mean(np.sum(Y * np.log(Y_hat) + (1 - Y) * np.log(1 - Y_hat), axis=1))

# Initialize the weights and biases
def initialize_params(num_features, num_classes):
    W1 = np.random.randn(num_features, 4) * 0.01
    b1 = np.zeros((1, 4))
    W2 = np.random.randn(4, num_classes) * 0.01
    b2 = np.zeros((1, num_classes))
    return W1, b1, W2, b2

# Define the forward propagation function
def forward_prop(X, W1, b1, W2, b2):
    # Calculate the weighted sum of the input features and the weights of the first layer
    Z1 = np.dot(X, W1) + b1
    # Apply the rectified linear unit (ReLU) activation function to the first layer's output
    A1 = relu(Z1)
    # Calculate the weighted sum of the first layer's output and the weights of the output layer
    Z2 = np.dot(A1, W2) + b2
    # Apply the sigmoid activation function to the output layer's output
    A2 = sigmoid(Z2)
    # Return the output of the output layer (A2), as well as the output and weighted sum of the first layer (A1 and Z1)
    return A2, A1, Z1

# Define the backpropagation function
def backprop(X, Y, Y_hat, lr, A1, Z1, W1, W2, b1, b2):
    # Calculate the error in the output layer using the derivative of the sigmoid activation function
    delta2 = (Y_hat - Y) * (Y_hat * (1 - Y_hat))
    # Calculate the gradients of the weights and biases in the output layer using the error and the output of the first hidden layer
    dW2 = np.dot(A1.T, delta2)
    db2 = np.sum(delta2, axis=0, keepdims=True)
    # Calculate the error in the first hidden layer using the derivative of the ReLU activation function and the gradients of the weights in the output layer
    delta1 = np.dot(delta2, W2.T) * (Z1 > 0)
    # Calculate the gradients of the weights and biases in the first hidden layer using the error and the input data
    dW1 = np.dot(X.T, delta1)
    db1 = np.sum(delta1, axis=0)
    # Update weights and biases using the calculated gradients and the learning rate
    W2 -= lr * dW2
    b2 -= lr * db2
    W1 -= lr * dW1
    b1 -= lr * db1
    # Return the updated weights and biases
    return W1, b1, W2, b2

# Load the iris dataset
iris = load_iris()
X, Y = iris.data, iris.target.reshape(-1, 1)

# One-hot encode the labels
encoder = OneHotEncoder(sparse=False)
Y = encoder.fit_transform(Y)

# Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the weights and biases
W1, b1, W2, b2 = initialize_params(X.shape[1], Y.shape[1])

# Train the model using backpropagation
num_epochs = 10000
lr = 0.01
print_freq = 500
W1_shapes = []
b1_shapes = []
W2_shapes = []
b2_shapes = []
loss_curve = []
for i in range(num_epochs):
    # Forward propagation
    Y_hat, A1, Z1 = forward_prop(X_train, W1, b1, W2, b2)
    cost = loss(Y_hat, Y_train)
    loss_curve.append(cost)
    # Backpropagation
    W1, b1, W2, b2 = backprop(X_train, Y_train, Y_hat, lr, A1, Z1, W1, W2, b1, b2)
    # Store shapes of weights and biases
    if i % print_freq == 0:
        W1_shapes.append(W1)
        b1_shapes.append(b1)
        W2_shapes.append(W2)
        b2_shapes.append(b2)
        print(f"Epoch {i}: Loss = {cost:.4f}")
        print(f"W1 = {W1}")
        print(f"b1 = {b1}")
        print(f"W2 = {W2}")
        print(f"b2 = {b2}")


In [None]:
# Print final cost
final_cost = loss_curve[-1]
print(f"Final cost: {final_cost:.4f}")

# Plot loss curve
plt.plot(range(num_epochs), loss_curve)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss Curve")
plt.show()

In [None]:
# Plot the ReLU activation function and its gradient
x = np.linspace(-5, 5, 100)
plt.plot(x, relu(x), label='ReLU')
plt.plot(x, relu_derivative(x), label='ReLU gradient')
plt.legend()
plt.xlabel('x')
plt.ylabel('y')
plt.title('ReLU activation function and its gradient')
plt.show()

In [None]:
x = np.linspace(-5, 5, 100)
y = sigmoid(x)
dy = sigmoid_derivative(x)

plt.plot(x, y, label="sigmoid")
plt.plot(x, dy, label="sigmoid derivative")
plt.legend()
plt.xlabel('x')
plt.ylabel('y')
plt.title('ReLU activation function and its gradient')
plt.show()