##


### Network Architecture
####  4 inputs => 4 neurons => relu => 3 neurons => softmax => 3 output

In [7]:
import numpy as np 
import pandas as pd
from sklearn import datasets 
from sklearn.model_selection import train_test_split


In [8]:
iris_data = datasets.load_iris()

In [9]:
iris_df = pd.DataFrame(iris_data.data)

In [10]:
iris_df['target'] = iris_data.target

In [11]:
X_train, X_test, y_train, y_test = train_test_split(iris_df[[0,1,2,3]], iris_df['target'], test_size=0.33, random_state=42)

In [25]:
X_train.head()

Unnamed: 0,0,1,2,3
96,5.7,2.9,4.2,1.3
105,7.6,3.0,6.6,2.1
66,5.6,3.0,4.5,1.5
0,5.1,3.5,1.4,0.2
122,7.7,2.8,6.7,2.0


In [26]:
y_train.head()

96     1
105    2
66     1
0      0
122    2
Name: target, dtype: int64

In [16]:
# Creating Dense Layer Class
class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons) # generating random weightd
        self.biases = np.zeros((1, n_neurons)) # setting initial biases zero
        
    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases  
    
    # Backward pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)
        
# Implementing activation functions
# ReLu implementation
class Activation_ReLU:
    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs
        # Calculate output values from input
        self.output = np.maximum(0, inputs)
        
    # Backward pass
    def backward(self, dvalues):
        # Since we need to modify the original variable,
        # let's make a copy of the values first
        self.dinputs = dvalues.copy()
        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0
        
# Softmax implementation
class Activation_Softmax:
    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs
        # Get unnormalized probabilities
        # exp_values = np.exp(inputs)
        exp_values = np.exp(inputs - np.max(inputs, axis=1,
        keepdims=True))
        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1,
        keepdims=True)
        self.output = probabilities
        
    # Backward pass
    def backward(self, dvalues):
        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)
        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in \
        enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate Jacobian matrix of the output and
            jacobian_matrix = np.diagflat(single_output) - \
            np.dot(single_output, single_output.T)
            # Calculate sample-wise gradient
            # and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix,
            single_dvalues)
        
# Common loss class
class Loss:
    # Calculates the data and regularization losses
    # given model output and ground truth values
    def calculate(self, output, y):
        # Calculate sample losses
        sample_losses = self.forward(output, y)
        # Calculate mean loss
        data_loss = np.mean(sample_losses)
        # Return loss
        return data_loss

# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):
    # Forward pass
    def forward(self, y_pred, y_true):
        # Number of samples in a batch
        samples = len(y_pred)
        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        # Probabilities for target values -
        # only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
            range(samples),
            y_true
            ]
        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
            y_pred_clipped * y_true,
            axis=1
            )
        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        # Number of labels in every sample
        # We'll use the first sample to count them
        labels = len(dvalues[0])
        # If labels are sparse, turn them into one-hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
        # Calculate gradient
        self.dinputs = -y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples
        
# Softmax classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy():
    # Creates activation and loss function objects
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()
        
    # Forward pass
    def forward(self, inputs, y_true):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)
    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        # If labels are one-hot encoded,
        # turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)
        # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples
        
class Optimizer_SGD:
    # Initialize optimizer - set settings,
    # learning rate of 0.1 is default for this optimizer
    def __init__(self, learning_rate=0.1):
        self.learning_rate = learning_rate
        
    # Update parameters
    def update_params(self, layer):
        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases


In [28]:
dense1 = Layer_Dense(4, 4) # 4 inputs and 4 neurons for hidden layer
activation1 = Activation_ReLU() # ReLu Activation function
dense2 = Layer_Dense(4, 3) # 3 neuron hidden layer
# activation2 = Activation_Softmax() # softmax activation function
# loss_function = Loss_CategoricalCrossentropy() # categorical loss function for classification of flowers
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer_SGD()

print('During Training')
# Training Session
for epoch in range(1500):
    # Perform a forward pass of our training data through this layer
    dense1.forward(X_train)
    # Perform a forward pass through activation function
    # takes the output of first dense layer here
    activation1.forward(dense1.output)
    # Perform a forward pass through second Dense layer
    # takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)
    # Perform a forward pass through the activation/loss function
    # takes the output of second dense layer here and returns loss
    loss = loss_activation.forward(dense2.output, y_train)
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y_train.shape) == 2:
        y = np.argmax(y_train, axis=1)
    accuracy = np.mean(predictions==y_train)
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + f'acc: {accuracy:.3f}, ' + f'loss: {loss:.3f}')
    
    # Backward pass
    loss_activation.backward(loss_activation.output, y_train)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    # Update weights and biases
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)

# Testing part
print('During Testing')
dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
loss = loss_activation.forward(dense2.output, y_test)
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)  
accuracy = np.mean(predictions==y_test)
print(f'acc: {accuracy:.3f}, ' + f'loss: {loss:.3f} ')

During Training
epoch: 0, acc: 0.310, loss: 1.099
epoch: 100, acc: 0.940, loss: 0.392
epoch: 200, acc: 0.790, loss: 0.347
epoch: 300, acc: 0.890, loss: 0.250
epoch: 400, acc: 0.950, loss: 0.139
epoch: 500, acc: 0.980, loss: 0.090
epoch: 600, acc: 0.980, loss: 0.080
epoch: 700, acc: 0.980, loss: 0.075
epoch: 800, acc: 0.980, loss: 0.072
epoch: 900, acc: 0.970, loss: 0.087
epoch: 1000, acc: 0.980, loss: 0.071
epoch: 1100, acc: 0.990, loss: 0.068
epoch: 1200, acc: 0.990, loss: 0.066
epoch: 1300, acc: 0.990, loss: 0.065
epoch: 1400, acc: 0.990, loss: 0.064
During Testing
acc: 0.980, loss: 0.063 
