In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import time as timer

In [3]:
#SELU parameters
alpha = 1.6732632423543772848170429916717 #From Pytorch Docs
scale = 1.0507009873554804934193349852946 #From Pytorch Docs

#Activation Functions

#relu returns 0 if the input (Z) is negative, otherwise returns the input as is (no changes)
def relu(Z):
  return np.maximum(0, Z)

#unlike relu, selu keeps the mean and variances of activations close to 0 and 1
#this helps keep activations standardized and can prevent vanishing or exploding gradients
def selu(Z):
  return scale*np.where(Z>0, Z, alpha*(np.exp(Z) - 1))

#the softmax activation returns an output vector that is N entries long, with the entry at index i corresponding to the probability of a particular input belonging to the class i
def softmax(Z):
  expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
  return expZ/np.sum(expZ, axis=1, keepdims=True)

#Activation Derivatives
#derivatives of activation functions are crucial for calculating the gradients of the loss function with respect to the network's weights and biases in order to optimize the network's(backpropagation)

#derivative of relu is 1 if the input if it is positive and 0 if the input is negative
#by doing this the weights and biases for some neurons (when gradient is 0) are not updated
def relu_derivative(Z):
  return np.where(Z>0, 1, 0)

def selu_derivative(Z):
  return np.where(Z>0, scale, scale*alpha*np.exp(Z))

def softmax_derivative(Z):
  s = softmax(Z)
  return s*(1 - s)

In [4]:
#Networks Class
class BasicNetwork:
  def __init__(self, input_size, hidden1_size, hidden2_size, output_size, lr=1e-3):
    #Initialize Network parameters
    self.input_size = input_size
    self.hidden1_size = hidden1_size
    self.hidden2_size = hidden2_size
    self.output_size = output_size
    self.lr = lr

    #Initialize Weights and Biases
    self.W1 = np.random.randn(input_size,  hidden1_size)*0.01
    self.b1 = np.zeros((1, hidden1_size))
    self.W2 = np.random.randn(hidden1_size, hidden2_size)*0.01
    self.b2 = np.zeros((1, hidden2_size))
    self.W3 = np.random.randn(hidden2_size, output_size)*0.01
    self.b3 = np.zeros((1, output_size))

  def forward(self, X):
    #Layer 1 is Selu Activation
    self.Z1 = X.dot(self.W1) + self.b1
    self.A1 = selu(self.Z1)
    #Layer 2 is Relu Activation
    self.Z2 = self.A1.dot(self.W2) + self.b2
    self.A2 = relu(self.Z2)
    #Layer 3 is Softmax Activation
    self.Z3 = self.A2.dot(self.W3) + self.b3
    self.A3 = softmax(self.Z3)
    return self.A3

  def loss(self, y_pred, y_true):
    m = y_pred.shape[0]
    y_flat = y_true.reshape(-1)
    eps = 1e-8
    correct_logprobs = -np.log(y_pred[np.arange(m), y_flat] + eps)
    loss = np.sum(correct_logprobs)/m
    return loss

  def backward(self, X, y):
    #backpropagation calculates the errors between the predicted and actual outputs
    #gradients are calculated with the derivative of the activation functions
    m = X.shape[0]
    y_flat = y.reshape(-1)
    #Output Layer, Softmax and Cross-Entropy
    dZ3 = self.A3.copy()
    dZ3[np.arange(m), y_flat] -= 1
    dZ3 /= m
    dW3 = self.A2.T.dot(dZ3)
    db3 = np.sum(dZ3, axis=0, keepdims=True)
    #Layer 2 is Relu Activation
    dA2 = dZ3.dot(self.W3.T)
    dZ2 = dA2*relu_derivative(self.Z2)
    dW2 = self.A1.T.dot(dZ2)
    db2 = np.sum(dZ2, axis=0, keepdims=True)
    #Layer 1 is Selu Activation
    dA1 = dZ2.dot(self.W2.T)
    dZ1 = dA1*selu_derivative(self.Z1)
    dW1 = X.T.dot(dZ1)
    db1 = np.sum(dZ1, axis=0, keepdims=True)
    return dW1, db1, dW2, db2, dW3, db3

  def fit(self, epochs, X_train, y_train):
    for epoch in range(1, epochs+1):
      self.forward(X_train) #forward pass
      loss = self.loss(self.A3, y_train) #calculate loss
      dW1, db1, dW2, db2, dW3, db3 = self.backward(X_train, y_train) #calculate gradients
      #update weights and biases
      self.W1 -= self.lr*dW1
      self.b1 -= self.lr*db1
      self.W2 -= self.lr*dW2
      self.b2 -= self.lr*db2
      self.W3 -= self.lr*dW3
      self.b3 -= self.lr*db3
      #print out training loss
      if epoch % 100==0:
        print(f"Epoch [{epoch:04d}/{epochs}], Train Loss: {loss:.4f}")
    return

  def predict(self, X):
    probs = self.forward(X)
    return np.argmax(probs, axis=1)

  def score(self, X_test, y_test):
    preds = self.predict(X_test)
    y_flat = y_test.reshape(-1)
    return np.mean(preds==y_flat)

In [6]:
#Load Dataset
url = "https://raw.githubusercontent.com/dvasiliu/AML/refs/heads/main/Data%20Sets/mobile.csv"
df = pd.read_csv(url)
X = df.drop("price_range", axis=1).values.astype(float)
y = df["price_range"].values.astype(int)

In [7]:
#Start Timer
print()
t1 = timer.time()




In [9]:
#Setup KFold Cross-Validation
kf = KFold(n_splits=5, shuffle=True)
accs = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
  #Split Data and Scale
  X_train, X_test = X[train_idx], X[test_idx]
  y_train, y_test = y[train_idx], y[test_idx]
  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  #Initialize the Network
  net = BasicNetwork(input_size=X.shape[1], hidden1_size=64, hidden2_size=32, output_size=4, lr=0.1)

  #Train and Score the Network
  epochs = 1000
  net.fit(epochs=epochs, X_train=X_train, y_train=y_train)
  acc = net.score(X_test, y_test)
  print(f"Fold {fold} Accuracy: {acc:.4f}\n")
  accs.append(acc)

#End Timer and Output Stats
t2 = timer.time()
deltat = t2-t1
epochdeltat = deltat/epochs
print(f'\nTraining Took: {deltat//60} min {deltat%60:.2f} s')
print(f'Time Per Epoch: {epochdeltat:.4f} s')

#Output Results
print(f'\nFold Accuracies: {np.round(accs,4)}')
print(f"Average 5-fold Accuracy: {np.mean(accs):.4f}")

Epoch [0100/1000], Train Loss: 1.3858
Epoch [0200/1000], Train Loss: 1.3802
Epoch [0300/1000], Train Loss: 0.8428
Epoch [0400/1000], Train Loss: 0.4759
Epoch [0500/1000], Train Loss: 0.3448
Epoch [0600/1000], Train Loss: 0.2703
Epoch [0700/1000], Train Loss: 0.2182
Epoch [0800/1000], Train Loss: 0.1764
Epoch [0900/1000], Train Loss: 0.1425
Epoch [1000/1000], Train Loss: 0.1158
Fold 1 Accuracy: 0.9500

Epoch [0100/1000], Train Loss: 1.3861
Epoch [0200/1000], Train Loss: 1.3855
Epoch [0300/1000], Train Loss: 1.3757
Epoch [0400/1000], Train Loss: 0.7436
Epoch [0500/1000], Train Loss: 0.4623
Epoch [0600/1000], Train Loss: 0.3418
Epoch [0700/1000], Train Loss: 0.2703
Epoch [0800/1000], Train Loss: 0.2185
Epoch [0900/1000], Train Loss: 0.1753
Epoch [1000/1000], Train Loss: 0.1431
Fold 2 Accuracy: 0.9750

Epoch [0100/1000], Train Loss: 1.3857
Epoch [0200/1000], Train Loss: 1.3822
Epoch [0300/1000], Train Loss: 1.0101
Epoch [0400/1000], Train Loss: 0.5068
Epoch [0500/1000], Train Loss: 0.3626


Sources:

[Understanding and implementing relu](https://www.digitalocean.com/community/tutorials/relu-function-in-python)

[Understanding and implementing selu](https://medium.com/@iitkarthik/selu-function-in-keras-the-secret-to-self-normalizing-neural-networks-with-practical-examples-f0b3cacc0775)

[Understanding and implementing selu derivative](https://neuralthreads.medium.com/selu-and-elu-exponential-linear-units-a826d5eeb99c)

[Understanding and implementing softmax and derivative](https://medium.com/intuitionmath/how-to-implement-the-softmax-derivative-independently-from-any-loss-function-ae6d44363a9d)

[deciding what weights to initialize](https://medium.com/@shaomukherjee/demystifying-weight-initialization-methods-in-neural-networks-96042d2447f1)

[random.normal parameters](https://numpy.org/doc/stable/reference/random/generated/numpy.random.normal.html)

[for building the network class](https://www.geeksforgeeks.org/implementation-of-neural-network-from-scratch-using-numpy/)

[cross-entropy loss](https://www.geeksforgeeks.org/how-to-implement-softmax-and-cross-entropy-in-python-and-pytorch/)

[backpropagation](https://www.geeksforgeeks.org/backpropagation-in-neural-network/)