<a href="https://colab.research.google.com/github/sameph/Icog_labs/blob/main/Simple_neural.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np

# Activation functions
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return x > 0

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

# Loss function
def cross_entropy_loss(y_true, y_pred):
    return -np.mean(np.sum(y_true * np.log(y_pred + 1e-9), axis=1))

# Forward propagation
def forward_propagation(X, weights, biases, activation):
    if activation == "relu":
        act_function, act_derivative = relu, relu_derivative
    elif activation == "sigmoid":
        act_function, act_derivative = sigmoid, sigmoid_derivative
    elif activation == "tanh":
        act_function, act_derivative = tanh, tanh_derivative

    layer1 = act_function(np.dot(X, weights[0]) + biases[0])
    layer2 = act_function(np.dot(layer1, weights[1]) + biases[1])
    output = softmax(np.dot(layer2, weights[2]) + biases[2])
    return layer1, layer2, output

# Backpropagation
def backpropagation(X, y_true, layer1, layer2, output, weights, activation):
    if activation == "relu":
        act_derivative = relu_derivative
    elif activation == "sigmoid":
        act_derivative = sigmoid_derivative
    elif activation == "tanh":
        act_derivative = tanh_derivative

    m = X.shape[0]

    d_output = output - y_true
    d_layer2 = np.dot(d_output, weights[2].T) * act_derivative(layer2)
    d_layer1 = np.dot(d_layer2, weights[1].T) * act_derivative(layer1)

    return d_output, d_layer2, d_layer1
# Gradient descent optimization
def gradient_descent(weights, biases, gradients, learning_rate):
    for i in range(len(weights)):
        weights[i] -= learning_rate * gradients[f'dW{i+1}']
        biases[i] -= learning_rate * gradients[f'dB{i+1}']

# Adam optimizer implementation
class AdamOptimizer:
    def __init__(self, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = {}
        self.v = {}
        self.t = 0

    def update(self, weights, biases, gradients):
        self.t += 1
        for i in range(len(weights)):
            if f'dW{i+1}' not in self.m:
                self.m[f'dW{i+1}'], self.v[f'dW{i+1}'] = np.zeros_like(gradients[f'dW{i+1}']), np.zeros_like(gradients[f'dW{i+1}'])
                self.m[f'dB{i+1}'], self.v[f'dB{i+1}'] = np.zeros_like(gradients[f'dB{i+1}']), np.zeros_like(gradients[f'dB{i+1}'])

            # Update biased first moment estimates
            self.m[f'dW{i+1}'] = self.beta1 * self.m[f'dW{i+1}'] + (1 - self.beta1) * gradients[f'dW{i+1}']
            self.m[f'dB{i+1}'] = self.beta1 * self.m[f'dB{i+1}'] + (1 - self.beta1) * gradients[f'dB{i+1}']

            # Update biased second raw moment estimates
            self.v[f'dW{i+1}'] = self.beta2 * self.v[f'dW{i+1}'] + (1 - self.beta2) * (gradients[f'dW{i+1}']**2)
            self.v[f'dB{i+1}'] = self.beta2 * self.v[f'dB{i+1}'] + (1 - self.beta2) * (gradients[f'dB{i+1}']**2)

            # Compute bias-corrected first and second moments
            m_hat_dW = self.m[f'dW{i+1}'] / (1 - self.beta1**self.t)
            v_hat_dW = self.v[f'dW{i+1}'] / (1 - self.beta2**self.t)
            m_hat_dB = self.m[f'dB{i+1}'] / (1 - self.beta1**self.t)
            v_hat_dB = self.v[f'dB{i+1}'] / (1 - self.beta2**self.t)

            # Update parameters
            weights[i] -= self.learning_rate * m_hat_dW / (np.sqrt(v_hat_dW) + self.epsilon)
            biases[i] -= self.learning_rate * m_hat_dB / (np.sqrt(v_hat_dB) + self.epsilon)

# Neural network training
def train(X, y, epochs=100, batch_size=32, learning_rate=0.01, activation="relu", optimizer="adam"):
    np.random.seed(42)
    input_dim = X.shape[1]
    hidden_dim = 3  # Small layers for simplicity
    output_dim = y.shape[1]

    weights = [np.random.randn(input_dim, hidden_dim),
               np.random.randn(hidden_dim, hidden_dim),
               np.random.randn(hidden_dim, output_dim)]
    biases = [np.zeros((1, hidden_dim)),
              np.zeros((1, hidden_dim)),
              np.zeros((1, output_dim))]

    # Initialize optimizer
    if optimizer == "adam":
        opt = AdamOptimizer(learning_rate)

    for epoch in range(epochs):
        indices = np.random.permutation(X.shape[0])
        X, y = X[indices], y[indices]

        for i in range(0, X.shape[0], batch_size):
            X_batch = X[i:i+batch_size]
            y_batch = y[i:i+batch_size]

            layer1, layer2, output = forward_propagation(X_batch, weights, biases, activation)
            d_output, d_layer2, d_layer1 = backpropagation(X_batch, y_batch, layer1, layer2, output, weights, activation)

            gradients = {
                'dW1': np.dot(X_batch.T, d_layer1) / batch_size,
                'dB1': np.sum(d_layer1, axis=0, keepdims=True) / batch_size,
                'dW2': np.dot(layer1.T, d_layer2) / batch_size,
                'dB2': np.sum(d_layer2, axis=0, keepdims=True) / batch_size,
                'dW3': np.dot(layer2.T, d_output) / batch_size,
                'dB3': np.sum(d_output, axis=0, keepdims=True) / batch_size,
            }

            if optimizer == "gradient_descent":
                gradient_descent(weights, biases, gradients, learning_rate)
            elif optimizer == "adam":
                opt.update(weights, biases, gradients)

        _, _, output = forward_propagation(X, weights, biases, activation)
        loss = cross_entropy_loss(y, output)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")

    return weights, biases

# Example usage

# Toy dataset
X = np.array([[1, 0], [0, 1], [1, 1], [0, 0]])  # Inputs (AND gate)
y = np.array([[1, 0], [0, 1], [1, 0], [0, 1]])  # Outputs (One-hot encoded)

# Train the model
trained_weights, trained_biases = train(X, y, epochs=500, batch_size=4, learning_rate=0.1, activation="relu", optimizer="adam")

# Test the model
_, _, predictions = forward_propagation(X, trained_weights, trained_biases, activation="relu")
print("\nPredictions:")
print(predictions)


Epoch 1/500, Loss: 0.7142
Epoch 2/500, Loss: 0.6522
Epoch 3/500, Loss: 0.6813
Epoch 4/500, Loss: 0.6982
Epoch 5/500, Loss: 0.6895
Epoch 6/500, Loss: 0.6622
Epoch 7/500, Loss: 0.6204
Epoch 8/500, Loss: 0.5700
Epoch 9/500, Loss: 0.5111
Epoch 10/500, Loss: 0.4663
Epoch 11/500, Loss: 0.4407
Epoch 12/500, Loss: 0.4167
Epoch 13/500, Loss: 0.3773
Epoch 14/500, Loss: 0.3198
Epoch 15/500, Loss: 0.2604
Epoch 16/500, Loss: 0.2086
Epoch 17/500, Loss: 0.1846
Epoch 18/500, Loss: 0.1615
Epoch 19/500, Loss: 0.1402
Epoch 20/500, Loss: 0.1212
Epoch 21/500, Loss: 0.1046
Epoch 22/500, Loss: 0.0904
Epoch 23/500, Loss: 0.0782
Epoch 24/500, Loss: 0.0680
Epoch 25/500, Loss: 0.0592
Epoch 26/500, Loss: 0.0518
Epoch 27/500, Loss: 0.0455
Epoch 28/500, Loss: 0.0402
Epoch 29/500, Loss: 0.0356
Epoch 30/500, Loss: 0.0318
Epoch 31/500, Loss: 0.0284
Epoch 32/500, Loss: 0.0256
Epoch 33/500, Loss: 0.0231
Epoch 34/500, Loss: 0.0210
Epoch 35/500, Loss: 0.0192
Epoch 36/500, Loss: 0.0176
Epoch 37/500, Loss: 0.0162
Epoch 38/5