# 02 - Neural Network Fundamentals

This notebook covers the neural network concepts essential for understanding Large Language Models.

## Topics Covered:
- Perceptrons and basic neurons
- Feedforward neural networks
- Activation functions
- Loss functions
- Backpropagation
- Gradient descent and optimization

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Callable, List, Tuple
import math

# Set random seed for reproducibility
np.random.seed(42)

## 1. Perceptrons

The perceptron is the simplest neural network unit.

In [None]:
class Perceptron:
    """Simple perceptron implementation."""
    
    def __init__(self, input_size: int, learning_rate: float = 0.01):
        self.weights = np.random.randn(input_size) * 0.1
        self.bias = 0.0
        self.learning_rate = learning_rate
    
    def forward(self, x: np.ndarray) -> float:
        """Forward pass through perceptron."""
        return np.dot(x, self.weights) + self.bias
    
    def predict(self, x: np.ndarray) -> int:
        """Make binary prediction."""
        return 1 if self.forward(x) > 0 else 0
    
    def train_step(self, x: np.ndarray, y: int) -> float:
        """Single training step."""
        prediction = self.predict(x)
        error = y - prediction
        
        # Update weights and bias
        self.weights += self.learning_rate * error * x
        self.bias += self.learning_rate * error
        
        return abs(error)

# Demonstrate perceptron on AND gate
# Training data for AND gate
X_and = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_and = np.array([0, 0, 0, 1])

perceptron = Perceptron(input_size=2, learning_rate=0.1)

# Training
epochs = 100
for epoch in range(epochs):
    total_error = 0
    for x, y in zip(X_and, y_and):
        error = perceptron.train_step(x, y)
        total_error += error
    
    if total_error == 0:
        print(f"Converged after {epoch + 1} epochs")
        break

# Test the trained perceptron
print("\nAND Gate Results:")
for x, y_true in zip(X_and, y_and):
    y_pred = perceptron.predict(x)
    print(f"Input: {x}, True: {y_true}, Predicted: {y_pred}")

## 2. Activation Functions

Activation functions introduce non-linearity to neural networks.

In [None]:
class ActivationFunctions:
    """Collection of activation functions and their derivatives."""
    
    @staticmethod
    def sigmoid(x: np.ndarray) -> np.ndarray:
        """Sigmoid activation function."""
        # Clip x to prevent overflow
        x = np.clip(x, -500, 500)
        return 1 / (1 + np.exp(-x))
    
    @staticmethod
    def sigmoid_derivative(x: np.ndarray) -> np.ndarray:
        """Derivative of sigmoid function."""
        s = ActivationFunctions.sigmoid(x)
        return s * (1 - s)
    
    @staticmethod
    def tanh(x: np.ndarray) -> np.ndarray:
        """Hyperbolic tangent activation function."""
        return np.tanh(x)
    
    @staticmethod
    def tanh_derivative(x: np.ndarray) -> np.ndarray:
        """Derivative of tanh function."""
        return 1 - np.tanh(x) ** 2
    
    @staticmethod
    def relu(x: np.ndarray) -> np.ndarray:
        """ReLU activation function."""
        return np.maximum(0, x)
    
    @staticmethod
    def relu_derivative(x: np.ndarray) -> np.ndarray:
        """Derivative of ReLU function."""
        return (x > 0).astype(float)
    
    @staticmethod
    def gelu(x: np.ndarray) -> np.ndarray:
        """GELU activation function (used in transformers)."""
        return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))
    
    @staticmethod
    def softmax(x: np.ndarray) -> np.ndarray:
        """Softmax activation function."""
        # Subtract max for numerical stability
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

# Visualize activation functions
x = np.linspace(-5, 5, 100)
activations = ActivationFunctions()

plt.figure(figsize=(15, 10))

# Plot activation functions
functions = [
    ('Sigmoid', activations.sigmoid),
    ('Tanh', activations.tanh),
    ('ReLU', activations.relu),
    ('GELU', activations.gelu)
]

for i, (name, func) in enumerate(functions, 1):
    plt.subplot(2, 2, i)
    plt.plot(x, func(x), label=name, linewidth=2)
    plt.title(f'{name} Activation Function')
    plt.xlabel('Input')
    plt.ylabel('Output')
    plt.grid(True, alpha=0.3)
    plt.legend()

plt.tight_layout()
plt.show()

# Compare activation function properties
print("Activation Function Properties:")
test_input = np.array([-2, -1, 0, 1, 2])
for name, func in functions:
    output = func(test_input)
    print(f"{name:8}: {output}")

## 3. Feedforward Neural Networks

In [None]:
class FeedforwardNetwork:
    """Simple feedforward neural network."""
    
    def __init__(self, layer_sizes: List[int], activation: str = 'sigmoid'):
        self.layer_sizes = layer_sizes
        self.num_layers = len(layer_sizes)
        
        # Initialize weights and biases
        self.weights = []
        self.biases = []
        
        for i in range(self.num_layers - 1):
            # Xavier initialization
            w = np.random.randn(layer_sizes[i], layer_sizes[i + 1]) * np.sqrt(2.0 / layer_sizes[i])
            b = np.zeros((1, layer_sizes[i + 1]))
            self.weights.append(w)
            self.biases.append(b)
        
        # Set activation function
        self.activations = ActivationFunctions()
        if activation == 'sigmoid':
            self.activation = self.activations.sigmoid
            self.activation_derivative = self.activations.sigmoid_derivative
        elif activation == 'tanh':
            self.activation = self.activations.tanh
            self.activation_derivative = self.activations.tanh_derivative
        elif activation == 'relu':
            self.activation = self.activations.relu
            self.activation_derivative = self.activations.relu_derivative
    
    def forward(self, X: np.ndarray) -> Tuple[np.ndarray, List[np.ndarray]]:
        """Forward pass through the network."""
        activations = [X]
        
        for i in range(self.num_layers - 1):
            z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
            
            # Use softmax for output layer, activation function for hidden layers
            if i == self.num_layers - 2:  # Output layer
                a = self.activations.softmax(z)
            else:  # Hidden layers
                a = self.activation(z)
            
            activations.append(a)
        
        return activations[-1], activations
    
    def backward(self, X: np.ndarray, y: np.ndarray, learning_rate: float = 0.01):
        """Backward pass (backpropagation)."""
        m = X.shape[0]
        
        # Forward pass
        output, activations = self.forward(X)
        
        # Compute loss (cross-entropy)
        loss = -np.mean(np.sum(y * np.log(output + 1e-15), axis=1))
        
        # Backward pass
        deltas = [output - y]  # Error at output layer
        
        # Compute deltas for hidden layers
        for i in range(self.num_layers - 2, 0, -1):
            delta = np.dot(deltas[0], self.weights[i].T) * self.activation_derivative(activations[i])
            deltas.insert(0, delta)
        
        # Update weights and biases
        for i in range(self.num_layers - 1):
            self.weights[i] -= learning_rate * np.dot(activations[i].T, deltas[i]) / m
            self.biases[i] -= learning_rate * np.mean(deltas[i], axis=0, keepdims=True)
        
        return loss
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        """Make predictions."""
        output, _ = self.forward(X)
        return np.argmax(output, axis=1)

# Create a simple classification dataset
def create_spiral_dataset(n_points: int = 100, n_classes: int = 3) -> Tuple[np.ndarray, np.ndarray]:
    """Create a spiral dataset for classification."""
    X = np.zeros((n_points * n_classes, 2))
    y = np.zeros(n_points * n_classes, dtype=int)
    
    for class_num in range(n_classes):
        ix = range(n_points * class_num, n_points * (class_num + 1))
        r = np.linspace(0.0, 1, n_points)
        t = np.linspace(class_num * 4, (class_num + 1) * 4, n_points) + np.random.randn(n_points) * 0.2
        X[ix] = np.c_[r * np.sin(t), r * np.cos(t)]
        y[ix] = class_num
    
    return X, y

# Generate dataset
X, y = create_spiral_dataset(n_points=50, n_classes=3)

# Convert labels to one-hot encoding
y_onehot = np.eye(3)[y]

# Create and train network
network = FeedforwardNetwork([2, 10, 10, 3], activation='relu')

# Training loop
losses = []
epochs = 1000

for epoch in range(epochs):
    loss = network.backward(X, y_onehot, learning_rate=0.1)
    losses.append(loss)
    
    if epoch % 100 == 0:
        predictions = network.predict(X)
        accuracy = np.mean(predictions == y)
        print(f"Epoch {epoch}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

# Plot training loss
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.plot(losses)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)

# Plot decision boundary
plt.subplot(1, 2, 2)
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
mesh_points = np.c_[xx.ravel(), yy.ravel()]
Z = network.predict(mesh_points)
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
scatter = plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='black')
plt.title('Decision Boundary')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

plt.tight_layout()
plt.show()

final_accuracy = np.mean(network.predict(X) == y)
print(f"\nFinal Accuracy: {final_accuracy:.4f}")

## 4. Loss Functions

In [None]:
class LossFunctions:
    """Collection of loss functions."""
    
    @staticmethod
    def mean_squared_error(y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """Mean Squared Error loss."""
        return np.mean((y_true - y_pred) ** 2)
    
    @staticmethod
    def cross_entropy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """Cross-entropy loss for classification."""
        # Clip predictions to prevent log(0)
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
    
    @staticmethod
    def binary_cross_entropy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """Binary cross-entropy loss."""
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    
    @staticmethod
    def huber_loss(y_true: np.ndarray, y_pred: np.ndarray, delta: float = 1.0) -> float:
        """Huber loss (robust to outliers)."""
        error = y_true - y_pred
        is_small_error = np.abs(error) <= delta
        squared_loss = 0.5 * error ** 2
        linear_loss = delta * np.abs(error) - 0.5 * delta ** 2
        return np.mean(np.where(is_small_error, squared_loss, linear_loss))

# Demonstrate different loss functions
y_true = np.array([0, 1, 1, 0, 1])
y_pred_good = np.array([0.1, 0.9, 0.8, 0.2, 0.9])
y_pred_bad = np.array([0.9, 0.1, 0.2, 0.8, 0.1])

loss_funcs = LossFunctions()

print("Loss Function Comparison:")
print(f"Good predictions: {y_pred_good}")
print(f"Bad predictions:  {y_pred_bad}")
print(f"True labels:      {y_true}")
print()

# Binary cross-entropy
bce_good = loss_funcs.binary_cross_entropy(y_true, y_pred_good)
bce_bad = loss_funcs.binary_cross_entropy(y_true, y_pred_bad)
print(f"Binary Cross-Entropy - Good: {bce_good:.4f}, Bad: {bce_bad:.4f}")

# MSE
mse_good = loss_funcs.mean_squared_error(y_true, y_pred_good)
mse_bad = loss_funcs.mean_squared_error(y_true, y_pred_bad)
print(f"Mean Squared Error   - Good: {mse_good:.4f}, Bad: {mse_bad:.4f}")

## 5. Optimization Algorithms

In [None]:
class Optimizers:
    """Collection of optimization algorithms."""
    
    class SGD:
        """Stochastic Gradient Descent optimizer."""
        
        def __init__(self, learning_rate: float = 0.01, momentum: float = 0.0):
            self.learning_rate = learning_rate
            self.momentum = momentum
            self.velocity = None
        
        def update(self, params: np.ndarray, gradients: np.ndarray) -> np.ndarray:
            if self.velocity is None:
                self.velocity = np.zeros_like(params)
            
            self.velocity = self.momentum * self.velocity - self.learning_rate * gradients
            return params + self.velocity
    
    class Adam:
        """Adam optimizer."""
        
        def __init__(self, learning_rate: float = 0.001, beta1: float = 0.9, 
                     beta2: float = 0.999, epsilon: float = 1e-8):
            self.learning_rate = learning_rate
            self.beta1 = beta1
            self.beta2 = beta2
            self.epsilon = epsilon
            self.m = None  # First moment
            self.v = None  # Second moment
            self.t = 0     # Time step
        
        def update(self, params: np.ndarray, gradients: np.ndarray) -> np.ndarray:
            if self.m is None:
                self.m = np.zeros_like(params)
                self.v = np.zeros_like(params)
            
            self.t += 1
            
            # Update biased first moment estimate
            self.m = self.beta1 * self.m + (1 - self.beta1) * gradients
            
            # Update biased second raw moment estimate
            self.v = self.beta2 * self.v + (1 - self.beta2) * (gradients ** 2)
            
            # Compute bias-corrected first moment estimate
            m_hat = self.m / (1 - self.beta1 ** self.t)
            
            # Compute bias-corrected second raw moment estimate
            v_hat = self.v / (1 - self.beta2 ** self.t)
            
            # Update parameters
            return params - self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)

# Demonstrate optimization on a simple quadratic function
def quadratic_function(x: np.ndarray) -> float:
    """Simple quadratic function: f(x) = x^2 + 2x + 1"""
    return np.sum(x**2 + 2*x + 1)

def quadratic_gradient(x: np.ndarray) -> np.ndarray:
    """Gradient of quadratic function: f'(x) = 2x + 2"""
    return 2*x + 2

# Compare optimizers
def compare_optimizers(initial_x: np.ndarray, steps: int = 100):
    """Compare different optimizers on the quadratic function."""
    
    # Initialize optimizers
    sgd = Optimizers.SGD(learning_rate=0.1)
    sgd_momentum = Optimizers.SGD(learning_rate=0.1, momentum=0.9)
    adam = Optimizers.Adam(learning_rate=0.1)
    
    # Track optimization paths
    optimizers = {
        'SGD': (sgd, initial_x.copy()),
        'SGD + Momentum': (sgd_momentum, initial_x.copy()),
        'Adam': (adam, initial_x.copy())
    }
    
    histories = {name: [] for name in optimizers.keys()}
    
    for step in range(steps):
        for name, (optimizer, x) in optimizers.items():
            # Compute gradient
            grad = quadratic_gradient(x)
            
            # Update parameters
            x_new = optimizer.update(x, grad)
            optimizers[name] = (optimizer, x_new)
            
            # Record function value
            histories[name].append(quadratic_function(x_new))
    
    return histories

# Run comparison
initial_point = np.array([5.0])
histories = compare_optimizers(initial_point, steps=50)

# Plot optimization paths
plt.figure(figsize=(10, 6))
for name, history in histories.items():
    plt.plot(history, label=name, linewidth=2)

plt.xlabel('Iteration')
plt.ylabel('Function Value')
plt.title('Optimizer Comparison on Quadratic Function')
plt.legend()
plt.grid(True, alpha=0.3)
plt.yscale('log')
plt.show()

# Print final values
print("Final function values:")
for name, history in histories.items():
    print(f"{name:15}: {history[-1]:.6f}")
print(f"{'Optimal':15}: {0.000000:.6f}")

## 6. Gradient Descent Variants

In [None]:
def batch_gradient_descent(X: np.ndarray, y: np.ndarray, learning_rate: float = 0.01, 
                          epochs: int = 100) -> Tuple[np.ndarray, List[float]]:
    """Batch gradient descent for linear regression."""
    m, n = X.shape
    theta = np.random.randn(n, 1) * 0.01
    costs = []
    
    for epoch in range(epochs):
        # Forward pass
        predictions = X.dot(theta)
        cost = np.mean((predictions - y) ** 2) / 2
        costs.append(cost)
        
        # Backward pass
        gradients = X.T.dot(predictions - y) / m
        theta -= learning_rate * gradients
    
    return theta, costs

def stochastic_gradient_descent(X: np.ndarray, y: np.ndarray, learning_rate: float = 0.01, 
                               epochs: int = 100) -> Tuple[np.ndarray, List[float]]:
    """Stochastic gradient descent for linear regression."""
    m, n = X.shape
    theta = np.random.randn(n, 1) * 0.01
    costs = []
    
    for epoch in range(epochs):
        epoch_cost = 0
        
        # Shuffle data
        indices = np.random.permutation(m)
        
        for i in indices:
            xi = X[i:i+1]
            yi = y[i:i+1]
            
            # Forward pass
            prediction = xi.dot(theta)
            cost = (prediction - yi) ** 2 / 2
            epoch_cost += cost[0, 0]
            
            # Backward pass
            gradient = xi.T.dot(prediction - yi)
            theta -= learning_rate * gradient
        
        costs.append(epoch_cost / m)
    
    return theta, costs

def mini_batch_gradient_descent(X: np.ndarray, y: np.ndarray, batch_size: int = 32,
                               learning_rate: float = 0.01, epochs: int = 100) -> Tuple[np.ndarray, List[float]]:
    """Mini-batch gradient descent for linear regression."""
    m, n = X.shape
    theta = np.random.randn(n, 1) * 0.01
    costs = []
    
    for epoch in range(epochs):
        epoch_cost = 0
        num_batches = 0
        
        # Shuffle data
        indices = np.random.permutation(m)
        
        for i in range(0, m, batch_size):
            batch_indices = indices[i:i + batch_size]
            X_batch = X[batch_indices]
            y_batch = y[batch_indices]
            
            # Forward pass
            predictions = X_batch.dot(theta)
            cost = np.mean((predictions - y_batch) ** 2) / 2
            epoch_cost += cost
            num_batches += 1
            
            # Backward pass
            gradients = X_batch.T.dot(predictions - y_batch) / len(X_batch)
            theta -= learning_rate * gradients
        
        costs.append(epoch_cost / num_batches)
    
    return theta, costs

# Generate synthetic linear regression data
np.random.seed(42)
m = 1000  # Number of samples
X = np.random.randn(m, 1)
y = 4 + 3 * X + np.random.randn(m, 1) * 0.5  # y = 4 + 3x + noise

# Add bias term
X_with_bias = np.c_[np.ones((m, 1)), X]

# Compare gradient descent variants
methods = {
    'Batch GD': lambda: batch_gradient_descent(X_with_bias, y, learning_rate=0.1, epochs=100),
    'Stochastic GD': lambda: stochastic_gradient_descent(X_with_bias, y, learning_rate=0.01, epochs=100),
    'Mini-batch GD': lambda: mini_batch_gradient_descent(X_with_bias, y, batch_size=32, learning_rate=0.1, epochs=100)
}

plt.figure(figsize=(12, 4))

for i, (name, method) in enumerate(methods.items(), 1):
    theta, costs = method()
    
    plt.subplot(1, 3, i)
    plt.plot(costs, linewidth=2)
    plt.title(f'{name}\nFinal θ = [{theta[0,0]:.2f}, {theta[1,0]:.2f}]')
    plt.xlabel('Epoch')
    plt.ylabel('Cost')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("True parameters: θ₀ = 4.00, θ₁ = 3.00")
print("\nLearned parameters:")
for name, method in methods.items():
    theta, _ = method()
    print(f"{name:15}: θ₀ = {theta[0,0]:.2f}, θ₁ = {theta[1,0]:.2f}")

## 7. Exercises

Try these exercises to deepen your understanding:

1. **Custom Activation Function**: Implement the Swish activation function (x * sigmoid(x)) and compare its performance with ReLU on the spiral dataset.

2. **Learning Rate Scheduling**: Implement different learning rate schedules (exponential decay, cosine annealing) and observe their effects on training.

3. **Regularization**: Add L1 and L2 regularization to the feedforward network and study their effects on overfitting.

4. **Batch Normalization**: Implement a simplified version of batch normalization and observe its effects on training stability.

5. **Universal Approximation**: Experiment with different network architectures to approximate complex functions like sin(x) or x².

## Summary

In this notebook, we covered the fundamental neural network concepts:

- **Perceptrons**: The basic building blocks of neural networks
- **Activation Functions**: Non-linear functions that enable learning complex patterns
- **Feedforward Networks**: Multi-layer networks for classification and regression
- **Loss Functions**: Measures of prediction error for different tasks
- **Backpropagation**: The algorithm for computing gradients
- **Optimization**: Different algorithms for updating network parameters
- **Gradient Descent Variants**: Batch, stochastic, and mini-batch approaches

These concepts form the foundation for understanding more complex architectures like RNNs and Transformers, which we'll explore in subsequent notebooks. The principles of forward propagation, backpropagation, and optimization remain consistent across all neural network architectures.