In [3]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# ==============================================================================
# STEP 1: CREATE AND PREPARE THE DATA
# ==============================================================================

# Create a synthetic dataset for testing
X, y = make_classification(
    n_samples=5000,    # Total number of data points
    n_features=10,     # Number of input features
    n_classes=3,       # Number of classes to predict (0, 1, or 2)
    n_informative=6,   # Number of features that actually help with classification
    random_state=42    # For reproducible results
)

# Convert labels to one-hot encoding
# Example: if y = [0, 1, 2], one-hot becomes [[1,0,0], [0,1,0], [0,0,1]]
num_classes = y.max() + 1
num_samples = y.size
y_onehot = np.zeros((num_samples, num_classes))
y_onehot[np.arange(num_samples), y] = 1

# Split data: 60% training, 20% validation, 20% testing
X_train, X_temp, y_train, y_temp = train_test_split(X, y_onehot, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training data: {X_train.shape[0]} samples")
print(f"Validation data: {X_val.shape[0]} samples") 
print(f"Test data: {X_test.shape[0]} samples")

# ==============================================================================
# STEP 2: DEFINE HELPER FUNCTIONS
# ==============================================================================

def softmax(raw_scores):
    """,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
    Convert raw scores to probabilities that sum to 1.
    
    Why subtract the max? For numerical stability!
    Large numbers in exp() can cause overflow, so we subtract the max
    to keep numbers manageable without changing the final probabilities.
    """
    # Subtract max to prevent overflow (numerical stability trick)
    stable_scores = raw_scores - np.max(raw_scores, axis=1, keepdims=True)
    
    # Convert to probabilities using exponential
     #Mathematical Formula: softmax(z_i) = exp(z_i) / Σ(exp(z_j))

    exp_scores = np.exp(stable_scores)
    probabilities = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    
    return probabilities
   

def calculate_loss(true_labels, predicted_probs):
    """
    Cross-entropy loss: measures how far our predictions are from the truth.
    Lower loss = better predictions.
    
    The math: -sum(true_label * log(predicted_probability)) / number_of_samples
    We add a tiny epsilon (1e-9) to avoid taking log of zero.
    """
    num_samples = true_labels.shape[0]
    epsilon = 1e-9  # Prevent log(0) which would be -infinity
    
    # Calculate cross-entropy loss
    #L = -1/N × Σ(y_i × log(ŷ_i))

    loss = -np.sum(true_labels * np.log(predicted_probs + epsilon)) / num_samples
    return loss

def calculate_accuracy(true_labels, predicted_probs):
    """Calculate what percentage of predictions were correct."""
    # Convert probabilities back to class predictions
    true_classes = np.argmax(true_labels, axis=1)
    predicted_classes = np.argmax(predicted_probs, axis=1)
    
    # Calculate percentage of correct predictions
    #Accuracy = (1/N) × Σ(I(y_i = ŷ_i))

    accuracy = np.mean(true_classes == predicted_classes)
    return accuracy

# ==============================================================================
# STEP 3: INITIALIZE THE MODEL PARAMETERS
# ==============================================================================

# Our model is: predictions = softmax(X * weights + bias)
num_features = X_train.shape[1]  # 10 features
num_classes = y_train.shape[1]   # 3 classes

# Initialize weights randomly (small values work better)
weights = np.random.randn(num_features, num_classes) * 0.01

# Initialize bias to zero
bias = np.zeros((1, num_classes))

print(f"Model initialized with {num_features} features and {num_classes} classes")

# ==============================================================================
# STEP 4: SET TRAINING HYPERPARAMETERS
# ==============================================================================

learning_rate = 0.1      # How big steps to take when updating weights
max_epochs = 1000        # Maximum number of training iterations
patience = 10            # Stop early if no improvement for this many epochs

# Variables for early stopping
best_validation_loss = np.inf
patience_counter = 0
best_weights = None
best_bias = None

# ==============================================================================
# STEP 5: TRAINING LOOP
# ==============================================================================

print("Starting training...")

for epoch in range(max_epochs):
    
    # === FORWARD PASS: Make predictions ===
    # Calculate raw scores: X * weights + bias
    raw_scores = np.dot(X_train, weights) + bias
    
    # Convert scores to probabilities using softmax
    predicted_probs = softmax(raw_scores)
    
    # Calculate how wrong we are (loss)
    training_loss = calculate_loss(y_train, predicted_probs)
    
    # === BACKWARD PASS: Calculate gradients ===
    # This is where we figure out how to adjust weights to reduce loss
    
    num_training_samples = X_train.shape[0]
    
    # Gradient for weights: how much each weight contributed to the error
    # Math: X^T * (predicted - actual) / num_samples
    error = predicted_probs - y_train
    weight_gradient = np.dot(X_train.T, error) / num_training_samples
    
    # Gradient for bias: average error across all samples
    bias_gradient = np.sum(error, axis=0, keepdims=True) / num_training_samples
    
    # === UPDATE PARAMETERS ===
    # Move weights in the opposite direction of the gradient
    weights = weights - learning_rate * weight_gradient
    bias = bias - learning_rate * bias_gradient
    
    # === VALIDATION CHECK ===
    # Check performance on validation set (data model hasn't seen during training)
    val_raw_scores = np.dot(X_val, weights) + bias
    val_predicted_probs = softmax(val_raw_scores)
    validation_loss = calculate_loss(y_val, val_predicted_probs)
    
    # === EARLY STOPPING LOGIC ===
    # If validation loss improves, save the model
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        best_weights = weights.copy()  # Save best weights
        best_bias = bias.copy()        # Save best bias
        patience_counter = 0           # Reset patience
    else:
        # If no improvement, increment patience counter
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch} - no improvement for {patience} epochs")
            break
    
    # Print progress every 50 epochs
    if epoch % 50 == 0:
        train_acc = calculate_accuracy(y_train, predicted_probs)
        val_acc = calculate_accuracy(y_val, val_predicted_probs)
        print(f"Epoch {epoch:4d} | Train Loss: {training_loss:.4f} | Train Acc: {train_acc:.4f} | "
              f"Val Loss: {validation_loss:.4f} | Val Acc: {val_acc:.4f}")

# ==============================================================================
# STEP 6: TEST THE FINAL MODEL
# ==============================================================================

print("\nTraining complete! Testing on unseen data...")

# Use the best weights (from validation) to make final predictions
test_raw_scores = np.dot(X_test, best_weights) + best_bias
test_predicted_probs = softmax(test_raw_scores)
test_accuracy = calculate_accuracy(y_test, test_predicted_probs)

print(f"Final Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.1f}%)")

Training data: 3000 samples
Validation data: 1000 samples
Test data: 1000 samples
Model initialized with 10 features and 3 classes
Starting training...
Epoch    0 | Train Loss: 1.0960 | Train Acc: 0.3877 | Val Loss: 1.0290 | Val Acc: 0.5960
Epoch   50 | Train Loss: 0.8325 | Train Acc: 0.6093 | Val Loss: 0.8199 | Val Acc: 0.6360
Epoch  100 | Train Loss: 0.8169 | Train Acc: 0.6277 | Val Loss: 0.8076 | Val Acc: 0.6450
Epoch  150 | Train Loss: 0.8135 | Train Acc: 0.6363 | Val Loss: 0.8054 | Val Acc: 0.6450
Epoch  200 | Train Loss: 0.8126 | Train Acc: 0.6390 | Val Loss: 0.8050 | Val Acc: 0.6440
Early stopping at epoch 220 - no improvement for 10 epochs

Training complete! Testing on unseen data...
Final Test Accuracy: 0.6410 (64.1%)
