# Mixture of Experts Approach

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Sample data (x values and corresponding y values)
x_data = np.array([0, 1, 2, 3, 4, 5])  # Input x values
y_data = np.array([1, 3, 5, 7, 9, 11])  # Output y values (y = 2x + 1)

# Number of experts
num_experts = 3

In [None]:
# Initialize expert weights (each expert has 2 weights: one for x and one for bias)
experts = np.random.randn(num_experts, 2)  # Shape: (3, 2)

# Initialize gating network weights (1 weight per expert)
gate_weights = np.random.randn(num_experts)

# Learning rate
lr = 0.01

# Number of training iterations
epochs = 100
loss_history = []

In [None]:
def softmax(z):
    exp_z = np.exp(z - np.max(z))  # Subtract max for numerical stability
    return exp_z / np.sum(exp_z)

In [None]:
for epoch in range(epochs):
    total_loss = 0  # Accumulate loss over all training samples

    # Accumulate gradients for experts and gating network
    grad_experts = np.zeros_like(experts)  # Shape: (3,2)
    grad_gate_weights = np.zeros_like(gate_weights)  # Shape: (3,)

    for i in range(len(x_data)):
        x = x_data[i]
        y_true = y_data[i]

        # Forward pass
        input_vector = np.array([x, 1])  # Bias is 1

        # Compute expert predictions (linear regression per expert)
        expert_outputs = np.dot(experts, input_vector)  # Shape: (3,)

        # Compute gating probabilities
        gate_probs = softmax(gate_weights)  # Shape: (3,)

        # Compute final output (weighted sum of expert predictions)
        y_pred = np.sum(gate_probs * expert_outputs)

        # Compute loss (Mean Squared Error)
        loss = (y_pred - y_true) ** 2
        total_loss += loss

        # Compute gradients (Backpropagation)
        
        # Gradients w.r.t expert weights
        for j in range(num_experts):
            grad_experts[j] += 2 * (y_pred - y_true) * gate_probs[j] * input_vector

        # Gradients w.r.t gating weights
        for j in range(num_experts):
            grad_gate_weights[j] += 2 * (y_pred - y_true) * (expert_outputs[j] - y_pred) * gate_probs[j] * (1 - gate_probs[j])

    # Update expert weights using average gradient
    experts -= lr * (grad_experts / len(x_data))

    # Update gating network weights using average gradient
    gate_weights -= lr * (grad_gate_weights / len(x_data))

    # Store average loss per epoch
    loss_history.append(total_loss / len(x_data))

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(x_data):.4f}")

In [None]:
plt.plot(range(epochs), loss_history, label="Loss")
plt.xlabel("Epochs")
plt.ylabel("MSE Loss")
plt.title("Loss Over Epochs")
plt.legend()
plt.show()