In [2]:
import numpy as np

class MLPClassifier:
    def __init__(self, hidden_dim, num_classes, learning_rate=0.1, num_epochs=1000):
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
    
    def fit(self, X, y):
        self.num_samples, self.input_dim = X.shape
        
        # Initialize weights and biases
        self.W1 = np.random.randn(self.input_dim, self.hidden_dim)
        self.b1 = np.zeros((1, self.hidden_dim))
        self.W2 = np.random.randn(self.hidden_dim, self.num_classes)
        self.b2 = np.zeros((1, self.num_classes))
        
        # Train the model
        for epoch in range(self.num_epochs):
            # Forward propagation
            hidden_output, output_probs = self.forward_propagation(X)
            
            # Backward propagation
            self.backward_propagation(X, hidden_output, output_probs, y)
        
    def predict(self, X):
        _, output_probs = self.forward_propagation(X)
        return np.argmax(output_probs, axis=1)
    
    def forward_propagation(self, X):
        # Hidden layer activation
        hidden_input = np.dot(X, self.W1) + self.b1
        hidden_output = self.sigmoid(hidden_input)
        
        # Output layer activation
        output_input = np.dot(hidden_output, self.W2) + self.b2
        output_probs = self.softmax(output_input)
        
        return hidden_output, output_probs
    
    def backward_propagation(self, X, hidden_output, output_probs, y):
        num_samples = X.shape[0]
        
        # Compute gradients
        d_output = output_probs
        d_output[range(num_samples), y] -= 1
        d_output /= num_samples
        
        dW2 = np.dot(hidden_output.T, d_output)
        db2 = np.sum(d_output, axis=0, keepdims=True)
        
        d_hidden = np.dot(d_output, self.W2.T) * self.sigmoid_derivative(hidden_output)
        
        dW1 = np.dot(X.T, d_hidden)
        db1 = np.sum(d_hidden, axis=0, keepdims=True)
        
        # Update weights and biases
        self.W2 -= self.learning_rate * dW2
        self.b2 -= self.learning_rate * db2
        self.W1 -= self.learning_rate * dW1
        self.b1 -= self.learning_rate * db1
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_derivative(self, x):
        return x * (1 - x)
    
    def softmax(self, x):
        exp_scores = np.exp(x)
        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
