In [29]:
import numpy as np

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # stability trick
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [31]:
def cross_entropy_loss(X, y_onehot, theta):
    m = X.shape[0]
    z = X @ theta
    y_pred = softmax(z)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    return - (1 / m) * np.sum(y_onehot * np.log(y_pred))

In [33]:
def cross_entropy_gradient(X, y_onehot, theta):
    m = X.shape[0]
    z = X @ theta
    y_pred = softmax(z)
    return (1 / m) * X.T @ (y_pred - y_onehot)

In [35]:
def one_encoding(y, num_classes=None):
    if num_classes is None:
        num_classes = np.max(y) + 1
    return np.eye(num_classes)[y]

In [37]:
def gradient_descent(X, y, lr=1e-2, n_steps=1000):
    y_onehot = one_encoding(y)
    m, n = X.shape
    k = y_onehot.shape[1]
    
    theta = np.zeros((n, k))

    for _ in range(n_steps):
        gradients = cross_entropy_gradient(X, y_onehot, theta)
        theta -= lr * gradients

    return theta

In [39]:
def predict(X, theta):
    probs = softmax(X @ theta)
    return np.argmax(probs, axis=1)

In [45]:
# Example data
X = np.array([
    [1.0, 2.0],
    [1.0, 3.0],
    [1.0, 5.0],
    [1.0, 2.5]
])
y = np.array([0, 1, 2, 1])  # 3 classes

# Train model
theta = gradient_descent(X, y, lr=0.2)

# Predict
y_pred = predict(X, theta)
print("Predictions:", y_pred)

Predictions: [0 1 2 1]
