In [121]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load the penguins dataset
df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")

# Drop the rows with missing values
df.dropna(inplace=True)

# Encode the species column to integers
label_encoder = LabelEncoder()
df['species'] = label_encoder.fit_transform(df['species'])

# Standardize the numeric columns
scaler = StandardScaler()
df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']] = scaler.fit_transform(
    df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']], df['species'], test_size=0.2, random_state=42)

# Add a bias column to X_train and X_test
X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

# Convert the target variables to one-hot encoding
y_train_one_hot = pd.get_dummies(y_train).values
y_test_one_hot = pd.get_dummies(y_test).values


In [122]:
def softmax(X, theta):
    exp = np.exp(X @ theta)
    return exp / np.sum(exp, axis=1, keepdims=True)

def softmax_derivative(X, y, theta):
    return X.T @ (softmax(X, theta) - y) / X.shape[0]


In [123]:
def batch_gradient_descent(X, y, alpha, epochs, early_stop_tol=None):
    # Initialize the weights
    theta = np.random.randn(X.shape[1], y.shape[1])

    # Initialize variables for early stopping
    best_theta = None
    best_loss = np.inf
    early_stop_count = 0

    # Iterate over epochs
    for i in range(epochs):
        # Compute the gradient
        grad = softmax_derivative(X, y, theta)

        # Update the weights
        theta -= alpha * grad

        # Compute the loss
        loss = -np.sum(y * np.log(softmax(X, theta))) / X.shape[0]

        # Check for early stopping
        if early_stop_tol is not None:
            if loss < best_loss:
                best_theta = theta.copy()
                best_loss = loss
                early_stop_count = 0
            else:
                early_stop_count += 1
                if early_stop_count >= early_stop_tol:
                    print(f"Stopping early after {i} epochs")
                    break

        # Print the loss every 100 epochs
        if i % 100 == 0:
            print(f"Epoch {i}: loss = {loss:.4f}")

    if early_stop_tol is not None:
        return best_theta
    else:
        return theta


In [124]:
# Train the model
theta = batch_gradient_descent(X_train, y_train_one_hot, alpha=0.01, epochs=1000, early_stop_tol=10)

# Make predictions on the test set
y_pred_one_hot = softmax(X_test, theta)
y_pred = np.argmax(y_pred_one_hot, axis=1)

# Print the accuracy on the test set
accuracy = np.mean(y_pred == y_test)
print(f"Accuracy on test set: {accuracy:.2%}")


Epoch 0: loss = 0.5382
Epoch 100: loss = 0.3828
Epoch 200: loss = 0.3095
Epoch 300: loss = 0.2638
Epoch 400: loss = 0.2314
Epoch 500: loss = 0.2069
Epoch 600: loss = 0.1877
Epoch 700: loss = 0.1723
Epoch 800: loss = 0.1597
Epoch 900: loss = 0.1491
Accuracy on test set: 98.51%
