In [6]:
import optuna
import torch
import pandas as pd
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer


def preprocess_agnews(csv_file):
    '''Load and preprocess AG News data'''
    df = pd.read_csv(csv_file, header = None)
    df = df[df[0] !=1]
    df['label'] = df[0] - 2
    df['text'] = [" ".join((title, body)) for title, body in zip(df[1], df[2])]
    return df[['text', 'label']]

class TextDataset(Dataset):
    '''Class for creating the vectoriser and features'''
    def __init__(self, dataframe, max_features=1500, vectoriser=None):
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe['label'].tolist()
        
        # TF-IDF: fit on training, reuse for testing
        if vectoriser is None:
            # Training: create and fit
            self.vectoriser = TfidfVectorizer(max_features=max_features, stop_words='english')
            self.features = self.vectoriser.fit_transform(self.texts)
        else:
            # Testing: use provided vectoriser
            self.vectoriser = vectoriser
            self.features = self.vectoriser.transform(self.texts)
        
        # Convert sparse matrix to dense tensor (TF-IDF returns sparse)
        self.features = torch.tensor(self.features.toarray(), dtype=torch.float32)
        self.labels = torch.tensor(self.labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        # Just return pre-computed vectors
        return self.features[idx], self.labels[idx]

def create_model(trial):
    """Let Optuna choose the architecture."""
    n_layers = trial.suggest_int("n_layers", 1, 3)
    layers = []
    
    in_features = 1500  # Your TF-IDF features
    for i in range(n_layers):
        out_features = trial.suggest_int(f"n_units_l{i}", 64, 512)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        in_features = out_features
    
    layers.append(nn.Linear(in_features, 3))  # Output layer
    return nn.Sequential(*layers)


def objective(trial):
    """Optuna calls this to evaluate each configuration."""
    
    # Hyperparameters to search
    lr = trial.suggest_float("lr", 1e-4, 1e-1, log=True)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    optimizer_name = trial.suggest_categorical("optimizer", ["SGD", "Adam", "RMSprop"])
    
    # Create model with suggested architecture
    model = create_model(trial)
    
    # Create dataloaders with suggested batch size
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Select optimizer
    if optimizer_name == "SGD":
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    elif optimizer_name == "Adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    else:
        optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
    
    loss_fn = nn.CrossEntropyLoss()
    
    # Train for a few epochs
    for epoch in range(5):
        model.train()
        for X, y in train_loader:
            optimizer.zero_grad()
            loss = loss_fn(model(X), y)
            loss.backward()
            optimizer.step()
    
    # Evaluate and return accuracy (Optuna maximizes this)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X, y in test_loader:
            pred = model(X).argmax(1)
            correct += (pred == y).sum().item()
            total += len(y)
    
    return correct / total  # Return accuracy

# Dataset
train_df = preprocess_agnews('../W2_feedfoward_neural_networks/agnews_train.csv')
test_df = preprocess_agnews('../W2_feedfoward_neural_networks/agnews_test.csv')

train_dataset = TextDataset(train_df, max_features=1500)
test_dataset = TextDataset(test_df, vectoriser=train_dataset.vectoriser)  # Passing the fitted vectoriser

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Run the search
study = optuna.create_study(direction="maximize")  # Maximize accuracy
study.optimize(objective, n_trials=50)  # Try 50 configurations

# Best results
print(f"Best accuracy: {study.best_value}")
print(f"Best hyperparameters: {study.best_params}")

[32m[I 2026-01-30 10:48:37,173][0m A new study created in memory with name: no-name-f4e988a7-05b1-4e87-8003-9fe3525af864[0m
[32m[I 2026-01-30 10:48:42,285][0m Trial 0 finished with value: 0.905438596491228 and parameters: {'lr': 0.03537734821644287, 'batch_size': 128, 'optimizer': 'Adam', 'n_layers': 2, 'n_units_l0': 70, 'n_units_l1': 365}. Best is trial 0 with value: 0.905438596491228.[0m
[32m[I 2026-01-30 10:48:52,260][0m Trial 1 finished with value: 0.9007017543859649 and parameters: {'lr': 0.04249611927659274, 'batch_size': 32, 'optimizer': 'RMSprop', 'n_layers': 1, 'n_units_l0': 223}. Best is trial 0 with value: 0.905438596491228.[0m
[32m[I 2026-01-30 10:48:57,635][0m Trial 2 finished with value: 0.3385964912280702 and parameters: {'lr': 0.0012419208707766783, 'batch_size': 128, 'optimizer': 'SGD', 'n_layers': 2, 'n_units_l0': 277, 'n_units_l1': 339}. Best is trial 0 with value: 0.905438596491228.[0m
[32m[I 2026-01-30 10:49:02,901][0m Trial 3 finished with value: 0.9

Best accuracy: 0.92
Best hyperparameters: {'lr': 0.0014122902270844123, 'batch_size': 128, 'optimizer': 'RMSprop', 'n_layers': 1, 'n_units_l0': 347}


In [7]:
# Get best params
best_params = study.best_params
print(best_params)
# {'n_layers': 2, 'n_units_l0': 256, 'n_units_l1': 128, 'lr': 0.003, 'batch_size': 64, 'optimizer': 'Adam'}

# Build final model with best architecture
# Train for more epochs with best hyperparameters

{'lr': 0.0014122902270844123, 'batch_size': 128, 'optimizer': 'RMSprop', 'n_layers': 1, 'n_units_l0': 347}
