# Baseline Models

This notebook trains and evaluates baseline models (LSTM, CNN, BiLSTM+Attention).

## Objectives
- Train baseline models on IMDB dataset
- Compare model performance
- Visualize training progress
- Analyze errors


In [None]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd())))

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from pathlib import Path

from src.data.dataset_loader import load_preprocessed_data
from src.data.preprocess import clean_text, create_vocabulary
from src.models.lstm_model import LSTMModel, BiLSTMWithAttention
from src.models.cnn_model import CNNModel
from src.utils.seed_everything import seed_everything
from src.utils.config_loader import load_config

seed_everything(42)
config = load_config('../config.yaml')


## 1. Load and Prepare Data


In [None]:
# Load preprocessed IMDB data (from notebook 02) - already split into train/val/test
from src.data.dataset_loader import load_preprocessed_data

print("Loading preprocessed IMDB data (train/val/test splits)...")
train_texts, train_labels = load_preprocessed_data('imdb_train', data_dir='../intermediate/data')
val_texts, val_labels = load_preprocessed_data('imdb_val', data_dir='../intermediate/data')
test_texts, test_labels = load_preprocessed_data('imdb_test', data_dir='../intermediate/data')

print(f"âœ… Loaded preprocessed data (already split)")
print(f"Train: {len(train_texts)}, Val: {len(val_texts)}, Test: {len(test_texts)}")


## 2. Create Vocabulary

Note: Vocabulary creation uses the cleaned/preprocessed texts from the previous step.


## 3. Train LSTM Model

Using preprocessed data - no need to clean again!


In [None]:
# Prepare data for baseline models (convert to sequences of word indices)
# Note: This is a simplified version - full implementation would require tokenization

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize LSTM model
lstm_model = LSTMModel(
    vocab_size=vocab_size,
    embedding_dim=100,
    hidden_dim=128,
    num_layers=2,
    num_classes=2,
    dropout=0.2
).to(device)

print(f"LSTM Model parameters: {sum(p.numel() for p in lstm_model.parameters()):,}")


In [None]:
# Create vocabulary for baseline models
vocab = create_vocabulary(train_texts[:5000], min_freq=2)  # Use subset for speed
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")
