In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

print("="*80)
print("PREPROCESSING: Tokenize and save datasets")
print("="*80)

In [None]:
print("\n1. Loading raw data...")

# Load IMDB CSV
df = pd.read_csv("../data/raw/IMDB Dataset.csv")

# Convert sentiment text to numerical labels (0=negative, 1=positive)
df['labels'] = df['sentiment'].map({'negative': 0, 'positive': 1})

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Split into train (80%) and test (20%)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

print(f"   ✓ Train set: {len(dataset['train'])} reviews")
print(f"   ✓ Test set: {len(dataset['test'])} reviews")

In [None]:
print("\n2. Tokenizing for BERT...")

# Load tokenizer automatically based on model name
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_bert(examples):
    """
    Tokenize reviews for BERT:
    - Converts text to token IDs
    - Pads all sequences to 512 tokens
    - Truncates reviews longer than 512 tokens
    """
    return bert_tokenizer(
        examples["review"],
        padding="max_length",      # Pad to max_length
        truncation=True,            # Cut at max_length
        max_length=512              # BERT's limit
    )

# Apply tokenization to entire dataset in batches
bert_tokenized = dataset.map(tokenize_bert, batched=True)

# Save to disk (avoids re-tokenizing later)
bert_tokenized.save_to_disk("../data/processed/bert_tokenized")

print("   ✓ Saved to ../data/processed/bert_tokenized")

In [None]:
print("\n3. Tokenizing for RoBERTa...")

# Load RoBERTa tokenizer
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_roberta(examples):
    """
    Tokenize reviews for RoBERTa:
    - Same process as BERT
    - RoBERTa uses different vocabulary/tokenization
    """
    return roberta_tokenizer(
        examples["review"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Apply tokenization
roberta_tokenized = dataset.map(tokenize_roberta, batched=True)

# Save to disk
roberta_tokenized.save_to_disk("../data/processed/roberta_tokenized")

print("   ✓ Saved to ../data/processed/roberta_tokenized")

In [None]:
# Uncomment if fine-tuning LLaMA:
'''
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
llama_tokenizer.pad_token = llama_tokenizer.eos_token  # LLaMA needs explicit pad token

def tokenize_llama(examples):
    return llama_tokenizer(
        examples["review"],
        padding="max_length",
        truncation=True,
        max_length=512  # Can go higher (2048+) but keep consistent
    )

llama_tokenized = dataset.map(tokenize_llama, batched=True)
llama_tokenized.save_to_disk("../data/processed/llama_tokenized")
print("   ✓ Saved to ../data/processed/llama_tokenized")
'''

In [None]:
print("\n" + "="*80)
print("PREPROCESSING COMPLETE")
print("="*80)
print("\nTokenized datasets saved:")
print("  • ../data/processed/bert_tokenized/")
print("  • ../data/processed/roberta_tokenized/")
print("\nYou can now run training scripts without re-tokenizing.")
print("This saves 5-10 minutes per experiment.")
print("="*80)