# Sentiment Analysis - Model Training on Google Colab

This notebook trains DistilBERT and RoBERTa models for sentiment analysis.

**Before running:**
1. Runtime ‚Üí Change runtime type ‚Üí GPU (T4)
2. Upload your `amazon_polarity_20k.csv` file (or we'll download sample data)

**What this notebook does:**
- Train DistilBERT (~90 minutes on T4 GPU)
- Train RoBERTa (~120 minutes on T4 GPU)
- Save both models for download
- Generate evaluation metrics

## Step 1: Check GPU and Install Dependencies

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("‚ö†Ô∏è WARNING: GPU not available! Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

In [None]:
# Install required packages
!pip install -q transformers datasets scikit-learn pandas tqdm accelerate

## Step 2: Upload Training Data

**Option A:** Upload your `amazon_polarity_20k.csv` file using the file upload button on the left

**Option B:** Use sample data from Hugging Face (we'll download it)

In [None]:
import os

# Check if user uploaded file
if os.path.exists('amazon_polarity_20k.csv'):
    print("‚úÖ Using uploaded file: amazon_polarity_20k.csv")
    DATA_PATH = 'amazon_polarity_20k.csv'
else:
    print("üì• Downloading sample data from Hugging Face...")
    from datasets import load_dataset
    import pandas as pd
    
    # Download amazon_polarity dataset
    dataset = load_dataset('amazon_polarity', split='train[:20000]')
    
    # Convert to DataFrame
    df = pd.DataFrame({
        'full_text': dataset['content'],
        'label': dataset['label']
    })
    
    # Save to CSV
    df.to_csv('amazon_polarity_20k.csv', index=False)
    DATA_PATH = 'amazon_polarity_20k.csv'
    print(f"‚úÖ Downloaded {len(df)} samples")

## Step 3: Define Training Functions

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import json
from datetime import datetime

In [None]:
def train_model(model_name, output_dir, epochs=3, batch_size=16, learning_rate=2e-5):
    """
    Train a sentiment classification model.
    
    Args:
        model_name: HuggingFace model name (e.g., 'distilbert-base-uncased')
        output_dir: Directory to save the trained model
        epochs: Number of training epochs
        batch_size: Training batch size
        learning_rate: Learning rate for optimizer
    """
    print(f"\n{'='*70}")
    print(f"Training {model_name}")
    print(f"{'='*70}\n")
    
    # Load data
    print("üìÇ Loading data...")
    df = pd.read_csv(DATA_PATH)
    print(f"   Total samples: {len(df)}")
    
    # Split data: 60% train, 20% val, 20% test
    train_val_df, test_df = train_test_split(
        df, test_size=0.2, random_state=42, stratify=df['label']
    )
    train_df, val_df = train_test_split(
        train_val_df, test_size=0.25, random_state=42, stratify=train_val_df['label']
    )
    
    print(f"   Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")
    
    # Convert to HuggingFace Dataset
    train_dataset = Dataset.from_pandas(train_df[['full_text', 'label']].reset_index(drop=True))
    val_dataset = Dataset.from_pandas(val_df[['full_text', 'label']].reset_index(drop=True))
    test_dataset = Dataset.from_pandas(test_df[['full_text', 'label']].reset_index(drop=True))
    
    # Load tokenizer and model
    print(f"\nü§ó Loading {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        ignore_mismatched_sizes=True
    )
    
    # Tokenize datasets
    print("üî§ Tokenizing...")
    def tokenize_function(examples):
        return tokenizer(
            examples['full_text'],
            truncation=True,
            padding=False,
            max_length=128
        )
    
    train_tokenized = train_dataset.map(tokenize_function, batched=True)
    val_tokenized = val_dataset.map(tokenize_function, batched=True)
    test_tokenized = test_dataset.map(tokenize_function, batched=True)
    
    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # Metrics function
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        probs = np.exp(logits) / np.exp(logits).sum(axis=-1, keepdims=True)
        
        return {
            'accuracy': accuracy_score(labels, predictions),
            'f1': f1_score(labels, predictions, average='weighted'),
            'precision': precision_score(labels, predictions, average='weighted'),
            'recall': recall_score(labels, predictions, average='weighted'),
            'roc_auc': roc_auc_score(labels, probs[:, 1])
        }
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"{output_dir}/checkpoints",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        logging_dir=f"{output_dir}/logs",
        logging_steps=100,
        save_total_limit=2,
        fp16=True,  # Use mixed precision for faster training
        report_to="none"
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    
    # Train
    print(f"\nüöÄ Starting training...\n")
    start_time = datetime.now()
    trainer.train()
    training_time = (datetime.now() - start_time).total_seconds() / 60
    
    # Evaluate on test set
    print("\nüìä Evaluating on test set...")
    test_results = trainer.predict(test_tokenized)
    test_metrics = compute_metrics((test_results.predictions, test_results.label_ids))
    
    print(f"\n{'='*70}")
    print(f"TEST RESULTS - {model_name}")
    print(f"{'='*70}")
    print(f"Accuracy:  {test_metrics['accuracy']:.4f}")
    print(f"F1-Score:  {test_metrics['f1']:.4f}")
    print(f"Precision: {test_metrics['precision']:.4f}")
    print(f"Recall:    {test_metrics['recall']:.4f}")
    print(f"ROC-AUC:   {test_metrics['roc_auc']:.4f}")
    print(f"\nTraining time: {training_time:.1f} minutes")
    print(f"{'='*70}\n")
    
    # Save model
    print(f"üíæ Saving model to {output_dir}/...")
    model.save_pretrained(output_dir, safe_serialization=True)
    tokenizer.save_pretrained(output_dir)
    
    # Save metrics
    metrics_dict = {
        'model_name': model_name,
        'test_metrics': test_metrics,
        'training_time_minutes': training_time,
        'training_config': {
            'epochs': epochs,
            'batch_size': batch_size,
            'learning_rate': learning_rate
        }
    }
    
    with open(f"{output_dir}/metrics.json", 'w') as f:
        json.dump(metrics_dict, f, indent=2)
    
    print("‚úÖ Training complete!\n")
    return test_metrics, training_time

## Step 4: Train DistilBERT

Expected time: ~90 minutes on T4 GPU

In [None]:
# Train DistilBERT
distilbert_metrics, distilbert_time = train_model(
    model_name='distilbert-base-uncased',
    output_dir='distilbert_sentiment',
    epochs=3,
    batch_size=16,
    learning_rate=2e-5
)

## Step 5: Train RoBERTa

Expected time: ~120 minutes on T4 GPU

In [None]:
# Train RoBERTa
roberta_metrics, roberta_time = train_model(
    model_name='roberta-base',
    output_dir='roberta_sentiment',
    epochs=3,
    batch_size=16,
    learning_rate=2e-5
)

## Step 6: Compare Results

In [None]:
import os

# Get model sizes
def get_model_size(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size / (1024 * 1024)  # Convert to MB

distilbert_size = get_model_size('distilbert_sentiment')
roberta_size = get_model_size('roberta_sentiment')

# Create comparison table
comparison = pd.DataFrame({
    'Model': ['DistilBERT', 'RoBERTa'],
    'Accuracy': [
        f"{distilbert_metrics['accuracy']:.4f}",
        f"{roberta_metrics['accuracy']:.4f}"
    ],
    'F1-Score': [
        f"{distilbert_metrics['f1']:.4f}",
        f"{roberta_metrics['f1']:.4f}"
    ],
    'ROC-AUC': [
        f"{distilbert_metrics['roc_auc']:.4f}",
        f"{roberta_metrics['roc_auc']:.4f}"
    ],
    'Training Time (min)': [
        f"{distilbert_time:.1f}",
        f"{roberta_time:.1f}"
    ],
    'Model Size (MB)': [
        f"{distilbert_size:.1f}",
        f"{roberta_size:.1f}"
    ]
})

print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)
print(comparison.to_string(index=False))
print("="*80 + "\n")

# Save comparison
comparison.to_csv('model_comparison.csv', index=False)
print("üíæ Saved comparison to model_comparison.csv")

## Step 7: Download Models

Zip and download the trained models to your local machine.

In [None]:
# Zip models for download
!zip -r distilbert_sentiment.zip distilbert_sentiment/
!zip -r roberta_sentiment.zip roberta_sentiment/

print("\n‚úÖ Models zipped!")
print("\nTo download:")
print("1. Click the folder icon on the left")
print("2. Right-click on 'distilbert_sentiment.zip' ‚Üí Download")
print("3. Right-click on 'roberta_sentiment.zip' ‚Üí Download")
print("4. Right-click on 'model_comparison.csv' ‚Üí Download")

# Also provide direct download links
from google.colab import files
print("\nOr download directly:")
files.download('distilbert_sentiment.zip')
files.download('roberta_sentiment.zip')
files.download('model_comparison.csv')

## Next Steps

After downloading:

1. **Extract models** to your local `models/` directory:
   ```bash
   unzip distilbert_sentiment.zip -d models/
   unzip roberta_sentiment.zip -d models/
   ```

2. **Compare with your BERT model** (91.875% accuracy)

3. **Select the winner** based on:
   - Accuracy (most important)
   - Model size (for deployment)
   - Inference speed (test locally)

4. **Build deployment** with the best model:
   - FastAPI REST API
   - Gradio web demo
   - Docker container