# AlexNet Training Pipeline - iFood 2019

Train 4 AlexNet variants (Baseline, Modified1, Modified2, Combined) sequentially on iFood dataset.

In [None]:
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import yaml
import json
from datetime import datetime
import os

print(f"PyTorch version: {torch.__version__}")
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")

## Load Configuration

In [None]:
# Load configuration
config_path = Path.cwd().parent / 'src' / 'config' / 'config.yaml'
with open(config_path) as f:
    config = yaml.safe_load(f)

print("Configuration loaded:")
print(f"  - Batch size: {config['training']['batch_size']}")
print(f"  - Learning rate: {config['training']['learning_rate']}")
print(f"  - Number of classes: {config['data']['num_classes']}")
print(f"  - Image size: {config['data']['image_size']}")

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Import Custom Modules

In [None]:
from data.dataset import IFoodDataset
from data.transforms import get_transforms
from models.alexnet_baseline import AlexNetBaseline
from models.alexnet_modified1 import AlexNetModified1
from models.alexnet_modified2 import AlexNetModified2
from models.alexnet_combined import AlexNetCombined
from training.train import train_epoch, validate
from training.utils import save_checkpoint

print("‚úì Custom modules imported successfully")

## Create Directories

In [None]:
# Create necessary directories
checkpoints_dir = Path.cwd().parent / 'checkpoints'
plots_dir = Path.cwd().parent / 'results' / 'plots'
logs_dir = Path.cwd().parent / 'results' / 'logs'

for directory in [checkpoints_dir, plots_dir, logs_dir]:
    directory.mkdir(parents=True, exist_ok=True)

print(f"‚úì Checkpoints dir: {checkpoints_dir}")
print(f"‚úì Plots dir: {plots_dir}")
print(f"‚úì Logs dir: {logs_dir}")

## Check Data Structure

In [None]:
# Check data structure
data_dir = Path.cwd().parent / 'data'
print(f"Data directory: {data_dir}")
print(f"Data dir exists: {data_dir.exists()}")

if data_dir.exists():
    print(f"\nContents of data directory:")
    for item in sorted(data_dir.iterdir()):
        if item.is_dir():
            print(f"  üìÅ {item.name}/")
        else:
            print(f"  üìÑ {item.name}")

## Load Data

In [None]:
# Load transforms
train_transform = get_transforms(split='train', config=config)
val_transform = get_transforms(split='val', config=config)

# Create datasets
train_csv = data_dir / 'annotations' / 'train_labels.csv'
val_csv = data_dir / 'annotations' / 'val_labels.csv'
train_img_dir = data_dir / 'train_set'
val_img_dir = data_dir / 'val_set'

print(f"Training CSV exists: {train_csv.exists()}")
print(f"Validation CSV exists: {val_csv.exists()}")

if train_csv.exists() and train_img_dir.exists():
    train_dataset = IFoodDataset(
        csv_file=str(train_csv),
        root_dir=str(train_img_dir),
        transform=train_transform
    )
    print(f"‚úì Training dataset: {len(train_dataset)} samples")
else:
    print("‚ö†Ô∏è Training data not found.")
    train_dataset = None

if val_csv.exists() and val_img_dir.exists():
    val_dataset = IFoodDataset(
        csv_file=str(val_csv),
        root_dir=str(val_img_dir),
        transform=val_transform
    )
    print(f"‚úì Validation dataset: {len(val_dataset)} samples")
else:
    print("‚ö†Ô∏è Validation data not found.")
    val_dataset = None

In [None]:
# Create dataloaders
if train_dataset and val_dataset:
    train_loader = DataLoader(
        train_dataset,
        batch_size=config['training']['batch_size'],
        shuffle=True,
        num_workers=config['training']['num_workers'],
        pin_memory=config['training']['pin_memory']
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config['evaluation']['batch_size'],
        shuffle=False,
        num_workers=config['evaluation']['num_workers'],
        pin_memory=config['training']['pin_memory']
    )
    
    print(f"‚úì Training loader: {len(train_loader)} batches")
    print(f"‚úì Validation loader: {len(val_loader)} batches")
else:
    print("‚ö†Ô∏è Cannot create dataloaders")

## Define Training Function

In [None]:
def train_model(model, train_loader, val_loader, model_name, checkpoint_dir, num_epochs=10):
    """Train a single model"""
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(
        model.parameters(),
        lr=config['training']['learning_rate'],
        momentum=config['training']['momentum'],
        weight_decay=config['training']['weight_decay']
    )
    
    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=config['training']['scheduler']['step_size'],
        gamma=config['training']['scheduler']['gamma']
    )
    
    model.to(device)
    history = {
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': []
    }
    
    print(f"\n{'='*60}")
    print(f"Training {model_name}")
    print(f"{'='*60}")
    
    for epoch in range(num_epochs):
        # Train
        train_metrics = train_epoch(model, train_loader, criterion, optimizer, device, epoch, use_wandb=False)
        history['train_loss'].append(train_metrics['train_loss'])
        history['train_acc'].append(train_metrics['train_accuracy'])
        
        # Validate
        val_metrics = validate(model, val_loader, criterion, device, epoch, use_wandb=False)
        history['val_loss'].append(val_metrics['val_loss'])
        history['val_acc'].append(val_metrics['val_accuracy'])
        
        scheduler.step()
        
        # Save checkpoint every 5 epochs
        if (epoch + 1) % 5 == 0:
            checkpoint_path = checkpoint_dir / f'checkpoint_epoch_{epoch+1}.pt'
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, checkpoint_path)
            print(f"  ‚úì Checkpoint: {checkpoint_path.name}")
    
    # Save final model
    final_path = checkpoint_dir / 'final_model.pt'
    torch.save(model.state_dict(), final_path)
    print(f"‚úì Final model saved: {final_path.name}")
    
    return history

## Configure Models

In [None]:
# Model configurations
models_config = {
    'Model_A': {
        'name': 'alexnet_baseline',
        'class': AlexNetBaseline,
        'checkpoint_dir': checkpoints_dir / 'model_a',
        'config': {'num_classes': config['data']['num_classes'], 'dropout': 0.5}
    },
    'Model_B': {
        'name': 'alexnet_modified1',
        'class': AlexNetModified1,
        'checkpoint_dir': checkpoints_dir / 'model_b',
        'config': {'num_classes': config['data']['num_classes'], 'dropout': 0.5, 'use_batch_norm': True}
    },
    'Model_C': {
        'name': 'alexnet_modified2',
        'class': AlexNetModified2,
        'checkpoint_dir': checkpoints_dir / 'model_c',
        'config': {'num_classes': config['data']['num_classes'], 'dropout': 0.5, 'use_leaky_relu': True}
    },
    'Model_D': {
        'name': 'alexnet_combined',
        'class': AlexNetCombined,
        'checkpoint_dir': checkpoints_dir / 'model_d',
        'config': {'num_classes': config['data']['num_classes'], 'dropout': 0.5, 'use_batch_norm': True, 'use_leaky_relu': True}
    }
}

# Create checkpoint directories
for model_config in models_config.values():
    model_config['checkpoint_dir'].mkdir(parents=True, exist_ok=True)

print("Models configured:")
for model_name, model_config in models_config.items():
    print(f"  ‚úì {model_name}: {model_config['name']}")

## Train All Models

In [None]:
if train_dataset and val_dataset:
    all_histories = {}
    
    for model_key, model_config in models_config.items():
        # Create model
        model = model_config['class'](**model_config['config'])
        
        # Train
        history = train_model(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            model_name=model_key,
            checkpoint_dir=model_config['checkpoint_dir'],
            num_epochs=10
        )
        
        all_histories[model_key] = history
        
        # Clear GPU memory
        del model
        torch.cuda.empty_cache()
    
    print(f"\n{'='*60}")
    print("‚úì All models trained successfully!")
    print(f"{'='*60}")
else:
    print("‚ö†Ô∏è Cannot train - data not available")

## Save Results

In [None]:
if train_dataset and val_dataset:
    # Save histories as JSON
    histories_json = {}
    for model_name, history in all_histories.items():
        histories_json[model_name] = {
            'train_loss': history['train_loss'],
            'train_acc': history['train_acc'],
            'val_loss': history['val_loss'],
            'val_acc': history['val_acc']
        }
    
    histories_path = logs_dir / 'training_histories.json'
    with open(histories_path, 'w') as f:
        json.dump(histories_json, f, indent=2)
    
    print(f"‚úì Histories saved: {histories_path}")
    
    # Print final metrics
    print("\nFinal Metrics:")
    print(f"{'Model':<15} {'Train Loss':<15} {'Train Acc':<15} {'Val Loss':<15} {'Val Acc':<15}")
    print("-" * 75)
    for model_name, history in all_histories.items():
        print(f"{model_name:<15} {history['train_loss'][-1]:<15.4f} {history['train_acc'][-1]:<15.2f} {history['val_loss'][-1]:<15.4f} {history['val_acc'][-1]:<15.2f}")

## Visualize Results

In [None]:
if train_dataset and val_dataset:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('AlexNet Models Training Results', fontsize=16, fontweight='bold')
    
    # Training loss
    ax = axes[0, 0]
    for model_name, history in all_histories.items():
        ax.plot(history['train_loss'], label=model_name, marker='o', markersize=3)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.set_title('Training Loss')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Validation loss
    ax = axes[0, 1]
    for model_name, history in all_histories.items():
        ax.plot(history['val_loss'], label=model_name, marker='o', markersize=3)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.set_title('Validation Loss')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Training accuracy
    ax = axes[1, 0]
    for model_name, history in all_histories.items():
        ax.plot(history['train_acc'], label=model_name, marker='o', markersize=3)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title('Training Accuracy')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Validation accuracy
    ax = axes[1, 1]
    for model_name, history in all_histories.items():
        ax.plot(history['val_acc'], label=model_name, marker='o', markersize=3)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title('Validation Accuracy')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plot_path = plots_dir / 'training_curves.png'
    plt.savefig(plot_path, dpi=150, bbox_inches='tight')
    print(f"‚úì Plot saved: {plot_path}")
    plt.show()

## Summary

In [None]:
print("\n" + "="*60)
print("TRAINING PIPELINE COMPLETE")
print("="*60)
print(f"\n‚úì Checkpoints: {checkpoints_dir}")
print(f"‚úì Plots: {plots_dir}")
print(f"‚úì Results: {logs_dir}")
print("\nNext:")
print("1. Download results from RunPod")
print("2. Analyze performance")
print("3. Write final report")
print("="*60)