# Batch Training on Google Colab

This notebook runs batch training using the functions from `batch_runner.py`.

## Setup Steps:
1. Mount Google Drive
2. Copy and extract data from Drive
3. Install local packages
4. Run batch training

## 1. Mount Google Drive

In [None]:
from google.colab import drive
import os
from pathlib import Path

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
DRIVE_DATA_PATH = Path('/content/drive/MyDrive/semester/data')
LOCAL_DATA_PATH = Path('/content/data')
LIBS_PATH = Path('/content/libs')

print(f"Drive data path: {DRIVE_DATA_PATH}")
print(f"Local data path: {LOCAL_DATA_PATH}")
print(f"Libraries path: {LIBS_PATH}")

## 2. Copy Data from Google Drive to Colab Environment

In [None]:
import shutil
import tarfile
from tqdm import tqdm

# Create local data directory
LOCAL_DATA_PATH.mkdir(parents=True, exist_ok=True)

# List of tar.gz files to copy and extract
# Modify this list based on the datasets you need
datasets = [
    'anaheim_L.tar.gz',
    'anaheim_L_lhs.tar.gz',
    'anaheim_LM_lhs.tar.gz',
    'chicago_LM_lhs.tar.gz',
    'sioux_falls_L.tar.gz',
    'sioux_falls_L_lhs.tar.gz',
    'sioux_falls_LM_lhs.tar.gz',
]

print("Copying and extracting datasets...")
for dataset in tqdm(datasets):
    source_file = DRIVE_DATA_PATH / dataset
    
    if not source_file.exists():
        print(f"Warning: {dataset} not found in Drive, skipping...")
        continue
    
    # Copy tar.gz file to local storage
    local_tar = LOCAL_DATA_PATH / dataset
    print(f"\nCopying {dataset}...")
    shutil.copy2(source_file, local_tar)
    
    # Extract tar.gz file
    print(f"Extracting {dataset}...")
    with tarfile.open(local_tar, 'r:gz') as tar:
        tar.extractall(path=LOCAL_DATA_PATH)
    
    # Remove tar.gz file to save space
    local_tar.unlink()
    print(f"✓ {dataset} extracted and cleaned up")

print("\n✓ All datasets copied and extracted!")

## 3. Verify Data Structure

In [None]:
# List extracted data directories
print("Extracted data directories:")
for item in sorted(LOCAL_DATA_PATH.iterdir()):
    if item.is_dir():
        print(f"  - {item.name}")
        # Check subdirectories
        subdirs = [d.name for d in item.iterdir() if d.is_dir()]
        if subdirs:
            print(f"    Subdirs: {', '.join(subdirs)}")

## 4. Copy and Install Local Packages

We need to install the local packages from the `libs/` folder:
- `static-assignment`
- `ml-static`

## 5. Install Required Packages

Install system dependencies and Python packages

In [None]:
# Install uv package manager (fast pip alternative)
!pip install -q uv

# Install static-assignment package
print("Installing static-assignment package...")
!uv pip install -e {LIBS_PATH / 'static-assignment'}

# Install ml-static package (includes batch_runner)
print("\nInstalling ml-static package...")
!uv pip install -e {LIBS_PATH / 'ml-static'}

print("\n✓ All packages installed!")

## 6. Setup Working Directory and Copy Configs

In [None]:
# Change to working directory
os.chdir('/content')

# Copy configs from libs/ml-static to current directory
CONFIG_SOURCE = LIBS_PATH / 'ml-static' / 'configs'
CONFIG_DEST = Path('/content/configs')

if CONFIG_SOURCE.exists():
    shutil.copytree(CONFIG_SOURCE, CONFIG_DEST, dirs_exist_ok=True)
    print(f"✓ Configs copied to {CONFIG_DEST}")
    
    # List config files
    print("\nAvailable config files:")
    for f in CONFIG_DEST.iterdir():
        print(f"  - {f.name}")
else:
    print(f"Warning: {CONFIG_SOURCE} not found")

## 7. Configure MLflow (Optional)

Setup MLflow tracking for experiment management

In [None]:
import mlflow

# Set tracking URI to local directory
mlflow.set_tracking_uri('file:///content/mlruns')

# Create mlruns directory
Path('/content/mlruns').mkdir(exist_ok=True)

print("MLflow tracking URI:", mlflow.get_tracking_uri())

## 8. Check GPU Availability

In [None]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print(f"✓ GPU is available: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("⚠ No GPU available, training will use CPU (slower)")
    print("Tip: Runtime > Change runtime type > Hardware accelerator > GPU")

## 9. Run Batch Training

### Option A: Using the batch_train CLI command

In [None]:
# Run batch training using the experiment suite
# Make sure to adjust the suite file path if needed

!batch-train --suite /content/configs/experiment_suite.yaml --config /content/configs/conf_run.yaml

### Option B: Using Python directly

In [None]:
from ml_static.batch_runner import batch_train, load_experiment_suite, run_batch_experiments, ConfigManager
from pathlib import Path

# Load experiment suite
suite_path = Path('/content/configs/experiment_suite.yaml')
config_path = Path('/content/configs/conf_run.yaml')

print(f"Loading experiments from {suite_path}...")
experiments = load_experiment_suite(suite_path)

print(f"Found {len(experiments)} experiments:")
for exp in experiments:
    print(f"  - {exp['name']}: {exp.get('description', 'No description')}")

# Initialize config manager
config_manager = ConfigManager(config_path)

# Run batch experiments
print("\nStarting batch training...")
results = run_batch_experiments(experiments, config_manager)

# Print results summary
print("\n" + "="*70)
print("BATCH TRAINING COMPLETE")
print("="*70)
for exp_name, result in results.items():
    status_icon = "✓" if result['status'] == 'SUCCESS' else "✗"
    print(f"{status_icon} {exp_name}: {result['status']}")
    if result.get('error'):
        print(f"  Error: {result['error']}")

## 10. Save Results Back to Google Drive (Optional)

In [None]:
# Copy MLflow artifacts back to Drive
DRIVE_OUTPUT = Path('/content/drive/MyDrive/semester/colab_results')
DRIVE_OUTPUT.mkdir(parents=True, exist_ok=True)

# Copy mlruns directory
if Path('/content/mlruns').exists():
    print("Copying MLflow results to Drive...")
    shutil.copytree(
        '/content/mlruns',
        DRIVE_OUTPUT / 'mlruns',
        dirs_exist_ok=True
    )
    print("✓ Results saved to Drive")

# Copy mlartifacts directory if it exists
if Path('/content/mlartifacts').exists():
    print("Copying MLflow artifacts to Drive...")
    shutil.copytree(
        '/content/mlartifacts',
        DRIVE_OUTPUT / 'mlartifacts',
        dirs_exist_ok=True
    )
    print("✓ Artifacts saved to Drive")

print(f"\n✓ All results saved to {DRIVE_OUTPUT}")

## 11. View MLflow Results (Optional)

In [None]:
import pandas as pd
import mlflow

# Get all runs
client = mlflow.tracking.MlflowClient()
experiments = client.search_experiments()

print("MLflow Experiments:")
for exp in experiments:
    print(f"\nExperiment: {exp.name} (ID: {exp.experiment_id})")
    
    # Get runs for this experiment
    runs = client.search_runs(exp.experiment_id)
    
    if runs:
        print(f"  Found {len(runs)} runs")
        
        # Create summary DataFrame
        data = []
        for run in runs:
            data.append({
                'run_id': run.info.run_id[:8],
                'status': run.info.status,
                'start_time': pd.to_datetime(run.info.start_time, unit='ms'),
                **{k: v for k, v in run.data.params.items()},
                **{k: v for k, v in run.data.metrics.items()}
            })
        
        df = pd.DataFrame(data)
        display(df)
    else:
        print("  No runs found")

## Notes

- **Runtime**: Make sure to select GPU runtime for faster training (Runtime > Change runtime type)
- **Session timeout**: Colab sessions can timeout after inactivity. For long training runs, consider using Colab Pro or keep the tab active
- **Storage**: Colab has limited disk space (~100GB). Monitor disk usage and clean up extracted data if needed
- **Data location**: Adjust the `datasets` list in Step 2 based on which datasets you need
- **Experiment suite**: Edit `/content/configs/experiment_suite.yaml` to customize experiments
- **Config file**: Edit `/content/configs/conf_run.yaml` for baseline configuration