# NASA CloudML - Colab Training

Train deep learning models for cloud optical depth prediction from satellite IR imagery.

## Quick Start

1. Run setup cells to install dependencies and mount Drive
2. Run diagnostics to determine if task is learnable
3. Choose your training configuration (configs/colab_optimized_full_tuned.yaml recommended)
4. Run training cell
5. View results in Google Drive under CloudML/results/

**Current Status:** Phase 1 fixes applied (variance-preserving loss, proper initialization)

---

## Setup (Run Once Per Session)

In [None]:
# ============================================================================
# STEP 1: Mount Google Drive
# ============================================================================
from google.colab import drive
import os

drive.mount('/content/drive')

# Create project directories
!mkdir -p /content/drive/MyDrive/CloudML/data
!mkdir -p /content/drive/MyDrive/CloudML/models
!mkdir -p /content/drive/MyDrive/CloudML/plots
!mkdir -p /content/drive/MyDrive/CloudML/logs

print("OK: Google Drive mounted successfully")
print("OK: Project directories created")

In [None]:
# ============================================================================
# STEP 2: Clone/Update Repository
# ============================================================================
%cd /content

if not os.path.exists('/content/repo'):
    print('Cloning repository...')
    !git clone https://github.com/rylanmalarchick/cloudMLPublic.git repo
else:
    print('Repository exists. Pulling latest changes...')
    %cd /content/repo
    !git pull origin main

%cd /content/repo
print("OK: Repository ready")

In [None]:
# ============================================================================
# STEP 3: Install Dependencies
# ============================================================================
print("Installing dependencies (this may take 5-10 minutes)...\n")

# Install PyTorch with CUDA 12.1 support
!pip install --quiet torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121

# Clean up potential conflicts
!pip uninstall -y -q mamba-ssm causal-conv1d 2>/dev/null

# Install core dependencies
!pip install --quiet h5py==3.14.0 netCDF4==1.7.2 pyhdf==0.11.6 scikit-learn matplotlib plotly pyyaml pandas

# Install advanced components
!pip install --quiet torch_geometric==2.5.3
!pip install --quiet causal-conv1d==1.4.0
!pip install --quiet mamba-ssm==2.2.2

print("\nOK: All dependencies installed successfully")
print("\nVerifying installation...")

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# ============================================================================
# STEP 4: Verify Data
# ============================================================================
import os

data_dir = '/content/drive/MyDrive/CloudML/data/'

# Expected flights
flights = ['10Feb25', '30Oct24', '04Nov24', '23Oct24', '18Feb25', '12Feb25']

print("Checking data availability...\n")
missing_data = []

for flight in flights:
    flight_path = os.path.join(data_dir, flight)
    if os.path.exists(flight_path):
        files = os.listdir(flight_path)
        has_h5 = any(f.endswith('.h5') for f in files)
        has_hdf5 = any(f.endswith('.hdf5') for f in files)
        has_hdf = any(f.endswith('.hdf') for f in files)
        
        if has_h5 and has_hdf5 and has_hdf:
            print(f"OK: {flight}: All files present")
        else:
            print(f"WARNING: {flight}: Missing files (h5={has_h5}, hdf5={has_hdf5}, hdf={has_hdf})")
            missing_data.append(flight)
    else:
        print(f"Not found: {flight}: Folder not found")
        missing_data.append(flight)

if missing_data:
    print(f"\nWARNING: {len(missing_data)} flight(s) missing data")
    print("Training will proceed with available flights only.")
else:
    print("\nOK: All data verified successfully!")

## Diagnostics (Determine if Task is Learnable)

In [None]:
# ============================================================================
# DIAGNOSTIC 1: Correlation Analysis (30 min)
# ============================================================================
# This checks if ANY features correlate with optical depth
# If no correlations, the task may not be learnable from this data
%cd /content/repo

print("Running Correlation Analysis...")
print("This will extract 28 hand-crafted features and compute correlations")
print("Expected runtime: ~30 minutes")
print("="*80)

!python diagnostics/1_correlation_analysis.py

print("\n" + "="*80)
print("CORRELATION ANALYSIS COMPLETE")
print("="*80)
print("Check results in diagnostics/results/correlation_summary.json")
print("If max r² < 0.05, the task may not be learnable from this data")

In [None]:
# ============================================================================
# DIAGNOSTIC 2: Simple Baseline Models (1 hour)
# ============================================================================
# This tests if Ridge, Random Forest, etc. can beat mean baseline
# If simple models get R² > 0, deep learning should work too
%cd /content/repo

print("Running Simple Baseline Models...")
print("This will test 7 classical ML models on hand-crafted features")
print("Expected runtime: ~1 hour")
print("="*80)

!python diagnostics/2_simple_baselines.py

print("\n" + "="*80)
print("BASELINE MODELS COMPLETE")
print("="*80)
print("Check results in diagnostics/results/baseline_summary.json")
print("If best R² > 0, signal exists - proceed to neural network training")
print("If best R² < 0, no model can learn - need different data/features")

## Training (Run After Diagnostics)

In [None]:
# ============================================================================
# PHASE 1 EXPERIMENT: VARIANCE COLLAPSE FIX (SHORT VALIDATION RUN)
# ============================================================================

# STEP 1: Pull latest code
print("Pulling latest code from main branch...")
%cd /content/repo
!git pull origin main
print("OK: Code updated\n")

# STEP 2: Start Phase 1 experiment
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
experiment_name = f"phase1_variance_fix_{timestamp}"

print("\n" + "="*80)
print("PHASE 1: VARIANCE COLLAPSE FIX EXPERIMENT")
print("="*80)
print(f"Experiment ID: {experiment_name}")
print(f"Config: phase_1_variance_fix.yaml")
print(f"\nEXPERIMENT OBJECTIVE:")
print(f"  Validate that variance loss can prevent model collapse.")
print(f"  This is a SHORT run (10 epochs) to test the fix.")
print(f"\nKEY SETTINGS:")
print(f"  - Epochs: 10 (validation run only)")
print(f"  - variance_lambda: 0.5 (Phase 1b: reduced from 2.0 to prevent loss explosion)")
print(f"  - min_variance_ratio: 0.1 (safety net - stops if variance collapses)")
print(f"\nPHASE 1B IMPROVEMENTS:")
print(f"  - Reduced variance_lambda to prevent loss explosion")
print(f"  - Fixed R² calculation to use unscaled (km) values, not z-scores")
print(f"  - Added debug logging for prediction ranges")
print(f"\nBASELINE PERFORMANCE (from diagnostics):")
print(f"  - GradientBoosting: R² = 0.777 (simple model on hand-crafted features)")
print(f"  - Previous NN runs: R² < 0 (all negative - model collapsed)")
print(f"\nEXPECTED OUTCOME:")
print(f"  - Loss values should be reasonable (~0.5-2.0, not thousands)")
print(f"  - R² should be calculated on real km values (not z-scores)")
print(f"  - If fix works: R² should be POSITIVE and INCREASING over epochs")
print(f"  - Target for this short run: R² > 0.1")
print(f"\nExpected Runtime: ~15-20 minutes")
print(f"Expected GPU Usage: ~10-12GB")
print("="*80)
print("\nTraining started... Monitor GPU with: !nvidia-smi\n")

%cd /content/repo
!python main.py \
    --config configs/phase_1_variance_fix.yaml \
    --save_name {experiment_name} \
    --epochs 10

print("\n" + "="*80)
print("PHASE 1 EXPERIMENT COMPLETE!")
print("="*80)
print(f"\nNEXT STEPS:")
print(f"  1. Check the DEBUG output showing scaled/unscaled prediction ranges")
print(f"  2. Verify loss values are reasonable (~0.5-2.0, not thousands)")
print(f"  3. Check if R² is positive and increasing")
print(f"  4. If R² > 0.1: GREAT! Scale up to 50 epochs with hyperparameter tuning")
print(f"  5. If R² still negative: Try Phase 2 (simpler architecture without attention)")
print(f"\nModel saved to: /content/drive/MyDrive/CloudML/models/trained/{experiment_name}.pth")
print(f"Results saved to: /content/drive/MyDrive/CloudML/plots/")
print(f"Logs saved to: /content/drive/MyDrive/CloudML/logs/")
print("\nCheck TensorBoard: %load_ext tensorboard")
print("                   %tensorboard --logdir /content/drive/MyDrive/CloudML/logs/tensorboard/")

In [None]:
# ============================================================================
# PHASE 2 EXPERIMENT: SIMPLE CNN BASELINE (NO ATTENTION)
# ============================================================================

# STEP 1: Pull latest code
print("Pulling latest code from main branch...")
%cd /content/repo
!git pull origin main
print("OK: Code updated\n")

# STEP 2: Start Phase 2 experiment
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
experiment_name = f"phase2_simple_cnn_{timestamp}"

print("\n" + "="*80)
print("PHASE 2: SIMPLE CNN BASELINE EXPERIMENT")
print("="*80)
print(f"Experiment ID: {experiment_name}")
print(f"Config: phase_2_simple_cnn.yaml")
print(f"\nEXPERIMENT OBJECTIVE:")
print(f"  Test if the complex transformer/attention architecture is preventing learning.")
print(f"  Use a simple CNN baseline to establish if the task is learnable at all.")
print(f"\nKEY SETTINGS:")
print(f"  - Architecture: SimpleCNNModel (no attention, no transformer)")
print(f"  - Epochs: 10 (validation run only)")
print(f"  - variance_lambda: 0.5 (keep variance loss)")
print(f"  - min_variance_ratio: 0.1 (safety net)")
print(f"\nPHASE 1 RESULTS:")
print(f"  - Variance ratio: GOOD (88-111% - not collapsed)")
print(f"  - R²: BAD (still negative: -0.89 to -1.16)")
print(f"  - Loss: BAD (600-900, should be <2.0)")
print(f"  - Predictions: OFF (some negative values, wrong scale)")
print(f"\nHYPOTHESIS:")
print(f"  Complex architecture cannot learn. Try simpler CNN first.")
print(f"\nEXPECTED OUTCOME:")
print(f"  - Loss should drop significantly (< 2.0 for scaled data)")
print(f"  - R² should be POSITIVE and INCREASING")
print(f"  - Predictions should be in valid range (0.1-2.0 km, no negatives)")
print(f"\nExpected Runtime: ~15-20 minutes")
print(f"Expected GPU Usage: ~8-10GB (less than Phase 1)")
print("="*80)
print("\nTraining started... Monitor GPU with: !nvidia-smi\n")

%cd /content/repo
!python main.py \
    --config configs/phase_2_simple_cnn.yaml \
    --save_name {experiment_name} \
    --epochs 10

print("\n" + "="*80)
print("PHASE 2 EXPERIMENT COMPLETE!")
print("="*80)
print(f"\nNEXT STEPS:")
print(f"  1. Check the DEBUG output and loss values")
print(f"  2. If R² is positive: GREAT! Simple CNN works.")
print(f"     -> Gradually add complexity (spatial attention first, then temporal)")
print(f"  3. If R² still negative: Problem is deeper.")
print(f"     -> Check data pipeline, preprocessing, or data quality")
print(f"\nModel saved to: /content/drive/MyDrive/CloudML/models/trained/{experiment_name}.pth")
print(f"Results saved to: /content/drive/MyDrive/CloudML/plots/")
print(f"Logs saved to: /content/drive/MyDrive/CloudML/logs/")
print("\nCheck TensorBoard: %load_ext tensorboard")
print("                   %tensorboard --logdir /content/drive/MyDrive/CloudML/logs/tensorboard/")

In [None]:
# ============================================================================
# OPTION A: FULL MODEL + OPTIMIZATIONS (ORIGINAL)
# ============================================================================
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
baseline_name = f"baseline_full_{timestamp}"

print("="*80)
print("TRAINING FULL MODEL WITH OPTIMIZATIONS")
print("="*80)
print(f"Experiment ID: {baseline_name}")
print(f"Config: colab_optimized_full.yaml")
print(f"Model: 64/128/256 channels (FULL)")
print(f"Optimizations: Gradient Checkpointing + torch.compile('default' mode)")
print(f"Expected Runtime: 2-2.5 hours (faster with compile)")
print(f"Expected GPU Usage: ~9-10GB (batch_size=20)")
print(f"CUDA Graph Fix: Using 'default' compile mode (compatible with checkpointing)")
print("="*80)
print("\nTraining started... Monitor GPU with: !nvidia-smi\n")

!python main.py \
    --config configs/colab_optimized_full.yaml \
    --save_name {baseline_name} \
    --epochs 50

print("\n" + "="*80)
print("FULL MODEL TRAINING COMPLETE!")
print("="*80)
print(f"Model saved to: /content/drive/MyDrive/CloudML/models/trained/{baseline_name}.pth")
print(f"Results saved to: /content/drive/MyDrive/CloudML/logs/")
print(f"Logs saved to: /content/drive/MyDrive/CloudML/plots/")
print("\nCheck TensorBoard: %load_ext tensorboard")
print("                   %tensorboard --logdir /content/drive/MyDrive/CloudML/logs/tensorboard/")

In [None]:
# ============================================================================
# OPTION B: FULL MODEL - MAXIMUM STABILITY (NEW - NO torch.compile)
# ============================================================================
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
baseline_name = f"baseline_full_stable_{timestamp}"

print("="*80)
print("TRAINING FULL MODEL - STABLE MODE")
print("="*80)
print(f"Experiment ID: {baseline_name}")
print(f"Config: colab_full_stable.yaml")
print(f"Model: 64/128/256 channels (FULL)")
print(f"Optimizations: Gradient Checkpointing only (NO torch.compile)")
print(f"Expected Runtime: ~3 hours (no compile speedup)")
print(f"Expected GPU Usage: ~8-9GB (batch_size=16)")
print(f"Stability: MAXIMUM (no CUDA graph issues)")
print("="*80)
print("\nTraining started... Monitor GPU with: !nvidia-smi\n")

!python main.py \
    --config configs/colab_full_stable.yaml \
    --save_name {baseline_name} \
    --epochs 50

print("\n" + "="*80)
print("FULL MODEL STABLE TRAINING COMPLETE!")
print("="*80)
print(f"Model saved to: /content/drive/MyDrive/CloudML/models/trained/{baseline_name}.pth")
print(f"Results saved to: /content/drive/MyDrive/CloudML/plots/")
print(f"Logs saved to: /content/drive/MyDrive/CloudML/logs/")
print("\nCheck TensorBoard: %load_ext tensorboard")
print("                   %tensorboard --logdir /content/drive/MyDrive/CloudML/logs/tensorboard/")

In [None]:
# ============================================================================
# OPTION C: MEMORY-OPTIMIZED MODEL (FALLBACK IF OOM)
# ============================================================================
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
baseline_name = f"baseline_memopt_{timestamp}"

print("="*80)
print("TRAINING MEMORY-OPTIMIZED MODEL")
print("="*80)
print(f"Experiment ID: {baseline_name}")
print(f"Config: colab_optimized.yaml")
print(f"Model: 32/64/128 channels (memory-optimized)")
print(f"Expected Runtime: 2.5-3 hours")
print(f"Expected GPU Usage: ~7-8GB (batch_size=16)")
print("="*80)
print("\nTraining started... Monitor GPU with: !nvidia-smi\n")

!python main.py \
    --config configs/colab_optimized.yaml \
    --save_name {baseline_name} \
    --epochs 50

print("\n" + "="*80)
print("MEMORY-OPTIMIZED TRAINING COMPLETE!")
print("="*80)
print(f"Model saved to: /content/drive/MyDrive/CloudML/models/trained/{baseline_name}.pth")
print(f"Results saved to: /content/drive/MyDrive/CloudML/plots/")
print(f"Logs saved to: /content/drive/MyDrive/CloudML/logs/")

## Utilities & Analysis

In [None]:
# ============================================================================
# AGGREGATE RESULTS FOR PAPER
# ============================================================================
import pandas as pd
import glob
import os

print("Aggregating results...\n")

# Find all CSV result files
results_dir = '/content/drive/MyDrive/CloudML/logs/csv/'
csv_files = glob.glob(os.path.join(results_dir, '*.csv'))

if csv_files:
    all_results = []
    
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        exp_name = os.path.basename(csv_file).replace('.csv', '')
        df['experiment'] = exp_name
        all_results.append(df)
    
    # Combine all results
    combined = pd.concat(all_results, ignore_index=True)
    
    # Save combined results
    output_path = '/content/drive/MyDrive/CloudML/all_results_combined.csv'
    combined.to_csv(output_path, index=False)
    
    print(f"OK: Combined {len(csv_files)} result files")
    print(f"OK: Saved to: {output_path}")
    print("\nSummary Statistics by Experiment:")
    print("="*80)
    
    # Group by experiment and show key metrics
    summary = combined.groupby('experiment').agg({
        'mae': 'mean',
        'rmse': 'mean',
        'r2': 'mean'
    }).round(4)
    
    print(summary)
    print("\nOK: Use this table for your paper!")
    
else:
    print("No result files found. Make sure experiments have completed.")

In [None]:
# ============================================================================
# GPU MONITORING (Run in parallel with training)
# ============================================================================
import time
from IPython.display import clear_output

def monitor_gpu(duration=300, interval=5):
    """Monitor GPU usage for specified duration"""
    for i in range(duration // interval):
        clear_output(wait=True)
        print(f"GPU Monitoring (updating every {interval}s, {i*interval}/{duration}s elapsed)\n")
        !nvidia-smi --query-gpu=timestamp,memory.used,memory.total,utilization.gpu,temperature.gpu --format=csv
        time.sleep(interval)

# Run for 5 minutes
monitor_gpu(duration=300, interval=10)

In [None]:
# ============================================================================
# VERIFY TENSORBOARD LOGS EXIST
# ============================================================================
import os

tb_dir = "/content/drive/MyDrive/CloudML/logs/tensorboard/"

if os.path.exists(tb_dir):
    runs = [d for d in os.listdir(tb_dir) if os.path.isdir(os.path.join(tb_dir, d))]
    if runs:
        print(f"OK: Found {len(runs)} TensorBoard run(s):")
        for run in sorted(runs):
            run_path = os.path.join(tb_dir, run)
            files = os.listdir(run_path)
            event_files = [f for f in files if 'events.out.tfevents' in f]
            print(f"  - {run}: {len(event_files)} event file(s)")
        print(f"\nOK: Ready to launch TensorBoard!")
    else:
        print(f"Not found: TensorBoard directory exists but is empty: {tb_dir}")
        print("  Run a training session first.")
else:
    print(f"Not found: TensorBoard directory not found: {tb_dir}")
    print("  Make sure:")
    print("  1. You've run a training session")
    print("  2. Files are saving to Google Drive (check config paths)")
    print("  3. Google Drive is mounted")

In [None]:
# ============================================================================
# TENSORBOARD (View training curves)
# ============================================================================
# Run this cell to launch TensorBoard in the notebook
%load_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/CloudML/logs/tensorboard/

# If you see "No dashboards are active", it means:
# 1. No training runs have been completed yet, OR
# 2. Files are not saving to Drive (check verification cell above)

In [None]:
# ============================================================================
# LIST ALL TRAINED MODELS
# ============================================================================
import os
from datetime import datetime

models_dir = '/content/drive/MyDrive/CloudML/models/trained/'

if os.path.exists(models_dir):
    models = sorted(os.listdir(models_dir))
    
    print(f"\nTrained Models ({len(models)} total)")
    print("="*100)
    print(f"{'Model Name':<60} {'Size (MB)':<15} {'Modified':<20}")
    print("="*100)
    
    for model in models:
        path = os.path.join(models_dir, model)
        size_mb = os.path.getsize(path) / (1024 * 1024)
        mtime = datetime.fromtimestamp(os.path.getmtime(path))
        print(f"{model:<60} {size_mb:>10.1f} MB   {mtime.strftime('%Y-%m-%d %H:%M')}")
    
    print("="*100)
else:
    print("Models directory not found.")

In [None]:
# ============================================================================
# DOWNLOAD RESULTS (Optional - already in Drive)
# ============================================================================
from google.colab import files

# Zip and download results
print("Zipping results for download...")

!cd /content/drive/MyDrive/CloudML && \
    zip -r results_export.zip plots/ logs/csv/ models/trained/ *.json *.csv 2>/dev/null

print("\nOK: Results zipped")
print("Downloading... (this may take a few minutes)")

files.download('/content/drive/MyDrive/CloudML/results_export.zip')

print("\nOK: Download complete!")