## 1Ô∏è‚É£ Install Dependencies

In [None]:
# Install required packages
!pip install torch torchvision opencv-python-headless pillow pandas scikit-learn matplotlib seaborn tqdm -q

print("‚úÖ Dependencies installed")

# Check GPU availability
import torch
print(f"\nGPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2Ô∏è‚É£ Setup Project Structure

In [None]:
# Clone repository or create structure
import os

# Create directories
os.makedirs('src', exist_ok=True)
os.makedirs('weights', exist_ok=True)
os.makedirs('outputs', exist_ok=True)
os.makedirs('cache', exist_ok=True)

print("‚úÖ Project structure created")
print("\nNext: Upload the source code files to Colab or mount Google Drive")

## 3Ô∏è‚É£ Mount Google Drive (Optional)

If your dataset and code are in Google Drive:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Navigate to your project folder
# %cd /content/drive/MyDrive/aero-gauge

print("‚úÖ Google Drive mounted")

## 4Ô∏è‚É£ Upload or Download Dataset

### Option A: Upload from local

In [None]:
# Upload dataset zip file
from google.colab import files
import zipfile

# Uncomment to upload
# uploaded = files.upload()
# 
# # Unzip
# for filename in uploaded.keys():
#     if filename.endswith('.zip'):
#         with zipfile.ZipFile(filename, 'r') as zip_ref:
#             zip_ref.extractall('.')
#         print(f"Extracted {filename}")

print("Upload completed (or skipped)")

### Option B: Download from Kaggle

In [None]:
# Install Kaggle
!pip install kaggle -q

# Upload your kaggle.json file first
# Or manually create it:
# from google.colab import files
# files.upload()  # Upload kaggle.json

# Setup Kaggle credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset
!kaggle datasets download -d deadcardassian/pm25vision -p dataset/ --unzip

print("‚úÖ Dataset downloaded from Kaggle")

## 5Ô∏è‚É£ Verify Dataset Structure

In [None]:
import pandas as pd
import os

# Check dataset structure
print("Dataset structure:")
!ls -lh dataset/

# Check metadata
if os.path.exists('dataset/train/metadata.csv'):
    train_df = pd.read_csv('dataset/train/metadata.csv')
    print(f"\nTrain samples: {len(train_df)}")
    print("\nFirst few rows:")
    print(train_df.head())
    print("\nColumns:", train_df.columns.tolist())
    
    # Check PM2.5 distribution
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    train_df['pm2_5'].hist(bins=30)
    plt.xlabel('PM2.5 (¬µg/m¬≥)')
    plt.ylabel('Count')
    plt.title('PM2.5 Distribution')
    
    plt.subplot(1, 2, 2)
    train_df['pm2_5'].plot(kind='box')
    plt.ylabel('PM2.5 (¬µg/m¬≥)')
    plt.title('PM2.5 Box Plot')
    plt.tight_layout()
    plt.show()
else:
    print("‚ö†Ô∏è metadata.csv not found. Please check dataset path.")

## 6Ô∏è‚É£ Upload Source Code Files

Upload the following files from your local `src/` folder:

In [None]:
# Option 1: Upload files manually
from google.colab import files

print("Upload these files to 'src/' folder:")
print("- features.py")
print("- dataset.py")
print("- model.py")
print("- train.py")
print("- utils.py")
print("- inference.py")

# Uncomment to upload:
# uploaded = files.upload()
# for filename in uploaded.keys():
#     !mv {filename} src/

# Option 2: If in Google Drive, copy from there
# !cp /content/drive/MyDrive/aero-gauge/src/*.py src/

print("\n‚úÖ Source files ready")
!ls src/

## 7Ô∏è‚É£ Test Feature Extraction

Quick test to ensure physics features work:

In [None]:
import sys
sys.path.insert(0, 'src')

from features import extract_all_features
import numpy as np

# Test with dummy image
test_img = np.random.randint(100, 200, (224, 224, 3), dtype=np.uint8)
features = extract_all_features(test_img)

print(f"‚úÖ Extracted {len(features)} physics features")
print("\nSample features:")
for i, (key, value) in enumerate(list(features.items())[:5]):
    print(f"  {key}: {value:.4f}")

## 8Ô∏è‚É£ Start Training

### Training Configuration

In [None]:
# Training parameters
BATCH_SIZE = 32  # Increase if GPU memory allows
EPOCHS = 30
LEARNING_RATE = 1e-4
NUM_WORKERS = 2

print("Training configuration:")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Epochs: {EPOCHS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

### Run Training

In [None]:
# Run training script
!python src/train.py \
    --data_dir ./dataset \
    --checkpoint_dir ./weights \
    --output_dir ./outputs \
    --cache_dir ./cache \
    --batch_size {BATCH_SIZE} \
    --epochs {EPOCHS} \
    --learning_rate {LEARNING_RATE} \
    --num_workers {NUM_WORKERS} \
    --patience 5 \
    --freeze_backbone

print("\n‚úÖ Training completed!")

## 9Ô∏è‚É£ View Training Results

In [None]:
import json
from PIL import Image
import matplotlib.pyplot as plt

# Load training summary
with open('outputs/training_summary.json', 'r') as f:
    summary = json.load(f)

print("=" * 60)
print("TRAINING SUMMARY")
print("=" * 60)
print(f"\nBest Validation MAE: {summary['best_val_mae']:.2f} ¬µg/m¬≥")
print(f"\nTest Metrics:")
for key, value in summary['test_metrics'].items():
    print(f"  {key.upper()}: {value:.4f}")

# Display training curves
if os.path.exists('outputs/training_results.png'):
    plt.figure(figsize=(14, 10))
    img = Image.open('outputs/training_results.png')
    plt.imshow(img)
    plt.axis('off')
    plt.title('Training Results', fontsize=16, fontweight='bold')
    plt.show()
else:
    print("\n‚ö†Ô∏è Results plot not found")

print("\n‚úÖ Model saved to: weights/best_model.pt")

## üîü Test Inference

In [None]:
from inference import create_predictor
from PIL import Image
import matplotlib.pyplot as plt

# Load predictor
device = 'cuda' if torch.cuda.is_available() else 'cpu'
predictor = create_predictor(
    checkpoint_dir='./weights',
    output_dir='./outputs',
    device=device
)

print("‚úÖ Predictor loaded")

# Test on a sample image from test set
test_images = [f for f in os.listdir('dataset/test/images') if f.endswith(('.jpg', '.png'))][:3]

print(f"\nTesting on {len(test_images)} sample images...\n")

for img_file in test_images:
    img_path = os.path.join('dataset/test/images', img_file)
    image = Image.open(img_path).convert('RGB')
    
    # Predict
    result = predictor.predict(image)
    
    # Display
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    axes[0].imshow(image)
    axes[0].set_title(f"{img_file}", fontsize=12)
    axes[0].axis('off')
    
    axes[1].imshow(result['dark_channel_heatmap'])
    axes[1].set_title('Dark Channel Heatmap', fontsize=12)
    axes[1].axis('off')
    
    plt.suptitle(
        f"PM2.5: {result['pm25']:.1f} ¬µg/m¬≥ | AQI: {result['aqi_index']} ({result['aqi_category']})",
        fontsize=14,
        fontweight='bold'
    )
    plt.tight_layout()
    plt.show()
    
    print(f"\nTop features for {img_file}:")
    for feat in result['top_features'][:3]:
        print(f"  - {feat['name']}: {feat['raw_value']:.3f}")
    print("-" * 60)

print("\n‚úÖ Inference test completed")

## üì• Download Trained Model

Download the model and normalization stats to use locally or in Streamlit app:

In [None]:
from google.colab import files
import shutil

# Create zip with model and stats
shutil.make_archive('aerogauge_model', 'zip', '.', 'weights')
shutil.make_archive('aerogauge_outputs', 'zip', '.', 'outputs')

# Download
files.download('aerogauge_model.zip')
files.download('aerogauge_outputs.zip')

print("‚úÖ Model files downloaded")
print("\nExtract these files locally:")
print("  - aerogauge_model.zip ‚Üí extract to your local 'weights/' folder")
print("  - aerogauge_outputs.zip ‚Üí extract to your local 'outputs/' folder")

## üéâ Next Steps

1. ‚úÖ Training completed on Colab GPU
2. ‚úÖ Model and stats downloaded
3. ‚úÖ Ready to deploy Streamlit app

### To run the Streamlit app locally:

```bash
# Place downloaded files:
# - weights/best_model.pt
# - outputs/feature_normalization.json

# Run app
streamlit run src/app.py
```

### Performance Tips:

- **Improve accuracy**: Increase epochs, try different learning rates
- **Reduce overfitting**: Add more augmentation, increase dropout
- **Faster training**: Increase batch size (if GPU memory allows)
- **Better generalization**: Collect more diverse training data

---

**Happy training! üöÄ**