# QAGNN Phase 2: Deep Learning Model Training
## üéØ Target: R¬≤ > 0.92 accuracy prediction

This notebook trains a neural network to predict genetic circuit accuracy from design parameters.

## 1. Setup and Verification

In [None]:
# Import libraries
import sys
sys.path.append('..')

import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
from pathlib import Path

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Setup
print(f"Python: {sys.version}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory Free: {torch.cuda.mem_get_info()[0]/1e9:.1f} GB")

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

## 2. Verify Phase 1 Data

In [None]:
def verify_phase1_data():
    """Verify Phase 1 data exists and is correct"""
    print("üîç Verifying Phase 1 data...")
    
    datasets = ['train', 'val', 'test']
    for name in datasets:
        path = f'../data/processed/{name}_dataset.h5'
        if Path(path).exists():
            with h5py.File(path, 'r') as f:
                X = f['X'][:]
                y = f['y'][:]
                print(f"‚úÖ {name}: {X.shape} samples, y-range: {y.min():.3f}-{y.max():.3f}")
        else:
            print(f"‚ùå {name}: Missing!")
            return False
    
    print("\nüìä Data statistics:")
    with h5py.File('../data/processed/train_dataset.h5', 'r') as f:
        X_train = f['X'][:]
        y_train = f['y'][:]
        
    print(f"   Features per sample: {X_train.shape[1]}")
    print(f"   Feature range: {X_train.min():.3f} to {X_train.max():.3f}")
    print(f"   Label range: {y_train.min():.3f} to {y_train.max():.3f}")
    print(f"   Feature mean: {X_train.mean():.3f}, std: {X_train.std():.3f}")
    print(f"   Label mean: {y_train.mean():.3f}, std: {y_train.std():.3f}")
    
    return True

# Run verification
if verify_phase1_data():
    print("\n‚úÖ Phase 1 data verified successfully!")
else:
    print("\n‚ùå Phase 1 data verification failed!")
    print("   Restore from backup: ~/project_backups/qagnn_phase1_complete_*/")

## 3. Load and Explore Data

In [None]:
# Load data
from src.ai.data_loader import load_datasets

train_loader, val_loader, test_loader, (X_test, y_test) = load_datasets(batch_size=32)

# Get one batch
for X_batch, y_batch in train_loader:
    print(f"First batch shape: X={X_batch.shape}, y={y_batch.shape}")
    print(f"Batch statistics:")
    print(f"  X range: {X_batch.min():.3f} to {X_batch.max():.3f}")
    print(f"  y range: {y_batch.min():.3f} to {y_batch.max():.3f}")
    break

## 4. Test Model Architecture

In [None]:
from src.ai.model import CircuitPredictor, test_model

# Test model
model = test_model()

# Test forward pass with actual data
X_sample = X_batch[:8]  # First 8 samples
y_sample = y_batch[:8]

with torch.no_grad():
    predictions = model(X_sample)
    
print("\nüß™ Sample predictions:")
for i in range(4):
    print(f"  Sample {i}: Actual={y_sample[i]:.3f}, Predicted={predictions[i].item():.3f}, Error={abs(y_sample[i]-predictions[i].item()):.3f}")

## 5. Train the Model

**Options:**
1. **Run full training** (next cell, ~2-3 hours)
2. **Quick test** (5 epochs, ~15 minutes)
3. **Load pre-trained** (if already trained)

In [None]:
# OPTION 1: Full training (50 epochs, ~2-3 hours)
# Uncomment to run
"""
from src.ai.train import Trainer
trainer = Trainer()
best_r2 = trainer.train()
"""

# OPTION 2: Quick test (5 epochs, ~15 minutes)
from src.ai.train import Trainer

# Create config for quick test
quick_config = {
    'epochs': 5,
    'batch_size': 32,
    'learning_rate': 0.001,
    'weight_decay': 0.0001,
    'dropout_rate': 0.2,
    'patience': 3,
    'early_stopping_patience': 5,
    'checkpoint_frequency': 2,
}

print("üöÄ Starting quick training (5 epochs)...")
trainer = Trainer(quick_config)
best_r2 = trainer.train()

## 6. Evaluate Trained Model

In [None]:
# Evaluate model
from src.ai.evaluate import ModelEvaluator

# Check if model exists
model_path = Path("../models/final/circuit_predictor_latest.pt")
if model_path.exists():
    print(f"‚úÖ Found trained model: {model_path}")
    evaluator = ModelEvaluator()
    evaluator.main()
else:
    print(f"‚ùå No trained model found at {model_path}")
    print("   Run the training cell above first.")

## 7. Analyze Training Results

In [None]:
# Load training history
import pandas as pd

history_path = Path("../results/logs/phase2/training_history.csv")
if history_path.exists():
    history_df = pd.read_csv(history_path)
    
    print("üìä Training History:")
    print(history_df.tail())  # Last few epochs
    
    # Best R¬≤
    best_r2 = history_df['val_r2'].max()
    best_epoch = history_df['val_r2'].idxmax() + 1
    
    print(f"\nüèÜ Best Validation R¬≤: {best_r2:.4f} at epoch {best_epoch}")
    print(f"üéØ Target: R¬≤ > 0.92")
    print(f"‚úÖ Status: {"ACHIEVED" if best_r2 > 0.92 else "NOT ACHIEVED"}")
    
    # Plot progress
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    axes[0].plot(history_df['epoch'], history_df['val_loss'], 'b-', label='Validation Loss')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss (MSE)')
    axes[0].set_title('Validation Loss over Time')
    axes[0].grid(True, alpha=0.3)
    
    axes[1].plot(history_df['epoch'], history_df['val_r2'], 'g-', label='Validation R¬≤')
    axes[1].axhline(y=0.92, color='r', linestyle='--', label='Target (0.92)')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('R¬≤ Score')
    axes[1].set_title('Validation R¬≤ over Time')
    axes[1].grid(True, alpha=0.3)
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()
    
else:
    print("‚ùå No training history found. Train the model first.")

## 8. Create Backup

In [None]:
# Create backup of Phase 2 progress
import shutil
from datetime import datetime

def create_phase2_backup():
    """Create backup of Phase 2 work"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    backup_name = f"qagnn_phase2_progress_{timestamp}"
    backup_path = Path(f"../../project_backups/{backup_name}")
    
    # Create backup directory
    backup_path.mkdir(parents=True, exist_ok=True)
    
    # Copy important files
    directories_to_backup = [
        "../models",
        "../results/figures/process/phase2",
        "../results/logs/phase2",
        "../results/tables",
        "../data/processed/phase2",
        "../src/ai"
    ]
    
    print(f"üíæ Creating backup: {backup_path}")
    
    for dir_path in directories_to_backup:
        source = Path(dir_path)
        if source.exists():
            dest = backup_path / source.name
            if source.is_dir():
                shutil.copytree(source, dest, dirs_exist_ok=True)
                print(f"  ‚úÖ {source.name}: Copied")
            else:
                shutil.copy2(source, dest)
                print(f"  ‚úÖ {source.name}: Copied")
        else:
            print(f"  ‚ö†Ô∏è  {source}: Not found")
    
    # Also copy this notebook
    notebook_path = Path("./02_deep_learning.ipynb")
    if notebook_path.exists():
        shutil.copy2(notebook_path, backup_path / "02_deep_learning.ipynb")
        print(f"  ‚úÖ Notebook: Copied")
    
    print(f"\n‚úÖ Backup complete: {backup_path}")
    return backup_path

# Create backup (optional)
# backup_path = create_phase2_backup()

## 9. Next Steps

In [None]:
print("üéØ NEXT STEPS FOR PHASE 2:")
print("=" * 40)
print("""
If R¬≤ > 0.92 achieved:
  1. ‚úÖ Run full 50-epoch training
  2. ‚úÖ Complete all evaluations
  3. ‚úÖ Generate all visualizations
  4. ‚úÖ Create backup
  5. üöÄ Proceed to Phase 3: Quantum Optimization

If R¬≤ < 0.92:
  1. üîß Adjust model architecture
  2. üìà Train for more epochs
  3. üéõÔ∏è  Tune hyperparameters
  4. üîç Check data quality
  5. üîÑ Re-train with improvements
""")

# Check current status
history_path = Path("../results/logs/phase2/training_history.csv")
if history_path.exists():
    history_df = pd.read_csv(history_path)
    best_r2 = history_df['val_r2'].max()
    
    print(f"\nüìä CURRENT STATUS:")
    print(f"   Best R¬≤: {best_r2:.4f}")
    print(f"   Target: >0.92")
    print(f"   Gap: {0.92 - best_r2:.4f}")
    
    if best_r2 > 0.92:
        print("\n‚úÖ READY FOR PHASE 3!")
    else:
        print("\n‚ö†Ô∏è  NEEDS IMPROVEMENT - Adjust and re-train.")
else:
    print("\nüîß No training completed yet. Run training cell above.")