# GPU Cluster: Parallel Training of PyTorch Deep Learning Models with Ray

This notebook trains **10 PyTorch MLP models** in parallel on a GPU cluster:
- Various neural network architectures
- Different feature subsets
- Bayesian hyperparameter optimization

**Cluster Configuration**: Single-node with 4 GPUs

**Note**: Run this notebook on the GPU cluster. Results are saved to `ryuta.ray.model_training_results`.

## 1. Setup and Imports

In [None]:
# Core libraries
import numpy as np
import pandas as pd
from datetime import datetime
import json
import time

# Ray imports
import ray

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Scikit-learn utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif, mutual_info_classif

# Optuna for Bayesian optimization
import optuna
from optuna.samplers import TPESampler

# Suppress Optuna logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

print("All imports successful!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
print(f"Notebook type: GPU CLUSTER")

## 2. Configuration

In [None]:
# Configuration
CONFIG = {
    'table_name': 'ryuta.ray.synthetic_data',
    'results_table': 'ryuta.ray.model_training_results',
    'test_size': 0.2,
    'random_state': 42,
    'n_trials_per_model': 20,  # Bayesian optimization trials
    
    # Cluster configuration
    'cluster_type': 'gpu',
    'n_gpus': 4,
    
    # Model distribution (GPU models only)
    'model_distribution': {
        'pytorch_mlp': 10
    },
    
    # Model ID offset (GPU models: 90-99)
    'model_id_start': 90
}

CONFIG['n_models_total'] = sum(CONFIG['model_distribution'].values())

print("Configuration loaded successfully!")
print(json.dumps(CONFIG, indent=2))

## 3. Load Data from Delta Table

In [None]:
# Load data from Delta table
print(f"Loading data from {CONFIG['table_name']}...")
df_spark = spark.table(CONFIG['table_name'])

print(f"Total rows: {df_spark.count()}")

# Convert to pandas
df = df_spark.toPandas()
print(f"Data loaded successfully! Shape: {df.shape}")

# Prepare features and labels
feature_columns = [col for col in df.columns if col.startswith('feature_')]
X = df[feature_columns].values
y = df['label'].values

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"Class distribution: {np.bincount(y)}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=CONFIG['test_size'], 
    random_state=CONFIG['random_state'],
    stratify=y
)

print(f"\nTrain set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## 4. Feature Subset Selection

In [None]:
def generate_feature_subsets(n_features, n_subsets):
    """
    Generate diverse feature subsets for GPU models
    """
    np.random.seed(CONFIG['random_state'] + 100)  # Different seed for GPU models
    subsets = []
    
    # Calculate feature importance scores
    f_scores, _ = f_classif(X_train, y_train)
    mi_scores = mutual_info_classif(X_train, y_train, random_state=CONFIG['random_state'])
    
    top_f_features = np.argsort(f_scores)[::-1]
    top_mi_features = np.argsort(mi_scores)[::-1]
    
    for i in range(n_subsets):
        subset_size = np.random.randint(30, n_features + 1)  # 30-100 features for deep learning
        strategy = i % 5  # Use fewer strategies for smaller model count
        
        if strategy == 0:
            subset = list(range(n_features))
        elif strategy == 1:
            subset = sorted(np.random.choice(n_features, subset_size, replace=False))
        elif strategy == 2:
            subset = sorted(top_f_features[:subset_size])
        elif strategy == 3:
            subset = sorted(top_mi_features[:subset_size])
        else:
            n_top = subset_size // 2
            top_features = list(top_f_features[:n_top])
            remaining = [f for f in range(n_features) if f not in top_features]
            random_features = list(np.random.choice(remaining, subset_size - n_top, replace=False))
            subset = sorted(top_features + random_features)
        
        subsets.append({
            'feature_indices': subset,
            'n_features': len(subset),
            'strategy': ['all', 'random', 'top_f', 'top_mi', 'f_random_mix'][strategy]
        })
    
    return subsets

# Generate feature subsets
n_features = X.shape[1]
feature_subsets = generate_feature_subsets(n_features, CONFIG['n_models_total'])

print(f"Generated {len(feature_subsets)} feature subsets for GPU models")

## 5. PyTorch Model Definition

In [None]:
class FlexibleMLP(nn.Module):
    """Flexible Multi-Layer Perceptron for binary classification"""
    
    def __init__(self, input_size, hidden_sizes, dropout=0.0):
        super(FlexibleMLP, self).__init__()
        
        layers = []
        prev_size = input_size
        
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            if dropout > 0:
                layers.append(nn.Dropout(dropout))
            prev_size = hidden_size
        
        # Output layer
        layers.append(nn.Linear(prev_size, 1))
        layers.append(nn.Sigmoid())
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

print("PyTorch MLP model defined!")

## 6. Hyperparameter Space

In [None]:
# Hyperparameter space for PyTorch models
HYPERPARAMETER_SPACE = {
    'hidden_sizes': [(64,), (128,), (256,), (64, 32), (128, 64), (256, 128), (128, 64, 32), (256, 128, 64)],
    'learning_rate': (1e-4, 1e-2, 'log'),
    'batch_size': [32, 64, 128, 256],
    'dropout': (0.0, 0.5),
    'epochs': [50, 100],
    'weight_decay': (1e-6, 1e-3, 'log')
}

print("Hyperparameter space defined!")

## 7. Training Function

In [None]:
def train_pytorch_model(hyperparams, X_tr, y_tr, X_val, y_val, device='cuda'):
    """Train a PyTorch MLP model"""
    
    # Scale features
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)
    
    # Convert to tensors
    X_tr_tensor = torch.FloatTensor(X_tr_scaled).to(device)
    y_tr_tensor = torch.FloatTensor(y_tr).unsqueeze(1).to(device)
    X_val_tensor = torch.FloatTensor(X_val_scaled).to(device)
    y_val_tensor = torch.FloatTensor(y_val).unsqueeze(1).to(device)
    
    # Create data loaders
    train_dataset = TensorDataset(X_tr_tensor, y_tr_tensor)
    train_loader = DataLoader(
        train_dataset, 
        batch_size=hyperparams['batch_size'], 
        shuffle=True
    )
    
    # Initialize model
    input_size = X_tr.shape[1]
    model = FlexibleMLP(
        input_size=input_size,
        hidden_sizes=hyperparams['hidden_sizes'],
        dropout=hyperparams['dropout']
    ).to(device)
    
    # Loss and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(
        model.parameters(), 
        lr=hyperparams['learning_rate'],
        weight_decay=hyperparams['weight_decay']
    )
    
    # Training loop
    model.train()
    for epoch in range(hyperparams['epochs']):
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    
    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred_proba = model(X_val_tensor).cpu().numpy().flatten()
        y_pred = (y_pred_proba > 0.5).astype(int)
    
    y_val_np = y_val_tensor.cpu().numpy().flatten()
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y_val_np, y_pred),
        'roc_auc': roc_auc_score(y_val_np, y_pred_proba),
        'f1': f1_score(y_val_np, y_pred),
        'precision': precision_score(y_val_np, y_pred),
        'recall': recall_score(y_val_np, y_pred)
    }
    
    return metrics

print("Training function defined!")

## 8. Bayesian Optimization

In [None]:
def suggest_hyperparameters(trial):
    """Suggest hyperparameters using Optuna"""
    hyperparams = {}
    
    hyperparams['hidden_sizes'] = trial.suggest_categorical(
        'hidden_sizes', 
        HYPERPARAMETER_SPACE['hidden_sizes']
    )
    hyperparams['learning_rate'] = trial.suggest_float(
        'learning_rate', 
        HYPERPARAMETER_SPACE['learning_rate'][0],
        HYPERPARAMETER_SPACE['learning_rate'][1],
        log=True
    )
    hyperparams['batch_size'] = trial.suggest_categorical(
        'batch_size',
        HYPERPARAMETER_SPACE['batch_size']
    )
    hyperparams['dropout'] = trial.suggest_float(
        'dropout',
        HYPERPARAMETER_SPACE['dropout'][0],
        HYPERPARAMETER_SPACE['dropout'][1]
    )
    hyperparams['epochs'] = trial.suggest_categorical(
        'epochs',
        HYPERPARAMETER_SPACE['epochs']
    )
    hyperparams['weight_decay'] = trial.suggest_float(
        'weight_decay',
        HYPERPARAMETER_SPACE['weight_decay'][0],
        HYPERPARAMETER_SPACE['weight_decay'][1],
        log=True
    )
    
    return hyperparams


def optimize_hyperparameters(X_tr, y_tr, X_val, y_val, n_trials=20, device='cuda'):
    """Optimize hyperparameters using Bayesian optimization"""
    
    def objective(trial):
        hyperparams = suggest_hyperparameters(trial)
        
        try:
            metrics = train_pytorch_model(hyperparams, X_tr, y_tr, X_val, y_val, device)
            return metrics['roc_auc']
        except Exception:
            return 0.5
    
    study = optuna.create_study(
        direction='maximize',
        sampler=TPESampler(seed=CONFIG['random_state'])
    )
    
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    
    return study.best_params, study.best_value

print("Bayesian optimization functions defined!")

## 9. Ray Remote Training Function

In [None]:
@ray.remote(num_gpus=1)
def train_model_with_optimization(
    model_id,
    feature_subset,
    X_train_full,
    y_train_full,
    X_test_full,
    y_test_full,
    n_trials=20
):
    """
    Ray remote function to train a PyTorch model on GPU
    """
    start_time = time.time()
    
    try:
        # Determine which GPU to use
        if torch.cuda.is_available():
            device = 'cuda'
            # Ray will handle GPU assignment
        else:
            device = 'cpu'
            print(f"Warning: CUDA not available for model {model_id}, using CPU")
        
        # Extract feature subset
        feature_indices = feature_subset['feature_indices']
        X_train = X_train_full[:, feature_indices]
        X_test = X_test_full[:, feature_indices]
        
        # Split for validation
        X_tr, X_val, y_tr, y_val = train_test_split(
            X_train, y_train_full,
            test_size=0.2,
            random_state=CONFIG['random_state'],
            stratify=y_train_full
        )
        
        # Optimize hyperparameters
        best_hyperparams, best_val_score = optimize_hyperparameters(
            X_tr, y_tr, X_val, y_val, n_trials, device
        )
        
        # Train final model on full training set
        test_metrics = train_pytorch_model(
            best_hyperparams, X_train, y_train_full, X_test, y_test_full, device
        )
        
        training_time = time.time() - start_time
        
        result = {
            'model_id': model_id,
            'model_type': 'pytorch_mlp',
            'cluster_type': 'gpu',
            'n_features_used': len(feature_indices),
            'feature_strategy': feature_subset['strategy'],
            'best_hyperparams': json.dumps({k: str(v) for k, v in best_hyperparams.items()}),
            'best_val_score': best_val_score,
            'accuracy': test_metrics['accuracy'],
            'roc_auc': test_metrics['roc_auc'],
            'f1': test_metrics['f1'],
            'precision': test_metrics['precision'],
            'recall': test_metrics['recall'],
            'training_time': training_time,
            'device': device,
            'status': 'success'
        }
        
        return result
        
    except Exception as e:
        return {
            'model_id': model_id,
            'model_type': 'pytorch_mlp',
            'cluster_type': 'gpu',
            'status': 'failed',
            'error': str(e),
            'training_time': time.time() - start_time
        }

print("Ray remote training function defined!")

## 10. Initialize Ray

In [None]:
# Initialize Ray cluster using Databricks utilities
from ray.util.spark import setup_ray_cluster, shutdown_ray_cluster

# Shutdown any existing Ray instance
if ray.is_initialized():
    ray.shutdown()

try:
    # For GPU cluster, setup Ray to use single-node with multiple GPUs
    print("Setting up Ray cluster on GPU node...")
    setup_ray_cluster(
        num_worker_nodes=1,  # Single-node GPU cluster
        num_cpus_worker_node=16,  # Adjust based on your GPU cluster's CPU count
        num_gpus_worker_node=CONFIG['n_gpus'],  # 4 GPUs
        collect_log_to_path="/Workspace/Users/ryuta.yoshimatsu@databricks.com/ray_logs"
    )
    
    # Explicitly initialize Ray after cluster setup
    ray.init(address='auto', ignore_reinit_error=True, logging_level='ERROR')
    
except Exception as e:
    print(f"Warning: setup_ray_cluster failed with: {e}")
    print("Falling back to standard Ray initialization...")
    # Fallback to standard initialization
    ray.init(ignore_reinit_error=True, logging_level='ERROR')

print("\nRay initialized successfully on GPU cluster!")
print(f"Available CPUs: {ray.cluster_resources().get('CPU', 0)}")
print(f"Available GPUs: {ray.cluster_resources().get('GPU', 0)}")
print(f"Expected GPUs: {CONFIG['n_gpus']}")

# Verify cluster setup
print(f"\nCluster nodes connected: {len(ray.nodes())}")
print(f"Total cluster resources: {ray.cluster_resources()}")

## 11. Generate Model Configurations

In [None]:
# Generate model configurations
model_configs = []
model_id = CONFIG['model_id_start']

for i in range(CONFIG['n_models_total']):
    feature_subset = feature_subsets[i]
    
    config = {
        'model_id': model_id,
        'feature_subset': feature_subset
    }
    
    model_configs.append(config)
    model_id += 1

print(f"Generated {len(model_configs)} GPU model configurations")
print(f"Model IDs: {CONFIG['model_id_start']} to {model_id - 1}")
print(f"Each model will use 1 GPU")

## 12. Launch Parallel Training

In [None]:
# Put data in Ray object store
X_train_ref = ray.put(X_train)
y_train_ref = ray.put(y_train)
X_test_ref = ray.put(X_test)
y_test_ref = ray.put(y_test)

print("Data stored in Ray object store")

In [None]:
# Launch training jobs
print(f"\nLaunching {len(model_configs)} PyTorch training jobs on GPU cluster...\n")
print("="*80)
start_time = time.time()

futures = []
for config in model_configs:
    future = train_model_with_optimization.remote(
        model_id=config['model_id'],
        feature_subset=config['feature_subset'],
        X_train_full=X_train_ref,
        y_train_full=y_train_ref,
        X_test_full=X_test_ref,
        y_test_full=y_test_ref,
        n_trials=CONFIG['n_trials_per_model']
    )
    
    futures.append(future)

print(f"All {len(futures)} jobs submitted. Waiting for results...\n")

In [None]:
# Collect results
results = []
completed = 0
remaining_futures = futures.copy()

while remaining_futures:
    ready_futures, remaining_futures = ray.wait(remaining_futures, num_returns=1)
    
    for future in ready_futures:
        result = ray.get(future)
        results.append(result)
        completed += 1
        
        if result['status'] == 'success':
            print(f"[{completed}/{len(futures)}] Model {result['model_id']} (PyTorch MLP) - "
                  f"ROC AUC: {result['roc_auc']:.4f} ({result['training_time']:.1f}s) [{result['device'].upper()}]")
        else:
            print(f"[{completed}/{len(futures)}] Model {result['model_id']} FAILED")

total_time = time.time() - start_time
print(f"\n{'='*80}")
print(f"GPU CLUSTER TRAINING COMPLETE")
print(f"{'='*80}")
print(f"Total time: {total_time:.2f}s ({total_time/60:.2f} minutes)")
print(f"Successful: {sum(1 for r in results if r['status'] == 'success')}")
print(f"Failed: {sum(1 for r in results if r['status'] == 'failed')}")

## 13. Cleanup

In [None]:
# Shutdown Ray cluster
shutdown_ray_cluster()
print("Ray cluster shut down successfully!")

## 14. Save Results to Delta Table

In [None]:
# Filter successful results
successful_results = [r for r in results if r['status'] == 'success']

if successful_results:
    # Create DataFrame
    results_df = pd.DataFrame(successful_results)
    results_df['training_timestamp'] = datetime.now().isoformat()
    
    # Convert to Spark DataFrame
    results_spark_df = spark.createDataFrame(results_df)
    
    # Save to Delta table (append mode)
    print(f"\nSaving {len(results_df)} results to {CONFIG['results_table']}...")
    results_spark_df.write \
        .format("delta") \
        .mode("append") \
        .saveAsTable(CONFIG['results_table'])
    
    print(f"Results saved successfully!")
    
    # Show summary statistics
    print(f"\nSummary Statistics:")
    print(f"  Mean ROC AUC: {results_df['roc_auc'].mean():.4f}")
    print(f"  Best ROC AUC: {results_df['roc_auc'].max():.4f}")
    print(f"  Mean Accuracy: {results_df['accuracy'].mean():.4f}")
    print(f"  Best Accuracy: {results_df['accuracy'].max():.4f}")
    print(f"  Total training time: {results_df['training_time'].sum():.2f}s")
    print(f"  Average training time per model: {results_df['training_time'].mean():.2f}s")
else:
    print("No successful results to save.")

## Summary

This notebook successfully trained **10 PyTorch MLP models** on the GPU cluster:
- Efficient parallel execution using Ray with GPU allocation
- Bayesian hyperparameter optimization for each model
- Various neural network architectures
- Different feature selection strategies
- Results saved to shared Delta table: `ryuta.ray.model_training_results`

**Next Steps**:
1. Use the analysis notebook to compare CPU and GPU results
2. Identify the best performing models overall