# ASL Recognition with TGCN - Complete Pipeline

## 🚀 Improved TGCN with Face Landmarks (553 Nodes)

This notebook implements a state-of-the-art ASL recognition system using:

- **553 keypoints**: 33 pose + 42 hands + 478 face landmarks
- **Advanced preprocessing**: Spatial anchoring, temporal smoothing, interpolation
- **Improved graph connectivity**: Anatomical + functional relationships
- **Data augmentation**: Spatial and temporal transformations
- **WLASL-100 subset**: Focus on quality over quantity

### Architecture Overview

- **Input**: MediaPipe keypoint sequences (seq_len, 553, 3)
- **Graph**: Enhanced connectivity with face-hand relationships
- **Model**: ST-GCN with temporal convolutions
- **Target**: 87.60% accuracy on WLASL-100 (literature benchmark)


In [43]:
# Core libraries
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingLR
from torch.cuda.amp import GradScaler, autocast

# Progress bars and timing
from tqdm.notebook import tqdm, trange
import time

# PyTorch Geometric for GCN
try:
    import torch_geometric
    from torch_geometric.nn import GCNConv, global_mean_pool
    from torch_geometric.data import Data, Batch
    print(f"✅ PyTorch Geometric {torch_geometric.__version__} loaded")
except ImportError:
    print("❌ PyTorch Geometric not found. Install with: pip install torch-geometric")
    raise

# Data handling and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from collections import defaultdict
import glob
import os
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Import our improved normalization module
from normalization import (
    ImprovedPoseNormalizer,
    create_improved_pose_dataset_class,
    create_improved_graph_connectivity,
    apply_spatial_augmentation,
    apply_temporal_augmentation
)

print("🎯 All libraries loaded successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

✅ PyTorch Geometric 2.6.1 loaded
🎯 All libraries loaded successfully!
PyTorch version: 2.7.0+cu118
CUDA available: True
Using device: cuda


## 📊 Configuration and Data Paths

Set up all paths and hyperparameters for the training pipeline.


In [54]:
# Data paths
DATA_DIR = r'f:\Uni_Stuff\6th_Sem\DL\Proj\video-asl-recognition\pose_estimation\data\keypoints'
CHECKPOINT_DIR = r'f:\Uni_Stuff\6th_Sem\DL\Proj\video-asl-recognition\pose_estimation\src\checkpoints'
MODEL_SAVE_PATH = os.path.join(CHECKPOINT_DIR, 'best_tgcn_face_model.pth')

# Create directories
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# 🚀 WINDOWS-COMPATIBLE GPU CONFIGURATION FOR SPEED 🚀
CONFIG = {
    # Data parameters - OPTIMIZED FOR SPEED
    'max_seq_len': 40,          # 🚀 Reduced from 50 to save memory/speed
    'num_nodes': 553,           # 33 pose + 42 hands + 478 face
    'num_features': 3,          # x, y, z coordinates
    'max_classes': 10,         # 🚀 Reduced from 300 for faster training
    'test_size': 0.2,           # Train/test split ratio
    'batch_size': 32,           # 🚀 DOUBLED from 16 for GPU efficiency
    
    # Model architecture - OPTIMIZED FOR SPEED
    'gcn_hidden': 128,          # 🚀 Reduced from 256 to save memory/speed
    'temporal_kernel': 7,       # 🚀 Reduced from 9 for speed
    'dropout': 0.3,             # Dropout rate
    'num_gcn_layers': 2,        # 🚀 Reduced from 3 for speed
    
    # Training parameters - OPTIMIZED FOR SPEED
    'num_epochs': 50,           # 🚀 Reduced for faster initial results
    'learning_rate': 0.002,     # 🚀 Increased for faster convergence
    'weight_decay': 1e-4,       # L2 regularization
    'patience': 10,             # 🚀 Reduced patience for faster training
    'min_lr': 1e-6,             # Minimum learning rate
    
    # 🚀 WINDOWS-COMPATIBLE GPU OPTIMIZATION SETTINGS 🚀
    'use_mixed_precision': True,  # 🚀 Enable AMP for 2x speed
    'pin_memory': True,           # 🚀 Faster CPU->GPU transfer
    'non_blocking': True,         # 🚀 Async GPU transfers
    'compile_model': False,       # 🚀 DISABLED - Triton not available on Windows
    'use_channels_last': False,   # 🚀 DISABLED - Can cause issues on some Windows setups
    'gradient_accumulation': 1,   # No gradient accumulation for speed
    
    # Data augmentation - SIMPLIFIED FOR SPEED
    'use_augmentation': True,     # Enable data augmentation
    'aug_probability': 0.2,       # 🚀 Reduced from 0.3 for speed
    'spatial_aug_strength': 0.05, # 🚀 Reduced for speed
    'temporal_aug_strength': 0.1, # 🚀 Reduced for speed
    'aug_on_gpu': True,           # 🚀 Move augmentation to GPU
    
    # DataLoader optimization 🚀
    'num_workers': 8,             # 🚀 Parallel data loading
    'prefetch_factor': 2,         # 🚀 Prefetch batches
    'persistent_workers': True,   # 🚀 Keep workers alive
}

print("📋 Windows-Compatible Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

print(f"\n📁 Data directory: {DATA_DIR}")
print(f"💾 Checkpoint directory: {CHECKPOINT_DIR}")

# 🚀 Enable Windows-compatible GPU optimizations
if torch.cuda.is_available():
    print("\n🚀 Enabling Windows-Compatible GPU Optimizations:")
    torch.backends.cudnn.benchmark = True  # Optimize cuDNN
    torch.backends.cuda.matmul.allow_tf32 = True  # Enable TF32
    torch.backends.cudnn.allow_tf32 = True
    print("  ✅ cuDNN benchmark enabled")
    print("  ✅ TF32 enabled for faster matmul")
    
    # Check GPU memory
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"  📊 GPU Memory: {gpu_memory:.1f} GB")
    
    if gpu_memory < 6:
        print("  ⚠️  Limited GPU memory detected - using conservative settings")
        CONFIG['batch_size'] = min(CONFIG['batch_size'], 24)
        CONFIG['gcn_hidden'] = min(CONFIG['gcn_hidden'], 96)
else:
    print("❌ No GPU detected - training will be slow on CPU")
    CONFIG['batch_size'] = 8
    CONFIG['use_mixed_precision'] = False

📋 Windows-Compatible Configuration:
  max_seq_len: 40
  num_nodes: 553
  num_features: 3
  max_classes: 10
  test_size: 0.2
  batch_size: 32
  gcn_hidden: 128
  temporal_kernel: 7
  dropout: 0.3
  num_gcn_layers: 2
  num_epochs: 50
  learning_rate: 0.002
  weight_decay: 0.0001
  patience: 10
  min_lr: 1e-06
  use_mixed_precision: True
  pin_memory: True
  non_blocking: True
  compile_model: False
  use_channels_last: False
  gradient_accumulation: 1
  use_augmentation: True
  aug_probability: 0.2
  spatial_aug_strength: 0.05
  temporal_aug_strength: 0.1
  aug_on_gpu: True
  num_workers: 8
  prefetch_factor: 2
  persistent_workers: True

📁 Data directory: f:\Uni_Stuff\6th_Sem\DL\Proj\video-asl-recognition\pose_estimation\data\keypoints
💾 Checkpoint directory: f:\Uni_Stuff\6th_Sem\DL\Proj\video-asl-recognition\pose_estimation\src\checkpoints

🚀 Enabling Windows-Compatible GPU Optimizations:
  ✅ cuDNN benchmark enabled
  ✅ TF32 enabled for faster matmul
  📊 GPU Memory: 4.0 GB
  ⚠️  Limite

## 🔍 Data Exploration and Validation

Explore the keypoint data to understand the dataset structure and validate the 553-node architecture.


In [55]:
def explore_dataset(data_dir):
    """Explore the keypoint dataset structure and statistics"""
    
    if not os.path.exists(data_dir):
        print(f"❌ Data directory not found: {data_dir}")
        print("Please run the keypoint extraction first with pose_estimation_mediapipe.py")
        return None
    
    # Find all word directories
    word_dirs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    word_dirs = sorted(word_dirs)
    
    print(f"📊 Dataset Statistics:")
    print(f"  Total classes found: {len(word_dirs)}")
    
    # Analyze sample distribution
    class_stats = []
    total_files = 0
    sample_shapes = []
    
    for word in word_dirs[:20]:  # Check first 20 classes
        word_dir = os.path.join(data_dir, word)
        npz_files = glob.glob(os.path.join(word_dir, "*.npz"))
        total_files += len(npz_files)
        
        # Check sample file shape
        if npz_files:
            try:
                sample_data = np.load(npz_files[0])
                if 'nodes' in sample_data:
                    shape = sample_data['nodes'].shape
                    sample_shapes.append(shape)
                    print(f"  {word}: {len(npz_files)} files, shape: {shape}")
            except Exception as e:
                print(f"  {word}: {len(npz_files)} files, error reading: {e}")
        
        class_stats.append((word, len(npz_files)))
    
    print(f"\n📈 Sample distribution (top 20):")
    class_stats.sort(key=lambda x: x[1], reverse=True)
    for word, count in class_stats[:10]:
        print(f"  {word}: {count} samples")
    
    # Validate node architecture
    if sample_shapes:
        most_common_shape = max(set(sample_shapes), key=sample_shapes.count)
        print(f"\n🏗️ Architecture validation:")
        print(f"  Most common shape: {most_common_shape}")
        print(f"  Expected nodes: {CONFIG['num_nodes']} (33 pose + 42 hands + 478 face)")
        
        if most_common_shape[1] == CONFIG['num_nodes']:
            print(f"  ✅ Architecture matches! Found {most_common_shape[1]} nodes")
        else:
            print(f"  ⚠️ Architecture mismatch! Found {most_common_shape[1]}, expected {CONFIG['num_nodes']}")
            if most_common_shape[1] == 75:
                print(f"  📝 Data contains only pose+hands (75 nodes). Need to re-run extraction with face landmarks.")
                return False
    
    return True

# Explore the dataset
data_ready = explore_dataset(DATA_DIR)

📊 Dataset Statistics:
  Total classes found: 300
  about: 8 files, shape: (104, 553, 3)
  accident: 13 files, shape: (103, 553, 3)
  africa: 13 files, shape: (145, 553, 3)
  again: 10 files, shape: (70, 553, 3)
  all: 13 files, shape: (82, 553, 3)
  always: 9 files, shape: (70, 553, 3)
  animal: 10 files, shape: (126, 553, 3)
  apple: 13 files, shape: (71, 553, 3)
  approve: 11 files, shape: (134, 553, 3)
  argue: 10 files, shape: (86, 553, 3)
  arrive: 10 files, shape: (107, 553, 3)
  baby: 10 files, shape: (72, 553, 3)
  back: 7 files, shape: (39, 553, 3)
  backpack: 11 files, shape: (36, 553, 3)
  bad: 11 files, shape: (70, 553, 3)
  bake: 8 files, shape: (98, 553, 3)
  balance: 11 files, shape: (102, 553, 3)
  ball: 11 files, shape: (55, 553, 3)
  banana: 10 files, shape: (79, 553, 3)
  bar: 10 files, shape: (62, 553, 3)

📈 Sample distribution (top 20):
  accident: 13 samples
  africa: 13 samples
  all: 13 samples
  apple: 13 samples
  approve: 11 samples
  backpack: 11 samples
  b

## 🗃️ Dataset Class and Data Loading

Create the dataset class with improved normalization and load the data for training.


In [56]:
# Create the improved dataset class
ImprovedPoseSequenceDataset = create_improved_pose_dataset_class()

# Initialize datasets
if data_ready:
    print("🔄 Creating datasets...")
    
    # Training dataset
    train_dataset = ImprovedPoseSequenceDataset(
        data_dir=DATA_DIR,
        max_seq_len=CONFIG['max_seq_len'],
        split='train',
        test_size=CONFIG['test_size'],
        random_state=42,
        use_subset=True,
        max_classes=CONFIG['max_classes']
    )
    
    # Test dataset
    test_dataset = ImprovedPoseSequenceDataset(
        data_dir=DATA_DIR,
        max_seq_len=CONFIG['max_seq_len'], 
        split='test',
        test_size=CONFIG['test_size'],
        random_state=42,
        use_subset=True,
        max_classes=CONFIG['max_classes']
    )
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG['batch_size'],
        shuffle=True,
        num_workers=0,  # Set to 0 for Windows/Jupyter compatibility
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=CONFIG['batch_size'],
        shuffle=False,
        num_workers=0,  # Set to 0 for Windows/Jupyter compatibility
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    print(f"✅ Datasets created successfully!")
    print(f"📊 Training samples: {len(train_dataset)}")
    print(f"📊 Test samples: {len(test_dataset)}")
    print(f"📊 Number of classes: {train_dataset.num_classes}")
    print(f"📊 Batch size: {CONFIG['batch_size']}")
    
    # Save class mapping
    class_mapping = {
        'word_to_idx': train_dataset.word_to_idx,
        'idx_to_word': train_dataset.idx_to_word
    }
    
    with open(os.path.join(CHECKPOINT_DIR, 'class_mapping.json'), 'w') as f:
        json.dump(class_mapping, f, indent=2)
    
    print(f"💾 Class mapping saved to {CHECKPOINT_DIR}/class_mapping.json")
    
else:
    print("❌ Cannot create datasets. Please fix data issues first.")

🔄 Creating datasets...
Loading IMPROVED dataset from: f:\Uni_Stuff\6th_Sem\DL\Proj\video-asl-recognition\pose_estimation\data\keypoints
🎯 Using subset of 10 classes for better training
Selected top 10 classes with most samples
Found 10 word categories
Number of classes: 10
Total valid files found: 169
TRAIN split: 135 files
Class distribution in train (showing top 10):
  drink: 17 samples
  computer: 16 samples
  go: 14 samples
  before: 13 samples
  thin: 13 samples
  like: 13 samples
  mother: 13 samples
  cousin: 12 samples
  hot: 12 samples
  no: 12 samples
Loading IMPROVED dataset from: f:\Uni_Stuff\6th_Sem\DL\Proj\video-asl-recognition\pose_estimation\data\keypoints
🎯 Using subset of 10 classes for better training
Selected top 10 classes with most samples
Found 10 word categories
Number of classes: 10
Total valid files found: 169
TRAIN split: 135 files
Class distribution in train (showing top 10):
  drink: 17 samples
  computer: 16 samples
  go: 14 samples
  before: 13 samples
  

## 🏗️ TGCN Model Architecture

Implement the Temporal Graph Convolutional Network with improved connectivity for 553 nodes.


In [57]:
class TemporalGCN(nn.Module):
    """Temporal Graph Convolutional Network for ASL Recognition"""
    
    def __init__(self, num_nodes, num_features, num_classes, 
                 gcn_hidden=256, temporal_kernel=9, dropout=0.3, num_gcn_layers=3):
        super(TemporalGCN, self).__init__()
        
        self.num_nodes = num_nodes
        self.num_features = num_features
        self.num_classes = num_classes
        self.gcn_hidden = gcn_hidden
        self.temporal_kernel = temporal_kernel
        
        # Create improved graph connectivity
        self.edge_index = create_improved_graph_connectivity()
        
        # Input projection
        self.input_projection = nn.Linear(num_features, gcn_hidden)
        
        # GCN layers with residual connections
        self.gcn_layers = nn.ModuleList()
        for i in range(num_gcn_layers):
            self.gcn_layers.append(GCNConv(gcn_hidden, gcn_hidden))
        
        # Batch normalization for each GCN layer
        self.batch_norms = nn.ModuleList([
            nn.BatchNorm1d(gcn_hidden) for _ in range(num_gcn_layers)
        ])
        
        # Temporal convolution layers
        self.temporal_conv1 = nn.Conv1d(
            gcn_hidden, gcn_hidden, 
            kernel_size=temporal_kernel,
            padding=temporal_kernel//2
        )
        self.temporal_conv2 = nn.Conv1d(
            gcn_hidden, gcn_hidden//2,
            kernel_size=temporal_kernel,
            padding=temporal_kernel//2
        )
        
        # Dropout layers
        self.dropout = nn.Dropout(dropout)
        self.spatial_dropout = nn.Dropout2d(dropout * 0.5)
        
        # Global pooling and classification
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Sequential(
            nn.Linear(gcn_hidden//2, gcn_hidden//4),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(gcn_hidden//4, num_classes)
        )
        
        # Initialize weights
        self._initialize_weights()
    
    def _initialize_weights(self):
        """Initialize model weights"""
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
    
    def forward(self, x):
        """
        Forward pass
        
        Args:
            x: Input tensor [batch_size, seq_len, num_nodes, num_features]
            
        Returns:
            logits: Class predictions [batch_size, num_classes]
        """
        batch_size, seq_len, num_nodes, num_features = x.shape
        
        # Move edge index to same device as input
        edge_index = self.edge_index.to(x.device)
        
        # Process each frame separately
        frame_outputs = []
        
        for t in range(seq_len):
            # Current frame: [batch_size, num_nodes, num_features]
            frame = x[:, t, :, :]
            
            # Project input features
            h = self.input_projection(frame)  # [batch_size, num_nodes, gcn_hidden]
            
            # Apply GCN layers with residual connections
            for i, (gcn, bn) in enumerate(zip(self.gcn_layers, self.batch_norms)):
                residual = h if i > 0 else None
                
                # Reshape for GCN: [batch_size * num_nodes, gcn_hidden]
                # Use contiguous() to ensure tensor is contiguous before view
                h_flat = h.contiguous().view(-1, h.size(-1))
                
                # Expand edge index for batch
                batch_edge_index = edge_index.unsqueeze(0).repeat(batch_size, 1, 1)
                batch_edge_index = batch_edge_index + torch.arange(batch_size, device=x.device).view(-1, 1, 1) * num_nodes
                batch_edge_index = batch_edge_index.view(2, -1)
                
                # Apply GCN
                h_flat = gcn(h_flat, batch_edge_index)
                h = h_flat.view(batch_size, num_nodes, -1)
                
                # Batch normalization
                h = h.permute(0, 2, 1)  # [batch_size, gcn_hidden, num_nodes]
                h = bn(h)
                h = h.permute(0, 2, 1)  # [batch_size, num_nodes, gcn_hidden]
                
                # Activation and residual connection
                h = F.relu(h)
                if residual is not None:
                    h = h + residual
                
                h = self.dropout(h)
            
            # Spatial dropout for regularization
            h = h.unsqueeze(-1)  # [batch_size, num_nodes, gcn_hidden, 1]
            h = self.spatial_dropout(h)
            h = h.squeeze(-1)   # [batch_size, num_nodes, gcn_hidden]
            
            # Global spatial pooling for this frame
            frame_features = torch.mean(h, dim=1)  # [batch_size, gcn_hidden]
            frame_outputs.append(frame_features)
        
        # Stack frame features: [batch_size, seq_len, gcn_hidden]
        temporal_features = torch.stack(frame_outputs, dim=1)
        
        # Temporal convolution: [batch_size, gcn_hidden, seq_len]
        temporal_features = temporal_features.permute(0, 2, 1)
        
        # Apply temporal convolutions
        temporal_features = F.relu(self.temporal_conv1(temporal_features))
        temporal_features = self.dropout(temporal_features)
        temporal_features = F.relu(self.temporal_conv2(temporal_features))
        
        # Global temporal pooling
        sequence_features = self.global_pool(temporal_features).squeeze(-1)  # [batch_size, gcn_hidden//2]
        
        # Classification
        logits = self.classifier(sequence_features)
        
        return logits

print("✅ TGCN model defined successfully!")

✅ TGCN model defined successfully!


## 🛠️ Training Utilities and Metrics

Define training utilities, metrics calculation, and progress tracking functions.


In [58]:
class MetricsTracker:
    """Track training metrics and progress"""
    
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.train_losses = []
        self.train_accuracies = []
        self.val_losses = []
        self.val_accuracies = []
        self.learning_rates = []
        self.best_val_acc = 0.0
        self.best_epoch = 0
    
    def update(self, train_loss, train_acc, val_loss, val_acc, lr):
        self.train_losses.append(train_loss)
        self.train_accuracies.append(train_acc)
        self.val_losses.append(val_loss)
        self.val_accuracies.append(val_acc)
        self.learning_rates.append(lr)
        
        if val_acc > self.best_val_acc:
            self.best_val_acc = val_acc
            self.best_epoch = len(self.val_accuracies) - 1
    
    def plot_metrics(self):
        """Plot training metrics"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Loss plot
        axes[0, 0].plot(self.train_losses, label='Train Loss', color='blue')
        axes[0, 0].plot(self.val_losses, label='Val Loss', color='red')
        axes[0, 0].set_title('Training and Validation Loss')
        axes[0, 0].set_xlabel('Epoch')
        axes[0, 0].set_ylabel('Loss')
        axes[0, 0].legend()
        axes[0, 0].grid(True)
        
        # Accuracy plot
        axes[0, 1].plot(self.train_accuracies, label='Train Acc', color='blue')
        axes[0, 1].plot(self.val_accuracies, label='Val Acc', color='red')
        axes[0, 1].axhline(y=self.best_val_acc, color='green', linestyle='--', 
                          label=f'Best Val Acc: {self.best_val_acc:.3f}')
        axes[0, 1].set_title('Training and Validation Accuracy')
        axes[0, 1].set_xlabel('Epoch')
        axes[0, 1].set_ylabel('Accuracy')
        axes[0, 1].legend()
        axes[0, 1].grid(True)
        
        # Learning rate plot
        axes[1, 0].plot(self.learning_rates, color='orange')
        axes[1, 0].set_title('Learning Rate Schedule')
        axes[1, 0].set_xlabel('Epoch')
        axes[1, 0].set_ylabel('Learning Rate')
        axes[1, 0].set_yscale('log')
        axes[1, 0].grid(True)
        
        # Validation accuracy zoomed
        axes[1, 1].plot(self.val_accuracies, color='red', linewidth=2)
        axes[1, 1].axhline(y=self.best_val_acc, color='green', linestyle='--')
        axes[1, 1].set_title(f'Validation Accuracy (Best: {self.best_val_acc:.3f}% at epoch {self.best_epoch})')
        axes[1, 1].set_xlabel('Epoch')
        axes[1, 1].set_ylabel('Accuracy (%)')
        axes[1, 1].grid(True)
        
        plt.tight_layout()
        plt.show()

def calculate_accuracy(outputs, targets):
    """Calculate accuracy from model outputs and targets"""
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == targets).sum().item()
    total = targets.size(0)
    return 100.0 * correct / total

def train_epoch(model, train_loader, criterion, optimizer, device, use_augmentation=False):
    """Train for one epoch with tqdm progress bar"""
    model.train()
    total_loss = 0.0
    total_accuracy = 0.0
    num_batches = 0
    
    # Create progress bar for batches
    train_pbar = tqdm(train_loader, desc='Training', leave=False)
    
    for batch_idx, (data, targets) in enumerate(train_pbar):
        data, targets = data.to(device), targets.to(device)
        
        # Apply data augmentation if enabled
        if use_augmentation and np.random.random() < CONFIG['aug_probability']:
            # Convert to numpy for augmentation
            data_np = data.cpu().numpy()
            
            # Apply spatial augmentation batch-wise
            if np.random.random() < 0.5:
                augmented_batch = []
                for i in range(data_np.shape[0]):  # Iterate over batch
                    # Apply spatial augmentation to each sequence in the batch
                    aug_seq = apply_spatial_augmentation(
                        data_np[i],  # Shape: [seq_len, num_nodes, num_features]
                        scale_range=CONFIG['spatial_aug_strength'],
                        translation_range=CONFIG['spatial_aug_strength']
                    )
                    augmented_batch.append(aug_seq)
                data_np = np.array(augmented_batch)
            
            # Apply temporal augmentation batch-wise  
            if np.random.random() < 0.5:
                augmented_batch = []
                for i in range(data_np.shape[0]):  # Iterate over batch
                    aug_seq = apply_temporal_augmentation(
                        data_np[i],  # Shape: [seq_len, num_nodes, num_features]
                        speed_range=CONFIG['temporal_aug_strength']
                    )
                    # Ensure consistent length
                    if aug_seq.shape[0] != CONFIG['max_seq_len']:
                        if aug_seq.shape[0] > CONFIG['max_seq_len']:
                            indices = np.linspace(0, aug_seq.shape[0]-1, CONFIG['max_seq_len'], dtype=int)
                            aug_seq = aug_seq[indices]
                        else:
                            padding = np.zeros((CONFIG['max_seq_len'] - aug_seq.shape[0], 
                                              aug_seq.shape[1], aug_seq.shape[2]))
                            aug_seq = np.concatenate([aug_seq, padding], axis=0)
                    augmented_batch.append(aug_seq)
                data_np = np.array(augmented_batch)
            
            # Convert back to tensor
            data = torch.tensor(data_np, dtype=torch.float32).to(device)
        
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, targets)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
        accuracy = calculate_accuracy(outputs, targets)
        total_accuracy += accuracy
        num_batches += 1
        
        # Update progress bar
        train_pbar.set_postfix({
            'Loss': f'{loss.item():.4f}',
            'Acc': f'{accuracy:.2f}%'
        })
    
    return total_loss / num_batches, total_accuracy / num_batches

def validate_epoch(model, val_loader, criterion, device):
    """Validate for one epoch with tqdm progress bar"""
    model.eval()
    total_loss = 0.0
    total_accuracy = 0.0
    num_batches = 0
    
    # Create progress bar for validation
    val_pbar = tqdm(val_loader, desc='Validation', leave=False)
    
    with torch.no_grad():
        for data, targets in val_pbar:
            data, targets = data.to(device), targets.to(device)
            outputs = model(data)
            loss = criterion(outputs, targets)
            
            total_loss += loss.item()
            accuracy = calculate_accuracy(outputs, targets)
            total_accuracy += accuracy
            num_batches += 1
            
            # Update progress bar
            val_pbar.set_postfix({
                'Loss': f'{loss.item():.4f}',
                'Acc': f'{accuracy:.2f}%'
            })
    
    return total_loss / num_batches, total_accuracy / num_batches

print("✅ Training utilities with tqdm progress bars defined successfully!")

✅ Training utilities with tqdm progress bars defined successfully!


## 🚀 Model Initialization and Training Setup

Initialize the TGCN model and set up the training components.


In [59]:
def train_model_with_progress():
    """Complete training pipeline with progress bars and monitoring"""
    
    if not data_ready:
        print("❌ Data not ready. Please run data loading first.")
        return None
        
    print("🚀 Starting TGCN Training Pipeline")
    print("=" * 60)
    print(f"📊 Dataset: {len(train_dataset)} train, {len(test_dataset)} test")
    print(f"🎯 Classes: {num_classes}")
    print(f"🏗️  Model: {sum(p.numel() for p in model.parameters())} parameters")
    print(f"⚙️  Device: {device}")
    print("=" * 60)
    
    # Initialize tracking
    metrics = MetricsTracker()
    best_model_state = None
    patience_counter = 0
    training_start_time = time.time()
    
    # Create epoch progress bar
    epoch_pbar = trange(CONFIG['num_epochs'], desc='🏋️ Training Epochs', 
                       position=0, leave=True)
    
    for epoch in epoch_pbar:
        epoch_start_time = time.time()
        
        # Training phase with progress bar
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        # Create batch progress bar
        train_pbar = tqdm(train_loader, desc=f'📈 Epoch {epoch+1}/{CONFIG["num_epochs"]}', 
                         position=1, leave=False)
        
        for batch_idx, (data, targets) in enumerate(train_pbar):
            data, targets = data.to(device), targets.to(device)
            batch_size = data.size(0)
            
            # Data augmentation
            if CONFIG['use_augmentation'] and np.random.random() < CONFIG['aug_probability']:
                augmented_batch = []
                for i in range(batch_size):
                    seq = data[i].cpu().numpy()
                    
                    # Apply spatial augmentation
                    if np.random.random() < 0.5:
                        seq = apply_spatial_augmentation(
                            seq, 
                            scale_range=CONFIG['spatial_aug_strength'],
                            translation_range=CONFIG['spatial_aug_strength']
                        )
                    
                    # Apply temporal augmentation
                    if np.random.random() < 0.3:
                        aug_seq = apply_temporal_augmentation(
                            seq, speed_range=CONFIG['temporal_aug_strength']
                        )
                        
                        # Handle sequence length
                        if aug_seq.shape[0] > CONFIG['max_seq_len']:
                            indices = np.linspace(0, aug_seq.shape[0] - 1, 
                                                CONFIG['max_seq_len'], dtype=int)
                            aug_seq = aug_seq[indices]
                        else:
                            padding = np.zeros((CONFIG['max_seq_len'] - aug_seq.shape[0], 
                                              aug_seq.shape[1], aug_seq.shape[2]))
                            aug_seq = np.concatenate([aug_seq, padding], axis=0)
                        seq = aug_seq
                    
                    augmented_batch.append(seq)
                
                data = torch.tensor(np.array(augmented_batch), dtype=torch.float32).to(device)
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, targets)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            # Track metrics
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_total += targets.size(0)
            train_correct += (predicted == targets).sum().item()
            
            # Update progress bar
            current_acc = 100. * train_correct / train_total
            train_pbar.set_postfix({
                'Loss': f'{loss.item():.4f}',
                'Acc': f'{current_acc:.2f}%'
            })
        
        # Calculate epoch metrics
        epoch_train_loss = train_loss / len(train_loader)
        epoch_train_acc = 100. * train_correct / train_total
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            val_pbar = tqdm(test_loader, desc='🔍 Validation', position=1, leave=False)
            for data, targets in val_pbar:
                data, targets = data.to(device), targets.to(device)
                outputs = model(data)
                loss = criterion(outputs, targets)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += targets.size(0)
                val_correct += (predicted == targets).sum().item()
                
                current_val_acc = 100. * val_correct / val_total
                val_pbar.set_postfix({'Acc': f'{current_val_acc:.2f}%'})
        
        epoch_val_loss = val_loss / len(test_loader)
        epoch_val_acc = 100. * val_correct / val_total
        
        # Update learning rate
        scheduler.step(epoch_val_loss)
        current_lr = optimizer.param_groups[0]['lr']
        
        # Track metrics
        metrics.update(
            epoch_train_loss, epoch_train_acc,
            epoch_val_loss, epoch_val_acc, current_lr
        )
        
        # Update epoch progress bar
        epoch_time = time.time() - epoch_start_time
        epoch_pbar.set_postfix({
            'Train Acc': f'{epoch_train_acc:.2f}%',
            'Val Acc': f'{epoch_val_acc:.2f}%',
            'Best': f'{metrics.best_val_acc:.2f}%',
            'Time': f'{epoch_time:.1f}s'
        })
        
        # Print detailed epoch summary
        print(f"\n📊 Epoch {epoch+1}/{CONFIG['num_epochs']} Summary:")
        print(f"   Train - Loss: {epoch_train_loss:.4f}, Acc: {epoch_train_acc:.2f}%")
        print(f"   Val   - Loss: {epoch_val_loss:.4f}, Acc: {epoch_val_acc:.2f}%")
        print(f"   LR: {current_lr:.6f}, Time: {epoch_time:.1f}s")
        
        # Save best model
        if epoch_val_acc > metrics.best_val_acc:
            metrics.best_val_acc = epoch_val_acc
            metrics.best_epoch = epoch
            best_model_state = model.state_dict().copy()
            patience_counter = 0
            
            # Save checkpoint
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'best_val_acc': epoch_val_acc,
                'config': CONFIG
            }, MODEL_SAVE_PATH)
            
            print(f"   💾 New best model saved! (Val Acc: {epoch_val_acc:.2f}%)")
        else:
            patience_counter += 1
            
        # Early stopping
        if patience_counter >= CONFIG['patience']:
            print(f"\n⏹️ Early stopping triggered after {epoch+1} epochs")
            print(f"   Best validation accuracy: {metrics.best_val_acc:.2f}% (epoch {metrics.best_epoch+1})")
            break
        
        # Stop if learning rate is too small
        if current_lr < CONFIG['min_lr']:
            print(f"\n⏹️ Learning rate too small ({current_lr:.2e}), stopping training")
            break
    
    # Training complete
    total_time = time.time() - training_start_time
    epoch_pbar.close()
    
    print(f"\n🏁 Training Complete!")
    print("=" * 60)
    print(f"⏱️  Total time: {total_time/3600:.2f} hours ({total_time/60:.1f} minutes)")
    print(f"🏆 Best validation accuracy: {metrics.best_val_acc:.2f}% (epoch {metrics.best_epoch+1})")
    print(f"🎯 Literature benchmark: 87.60%")
    
    if metrics.best_val_acc >= 87.60:
        print(f"🎉 SUCCESS! We achieved the literature benchmark!")
    elif metrics.best_val_acc >= 80.0:
        print(f"✅ EXCELLENT! Strong performance, very close to benchmark")
    else:
        print(f"📈 Good progress! Room for hyperparameter tuning")
    
    # Load best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print(f"✅ Best model restored")
    
    # Plot training curves
    metrics.plot_metrics()
    
    return metrics

print("✅ Training function with progress bars ready!")

✅ Training function with progress bars ready!


In [60]:
if data_ready:
    # Initialize model
    model = TemporalGCN(
        num_nodes=CONFIG['num_nodes'],
        num_features=CONFIG['num_features'],
        num_classes=train_dataset.num_classes,
        gcn_hidden=CONFIG['gcn_hidden'],
        temporal_kernel=CONFIG['temporal_kernel'],
        dropout=CONFIG['dropout'],
        num_gcn_layers=CONFIG['num_gcn_layers']
    ).to(device)
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"🏗️ Model Architecture:")
    print(f"  Total parameters: {total_params:,}")
    print(f"  Trainable parameters: {trainable_params:,}")
    print(f"  Model size: {total_params * 4 / 1024 / 1024:.2f} MB")
    
    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(
        model.parameters(),
        lr=CONFIG['learning_rate'],
        weight_decay=CONFIG['weight_decay']
    )
    
    # Learning rate scheduler
    scheduler = ReduceLROnPlateau(
        optimizer,
        mode='max',
        factor=0.5,
        patience=5,
        min_lr=CONFIG['min_lr']
    )
    
    # Metrics tracker
    metrics = MetricsTracker()
    
    print(f"✅ Training setup complete!")
    print(f"  Device: {device}")
    print(f"  Optimizer: AdamW")
    print(f"  Scheduler: ReduceLROnPlateau")
    print(f"  Loss function: CrossEntropyLoss")
    
    # Test a forward pass
    model.eval()
    with torch.no_grad():
        test_batch = next(iter(train_loader))
        test_data, test_targets = test_batch[0][:2].to(device), test_batch[1][:2].to(device)
        test_output = model(test_data)
        print(f"\n🧪 Test forward pass:")
        print(f"  Input shape: {test_data.shape}")
        print(f"  Output shape: {test_output.shape}")
        print(f"  Expected output shape: [2, {train_dataset.num_classes}]")
        
        if test_output.shape == (2, train_dataset.num_classes):
            print(f"  ✅ Forward pass successful!")
        else:
            print(f"  ❌ Shape mismatch in forward pass!")

else:
    print("❌ Cannot initialize model. Please fix data issues first.")

IMPROVED Graph: 553 nodes, 308 edges
Added face landmarks and enhanced connectivity for ASL recognition
🏗️ Model Architecture:
  Total parameters: 117,730
  Trainable parameters: 117,730
  Model size: 0.45 MB
✅ Training setup complete!
  Device: cuda
  Optimizer: AdamW
  Scheduler: ReduceLROnPlateau
  Loss function: CrossEntropyLoss

🧪 Test forward pass:
  Input shape: torch.Size([2, 40, 553, 3])
  Output shape: torch.Size([2, 10])
  Expected output shape: [2, 10]
  ✅ Forward pass successful!

🧪 Test forward pass:
  Input shape: torch.Size([2, 40, 553, 3])
  Output shape: torch.Size([2, 10])
  Expected output shape: [2, 10]
  ✅ Forward pass successful!


## 🛠️ Core Ultra-Optimized Training Loop

This section contains the core ultra-optimized training loop with all GPU enhancements.


In [61]:
def _run_ultra_training_loop(compiled_model, scaler, metrics, best_model_state, 
                            patience_counter, training_start_time, batch_times, 
                            epoch_times, samples_per_second):
    """Core ultra-optimized training loop with all GPU enhancements"""
    
    # Create epoch progress bar
    epoch_pbar = trange(CONFIG['num_epochs'], desc='🚀 Ultra-Fast Training', 
                       position=0, leave=True)
    
    for epoch in epoch_pbar:
        epoch_start_time = time.time()
        
        # Training phase with ultra optimizations
        compiled_model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        # Create batch progress bar with speed metrics
        train_pbar = tqdm(train_loader, desc=f'⚡ Epoch {epoch+1}/{CONFIG["num_epochs"]}', 
                         position=1, leave=False)
        
        for batch_idx, (data, targets) in enumerate(train_pbar):
            batch_iteration_start = time.time()
            
            # 🚀 Optimized data transfer with non-blocking
            if CONFIG['non_blocking']:
                data = data.to(device, non_blocking=True)
                targets = targets.to(device, non_blocking=True)
            else:
                data, targets = data.to(device), targets.to(device)
            
            # 🚀 Apply channels-last format if enabled
            if CONFIG['use_channels_last'] and torch.cuda.is_available():
                try:
                    data = data.to(memory_format=torch.channels_last)
                except:
                    pass  # Fallback silently
            
            # 🚀 Forward pass with mixed precision
            optimizer.zero_grad()
            
            if scaler is not None:
                # Mixed precision forward pass
                with torch.cuda.amp.autocast():
                    outputs = compiled_model(data)
                    loss = criterion(outputs, targets)
                
                # Mixed precision backward pass
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(compiled_model.parameters(), max_norm=1.0)
                scaler.step(optimizer)
                scaler.update()
            else:
                # Standard precision
                outputs = compiled_model(data)
                loss = criterion(outputs, targets)
                
                loss.backward()
                torch.nn.utils.clip_grad_norm_(compiled_model.parameters(), max_norm=1.0)
                optimizer.step()
            
            # Track metrics and speed
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_total += targets.size(0)
            train_correct += (predicted == targets).sum().item()
            
            # 🚀 Speed tracking
            batch_end_time = time.time()
            batch_time = batch_end_time - batch_iteration_start
            batch_times.append(batch_time)
            
            samples_processed = targets.size(0)
            current_speed = samples_processed / batch_time
            samples_per_second.append(current_speed)
            
            # Calculate current metrics
            current_acc = 100. * train_correct / train_total
            avg_speed = np.mean(samples_per_second[-10:]) if len(samples_per_second) >= 10 else np.mean(samples_per_second)
            
            # Update progress bar with speed info
            train_pbar.set_postfix({
                'Loss': f'{loss.item():.4f}',
                'Acc': f'{current_acc:.2f}%',
                'Speed': f'{avg_speed:.1f}s/s',
                'Batch': f'{batch_time*1000:.1f}ms'
            })
            
            # Memory cleanup every 50 batches
            if batch_idx % 50 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        # Validation and epoch completion
        epoch_metrics = _run_validation_and_update(compiled_model, scaler, epoch, train_loss, 
                                                  train_correct, train_total, metrics, 
                                                  best_model_state, patience_counter,
                                                  epoch_start_time, epoch_times, samples_per_second,
                                                  epoch_pbar)
        
        if epoch_metrics is None:  # Early stopping or target achieved
            break
        
        best_model_state, patience_counter = epoch_metrics
        
        # Memory cleanup between epochs
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    return _finalize_ultra_training(metrics, best_model_state, compiled_model, 
                                   training_start_time, samples_per_second, epoch_times)

def _run_validation_and_update(compiled_model, scaler, epoch, train_loss, train_correct, 
                              train_total, metrics, best_model_state, patience_counter,
                              epoch_start_time, epoch_times, samples_per_second, epoch_pbar):
    """Ultra-fast validation with mixed precision and metric updates"""
    
    # Calculate epoch metrics
    epoch_train_loss = train_loss / len(train_loader)
    epoch_train_acc = 100. * train_correct / train_total
    
    # 🚀 Ultra-fast validation with mixed precision
    compiled_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        val_pbar = tqdm(test_loader, desc='⚡ Ultra-Fast Val', position=1, leave=False)
        for data, targets in val_pbar:
            if CONFIG['non_blocking']:
                data = data.to(device, non_blocking=True)
                targets = targets.to(device, non_blocking=True)
            else:
                data, targets = data.to(device), targets.to(device)
            
            # Apply channels-last if enabled
            if CONFIG['use_channels_last'] and torch.cuda.is_available():
                try:
                    data = data.to(memory_format=torch.channels_last)
                except:
                    pass
            
            # Mixed precision inference
            if scaler is not None:
                with torch.cuda.amp.autocast():
                    outputs = compiled_model(data)
                    loss = criterion(outputs, targets)
            else:
                outputs = compiled_model(data)
                loss = criterion(outputs, targets)
            
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_total += targets.size(0)
            val_correct += (predicted == targets).sum().item()
            
            current_val_acc = 100. * val_correct / val_total
            val_pbar.set_postfix({'Acc': f'{current_val_acc:.2f}%'})
    
    epoch_val_loss = val_loss / len(test_loader)
    epoch_val_acc = 100. * val_correct / val_total
    
    # Update learning rate
    scheduler.step(epoch_val_loss)
    current_lr = optimizer.param_groups[0]['lr']
    
    # Track metrics
    metrics.update(epoch_train_loss, epoch_train_acc, epoch_val_loss, epoch_val_acc, current_lr)
    
    # 🚀 Speed calculations
    epoch_time = time.time() - epoch_start_time
    epoch_times.append(epoch_time)
    
    total_samples = len(train_dataset) + len(test_dataset)
    epoch_speed = total_samples / epoch_time
    avg_epoch_speed = np.mean(epoch_times[-5:]) if len(epoch_times) >= 5 else np.mean(epoch_times)
    
    # Estimate remaining time
    remaining_epochs = CONFIG['num_epochs'] - (epoch + 1)
    estimated_time_remaining = remaining_epochs * avg_epoch_speed / 60  # minutes
    
    # Update epoch progress bar with ultra-detailed metrics
    epoch_pbar.set_postfix({
        'Train Acc': f'{epoch_train_acc:.2f}%',
        'Val Acc': f'{epoch_val_acc:.2f}%',
        'Best': f'{metrics.best_val_acc:.2f}%',
        'Speed': f'{epoch_speed:.0f}s/s',
        'ETA': f'{estimated_time_remaining:.1f}m'
    })
    
    # Print ultra-detailed epoch summary
    print(f"\n⚡ ULTRA-FAST Epoch {epoch+1}/{CONFIG['num_epochs']} Summary:")
    print(f"   📈 Train - Loss: {epoch_train_loss:.4f}, Acc: {epoch_train_acc:.2f}%")
    print(f"   📊 Val   - Loss: {epoch_val_loss:.4f}, Acc: {epoch_val_acc:.2f}%")
    print(f"   🏆 Best  - Val Acc: {metrics.best_val_acc:.2f}% (epoch {metrics.best_epoch+1})")
    print(f"   ⚡ Speed - {epoch_speed:.0f} samples/sec, Epoch: {epoch_time:.1f}s")
    print(f"   🕒 ETA   - {estimated_time_remaining:.1f} minutes remaining")
    print(f"   🔧 LR    - {current_lr:.6f}")
    
    # Save best model with enhanced info
    if epoch_val_acc > metrics.best_val_acc:
        metrics.best_val_acc = epoch_val_acc
        metrics.best_epoch = epoch
        best_model_state = compiled_model.state_dict().copy()
        patience_counter = 0
        
        # Save checkpoint with speed metrics
        torch.save({
            'epoch': epoch,
            'model_state_dict': compiled_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'scaler_state_dict': scaler.state_dict() if scaler else None,
            'best_val_acc': epoch_val_acc,
            'config': CONFIG,
            'class_mapping': class_mapping,
            'speed_metrics': {
                'epoch_speed': epoch_speed,
                'avg_samples_per_second': np.mean(samples_per_second[-100:]) if samples_per_second else 0
            }
        }, MODEL_SAVE_PATH)
        
        print(f"   💾 🚀 NEW BEST MODEL SAVED! Val Acc: {epoch_val_acc:.2f}%")
        
        # Check if we've achieved the target
        if epoch_val_acc >= 87.60:
            print(f"\n🎉 🏆 TARGET ACHIEVED! 🏆 🎉")
            print(f"Ultra-fast training reached {epoch_val_acc:.2f}% >= 87.60% benchmark!")
            return None  # Signal to stop training
    else:
        patience_counter += 1
        print(f"   ⏳ Patience: {patience_counter}/{CONFIG['patience']}")
        
        # Early stopping
        if patience_counter >= CONFIG['patience']:
            print(f"\n⏹️ Early stopping triggered at epoch {epoch+1}")
            print(f"Best validation accuracy: {metrics.best_val_acc:.2f}% (epoch {metrics.best_epoch+1})")
            return None  # Signal to stop training
    
    # Stop if learning rate becomes too small
    if current_lr < CONFIG['min_lr']:
        print(f"\n⏹️ Learning rate too small ({current_lr:.2e}), stopping training")
        return None  # Signal to stop training
    
    return best_model_state, patience_counter

def _finalize_ultra_training(metrics, best_model_state, compiled_model, training_start_time, 
                           samples_per_second, epoch_times):
    """Finalize ultra-optimized training with results and analysis"""
    
    total_time = time.time() - training_start_time
    
    print(f"\n🏁 🚀 ULTRA-FAST TRAINING COMPLETE! 🚀 🏁")
    print("=" * 70)
    print(f"⏱️  Total time: {total_time/3600:.2f} hours ({total_time/60:.1f} minutes)")
    print(f"🏆 Best validation accuracy: {metrics.best_val_acc:.2f}% (epoch {metrics.best_epoch+1})")
    print(f"🎯 Literature benchmark: 87.60%")
    print(f"⚡ Average speed: {np.mean(samples_per_second):.0f} samples/second")
    print(f"🔥 Average epoch time: {np.mean(epoch_times):.1f} seconds")
    
    # Performance analysis
    if metrics.best_val_acc >= 87.60:
        print(f"🎉 🎉 🎉 SUCCESS! BENCHMARK ACHIEVED! 🎉 🎉 🎉")
        print(f"Ultra-optimized training achieved {metrics.best_val_acc:.2f}% >= 87.60%!")
    elif metrics.best_val_acc >= 85.0:
        print(f"🔥 🔥 EXCELLENT! Very close to benchmark! 🔥 🔥")
        print(f"Achieved {metrics.best_val_acc:.2f}% - only {87.60 - metrics.best_val_acc:.2f}% away!")
    elif metrics.best_val_acc >= 80.0:
        print(f"✅ ✅ GREAT! Strong performance! ✅ ✅")
    else:
        print(f"📈 Good progress! Consider hyperparameter tuning.")
    
    # Speed analysis
    if samples_per_second and epoch_times:
        print(f"\n⚡ SPEED ANALYSIS:")
        print(f"   🚀 Peak speed: {max(samples_per_second):.0f} samples/second")
        print(f"   📊 Average speed: {np.mean(samples_per_second):.0f} samples/second")
        print(f"   🔥 Fastest epoch: {min(epoch_times):.1f} seconds")
        print(f"   ⚡ Total speedup: ~{len(samples_per_second) * np.mean(samples_per_second) / max(1, total_time):.1f}x")
    
    # Load best model
    if best_model_state is not None:
        compiled_model.load_state_dict(best_model_state)
        print(f"✅ Best model restored for evaluation")
    
    # Plot training curves
    print("\n📈 Plotting ultra-fast training metrics...")
    metrics.plot_metrics()
    
    return metrics

## 🚀 Execute Ultra-Optimized Training

Run the complete ultra-optimized training pipeline with all GPU enhancements enabled.


In [62]:
def train_model_ultra_optimized():
    """Ultra-optimized training function with all GPU enhancements"""
    
    print("🚀 Starting ultra-fast training with GPU optimizations...")
    print("🔥 Features: Mixed Precision + Model Compilation + Speed Tracking")
    print(f"📊 Dataset: {len(train_dataset)} train, {len(test_dataset)} test")
    print(f"🎯 Classes: {train_dataset.num_classes}")
    print(f"🏗️ Model: {sum(p.numel() for p in model.parameters()):,} parameters")
    print(f"⚙️ Device: {device}")
    print(f"🎯 Target: 87.60% accuracy (WLASL-100 benchmark)")
    
    # Initialize training components
    try:
        # Enable optimizations
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        # Initialize mixed precision scaler
        scaler = GradScaler() if CONFIG['use_mixed_precision'] and torch.cuda.is_available() else None
        
        # Compile model for PyTorch 2.0+ optimizations
        compilation_successful = False
        if CONFIG['compile_model']:
            try:
                compiled_model = torch.compile(model, mode='reduce-overhead')
                compilation_successful = True
                print("✅ Model compilation successful!")
            except Exception as e:
                compiled_model = model
                print(f"⚠️ Model compilation failed: {e}")
                print("   Falling back to regular model")
        else:
            compiled_model = model
            print("ℹ️ Model compilation disabled in config")
        
        # Initialize metrics tracker
        metrics = MetricsTracker()
        
        # Training variables
        best_model_state = None
        patience_counter = 0
        training_start_time = time.time()
        batch_times = []
        epoch_times = []
        samples_per_second = []
        
        print("\n🚀 Starting ultra-optimized training loop...")
        
        # Execute the core training loop
        result = _run_ultra_training_loop(
            compiled_model, scaler, metrics, best_model_state,
            patience_counter, training_start_time, batch_times,
            epoch_times, samples_per_second
        )
        
        if result is not None:
            best_model_state, patience_counter = result
        
        # Finalize training
        final_metrics = _finalize_ultra_training(
            metrics, best_model_state, compiled_model, training_start_time,
            samples_per_second, epoch_times
        )
        
        return final_metrics
        
    except Exception as e:
        print(f"❌ Training failed with error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

In [None]:
if data_ready:
    print("🚀 STARTING ULTRA-OPTIMIZED TGCN TRAINING 🚀")
    print("=" * 70)
    print(f"📊 Dataset: {len(train_dataset)} train, {len(test_dataset)} test samples")
    print(f"🎯 Classes: {train_dataset.num_classes}")
    print(f"🏗️ Model: {sum(p.numel() for p in model.parameters()):,} parameters")
    print(f"⚙️ Device: {device}")
    print(f"🎯 Target: 87.60% accuracy (WLASL-100 benchmark)")
    print("\n" + "="*70)
    
    # Start ultra-optimized training
    training_metrics = train_model_ultra_optimized()
    
    if training_metrics:
        print(f"\n🏆 ULTRA-FAST Training Results:")
        print(f"   🎯 Best Accuracy: {training_metrics.best_val_acc:.2f}%")
        print(f"   📅 Best Epoch: {training_metrics.best_epoch + 1}")
        print(f"   💾 Model saved: {MODEL_SAVE_PATH}")
        
        # Show final comparison
        if training_metrics.best_val_acc >= 87.60:
            print(f"\n🎉 🎉 🎉 SUCCESS! 🎉 🎉 🎉")
            print(f"BENCHMARK ACHIEVED: {training_metrics.best_val_acc:.2f}% >= 87.60%!")
            print(f"🚀 Ultra-optimized training was successful!")
        elif training_metrics.best_val_acc >= 85.0:
            print(f"\n🔥 EXCELLENT! Very close to benchmark!")
            print(f"Achieved {training_metrics.best_val_acc:.2f}% - only {87.60 - training_metrics.best_val_acc:.2f}% away!")
        elif training_metrics.best_val_acc >= 80.0:
            print(f"\n✅ GREAT! Strong performance achieved!")
        else:
            print(f"\n📈 Good progress! Consider hyperparameter tuning.")
    
    # Save training metrics
    training_results = {
        'best_val_acc': training_metrics.best_val_acc,
        'best_epoch': training_metrics.best_epoch,
        'total_epochs': len(training_metrics.val_accuracies),
        'final_train_acc': training_metrics.train_accuracies[-1] if training_metrics.train_accuracies else 0,
        'config': CONFIG,
        'timestamp': time.strftime('%Y%m%d_%H%M%S')
    }
    
    with open(os.path.join(CHECKPOINT_DIR, 'training_results.json'), 'w') as f:
        json.dump(training_results, f, indent=2)
    
    print(f"\n💾 Training results saved to: {CHECKPOINT_DIR}/training_results.json")
    print(f"Model saved to: {MODEL_SAVE_PATH}")

else:
    print("❌ Cannot start training. Please fix data issues first.")

🚀 STARTING ULTRA-OPTIMIZED TGCN TRAINING 🚀
📊 Dataset: 135 train, 34 test samples
🎯 Classes: 10
🏗️ Model: 117,730 parameters
⚙️ Device: cuda
🎯 Target: 87.60% accuracy (WLASL-100 benchmark)

🚀 Starting ultra-fast training with GPU optimizations...
🔥 Features: Mixed Precision + Model Compilation + Speed Tracking
📊 Dataset: 135 train, 34 test
🎯 Classes: 10
🏗️ Model: 117,730 parameters
⚙️ Device: cuda
🎯 Target: 87.60% accuracy (WLASL-100 benchmark)
ℹ️ Model compilation disabled in config

🚀 Starting ultra-optimized training loop...


🚀 Ultra-Fast Training:   0%|          | 0/50 [00:00<?, ?it/s]

⚡ Epoch 1/50:   0%|          | 0/6 [00:00<?, ?it/s]

⚡ Ultra-Fast Val:   0%|          | 0/2 [00:00<?, ?it/s]


⚡ ULTRA-FAST Epoch 1/50 Summary:
   📈 Train - Loss: 2.5620, Acc: 9.63%
   📊 Val   - Loss: 2.2970, Acc: 14.71%
   🏆 Best  - Val Acc: 14.71% (epoch 1)
   ⚡ Speed - 7 samples/sec, Epoch: 22.7s
   🕒 ETA   - 18.5 minutes remaining
   🔧 LR    - 0.002000
   ⏳ Patience: 1/10


⚡ Epoch 2/50:   0%|          | 0/6 [00:00<?, ?it/s]

⚡ Ultra-Fast Val:   0%|          | 0/2 [00:00<?, ?it/s]


⚡ ULTRA-FAST Epoch 2/50 Summary:
   📈 Train - Loss: 2.3577, Acc: 11.11%
   📊 Val   - Loss: 2.2981, Acc: 11.76%
   🏆 Best  - Val Acc: 14.71% (epoch 1)
   ⚡ Speed - 8 samples/sec, Epoch: 22.2s
   🕒 ETA   - 18.0 minutes remaining
   🔧 LR    - 0.002000
   ⏳ Patience: 2/10


⚡ Epoch 3/50:   0%|          | 0/6 [00:00<?, ?it/s]

⚡ Ultra-Fast Val:   0%|          | 0/2 [00:00<?, ?it/s]


⚡ ULTRA-FAST Epoch 3/50 Summary:
   📈 Train - Loss: 2.2906, Acc: 9.63%
   📊 Val   - Loss: 2.3107, Acc: 14.71%
   🏆 Best  - Val Acc: 14.71% (epoch 1)
   ⚡ Speed - 8 samples/sec, Epoch: 22.3s
   🕒 ETA   - 17.6 minutes remaining
   🔧 LR    - 0.002000
   ⏳ Patience: 3/10


⚡ Epoch 4/50:   0%|          | 0/6 [00:00<?, ?it/s]

⚡ Ultra-Fast Val:   0%|          | 0/2 [00:00<?, ?it/s]


⚡ ULTRA-FAST Epoch 4/50 Summary:
   📈 Train - Loss: 2.3114, Acc: 11.11%
   📊 Val   - Loss: 2.3036, Acc: 11.76%
   🏆 Best  - Val Acc: 14.71% (epoch 1)
   ⚡ Speed - 8 samples/sec, Epoch: 21.8s
   🕒 ETA   - 17.1 minutes remaining
   🔧 LR    - 0.002000
   ⏳ Patience: 4/10


⚡ Epoch 5/50:   0%|          | 0/6 [00:00<?, ?it/s]

⚡ Ultra-Fast Val:   0%|          | 0/2 [00:00<?, ?it/s]


⚡ ULTRA-FAST Epoch 5/50 Summary:
   📈 Train - Loss: 2.2977, Acc: 13.33%
   📊 Val   - Loss: 2.2972, Acc: 14.71%
   🏆 Best  - Val Acc: 14.71% (epoch 1)
   ⚡ Speed - 8 samples/sec, Epoch: 21.6s
   🕒 ETA   - 16.6 minutes remaining
   🔧 LR    - 0.002000
   ⏳ Patience: 5/10


⚡ Epoch 6/50:   0%|          | 0/6 [00:00<?, ?it/s]

⚡ Ultra-Fast Val:   0%|          | 0/2 [00:00<?, ?it/s]


⚡ ULTRA-FAST Epoch 6/50 Summary:
   📈 Train - Loss: 2.3020, Acc: 6.67%
   📊 Val   - Loss: 2.2919, Acc: 14.71%
   🏆 Best  - Val Acc: 14.71% (epoch 1)
   ⚡ Speed - 6 samples/sec, Epoch: 26.3s
   🕒 ETA   - 16.8 minutes remaining
   🔧 LR    - 0.002000
   ⏳ Patience: 6/10


⚡ Epoch 7/50:   0%|          | 0/6 [00:00<?, ?it/s]

## 📊 Model Evaluation and Analysis

Evaluate the trained model and analyze its performance in detail.


## 📊 Model Evaluation and Analysis

Evaluate the trained model and analyze its performance in detail.


In [None]:
def evaluate_model(model, test_loader, class_mapping, device):
    """Comprehensive model evaluation with detailed metrics"""
    model.eval()
    
    all_predictions = []
    all_targets = []
    all_probabilities = []
    
    print("🔍 Evaluating model on test set...")
    
    with torch.no_grad():
        test_pbar = tqdm(test_loader, desc='Evaluation')
        for data, targets in test_pbar:
            data, targets = data.to(device), targets.to(device)
            
            outputs = model(data)
            probabilities = F.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)
            
            all_predictions.extend(predicted.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())
    
    # Calculate overall accuracy
    accuracy = accuracy_score(all_targets, all_predictions)
    
    # Generate classification report
    class_names = [class_mapping['idx_to_word'][str(i)] for i in range(len(class_mapping['idx_to_word']))]
    report = classification_report(all_targets, all_predictions, 
                                 target_names=class_names, 
                                 output_dict=True, zero_division=0)
    
    # Generate confusion matrix
    cm = confusion_matrix(all_targets, all_predictions)
    
    print(f"\n📊 Evaluation Results:")
    print(f"  Overall Accuracy: {accuracy*100:.2f}%")
    print(f"  Macro Avg F1-Score: {report['macro avg']['f1-score']:.3f}")
    print(f"  Weighted Avg F1-Score: {report['weighted avg']['f1-score']:.3f}")
    
    # Per-class analysis
    print(f"\n📋 Per-Class Performance:")
    class_accuracies = []
    for i, class_name in enumerate(class_names):
        if str(i) in report and isinstance(report[str(i)], dict):
            precision = report[str(i)]['precision']
            recall = report[str(i)]['recall']
            f1 = report[str(i)]['f1-score']
            support = report[str(i)]['support']
            
            # Calculate per-class accuracy
            class_mask = np.array(all_targets) == i
            if np.sum(class_mask) > 0:
                class_acc = accuracy_score(
                    np.array(all_targets)[class_mask],
                    np.array(all_predictions)[class_mask]
                )
                class_accuracies.append((class_name, class_acc, np.sum(class_mask)))
    
    # Sort by accuracy
    class_accuracies.sort(key=lambda x: x[1], reverse=True)
    
    print(f"\n🎯 Top 10 Best Performing Classes:")
    for name, acc, count in class_accuracies[:10]:
        print(f"  {name}: {acc*100:.1f}% ({count} samples)")
    
    print(f"\n🎯 Bottom 10 Performing Classes:")
    for name, acc, count in class_accuracies[-10:]:
        print(f"  {name}: {acc*100:.1f}% ({count} samples)")
    
    return accuracy, report, cm

if data_ready and os.path.exists(MODEL_SAVE_PATH):
    print("🔄 Loading best model for evaluation...")
    
    # Load the best model
    checkpoint = torch.load(MODEL_SAVE_PATH, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    
    print(f"✅ Model loaded from epoch {checkpoint['epoch']+1}")
    print(f"Best validation accuracy: {checkpoint['best_val_acc']:.2f}%")
    
    # Evaluate on test set
    accuracy, report, cm = evaluate_model(model, test_loader, class_mapping, device)
    
    # Compare with literature
    print(f"\n🏆 Performance Comparison:")
    print(f"Our Model: {accuracy*100:.2f}%")
    print(f"Literature Benchmark (WLASL-100): 87.60%")
    
    if accuracy*100 >= 87.60:
        print(f"🎉 SUCCESS! We've achieved the literature benchmark!")
    elif accuracy*100 >= 80.0:
        print(f"✅ GOOD! Strong performance, close to benchmark")
    elif accuracy*100 >= 70.0:
        print(f"⚠️ MODERATE: Decent performance, room for improvement")
    else:
        print(f"❌ POOR: Significant improvement needed")
    
    # Save evaluation results
    evaluation_results = {
        'accuracy': accuracy,
        'classification_report': report,
        'confusion_matrix': cm.tolist(),
        'model_path': MODEL_SAVE_PATH,
        'timestamp': time.strftime('%Y%m%d_%H%M%S')
    }
    
    eval_path = os.path.join(CHECKPOINT_DIR, 'evaluation_results.json')
    with open(eval_path, 'w') as f:
        json.dump(evaluation_results, f, indent=2, default=str)
    
    print(f"\n💾 Evaluation results saved to: {eval_path}")
    
else:
    print("❌ Model not found. Please train the model first.")

## 🎯 Training Summary and Next Steps

Final summary of the ultra-optimized TGCN training results and recommendations for next steps.


In [None]:
print("\n" + "="*70)
print("🎯 ULTRA-OPTIMIZED TGCN TRAINING SUMMARY")
print("="*70)

if data_ready and os.path.exists(MODEL_SAVE_PATH):
    # Load final results
    checkpoint = torch.load(MODEL_SAVE_PATH, map_location='cpu')
    final_accuracy = checkpoint.get('best_val_acc', 0)
    
    print(f"📊 Final Results:")
    print(f"   🎯 Best Validation Accuracy: {final_accuracy:.2f}%")
    print(f"   📈 Target Benchmark: 87.60%")
    print(f"   📁 Model saved at: {MODEL_SAVE_PATH}")
    
    # Performance assessment
    if final_accuracy >= 87.60:
        print(f"\n🎉 🏆 OUTSTANDING SUCCESS! 🏆 🎉")
        print(f"✅ Achieved benchmark: {final_accuracy:.2f}% >= 87.60%")
        print(f"🚀 Ultra-optimized training was highly effective!")
        status = "SUCCESS"
    elif final_accuracy >= 85.0:
        print(f"\n🔥 EXCELLENT PERFORMANCE! 🔥")
        print(f"✅ Very close to benchmark: {final_accuracy:.2f}%")
        print(f"📈 Gap: {87.60 - final_accuracy:.2f}%")
        status = "EXCELLENT"
    elif final_accuracy >= 80.0:
        print(f"\n✅ GOOD PERFORMANCE!")
        print(f"📊 Achieved: {final_accuracy:.2f}%")
        print(f"📈 Gap: {87.60 - final_accuracy:.2f}%")
        status = "GOOD"
    elif final_accuracy >= 70.0:
        print(f"\n⚠️ MODERATE PERFORMANCE")
        print(f"📊 Achieved: {final_accuracy:.2f}%")
        print(f"📈 Needs improvement: {87.60 - final_accuracy:.2f}%")
        status = "MODERATE"
    else:
        print(f"\n❌ POOR PERFORMANCE")
        print(f"📊 Achieved: {final_accuracy:.2f}%")
        print(f"🔧 Significant improvements needed")
        status = "POOR"
    
    print(f"\n🚀 Optimization Features Used:")
    print(f"   ✅ Mixed Precision Training (AMP)")
    print(f"   ✅ PyTorch 2.0 Model Compilation")
    print(f"   ✅ cuDNN Optimizations")
    print(f"   ✅ Memory Layout Optimization")
    print(f"   ✅ Async GPU Transfers")
    print(f"   ✅ Gradient Clipping & Scaling")
    print(f"   ✅ Early Stopping & LR Scheduling")
    print(f"   ✅ Data Augmentation")
    
    print(f"\n📋 Next Steps Based on Performance:")
    
    if status == "SUCCESS":
        print(f"   🎯 Model is ready for deployment!")
        print(f"   📊 Consider inference optimization")
        print(f"   🔄 Test on additional datasets")
        print(f"   📝 Document deployment procedures")
    
    elif status in ["EXCELLENT", "GOOD"]:
        print(f"   🔧 Fine-tune hyperparameters:")
        print(f"      - Increase model capacity (gcn_hidden: 256+)")
        print(f"      - More training epochs with lower learning rate")
        print(f"      - Stronger data augmentation")
        print(f"   📊 Ensemble methods")
        print(f"   🎯 Test different architectures")
    
    else:
        print(f"   🔧 Major improvements needed:")
        print(f"      - Check data quality and preprocessing")
        print(f"      - Increase model complexity")
        print(f"      - More training data")
        print(f"      - Learning rate scheduling")
        print(f"   📊 Consider different model architectures")
        print(f"   🐛 Debug training pipeline")
    
    print(f"\n💾 Files Generated:")
    print(f"   🔹 Model: {MODEL_SAVE_PATH}")
    print(f"   🔹 Class mapping: {CHECKPOINT_DIR}/class_mapping.json")
    print(f"   🔹 Training results: {CHECKPOINT_DIR}/training_results.json")
    if os.path.exists(os.path.join(CHECKPOINT_DIR, 'evaluation_results.json')):
        print(f"   🔹 Evaluation: {CHECKPOINT_DIR}/evaluation_results.json")

else:
    print(f"❌ Training incomplete or failed.")
    print(f"📋 Troubleshooting steps:")
    print(f"   1. Check data directory: {DATA_DIR}")
    print(f"   2. Verify keypoint extraction")
    print(f"   3. Check GPU memory and CUDA")
    print(f"   4. Review error messages above")

print(f"\n🎉 Ultra-Optimized TGCN Pipeline Complete!")
print("="*70)