In [1]:
# ===== RESTART KERNEL FIRST AND RUN THIS COMPLETE CODE =====
# OPTIMIZED 5-8M PARAMETER MODEL WITH PROGRESS BAR AND EVERY EPOCH RESULTS

# Restart your kernel first, then run this
import sys
import os
import gc
import math
import warnings
import time
from datetime import datetime
warnings.filterwarnings('ignore')

# Safe torch imports
try:
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    print("✅ PyTorch loaded successfully")
except ImportError as e:
    print(f"❌ PyTorch import failed: {e}")
    sys.exit(1)

# Progress bar import
try:
    from tqdm import tqdm
    print("✅ tqdm loaded for progress bars")
except ImportError:
    print("⚠️ tqdm not found, using basic progress")
    class tqdm:
        def __init__(self, iterable=None, total=None, desc="", unit="epoch"):
            self.iterable = iterable if iterable else range(total)
            self.total = total
            self.desc = desc
            self.current = 0
            
        def __iter__(self):
            return self
            
        def __next__(self):
            if self.current >= len(self.iterable):
                raise StopIteration
            value = self.iterable[self.current]
            self.current += 1
            progress = self.current / len(self.iterable) * 100
            print(f"\r{self.desc} [{self.current}/{len(self.iterable)}] {progress:.1f}%", end='', flush=True)
            return value
            
        def set_postfix(self, **kwargs):
            pass

# Safe PyTorch Geometric imports with fallback
try:
    import torch_geometric
    from torch_geometric.nn import HeteroConv, GATConv, SAGEConv
    from torch_geometric.nn import Linear
    print("✅ PyTorch Geometric loaded successfully")
except (ImportError, AttributeError) as e:
    print(f"⚠️ PyTorch Geometric issue: {e}")
    print("🔧 Using fallback Linear layer...")
    Linear = nn.Linear

# Check CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")

class OptimizedGNN58M(nn.Module):
    """Optimized 5-8M Parameter GNN for Maximum Accuracy"""
    def __init__(self, sequence_features, problem_features, skill_features,
                 hidden_dim=320, num_layers=5, num_heads=20, dropout=0.16, device='cuda'):
        super(OptimizedGNN58M, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.num_heads = num_heads
        self.device = device
        
        # OPTIMIZED input projections - 3x expansion (reduced from 4x)
        self.sequence_input_proj = nn.Sequential(
            nn.Linear(sequence_features, hidden_dim * 3),  # 3x instead of 4x
            nn.BatchNorm1d(hidden_dim * 3),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 3, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU()
        ).to(device)
        
        self.problem_input_proj = nn.Sequential(
            nn.Linear(problem_features, hidden_dim * 3),
            nn.BatchNorm1d(hidden_dim * 3),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 3, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU()
        ).to(device)
        
        self.skill_input_proj = nn.Sequential(
            nn.Linear(skill_features, hidden_dim * 3),
            nn.BatchNorm1d(hidden_dim * 3),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 3, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU()
        ).to(device)
        
        # Optimized heterogeneous layers - 5 layers (reduced from 6)
        self.conv_layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList()
        self.residual_projections = nn.ModuleList()
        
        for i in range(num_layers):
            try:
                conv_dict = {
                    ('sequence', 'predicts', 'problem'): GATConv(
                        hidden_dim, hidden_dim // num_heads, heads=num_heads,
                        dropout=dropout, add_self_loops=False, edge_dim=9, concat=True
                    ),
                    ('problem', 'predicted_by', 'sequence'): GATConv(
                        hidden_dim, hidden_dim // num_heads, heads=num_heads,
                        dropout=dropout, add_self_loops=False, concat=True
                    ),
                    ('problem', 'requires', 'skill'): SAGEConv(
                        (hidden_dim, hidden_dim), hidden_dim, aggr='mean'
                    ),
                    ('skill', 'required_by', 'problem'): SAGEConv(
                        (hidden_dim, hidden_dim), hidden_dim, aggr='mean'
                    ),
                }
                self.conv_layers.append(HeteroConv(conv_dict, aggr='mean').to(device))
            except Exception as e:
                print(f"⚠️ Graph layer {i} failed: {e}")
                fallback_layer = nn.Linear(hidden_dim, hidden_dim).to(device)
                self.conv_layers.append(fallback_layer)
            
            bn_dict = nn.ModuleDict({
                'sequence': nn.Sequential(nn.BatchNorm1d(hidden_dim), nn.Dropout(dropout)).to(device),
                'problem': nn.Sequential(nn.BatchNorm1d(hidden_dim), nn.Dropout(dropout)).to(device),
                'skill': nn.Sequential(nn.BatchNorm1d(hidden_dim), nn.Dropout(dropout)).to(device)
            })
            self.batch_norms.append(bn_dict)
            
            residual_dict = nn.ModuleDict({
                'sequence': nn.Linear(hidden_dim, hidden_dim, bias=False) if i > 0 else nn.Identity(),
                'problem': nn.Linear(hidden_dim, hidden_dim, bias=False) if i > 0 else nn.Identity(),
                'skill': nn.Linear(hidden_dim, hidden_dim, bias=False) if i > 0 else nn.Identity()
            })
            self.residual_projections.append(residual_dict.to(device))
        
        # OPTIMIZED temporal encoder - 4x expansion (reduced from 5x)
        self.temporal_encoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 4),
            nn.BatchNorm1d(hidden_dim * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 4, hidden_dim * 2),
            nn.BatchNorm1d(hidden_dim * 2),
            nn.GELU(),
            nn.Dropout(dropout * 0.8),
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU()
        ).to(device)
        
        # OPTIMIZED predictor - Reduced depth but maintained capacity
        self.predictor = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim * 4),  # 4x expansion
            nn.BatchNorm1d(hidden_dim * 4),
            nn.GELU(),
            nn.Dropout(dropout * 2.0),
            
            nn.Linear(hidden_dim * 4, hidden_dim * 3),  # 3x layer
            nn.BatchNorm1d(hidden_dim * 3),
            nn.GELU(),
            nn.Dropout(dropout * 1.6),
            
            nn.Linear(hidden_dim * 3, hidden_dim * 2),  # 2x layer
            nn.BatchNorm1d(hidden_dim * 2),
            nn.GELU(),
            nn.Dropout(dropout * 1.2),
            
            nn.Linear(hidden_dim * 2, hidden_dim),  # 1x layer
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            
            nn.Linear(hidden_dim, hidden_dim // 2),  # 0.5x layer
            nn.BatchNorm1d(hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout * 0.6),
            
            nn.Linear(hidden_dim // 2, 1)  # Output
        ).to(device)
        
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_normal_(module.weight, gain=1.05)  # Slightly reduced gain
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.BatchNorm1d):
            nn.init.ones_(module.weight)
            nn.init.zeros_(module.bias)
    
    def safe_hetero_conv(self, conv_layer, x_dict, edge_index_dict, edge_attr_dict=None):
        try:
            if hasattr(conv_layer, 'convs'):
                if edge_attr_dict:
                    return conv_layer(x_dict, edge_index_dict, edge_attr_dict)
                else:
                    return conv_layer(x_dict, edge_index_dict)
            else:
                return {k: conv_layer(v) for k, v in x_dict.items()}
        except Exception as e:
            return x_dict
    
    def forward(self, data):
        try:
            x_dict = {
                'sequence': self.sequence_input_proj(data['sequence'].x.to(self.device)),
                'problem': self.problem_input_proj(data['problem'].x.to(self.device)),
                'skill': self.skill_input_proj(data['skill'].x.to(self.device))
            }
            
            for i, (conv, bn_dict, res_dict) in enumerate(zip(self.conv_layers, self.batch_norms, self.residual_projections)):
                residual = {k: res_dict[k](v) for k, v in x_dict.items()} if i > 0 else None
                
                if i == 0 and hasattr(data, 'edge_index_dict') and ('sequence', 'predicts', 'problem') in data.edge_index_dict:
                    edge_attr_dict = {
                        ('sequence', 'predicts', 'problem'): data['sequence', 'predicts', 'problem'].edge_attr.to(self.device)
                    }
                    x_dict = self.safe_hetero_conv(conv, x_dict, data.edge_index_dict, edge_attr_dict)
                else:
                    x_dict = self.safe_hetero_conv(conv, x_dict, getattr(data, 'edge_index_dict', {}))
                
                for node_type in x_dict.keys():
                    if node_type in bn_dict:
                        x_dict[node_type] = bn_dict[node_type](x_dict[node_type])
                        x_dict[node_type] = F.gelu(x_dict[node_type])
                        
                        if residual is not None and node_type in residual:
                            x_dict[node_type] = x_dict[node_type] + 0.15 * residual[node_type]
            
            sequence_embeddings = self.temporal_encoder(x_dict['sequence'])
            problem_embeddings = x_dict['problem']
            
            if hasattr(data, 'edge_index_dict') and ('sequence', 'predicts', 'problem') in data.edge_index_dict:
                edge_index = data['sequence', 'predicts', 'problem'].edge_index.to(self.device)
            else:
                seq_size = sequence_embeddings.shape[0]
                prob_size = problem_embeddings.shape[0]
                edge_index = torch.stack([
                    torch.arange(min(seq_size, prob_size), device=self.device),
                    torch.arange(min(seq_size, prob_size), device=self.device)
                ])
            
            sequence_indices = edge_index[0]
            problem_indices = edge_index[1]
            
            pred_sequence_emb = sequence_embeddings[sequence_indices]
            pred_problem_emb = problem_embeddings[problem_indices]
            combined_embeddings = torch.cat([pred_sequence_emb, pred_problem_emb], dim=1)
            
            logits = self.predictor(combined_embeddings).squeeze(-1)
            
            return logits, {
                'sequence_embeddings': sequence_embeddings,
                'problem_embeddings': problem_embeddings
            }
            
        except Exception as e:
            batch_size = data['sequence'].x.shape[0]
            return torch.zeros(batch_size, device=self.device), {}

class OptimizedSymbolicEngine58M(nn.Module):
    """Optimized 5-8M Parameter Symbolic Engine"""
    def __init__(self, device='cuda'):
        super(OptimizedSymbolicEngine58M, self).__init__()
        self.device = device
        
        # Enhanced but not excessive rule weights
        self.rule_weights = nn.Parameter(torch.tensor([
            12.0, 10.5, 9.5, 8.5, 7.5, 6.5, 5.5, 4.5
        ], device=device))
        
        # Optimized thresholds
        self.performance_threshold = nn.Parameter(torch.tensor(0.82, device=device))
        self.hint_threshold = nn.Parameter(torch.tensor(0.18, device=device))
        self.skill_threshold = nn.Parameter(torch.tensor(0.78, device=device))
        self.time_optimal = nn.Parameter(torch.tensor(0.62, device=device))
        
        # OPTIMIZED rule combiner - Reduced size but still powerful
        self.rule_combiner = nn.Sequential(
            nn.Linear(9, 1024),  # 8 rules + 1 GNN - Reduced from 1536
            nn.BatchNorm1d(1024),
            nn.GELU(),
            nn.Dropout(0.4),
            
            nn.Linear(1024, 512),  # Reduced from 768
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(0.35),
            
            nn.Linear(512, 256),  # Reduced from 384
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Dropout(0.3),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Dropout(0.2),
            
            nn.Linear(128, 64),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(64, 1)
        ).to(device)
        
    def safe_extract_features(self, edge_features, gnn_probs):
        batch_size = gnn_probs.shape[0]
        
        try:
            hint_usage = torch.clamp(
                edge_features[:, 6] if edge_features.shape[1] > 6 
                else (1 - gnn_probs) * 0.15, 0, 1
            ).to(self.device)
            
            time_ratio = torch.clamp(
                edge_features[:, 5] if edge_features.shape[1] > 5 
                else 0.5 + (gnn_probs - 0.5) * 0.15, 0, 2
            ).to(self.device)
            
            skill_overlap = torch.clamp(
                edge_features[:, 2] if edge_features.shape[1] > 2 
                else gnn_probs * 0.25 + 0.6, 0, 1
            ).to(self.device)
            
            performance_history = torch.clamp(
                edge_features[:, 8] if edge_features.shape[1] > 8 
                else gnn_probs * 0.98 + 0.01, 0, 1
            ).to(self.device)
            
            recent_trend = torch.clamp(
                edge_features[:, 7] if edge_features.shape[1] > 7 
                else (gnn_probs - 0.5) * 1.5, -1, 1
            ).to(self.device)
            
        except Exception as e:
            hint_usage = (1 - gnn_probs) * 0.25
            time_ratio = torch.ones_like(gnn_probs) * 0.5
            skill_overlap = gnn_probs * 0.3 + 0.5
            performance_history = gnn_probs * 0.85 + 0.1
            recent_trend = (gnn_probs - 0.5) * 1.0
        
        return hint_usage, time_ratio, skill_overlap, performance_history, recent_trend
    
    def apply_optimized_rules(self, gnn_logits, edge_features):
        """Apply 8 OPTIMIZED high-impact rules"""
        batch_size = gnn_logits.shape[0]
        gnn_probs = torch.sigmoid(gnn_logits)
        
        hint_usage, time_ratio, skill_overlap, performance_history, recent_trend = self.safe_extract_features(edge_features, gnn_probs)
        
        confidence_level = torch.abs(gnn_probs - 0.5) * 2
        difficulty_level = 1.0 - performance_history
        mastery_level = skill_overlap * performance_history
        
        rules = torch.zeros(batch_size, 8, device=self.device)
        
        try:
            # Rule 1: Performance Consistency - 12x boost
            consistency = torch.sigmoid((performance_history - self.performance_threshold) * 45)
            confidence_amp = 1 + confidence_level * 1.2
            rules[:, 0] = 12.0 * consistency * confidence_amp
            
            # Rule 2: Skill Mastery Transfer - 10.5x boost
            transfer = skill_overlap * performance_history
            transfer_boost = torch.sigmoid((transfer - self.skill_threshold) * 40)
            neural_synergy = 1 + 0.6 * gnn_probs
            rules[:, 1] = 10.5 * transfer_boost * neural_synergy
            
            # Rule 3: Learning Momentum - 9.5x boost
            momentum = torch.sigmoid(recent_trend * 30)
            momentum_amp = 1 + 0.5 * performance_history
            rules[:, 2] = 9.5 * momentum * momentum_amp
            
            # Rule 4: Strategic Hint Usage - 8.5x boost
            strategic_hints = hint_usage * difficulty_level
            appropriate = torch.sigmoid(-(hint_usage - self.hint_threshold) * 60)
            rules[:, 3] = 8.5 * strategic_hints * appropriate
            
            # Rule 5: Time Appropriateness - 7.5x boost
            time_optimality = torch.exp(-25 * (time_ratio - self.time_optimal) ** 2)
            time_confidence = 1 + 0.4 * confidence_level
            rules[:, 4] = 7.5 * time_optimality * time_confidence
            
            # Rule 6: Difficulty Alignment - 6.5x boost
            zpd_alignment = torch.exp(-35 * (difficulty_level - (1 - performance_history)) ** 2)
            rules[:, 5] = 6.5 * zpd_alignment
            
            # Rule 7: Practice Quality - 5.5x boost
            practice_quality = torch.clamp(recent_trend, 0, 1) * (1 - hint_usage)
            rules[:, 6] = 5.5 * practice_quality
            
            # Rule 8: Content Familiarity - 4.5x boost
            familiarity = skill_overlap * performance_history * (1 + 0.2 * confidence_level)
            rules[:, 7] = 4.5 * familiarity
            
        except Exception as e:
            for i in range(8):
                rules[:, i] = self.rule_weights[i] * gnn_probs * (i + 1) / 8
        
        return rules
    
    def forward(self, gnn_logits, graph_data):
        try:
            if hasattr(graph_data, 'edge_attr'):
                edge_features = graph_data.edge_attr.to(self.device)
            elif hasattr(graph_data, '__getitem__') and ('sequence', 'predicts', 'problem') in graph_data:
                edge_features = graph_data['sequence', 'predicts', 'problem'].edge_attr.to(self.device)
            else:
                batch_size = gnn_logits.shape[0]
                edge_features = torch.randn(batch_size, 10, device=self.device) * 0.1
            
            rule_outputs = self.apply_optimized_rules(gnn_logits, edge_features)
            weighted_rules = rule_outputs * self.rule_weights.unsqueeze(0)
            
            gnn_probs = torch.sigmoid(gnn_logits)
            combined_input = torch.cat([gnn_probs.unsqueeze(1), weighted_rules], dim=1)
            
            adjustment = self.rule_combiner(combined_input).squeeze(1)
            
            # OPTIMIZED scaling - 12x to 25x (reduced from 20x to 50x)
            uncertainty = 1 - torch.abs(gnn_probs - 0.5) * 2
            confidence_scaling = torch.abs(gnn_probs - 0.5) * 2
            adaptive_scale = 12.0 + 10.0 * uncertainty + 6.0 * confidence_scaling
            scaled_adjustment = adjustment * adaptive_scale
            
            adjusted_logits = gnn_logits + scaled_adjustment
            
            return adjusted_logits, {
                'rule_outputs': rule_outputs,
                'adjustment': scaled_adjustment,
                'scaling_factor': adaptive_scale
            }
            
        except Exception as e:
            return gnn_logits, {'rule_outputs': torch.zeros_like(gnn_logits)}

class Optimized58MillionModel(nn.Module):
    """Optimized 5-8 Million Parameter Model"""
    def __init__(self, sequence_features, problem_features, skill_features, device='cuda'):
        super(Optimized58MillionModel, self).__init__()
        self.device = device
        
        self.gnn = OptimizedGNN58M(
            sequence_features, problem_features, skill_features,
            hidden_dim=320, num_layers=5, num_heads=20, dropout=0.16, device=device
        )
        self.symbolic_engine = OptimizedSymbolicEngine58M(device)
        
        # Balanced weights
        self.neural_weight = nn.Parameter(torch.tensor(0.32, device=device))
        self.symbolic_weight = nn.Parameter(torch.tensor(0.68, device=device))
        
        # Simplified meta combiner
        self.meta_combiner = nn.Sequential(
            nn.Linear(3, 96),  # Reduced from 128
            nn.BatchNorm1d(96),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(96, 32),  # Reduced from 64
            nn.GELU(),
            nn.Linear(32, 1),
            nn.Tanh()
        ).to(device)
        
    def forward(self, data):
        try:
            gnn_logits, gnn_info = self.gnn(data)
            symbolic_logits, symbolic_info = self.symbolic_engine(gnn_logits, data)
            
            weights_sum = torch.abs(self.neural_weight) + torch.abs(self.symbolic_weight) + 1e-8
            
            neural_contrib = (torch.abs(self.neural_weight) / weights_sum) * gnn_logits
            symbolic_contrib = (torch.abs(self.symbolic_weight) / weights_sum) * symbolic_logits
            
            gnn_probs = torch.sigmoid(gnn_logits)
            symbolic_probs = torch.sigmoid(symbolic_logits)
            confidence = torch.abs(gnn_probs - 0.5) * 2
            
            if gnn_probs.shape[0] > 1:
                meta_input = torch.stack([gnn_probs, symbolic_probs, confidence], dim=1)
                meta_weight = self.meta_combiner(meta_input).squeeze(1)
                meta_adjustment = meta_weight * (symbolic_logits - gnn_logits)
            else:
                meta_adjustment = 0.0
            
            final_logits = neural_contrib + symbolic_contrib + 0.25 * meta_adjustment
            
            return final_logits, {
                'gnn_logits': gnn_logits,
                'symbolic_logits': symbolic_logits,
                'symbolic_info': symbolic_info,
                'neural_weight': torch.abs(self.neural_weight),
                'symbolic_weight': torch.abs(self.symbolic_weight)
            }
            
        except Exception as e:
            batch_size = data['sequence'].x.shape[0] if 'sequence' in data else 1
            return torch.zeros(batch_size, device=self.device), {}

def format_time(seconds):
    """Format seconds into human readable time"""
    if seconds < 60:
        return f"{seconds:.1f}s"
    elif seconds < 3600:
        mins = seconds / 60
        return f"{mins:.1f}m"
    else:
        hours = seconds / 3600
        return f"{hours:.1f}h"

def optimized_58m_training_with_progress(model_path, device='cuda'):
    """OPTIMIZED 5-8M Training with PROGRESS BAR and EVERY EPOCH RESULTS"""
    print("🚀💎 OPTIMIZED 5-8M PARAMETER TRAINING WITH PROGRESS BAR 💎🚀")
    print("=" * 80)
    
    try:
        # Data loading
        print("📂 Loading data...")
        start_time = time.time()
        saved_data = torch.load(model_path, map_location='cpu')
        graph_data = saved_data['graph_data']
        targets = saved_data['targets']
        
        # Device transfer
        print("🔄 Transferring to GPU...")
        for key in graph_data.keys():
            if hasattr(graph_data[key], 'x') and graph_data[key].x is not None:
                graph_data[key].x = graph_data[key].x.to(device)
            if hasattr(graph_data[key], 'edge_index') and graph_data[key].edge_index is not None:
                graph_data[key].edge_index = graph_data[key].edge_index.to(device)
            if hasattr(graph_data[key], 'edge_attr') and graph_data[key].edge_attr is not None:
                graph_data[key].edge_attr = graph_data[key].edge_attr.to(device)
        
        if hasattr(graph_data, 'edge_index_dict'):
            edge_index_dict = {}
            for edge_type, edge_index in graph_data.edge_index_dict.items():
                edge_index_dict[edge_type] = edge_index.to(device)
            graph_data.edge_index_dict = edge_index_dict
        
        targets = targets.to(device)
        
        # Data split
        pos_indices = torch.where(targets == 1)[0]
        neg_indices = torch.where(targets == 0)[0]
        
        pos_perm = torch.randperm(len(pos_indices))
        neg_perm = torch.randperm(len(neg_indices))
        
        pos_train_size = int(0.8 * len(pos_indices))
        neg_train_size = int(0.8 * len(neg_indices))
        
        train_indices = torch.cat([
            pos_indices[pos_perm[:pos_train_size]],
            neg_indices[neg_perm[:neg_train_size]]
        ])
        val_indices = torch.cat([
            pos_indices[pos_perm[pos_train_size:]],
            neg_indices[neg_perm[neg_train_size:]]
        ])
        
        train_targets = targets[train_indices]
        val_targets = targets[val_indices]
        
        print(f"✅ Setup complete: {len(train_targets)} train, {len(val_targets)} val")
        print(f"   Train balance: {train_targets.float().mean():.3f}")
        print(f"   Val balance: {val_targets.float().mean():.3f}")
        
        # Create optimized model
        sequence_features = graph_data['sequence'].x.shape[1]
        problem_features = graph_data['problem'].x.shape[1]
        skill_features = graph_data['skill'].x.shape[1]
        
        optimized_model = Optimized58MillionModel(sequence_features, problem_features, skill_features, device).to(device)
        
        total_params = sum(p.numel() for p in optimized_model.parameters())
        trainable_params = sum(p.numel() for p in optimized_model.parameters() if p.requires_grad)
        
        print(f"💎 OPTIMIZED Model created:")
        print(f"   Total parameters: {total_params:,} ({total_params/1e6:.2f}M)")
        print(f"   Trainable parameters: {trainable_params:,} ({trainable_params/1e6:.2f}M)")
        print(f"   Target range: 5-8M ✅" if 5e6 <= total_params <= 8e6 else f"   ⚠️ Outside 5-8M range")
        
        # Optimized optimizer
        optimizer = torch.optim.AdamW([
            {'params': optimized_model.gnn.parameters(), 'lr': 0.0008, 'weight_decay': 8e-5},
            {'params': optimized_model.symbolic_engine.parameters(), 'lr': 0.005, 'weight_decay': 4e-5},
            {'params': optimized_model.meta_combiner.parameters(), 'lr': 0.003, 'weight_decay': 6e-5},
            {'params': [optimized_model.neural_weight, optimized_model.symbolic_weight], 'lr': 0.005}
        ])
        
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(1.15, device=device))
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer, max_lr=[0.0016, 0.01, 0.006, 0.01], 
            epochs=400, steps_per_epoch=1, pct_start=0.05, div_factor=4, final_div_factor=80
        )
        
        # Training setup
        best_acc = 0.0
        best_epoch = 0
        patience = 0
        max_patience = 40
        training_start_time = time.time()
        epoch_times = []
        
        print("=" * 80)
        print("🚀 STARTING OPTIMIZED TRAINING WITH DETAILED PROGRESS")
        print("=" * 80)
        print(f"{'Epoch':<6} {'Time':<8} {'Loss':<8} {'Val Acc':<8} {'Best Acc':<9} {'LR':<10} {'Status'}")
        print("-" * 80)
        
        # Training loop with progress bar
        progress_bar = tqdm(range(400), desc="🚀 Optimized Training", unit="epoch")
        
        for epoch in progress_bar:
            epoch_start_time = time.time()
            optimized_model.train()
            optimizer.zero_grad()
            
            try:
                # Forward pass
                all_logits, model_info = optimized_model(graph_data)
                train_logits = all_logits[train_indices]
                
                # Loss with enhanced smoothing
                smoothed_targets = train_targets.float() * 0.92 + 0.04
                loss = criterion(train_logits, smoothed_targets)
                
                # Backward
                loss.backward()
                torch.nn.utils.clip_grad_norm_(optimized_model.parameters(), 1.2)
                optimizer.step()
                scheduler.step()
                
                # Current learning rate
                current_lr = optimizer.param_groups[1]['lr']
                
                # Validation EVERY EPOCH
                optimized_model.eval()
                with torch.no_grad():
                    val_logits, _ = optimized_model(graph_data)
                    val_probs = torch.sigmoid(val_logits[val_indices])
                    
                    # Multi-threshold evaluation
                    best_threshold_acc = 0
                    best_threshold = 0.5
                    for threshold in [0.42, 0.45, 0.48, 0.5, 0.52, 0.55, 0.58]:
                        acc = ((val_probs > threshold) == val_targets).float().mean().item()
                        if acc > best_threshold_acc:
                            best_threshold_acc = acc
                            best_threshold = threshold
                    
                    # Track best model
                    status = ""
                    if best_threshold_acc > best_acc + 0.0008:
                        best_acc = best_threshold_acc
                        best_epoch = epoch
                        patience = 0
                        torch.save({
                            'model_state_dict': optimized_model.state_dict(),
                            'epoch': epoch,
                            'best_acc': best_acc,
                            'threshold': best_threshold,
                            'total_params': total_params
                        }, 'optimized_58m_model.pt')
                        status = "🏆 NEW BEST!"
                    else:
                        patience += 1
                        if patience <= 5:
                            status = f"📈 ({patience})"
                        elif patience <= 15:
                            status = f"⏳ ({patience})"
                        else:
                            status = f"⚠️ ({patience})"
                
                # Calculate epoch time
                epoch_time = time.time() - epoch_start_time
                epoch_times.append(epoch_time)
                avg_epoch_time = sum(epoch_times[-10:]) / len(epoch_times[-10:])
                
                # Estimate remaining time
                remaining_epochs = 400 - epoch - 1
                estimated_remaining = remaining_epochs * avg_epoch_time
                
                # Print detailed results
                print(f"{epoch+1:<6} {format_time(epoch_time):<8} {loss.item():<8.4f} "
                      f"{best_threshold_acc*100:<8.2f}% {best_acc*100:<8.2f}% "
                      f"{current_lr:<10.2e} {status}")
                
                # Update progress bar
                progress_bar.set_postfix({
                    'Loss': f'{loss.item():.4f}',
                    'Val': f'{best_threshold_acc*100:.2f}%',
                    'Best': f'{best_acc*100:.2f}%',
                    'Patience': f'{patience}/{max_patience}',
                    'ETA': format_time(estimated_remaining)
                })
                
                # Success check
                if best_acc >= 0.85:
                    print("\n" + "🎉" * 40)
                    print("🏆 85%+ ACCURACY ACHIEVED! 🏆")
                    print("🎉" * 40)
                    break
                
                # Early stopping
                if patience >= max_patience:
                    print(f"\n⏹️ Early stopping triggered at epoch {epoch+1}")
                    break
                    
            except Exception as e:
                print(f"\n⚠️ Error at epoch {epoch+1}: {e}")
                torch.cuda.empty_cache()
                continue
            
            # Memory cleanup
            if (epoch + 1) % 12 == 0:
                torch.cuda.empty_cache()
                gc.collect()
        
        progress_bar.close()
        
        # Training summary
        total_training_time = time.time() - training_start_time
        
        print("\n" + "=" * 80)
        print("🏁 OPTIMIZED TRAINING COMPLETE!")
        print("=" * 80)
        print(f"📊 TRAINING SUMMARY:")
        print(f"   Total training time: {format_time(total_training_time)}")
        print(f"   Average time per epoch: {format_time(sum(epoch_times)/len(epoch_times))}")
        print(f"   Best epoch: {best_epoch + 1}")
        print(f"   Best accuracy: {best_acc:.4f} ({best_acc*100:.2f}%)")
        print(f"   Total epochs trained: {len(epoch_times)}")
        
        # Load best model
        try:
            checkpoint = torch.load('optimized_58m_model.pt')
            optimized_model.load_state_dict(checkpoint['model_state_dict'])
            print(f"✅ Loaded best model from epoch {checkpoint['epoch']+1}")
        except:
            print("⚠️ Using final model state")
        
        improvement = (best_acc - 0.778) * 100
        
        print(f"\n📈 IMPROVEMENT ANALYSIS:")
        print(f"   Starting baseline: 77.8%")
        print(f"   OPTIMIZED result: {best_acc*100:.2f}%")
        print(f"   Net improvement: +{improvement:.2f} percentage points")
        print(f"   Model parameters: {total_params:,} ({total_params/1e6:.2f}M)")
        print(f"   Parameter efficiency: {improvement/(total_params/1e6):.2f} pp/M params")
        
        if best_acc >= 0.85:
            print("\n🎉💎🏆 OPTIMIZED SUCCESS! 85%+ TARGET ACHIEVED! 🏆💎🎉")
        elif best_acc >= 0.82:
            print("\n🎊💎 OPTIMIZED EXCELLENCE! Very close to 85%!")
        else:
            print(f"\n💎 OPTIMIZED achievement: {best_acc*100:.2f}% with {total_params/1e6:.1f}M parameters!")
        
        return optimized_model, best_acc
        
    except Exception as e:
        print(f"❌ Training failed: {e}")
        return None, 0.0

# ===== LAUNCH OPTIMIZED TRAINING WITH PROGRESS =====
print("🚀💎 LAUNCHING OPTIMIZED 5-8M PARAMETER TRAINING 💎🚀")
torch.cuda.empty_cache()
gc.collect()

# Run training
try:
    optimized_model, optimized_accuracy = optimized_58m_training_with_progress(
        model_path='./teg_nesynet_models/teg_nesynet_temporal_v1.pt',
        device=device
    )
    
    print(f"\n💎🎯 FINAL OPTIMIZED STATUS:")
    if optimized_accuracy >= 0.85:
        print(f"🏆 OPTIMIZED SUCCESS! {optimized_accuracy*100:.2f}% ACHIEVED!")
    else:
        print(f"💎 Optimized progress: {optimized_accuracy*100:.2f}%")
        
except Exception as e:
    print(f"❌ Optimization failed: {e}")
    print("🔧 Please restart kernel and try again")


✅ PyTorch loaded successfully
✅ tqdm loaded for progress bars
✅ PyTorch Geometric loaded successfully
✅ Using device: cuda
🚀💎 LAUNCHING OPTIMIZED 5-8M PARAMETER TRAINING 💎🚀
🚀💎 OPTIMIZED 5-8M PARAMETER TRAINING WITH PROGRESS BAR 💎🚀
📂 Loading data...
🔄 Transferring to GPU...
✅ Setup complete: 19056 train, 4764 val
   Train balance: 0.543
   Val balance: 0.543
💎 OPTIMIZED Model created:
   Total parameters: 10,380,433 (10.38M)
   Trainable parameters: 10,380,433 (10.38M)
   ⚠️ Outside 5-8M range
🚀 STARTING OPTIMIZED TRAINING WITH DETAILED PROGRESS
Epoch  Time     Loss     Val Acc  Best Acc  LR         Status
--------------------------------------------------------------------------------


🚀 Optimized Training:   0%|          | 1/400 [00:23<2:37:03, 23.62s/epoch, Loss=1.4700, Val=45.74%, Best=45.74%, Patience=0/40, ETA=2.6h]

1      23.6s    1.4700   45.74   % 45.74   % 2.55e-03   🏆 NEW BEST!


🚀 Optimized Training:   0%|          | 2/400 [00:47<2:39:34, 24.06s/epoch, Loss=1.8093, Val=54.26%, Best=54.26%, Patience=0/40, ETA=2.7h]

2      24.4s    1.8093   54.26   % 54.26   % 2.70e-03   🏆 NEW BEST!


🚀 Optimized Training:   1%|          | 3/400 [01:13<2:43:09, 24.66s/epoch, Loss=1.6239, Val=54.26%, Best=54.26%, Patience=1/40, ETA=2.7h]

3      25.4s    1.6239   54.26   % 54.26   % 2.95e-03   📈 (1)


🚀 Optimized Training:   1%|          | 4/400 [01:39<2:45:31, 25.08s/epoch, Loss=1.0538, Val=49.52%, Best=54.26%, Patience=2/40, ETA=2.7h]

4      25.7s    1.0538   49.52   % 54.26   % 3.29e-03   📈 (2)


🚀 Optimized Training:   1%|▏         | 5/400 [02:04<2:47:04, 25.38s/epoch, Loss=1.2799, Val=45.74%, Best=54.26%, Patience=3/40, ETA=2.7h]

5      25.9s    1.2799   45.74   % 54.26   % 3.71e-03   📈 (3)


🚀 Optimized Training:   2%|▏         | 6/400 [02:31<2:48:11, 25.61s/epoch, Loss=1.1168, Val=53.40%, Best=54.26%, Patience=4/40, ETA=2.8h]

6      26.1s    1.1168   53.40   % 54.26   % 4.20e-03   📈 (4)


🚀 Optimized Training:   2%|▏         | 7/400 [02:56<2:47:59, 25.65s/epoch, Loss=0.8587, Val=51.66%, Best=54.26%, Patience=5/40, ETA=2.8h]

7      25.7s    0.8587   51.66   % 54.26   % 4.74e-03   📈 (5)


🚀 Optimized Training:   2%|▏         | 8/400 [03:22<2:47:45, 25.68s/epoch, Loss=0.8430, Val=48.17%, Best=54.26%, Patience=6/40, ETA=2.8h]

8      25.7s    0.8430   48.17   % 54.26   % 5.33e-03   ⏳ (6)


🚀 Optimized Training:   2%|▏         | 9/400 [03:48<2:48:11, 25.81s/epoch, Loss=0.7998, Val=53.95%, Best=54.26%, Patience=7/40, ETA=2.8h]

9      26.1s    0.7998   53.95   % 54.26   % 5.94e-03   ⏳ (7)


🚀 Optimized Training:   2%|▎         | 10/400 [04:14<2:48:03, 25.86s/epoch, Loss=0.7838, Val=48.55%, Best=54.26%, Patience=8/40, ETA=2.8h]

10     26.0s    0.7838   48.55   % 54.26   % 6.56e-03   ⏳ (8)


🚀 Optimized Training:   3%|▎         | 11/400 [04:40<2:47:15, 25.80s/epoch, Loss=0.7677, Val=50.06%, Best=54.26%, Patience=9/40, ETA=2.8h]

11     25.7s    0.7677   50.06   % 54.26   % 7.17e-03   ⏳ (9)


🚀 Optimized Training:   3%|▎         | 11/400 [05:05<2:47:15, 25.80s/epoch, Loss=0.7345, Val=51.87%, Best=54.26%, Patience=10/40, ETA=2.8h]

12     25.5s    0.7345   51.87   % 54.26   % 7.76e-03   ⏳ (10)


🚀 Optimized Training:   3%|▎         | 13/400 [05:24<2:32:17, 23.61s/epoch, Loss=0.7132, Val=54.26%, Best=54.26%, Patience=11/40, ETA=2.7h]

13     18.2s    0.7132   54.26   % 54.26   % 8.30e-03   ⏳ (11)


🚀 Optimized Training:   4%|▎         | 14/400 [05:45<2:25:53, 22.68s/epoch, Loss=0.7033, Val=54.35%, Best=54.35%, Patience=0/40, ETA=2.6h] 

14     20.5s    0.7033   54.35   % 54.35   % 8.79e-03   🏆 NEW BEST!


🚀 Optimized Training:   4%|▍         | 15/400 [06:05<2:19:48, 21.79s/epoch, Loss=0.6911, Val=54.26%, Best=54.35%, Patience=1/40, ETA=2.6h]

15     19.7s    0.6911   54.26   % 54.35   % 9.21e-03   📈 (1)


🚀 Optimized Training:   4%|▍         | 16/400 [06:24<2:15:20, 21.15s/epoch, Loss=0.6942, Val=54.26%, Best=54.35%, Patience=2/40, ETA=2.5h]

16     19.7s    0.6942   54.26   % 54.35   % 9.55e-03   📈 (2)


🚀 Optimized Training:   4%|▍         | 17/400 [06:44<2:11:54, 20.66s/epoch, Loss=0.6832, Val=54.28%, Best=54.35%, Patience=3/40, ETA=2.4h]

17     19.5s    0.6832   54.28   % 54.35   % 9.80e-03   📈 (3)


🚀 Optimized Training:   4%|▍         | 18/400 [07:03<2:09:05, 20.28s/epoch, Loss=0.6808, Val=54.26%, Best=54.35%, Patience=4/40, ETA=2.3h]

18     19.4s    0.6808   54.26   % 54.35   % 9.95e-03   📈 (4)


🚀 Optimized Training:   5%|▍         | 19/400 [07:23<2:08:17, 20.20s/epoch, Loss=0.6729, Val=54.30%, Best=54.35%, Patience=5/40, ETA=2.3h]

19     20.0s    0.6729   54.30   % 54.35   % 1.00e-02   📈 (5)


🚀 Optimized Training:   5%|▌         | 20/400 [07:43<2:07:17, 20.10s/epoch, Loss=0.6683, Val=54.37%, Best=54.35%, Patience=6/40, ETA=2.2h]

20     19.8s    0.6683   54.37   % 54.35   % 1.00e-02   ⏳ (6)


🚀 Optimized Training:   5%|▌         | 21/400 [08:03<2:06:27, 20.02s/epoch, Loss=0.6605, Val=55.79%, Best=55.79%, Patience=0/40, ETA=2.1h]

21     19.8s    0.6605   55.79   % 55.79   % 1.00e-02   🏆 NEW BEST!


🚀 Optimized Training:   6%|▌         | 22/400 [08:22<2:05:21, 19.90s/epoch, Loss=0.6538, Val=55.67%, Best=55.79%, Patience=1/40, ETA=2.1h]

22     19.6s    0.6538   55.67   % 55.79   % 1.00e-02   📈 (1)


🚀 Optimized Training:   6%|▌         | 23/400 [08:43<2:05:35, 19.99s/epoch, Loss=0.6479, Val=55.81%, Best=55.79%, Patience=2/40, ETA=2.1h]

23     20.2s    0.6479   55.81   % 55.79   % 1.00e-02   📈 (2)


🚀 Optimized Training:   6%|▌         | 23/400 [09:03<2:05:35, 19.99s/epoch, Loss=0.6442, Val=57.12%, Best=57.12%, Patience=0/40, ETA=2.1h]

24     20.1s    0.6442   57.12   % 57.12   % 1.00e-02   🏆 NEW BEST!


🚀 Optimized Training:   6%|▋         | 25/400 [09:22<2:03:38, 19.78s/epoch, Loss=0.6377, Val=57.37%, Best=57.37%, Patience=0/40, ETA=2.0h]

25     18.6s    0.6377   57.37   % 57.37   % 9.99e-03   🏆 NEW BEST!


🚀 Optimized Training:   6%|▋         | 26/400 [09:42<2:03:43, 19.85s/epoch, Loss=0.6357, Val=57.70%, Best=57.70%, Patience=0/40, ETA=2.0h]

26     20.0s    0.6357   57.70   % 57.70   % 9.99e-03   🏆 NEW BEST!


🚀 Optimized Training:   7%|▋         | 27/400 [10:03<2:05:28, 20.18s/epoch, Loss=0.6347, Val=57.79%, Best=57.79%, Patience=0/40, ETA=2.1h]

27     21.0s    0.6347   57.79   % 57.79   % 9.99e-03   🏆 NEW BEST!


🚀 Optimized Training:   7%|▋         | 28/400 [10:24<2:05:42, 20.28s/epoch, Loss=0.6262, Val=58.73%, Best=58.73%, Patience=0/40, ETA=2.1h]

28     20.5s    0.6262   58.73   % 58.73   % 9.99e-03   🏆 NEW BEST!


🚀 Optimized Training:   7%|▋         | 29/400 [10:44<2:05:09, 20.24s/epoch, Loss=0.6289, Val=58.75%, Best=58.73%, Patience=1/40, ETA=2.1h]

29     20.2s    0.6289   58.75   % 58.73   % 9.98e-03   📈 (1)


🚀 Optimized Training:   8%|▊         | 30/400 [11:04<2:03:52, 20.09s/epoch, Loss=0.6156, Val=59.66%, Best=59.66%, Patience=0/40, ETA=2.1h]

30     19.7s    0.6156   59.66   % 59.66   % 9.98e-03   🏆 NEW BEST!


🚀 Optimized Training:   8%|▊         | 31/400 [11:23<2:02:28, 19.92s/epoch, Loss=0.6156, Val=59.24%, Best=59.66%, Patience=1/40, ETA=2.0h]

31     19.5s    0.6156   59.24   % 59.66   % 9.98e-03   📈 (1)


🚀 Optimized Training:   8%|▊         | 32/400 [11:43<2:02:17, 19.94s/epoch, Loss=0.6170, Val=60.31%, Best=60.31%, Patience=0/40, ETA=2.0h]

32     20.0s    0.6170   60.31   % 60.31   % 9.97e-03   🏆 NEW BEST!


🚀 Optimized Training:   8%|▊         | 33/400 [12:03<2:01:54, 19.93s/epoch, Loss=0.6304, Val=61.25%, Best=61.25%, Patience=0/40, ETA=2.0h]

33     19.9s    0.6304   61.25   % 61.25   % 9.97e-03   🏆 NEW BEST!


🚀 Optimized Training:   8%|▊         | 34/400 [12:23<2:02:14, 20.04s/epoch, Loss=0.6189, Val=62.87%, Best=62.87%, Patience=0/40, ETA=2.0h]

34     20.3s    0.6189   62.87   % 62.87   % 9.96e-03   🏆 NEW BEST!


🚀 Optimized Training:   9%|▉         | 35/400 [12:43<2:01:50, 20.03s/epoch, Loss=0.6182, Val=63.79%, Best=63.79%, Patience=0/40, ETA=2.0h]

35     20.0s    0.6182   63.79   % 63.79   % 9.96e-03   🏆 NEW BEST!


🚀 Optimized Training:   9%|▉         | 36/400 [13:04<2:02:03, 20.12s/epoch, Loss=0.6096, Val=63.20%, Best=63.79%, Patience=1/40, ETA=2.0h]

36     19.7s    0.6096   63.20   % 63.79   % 9.95e-03   📈 (1)


🚀 Optimized Training:   9%|▉         | 37/400 [13:22<1:59:09, 19.70s/epoch, Loss=0.6030, Val=62.47%, Best=63.79%, Patience=2/40, ETA=2.0h]

37     18.7s    0.6030   62.47   % 63.79   % 9.94e-03   📈 (2)


🚀 Optimized Training:  10%|▉         | 38/400 [13:41<1:57:24, 19.46s/epoch, Loss=0.5920, Val=63.01%, Best=63.79%, Patience=3/40, ETA=2.0h]

38     18.9s    0.5920   63.01   % 63.79   % 9.94e-03   📈 (3)


🚀 Optimized Training:  10%|▉         | 39/400 [14:00<1:56:39, 19.39s/epoch, Loss=0.5953, Val=62.47%, Best=63.79%, Patience=4/40, ETA=2.0h]

39     19.2s    0.5953   62.47   % 63.79   % 9.93e-03   📈 (4)


🚀 Optimized Training:  10%|█         | 40/400 [14:20<1:55:56, 19.32s/epoch, Loss=0.5851, Val=64.78%, Best=64.78%, Patience=0/40, ETA=2.0h]

40     19.2s    0.5851   64.78   % 64.78   % 9.93e-03   🏆 NEW BEST!


🚀 Optimized Training:  10%|█         | 41/400 [14:39<1:55:31, 19.31s/epoch, Loss=0.5813, Val=66.23%, Best=66.23%, Patience=0/40, ETA=1.9h]

41     19.3s    0.5813   66.23   % 66.23   % 9.92e-03   🏆 NEW BEST!


🚀 Optimized Training:  10%|█         | 42/400 [14:58<1:54:57, 19.27s/epoch, Loss=0.5794, Val=66.67%, Best=66.67%, Patience=0/40, ETA=1.9h]

42     19.2s    0.5794   66.67   % 66.67   % 9.91e-03   🏆 NEW BEST!


🚀 Optimized Training:  11%|█         | 43/400 [15:17<1:54:38, 19.27s/epoch, Loss=0.5773, Val=67.49%, Best=67.49%, Patience=0/40, ETA=1.9h]

43     19.3s    0.5773   67.49   % 67.49   % 9.90e-03   🏆 NEW BEST!


🚀 Optimized Training:  11%|█         | 44/400 [15:36<1:54:01, 19.22s/epoch, Loss=0.5682, Val=68.09%, Best=68.09%, Patience=0/40, ETA=1.9h]

44     19.1s    0.5682   68.09   % 68.09   % 9.89e-03   🏆 NEW BEST!


🚀 Optimized Training:  11%|█▏        | 45/400 [15:56<1:53:39, 19.21s/epoch, Loss=0.5667, Val=69.82%, Best=69.82%, Patience=0/40, ETA=1.9h]

45     19.2s    0.5667   69.82   % 69.82   % 9.89e-03   🏆 NEW BEST!


🚀 Optimized Training:  12%|█▏        | 46/400 [16:15<1:53:13, 19.19s/epoch, Loss=0.5639, Val=70.00%, Best=70.00%, Patience=0/40, ETA=1.9h]

46     19.1s    0.5639   70.00   % 70.00   % 9.88e-03   🏆 NEW BEST!


🚀 Optimized Training:  12%|█▏        | 47/400 [16:34<1:52:39, 19.15s/epoch, Loss=0.5562, Val=69.27%, Best=70.00%, Patience=1/40, ETA=1.9h]

47     19.0s    0.5562   69.27   % 70.00   % 9.87e-03   📈 (1)


🚀 Optimized Training:  12%|█▏        | 47/400 [16:53<1:52:39, 19.15s/epoch, Loss=0.5594, Val=71.47%, Best=71.47%, Patience=0/40, ETA=1.9h]

48     19.1s    0.5594   71.47   % 71.47   % 9.86e-03   🏆 NEW BEST!


🚀 Optimized Training:  12%|█▏        | 49/400 [17:14<1:54:01, 19.49s/epoch, Loss=0.5622, Val=71.28%, Best=71.47%, Patience=1/40, ETA=1.9h]

49     19.7s    0.5622   71.28   % 71.47   % 9.85e-03   📈 (1)


🚀 Optimized Training:  12%|█▎        | 50/400 [17:32<1:51:35, 19.13s/epoch, Loss=0.5550, Val=72.61%, Best=72.61%, Patience=0/40, ETA=1.9h]

50     18.3s    0.5550   72.61   % 72.61   % 9.84e-03   🏆 NEW BEST!


🚀 Optimized Training:  13%|█▎        | 51/400 [17:52<1:53:20, 19.49s/epoch, Loss=0.5492, Val=73.07%, Best=73.07%, Patience=0/40, ETA=1.9h]

51     20.3s    0.5492   73.07   % 73.07   % 9.83e-03   🏆 NEW BEST!


🚀 Optimized Training:  13%|█▎        | 52/400 [18:12<1:54:07, 19.68s/epoch, Loss=0.5466, Val=72.65%, Best=73.07%, Patience=1/40, ETA=1.9h]

52     20.1s    0.5466   72.65   % 73.07   % 9.82e-03   📈 (1)


🚀 Optimized Training:  13%|█▎        | 53/400 [18:32<1:54:37, 19.82s/epoch, Loss=0.5480, Val=72.94%, Best=73.07%, Patience=2/40, ETA=1.9h]

53     20.1s    0.5480   72.94   % 73.07   % 9.80e-03   📈 (2)


🚀 Optimized Training:  14%|█▎        | 54/400 [18:53<1:54:48, 19.91s/epoch, Loss=0.5432, Val=72.67%, Best=73.07%, Patience=3/40, ETA=1.9h]

54     20.1s    0.5432   72.67   % 73.07   % 9.79e-03   📈 (3)


🚀 Optimized Training:  14%|█▍        | 55/400 [19:13<1:54:45, 19.96s/epoch, Loss=0.5355, Val=72.29%, Best=73.07%, Patience=4/40, ETA=1.9h]

55     20.1s    0.5355   72.29   % 73.07   % 9.78e-03   📈 (4)


🚀 Optimized Training:  14%|█▍        | 56/400 [19:33<1:54:50, 20.03s/epoch, Loss=0.5355, Val=73.59%, Best=73.59%, Patience=0/40, ETA=1.9h]

56     20.2s    0.5355   73.59   % 73.59   % 9.77e-03   🏆 NEW BEST!


🚀 Optimized Training:  14%|█▍        | 57/400 [19:53<1:54:39, 20.06s/epoch, Loss=0.5361, Val=73.13%, Best=73.59%, Patience=1/40, ETA=1.9h]

57     20.1s    0.5361   73.13   % 73.59   % 9.76e-03   📈 (1)


🚀 Optimized Training:  14%|█▍        | 58/400 [20:13<1:54:27, 20.08s/epoch, Loss=0.5336, Val=72.90%, Best=73.59%, Patience=2/40, ETA=1.9h]

58     20.1s    0.5336   72.90   % 73.59   % 9.74e-03   📈 (2)


🚀 Optimized Training:  15%|█▍        | 59/400 [20:34<1:55:12, 20.27s/epoch, Loss=0.5305, Val=74.29%, Best=74.29%, Patience=0/40, ETA=1.9h]

59     20.7s    0.5305   74.29   % 74.29   % 9.73e-03   🏆 NEW BEST!


🚀 Optimized Training:  15%|█▍        | 59/400 [20:54<1:55:12, 20.27s/epoch, Loss=0.5262, Val=74.75%, Best=74.75%, Patience=0/40, ETA=1.9h]

60     20.3s    0.5262   74.75   % 74.75   % 9.72e-03   🏆 NEW BEST!


🚀 Optimized Training:  15%|█▌        | 61/400 [21:13<1:51:14, 19.69s/epoch, Loss=0.5223, Val=74.64%, Best=74.75%, Patience=1/40, ETA=1.9h]

61     17.8s    0.5223   74.64   % 74.75   % 9.70e-03   📈 (1)


🚀 Optimized Training:  16%|█▌        | 62/400 [21:32<1:49:52, 19.50s/epoch, Loss=0.5204, Val=74.90%, Best=74.90%, Patience=0/40, ETA=1.9h]

62     19.1s    0.5204   74.90   % 74.90   % 9.69e-03   🏆 NEW BEST!


🚀 Optimized Training:  16%|█▌        | 63/400 [21:51<1:48:55, 19.39s/epoch, Loss=0.5171, Val=75.21%, Best=75.21%, Patience=0/40, ETA=1.9h]

63     19.1s    0.5171   75.21   % 75.21   % 9.67e-03   🏆 NEW BEST!


🚀 Optimized Training:  16%|█▌        | 64/400 [22:10<1:48:01, 19.29s/epoch, Loss=0.5210, Val=75.25%, Best=75.21%, Patience=1/40, ETA=1.8h]

64     19.0s    0.5210   75.25   % 75.21   % 9.66e-03   📈 (1)


🚀 Optimized Training:  16%|█▋        | 65/400 [22:29<1:47:30, 19.25s/epoch, Loss=0.5145, Val=75.82%, Best=75.82%, Patience=0/40, ETA=1.8h]

65     19.2s    0.5145   75.82   % 75.82   % 9.64e-03   🏆 NEW BEST!


🚀 Optimized Training:  16%|█▋        | 66/400 [22:48<1:46:44, 19.18s/epoch, Loss=0.5115, Val=75.88%, Best=75.82%, Patience=1/40, ETA=1.8h]

66     19.0s    0.5115   75.88   % 75.82   % 9.63e-03   📈 (1)


🚀 Optimized Training:  17%|█▋        | 67/400 [23:07<1:46:16, 19.15s/epoch, Loss=0.5082, Val=75.92%, Best=75.92%, Patience=0/40, ETA=1.8h]

67     19.1s    0.5082   75.92   % 75.92   % 9.61e-03   🏆 NEW BEST!


🚀 Optimized Training:  17%|█▋        | 68/400 [23:26<1:45:59, 19.15s/epoch, Loss=0.5083, Val=76.26%, Best=76.26%, Patience=0/40, ETA=1.8h]

68     19.2s    0.5083   76.26   % 76.26   % 9.60e-03   🏆 NEW BEST!


🚀 Optimized Training:  17%|█▋        | 69/400 [23:45<1:45:23, 19.11s/epoch, Loss=0.5068, Val=76.26%, Best=76.26%, Patience=1/40, ETA=1.8h]

69     19.0s    0.5068   76.26   % 76.26   % 9.58e-03   📈 (1)


🚀 Optimized Training:  18%|█▊        | 70/400 [24:04<1:44:53, 19.07s/epoch, Loss=0.5127, Val=76.32%, Best=76.26%, Patience=2/40, ETA=1.7h]

70     19.0s    0.5127   76.32   % 76.26   % 9.56e-03   📈 (2)


🚀 Optimized Training:  18%|█▊        | 71/400 [24:23<1:44:35, 19.07s/epoch, Loss=0.5070, Val=76.39%, Best=76.39%, Patience=0/40, ETA=1.7h]

71     19.1s    0.5070   76.39   % 76.39   % 9.55e-03   🏆 NEW BEST!


🚀 Optimized Training:  18%|█▊        | 71/400 [24:42<1:44:35, 19.07s/epoch, Loss=0.5065, Val=76.53%, Best=76.53%, Patience=0/40, ETA=1.7h]

72     19.1s    0.5065   76.53   % 76.53   % 9.53e-03   🏆 NEW BEST!


🚀 Optimized Training:  18%|█▊        | 73/400 [25:01<1:42:48, 18.87s/epoch, Loss=0.5043, Val=76.24%, Best=76.53%, Patience=1/40, ETA=1.7h]

73     17.9s    0.5043   76.24   % 76.53   % 9.51e-03   📈 (1)


🚀 Optimized Training:  18%|█▊        | 74/400 [25:19<1:41:24, 18.66s/epoch, Loss=0.5060, Val=76.70%, Best=76.70%, Patience=0/40, ETA=1.7h]

74     18.2s    0.5060   76.70   % 76.70   % 9.49e-03   🏆 NEW BEST!


🚀 Optimized Training:  19%|█▉        | 75/400 [25:39<1:42:13, 18.87s/epoch, Loss=0.5017, Val=76.32%, Best=76.70%, Patience=1/40, ETA=1.7h]

75     19.4s    0.5017   76.32   % 76.70   % 9.48e-03   📈 (1)


🚀 Optimized Training:  19%|█▉        | 76/400 [25:58<1:42:42, 19.02s/epoch, Loss=0.5034, Val=76.24%, Best=76.70%, Patience=2/40, ETA=1.7h]

76     19.4s    0.5034   76.24   % 76.70   % 9.46e-03   📈 (2)


🚀 Optimized Training:  19%|█▉        | 77/400 [26:18<1:43:19, 19.19s/epoch, Loss=0.4968, Val=76.34%, Best=76.70%, Patience=3/40, ETA=1.7h]

77     19.6s    0.4968   76.34   % 76.70   % 9.44e-03   📈 (3)


🚀 Optimized Training:  20%|█▉        | 78/400 [26:37<1:43:26, 19.28s/epoch, Loss=0.4971, Val=76.78%, Best=76.78%, Patience=0/40, ETA=1.7h]

78     19.5s    0.4971   76.78   % 76.78   % 9.42e-03   🏆 NEW BEST!


🚀 Optimized Training:  20%|█▉        | 79/400 [26:57<1:43:42, 19.39s/epoch, Loss=0.4951, Val=76.91%, Best=76.91%, Patience=0/40, ETA=1.7h]

79     19.6s    0.4951   76.91   % 76.91   % 9.40e-03   🏆 NEW BEST!


🚀 Optimized Training:  20%|██        | 80/400 [27:16<1:43:36, 19.43s/epoch, Loss=0.4965, Val=77.04%, Best=77.04%, Patience=0/40, ETA=1.7h]

80     19.5s    0.4965   77.04   % 77.04   % 9.38e-03   🏆 NEW BEST!


🚀 Optimized Training:  20%|██        | 81/400 [27:35<1:43:10, 19.41s/epoch, Loss=0.4920, Val=76.97%, Best=77.04%, Patience=1/40, ETA=1.7h]

81     19.4s    0.4920   76.97   % 77.04   % 9.36e-03   📈 (1)


🚀 Optimized Training:  20%|██        | 82/400 [27:55<1:43:12, 19.47s/epoch, Loss=0.4908, Val=77.18%, Best=77.18%, Patience=0/40, ETA=1.7h]

82     19.6s    0.4908   77.18   % 77.18   % 9.34e-03   🏆 NEW BEST!


🚀 Optimized Training:  21%|██        | 83/400 [28:15<1:42:58, 19.49s/epoch, Loss=0.4903, Val=77.08%, Best=77.18%, Patience=1/40, ETA=1.7h]

83     19.5s    0.4903   77.08   % 77.18   % 9.32e-03   📈 (1)


🚀 Optimized Training:  21%|██        | 83/400 [28:34<1:42:58, 19.49s/epoch, Loss=0.4887, Val=77.14%, Best=77.18%, Patience=2/40, ETA=1.7h]

84     19.3s    0.4887   77.14   % 77.18   % 9.30e-03   📈 (2)


🚀 Optimized Training:  21%|██▏       | 85/400 [28:52<1:39:30, 18.95s/epoch, Loss=0.4861, Val=77.20%, Best=77.18%, Patience=3/40, ETA=1.7h]

85     17.3s    0.4861   77.20   % 77.18   % 9.28e-03   📈 (3)


🚀 Optimized Training:  22%|██▏       | 86/400 [29:11<1:38:53, 18.90s/epoch, Loss=0.4879, Val=76.97%, Best=77.18%, Patience=4/40, ETA=1.7h]

86     18.8s    0.4879   76.97   % 77.18   % 9.25e-03   📈 (4)


🚀 Optimized Training:  22%|██▏       | 87/400 [29:30<1:38:24, 18.86s/epoch, Loss=0.4843, Val=77.06%, Best=77.18%, Patience=5/40, ETA=1.7h]

87     18.8s    0.4843   77.06   % 77.18   % 9.23e-03   📈 (5)


🚀 Optimized Training:  22%|██▏       | 88/400 [29:48<1:38:11, 18.88s/epoch, Loss=0.4810, Val=77.50%, Best=77.50%, Patience=0/40, ETA=1.7h]

88     18.9s    0.4810   77.50   % 77.50   % 9.21e-03   🏆 NEW BEST!


🚀 Optimized Training:  22%|██▏       | 89/400 [30:07<1:37:49, 18.87s/epoch, Loss=0.4837, Val=77.37%, Best=77.50%, Patience=1/40, ETA=1.6h]

89     18.8s    0.4837   77.37   % 77.50   % 9.19e-03   📈 (1)


🚀 Optimized Training:  22%|██▎       | 90/400 [30:26<1:37:39, 18.90s/epoch, Loss=0.4874, Val=77.98%, Best=77.98%, Patience=0/40, ETA=1.6h]

90     19.0s    0.4874   77.98   % 77.98   % 9.17e-03   🏆 NEW BEST!


🚀 Optimized Training:  23%|██▎       | 91/400 [30:45<1:37:17, 18.89s/epoch, Loss=0.4792, Val=77.60%, Best=77.98%, Patience=1/40, ETA=1.6h]

91     18.9s    0.4792   77.60   % 77.98   % 9.14e-03   📈 (1)


🚀 Optimized Training:  23%|██▎       | 92/400 [31:04<1:36:46, 18.85s/epoch, Loss=0.4799, Val=77.62%, Best=77.98%, Patience=2/40, ETA=1.6h]

92     18.8s    0.4799   77.62   % 77.98   % 9.12e-03   📈 (2)


🚀 Optimized Training:  23%|██▎       | 93/400 [31:23<1:36:24, 18.84s/epoch, Loss=0.4828, Val=77.60%, Best=77.98%, Patience=3/40, ETA=1.6h]

93     18.8s    0.4828   77.60   % 77.98   % 9.10e-03   📈 (3)


🚀 Optimized Training:  24%|██▎       | 94/400 [31:42<1:36:04, 18.84s/epoch, Loss=0.4794, Val=77.43%, Best=77.98%, Patience=4/40, ETA=1.6h]

94     18.8s    0.4794   77.43   % 77.98   % 9.07e-03   📈 (4)


🚀 Optimized Training:  24%|██▍       | 95/400 [32:00<1:35:40, 18.82s/epoch, Loss=0.4817, Val=77.46%, Best=77.98%, Patience=5/40, ETA=1.6h]

95     18.8s    0.4817   77.46   % 77.98   % 9.05e-03   📈 (5)


🚀 Optimized Training:  24%|██▍       | 95/400 [32:19<1:35:40, 18.82s/epoch, Loss=0.4811, Val=77.20%, Best=77.98%, Patience=6/40, ETA=1.6h]

96     18.9s    0.4811   77.20   % 77.98   % 9.02e-03   ⏳ (6)


🚀 Optimized Training:  24%|██▍       | 97/400 [32:38<1:34:17, 18.67s/epoch, Loss=0.4794, Val=77.35%, Best=77.98%, Patience=7/40, ETA=1.6h]

97     17.8s    0.4794   77.35   % 77.98   % 9.00e-03   ⏳ (7)


🚀 Optimized Training:  24%|██▍       | 98/400 [32:57<1:34:20, 18.74s/epoch, Loss=0.4814, Val=77.35%, Best=77.98%, Patience=8/40, ETA=1.6h]

98     18.9s    0.4814   77.35   % 77.98   % 8.97e-03   ⏳ (8)


🚀 Optimized Training:  25%|██▍       | 99/400 [33:16<1:34:49, 18.90s/epoch, Loss=0.4757, Val=77.41%, Best=77.98%, Patience=9/40, ETA=1.6h]

99     19.3s    0.4757   77.41   % 77.98   % 8.95e-03   ⏳ (9)


🚀 Optimized Training:  25%|██▌       | 100/400 [33:35<1:34:48, 18.96s/epoch, Loss=0.4825, Val=77.56%, Best=77.98%, Patience=10/40, ETA=1.6h]

100    19.1s    0.4825   77.56   % 77.98   % 8.92e-03   ⏳ (10)


🚀 Optimized Training:  25%|██▌       | 101/400 [33:54<1:34:47, 19.02s/epoch, Loss=0.4777, Val=77.64%, Best=77.98%, Patience=11/40, ETA=1.6h]

101    19.2s    0.4777   77.64   % 77.98   % 8.90e-03   ⏳ (11)


🚀 Optimized Training:  26%|██▌       | 102/400 [34:13<1:34:36, 19.05s/epoch, Loss=0.4777, Val=77.77%, Best=77.98%, Patience=12/40, ETA=1.6h]

102    19.1s    0.4777   77.77   % 77.98   % 8.87e-03   ⏳ (12)


🚀 Optimized Training:  26%|██▌       | 103/400 [34:33<1:34:34, 19.10s/epoch, Loss=0.4768, Val=77.67%, Best=77.98%, Patience=13/40, ETA=1.6h]

103    19.2s    0.4768   77.67   % 77.98   % 8.85e-03   ⏳ (13)


🚀 Optimized Training:  26%|██▌       | 104/400 [34:52<1:34:14, 19.10s/epoch, Loss=0.4757, Val=77.83%, Best=77.98%, Patience=14/40, ETA=1.6h]

104    19.1s    0.4757   77.83   % 77.98   % 8.82e-03   ⏳ (14)


🚀 Optimized Training:  26%|██▋       | 105/400 [35:11<1:33:53, 19.10s/epoch, Loss=0.4772, Val=77.69%, Best=77.98%, Patience=15/40, ETA=1.6h]

105    19.1s    0.4772   77.69   % 77.98   % 8.79e-03   ⏳ (15)


🚀 Optimized Training:  26%|██▋       | 106/400 [35:30<1:33:43, 19.13s/epoch, Loss=0.4765, Val=77.52%, Best=77.98%, Patience=16/40, ETA=1.6h]

106    19.2s    0.4765   77.52   % 77.98   % 8.77e-03   ⚠️ (16)


🚀 Optimized Training:  27%|██▋       | 107/400 [35:49<1:33:25, 19.13s/epoch, Loss=0.4734, Val=77.58%, Best=77.98%, Patience=17/40, ETA=1.6h]

107    19.1s    0.4734   77.58   % 77.98   % 8.74e-03   ⚠️ (17)


🚀 Optimized Training:  27%|██▋       | 107/400 [36:08<1:33:25, 19.13s/epoch, Loss=0.4718, Val=77.54%, Best=77.98%, Patience=18/40, ETA=1.6h]

108    19.1s    0.4718   77.54   % 77.98   % 8.71e-03   ⚠️ (18)


🚀 Optimized Training:  27%|██▋       | 109/400 [36:26<1:30:41, 18.70s/epoch, Loss=0.4731, Val=77.77%, Best=77.98%, Patience=19/40, ETA=1.5h]

109    17.2s    0.4731   77.77   % 77.98   % 8.68e-03   ⚠️ (19)


🚀 Optimized Training:  28%|██▊       | 110/400 [36:46<1:32:13, 19.08s/epoch, Loss=0.4717, Val=77.67%, Best=77.98%, Patience=20/40, ETA=1.5h]

110    20.0s    0.4717   77.67   % 77.98   % 8.65e-03   ⚠️ (20)


🚀 Optimized Training:  28%|██▊       | 111/400 [37:06<1:32:42, 19.25s/epoch, Loss=0.4704, Val=77.67%, Best=77.98%, Patience=21/40, ETA=1.5h]

111    19.6s    0.4704   77.67   % 77.98   % 8.63e-03   ⚠️ (21)


🚀 Optimized Training:  28%|██▊       | 112/400 [37:25<1:32:49, 19.34s/epoch, Loss=0.4736, Val=77.62%, Best=77.98%, Patience=22/40, ETA=1.5h]

112    19.5s    0.4736   77.62   % 77.98   % 8.60e-03   ⚠️ (22)


🚀 Optimized Training:  28%|██▊       | 113/400 [37:45<1:32:49, 19.41s/epoch, Loss=0.4708, Val=77.71%, Best=77.98%, Patience=23/40, ETA=1.5h]

113    19.6s    0.4708   77.71   % 77.98   % 8.57e-03   ⚠️ (23)


🚀 Optimized Training:  28%|██▊       | 114/400 [38:04<1:32:49, 19.48s/epoch, Loss=0.4721, Val=77.94%, Best=77.98%, Patience=24/40, ETA=1.5h]

114    19.6s    0.4721   77.94   % 77.98   % 8.54e-03   ⚠️ (24)


🚀 Optimized Training:  29%|██▉       | 115/400 [38:24<1:32:50, 19.55s/epoch, Loss=0.4683, Val=77.81%, Best=77.98%, Patience=25/40, ETA=1.5h]

115    19.7s    0.4683   77.81   % 77.98   % 8.51e-03   ⚠️ (25)


🚀 Optimized Training:  29%|██▉       | 116/400 [38:44<1:32:33, 19.56s/epoch, Loss=0.4704, Val=77.62%, Best=77.98%, Patience=26/40, ETA=1.5h]

116    19.6s    0.4704   77.62   % 77.98   % 8.48e-03   ⚠️ (26)


🚀 Optimized Training:  29%|██▉       | 117/400 [39:03<1:32:12, 19.55s/epoch, Loss=0.4679, Val=77.90%, Best=77.98%, Patience=27/40, ETA=1.5h]

117    19.5s    0.4679   77.90   % 77.98   % 8.45e-03   ⚠️ (27)


🚀 Optimized Training:  30%|██▉       | 118/400 [39:23<1:31:51, 19.54s/epoch, Loss=0.4669, Val=77.75%, Best=77.98%, Patience=28/40, ETA=1.5h]

118    19.5s    0.4669   77.75   % 77.98   % 8.42e-03   ⚠️ (28)


🚀 Optimized Training:  30%|██▉       | 119/400 [39:42<1:31:34, 19.55s/epoch, Loss=0.4682, Val=77.64%, Best=77.98%, Patience=29/40, ETA=1.5h]

119    19.6s    0.4682   77.64   % 77.98   % 8.39e-03   ⚠️ (29)


🚀 Optimized Training:  30%|██▉       | 119/400 [40:02<1:31:34, 19.55s/epoch, Loss=0.4663, Val=78.00%, Best=77.98%, Patience=30/40, ETA=1.5h]

120    19.6s    0.4663   78.00   % 77.98   % 8.36e-03   ⚠️ (30)


🚀 Optimized Training:  30%|███       | 121/400 [40:20<1:28:46, 19.09s/epoch, Loss=0.4698, Val=77.71%, Best=77.98%, Patience=31/40, ETA=1.5h]

121    17.4s    0.4698   77.71   % 77.98   % 8.33e-03   ⚠️ (31)


🚀 Optimized Training:  30%|███       | 122/400 [40:41<1:30:58, 19.63s/epoch, Loss=0.4650, Val=77.77%, Best=77.98%, Patience=32/40, ETA=1.5h]

122    20.9s    0.4650   77.77   % 77.98   % 8.30e-03   ⚠️ (32)


🚀 Optimized Training:  31%|███       | 123/400 [41:02<1:32:17, 19.99s/epoch, Loss=0.4650, Val=77.83%, Best=77.98%, Patience=33/40, ETA=1.5h]

123    20.8s    0.4650   77.83   % 77.98   % 8.27e-03   ⚠️ (33)


🚀 Optimized Training:  31%|███       | 124/400 [41:23<1:33:07, 20.24s/epoch, Loss=0.4656, Val=77.94%, Best=77.98%, Patience=34/40, ETA=1.5h]

124    20.8s    0.4656   77.94   % 77.98   % 8.24e-03   ⚠️ (34)


🚀 Optimized Training:  31%|███▏      | 125/400 [41:44<1:33:45, 20.46s/epoch, Loss=0.4653, Val=78.06%, Best=78.06%, Patience=0/40, ETA=1.5h] 

125    21.0s    0.4653   78.06   % 78.06   % 8.21e-03   🏆 NEW BEST!


🚀 Optimized Training:  32%|███▏      | 126/400 [42:05<1:34:04, 20.60s/epoch, Loss=0.4638, Val=77.94%, Best=78.06%, Patience=1/40, ETA=1.5h]

126    20.9s    0.4638   77.94   % 78.06   % 8.17e-03   📈 (1)


🚀 Optimized Training:  32%|███▏      | 127/400 [42:25<1:34:00, 20.66s/epoch, Loss=0.4638, Val=78.09%, Best=78.06%, Patience=2/40, ETA=1.5h]

127    20.8s    0.4638   78.09   % 78.06   % 8.14e-03   📈 (2)


🚀 Optimized Training:  32%|███▏      | 128/400 [42:46<1:34:01, 20.74s/epoch, Loss=0.4656, Val=77.83%, Best=78.06%, Patience=3/40, ETA=1.5h]

128    20.9s    0.4656   77.83   % 78.06   % 8.11e-03   📈 (3)


🚀 Optimized Training:  32%|███▏      | 129/400 [43:07<1:34:00, 20.81s/epoch, Loss=0.4625, Val=77.60%, Best=78.06%, Patience=4/40, ETA=1.5h]

129    21.0s    0.4625   77.60   % 78.06   % 8.08e-03   📈 (4)


🚀 Optimized Training:  32%|███▎      | 130/400 [43:28<1:33:53, 20.86s/epoch, Loss=0.4633, Val=77.60%, Best=78.06%, Patience=5/40, ETA=1.5h]

130    21.0s    0.4633   77.60   % 78.06   % 8.04e-03   📈 (5)


🚀 Optimized Training:  33%|███▎      | 131/400 [43:49<1:33:28, 20.85s/epoch, Loss=0.4633, Val=77.75%, Best=78.06%, Patience=6/40, ETA=1.6h]

131    20.8s    0.4633   77.75   % 78.06   % 8.01e-03   ⏳ (6)


🚀 Optimized Training:  33%|███▎      | 131/400 [44:10<1:33:28, 20.85s/epoch, Loss=0.4643, Val=77.73%, Best=78.06%, Patience=7/40, ETA=1.6h]

132    20.8s    0.4643   77.73   % 78.06   % 7.98e-03   ⏳ (7)


🚀 Optimized Training:  33%|███▎      | 133/400 [44:28<1:28:10, 19.81s/epoch, Loss=0.4624, Val=77.73%, Best=78.06%, Patience=8/40, ETA=1.5h]

133    16.9s    0.4624   77.73   % 78.06   % 7.95e-03   ⏳ (8)


🚀 Optimized Training:  34%|███▎      | 134/400 [44:47<1:27:10, 19.66s/epoch, Loss=0.4615, Val=77.81%, Best=78.06%, Patience=9/40, ETA=1.5h]

134    19.3s    0.4615   77.81   % 78.06   % 7.91e-03   ⏳ (9)


🚀 Optimized Training:  34%|███▍      | 135/400 [45:06<1:26:14, 19.53s/epoch, Loss=0.4620, Val=77.83%, Best=78.06%, Patience=10/40, ETA=1.5h]

135    19.2s    0.4620   77.83   % 78.06   % 7.88e-03   ⏳ (10)


🚀 Optimized Training:  34%|███▍      | 136/400 [45:25<1:25:25, 19.42s/epoch, Loss=0.4630, Val=77.96%, Best=78.06%, Patience=11/40, ETA=1.5h]

136    19.2s    0.4630   77.96   % 78.06   % 7.84e-03   ⏳ (11)


🚀 Optimized Training:  34%|███▍      | 137/400 [45:45<1:25:10, 19.43s/epoch, Loss=0.4599, Val=78.04%, Best=78.06%, Patience=12/40, ETA=1.5h]

137    19.5s    0.4599   78.04   % 78.06   % 7.81e-03   ⏳ (12)


🚀 Optimized Training:  34%|███▍      | 138/400 [46:04<1:25:02, 19.47s/epoch, Loss=0.4614, Val=77.96%, Best=78.06%, Patience=13/40, ETA=1.4h]

138    19.6s    0.4614   77.96   % 78.06   % 7.78e-03   ⏳ (13)


🚀 Optimized Training:  35%|███▍      | 139/400 [46:24<1:24:28, 19.42s/epoch, Loss=0.4632, Val=77.94%, Best=78.06%, Patience=14/40, ETA=1.4h]

139    19.3s    0.4632   77.94   % 78.06   % 7.74e-03   ⏳ (14)


🚀 Optimized Training:  35%|███▌      | 140/400 [46:43<1:23:58, 19.38s/epoch, Loss=0.4606, Val=77.94%, Best=78.06%, Patience=15/40, ETA=1.4h]

140    19.3s    0.4606   77.94   % 78.06   % 7.71e-03   ⏳ (15)


🚀 Optimized Training:  35%|███▌      | 141/400 [47:02<1:23:41, 19.39s/epoch, Loss=0.4595, Val=77.92%, Best=78.06%, Patience=16/40, ETA=1.4h]

141    19.4s    0.4595   77.92   % 78.06   % 7.67e-03   ⚠️ (16)


🚀 Optimized Training:  36%|███▌      | 142/400 [47:22<1:23:29, 19.42s/epoch, Loss=0.4607, Val=77.88%, Best=78.06%, Patience=17/40, ETA=1.4h]

142    19.5s    0.4607   77.88   % 78.06   % 7.64e-03   ⚠️ (17)


🚀 Optimized Training:  36%|███▌      | 143/400 [47:41<1:22:53, 19.35s/epoch, Loss=0.4608, Val=77.81%, Best=78.06%, Patience=18/40, ETA=1.4h]

143    19.2s    0.4608   77.81   % 78.06   % 7.60e-03   ⚠️ (18)


🚀 Optimized Training:  36%|███▌      | 143/400 [48:00<1:22:53, 19.35s/epoch, Loss=0.4604, Val=78.17%, Best=78.17%, Patience=0/40, ETA=1.4h] 

144    19.4s    0.4604   78.17   % 78.17   % 7.57e-03   🏆 NEW BEST!


🚀 Optimized Training:  36%|███▋      | 145/400 [48:18<1:20:09, 18.86s/epoch, Loss=0.4578, Val=78.27%, Best=78.27%, Patience=0/40, ETA=1.4h]

145    17.2s    0.4578   78.27   % 78.27   % 7.53e-03   🏆 NEW BEST!


🚀 Optimized Training:  36%|███▋      | 146/400 [48:38<1:21:13, 19.19s/epoch, Loss=0.4588, Val=78.04%, Best=78.27%, Patience=1/40, ETA=1.4h]

146    19.9s    0.4588   78.04   % 78.27   % 7.50e-03   📈 (1)


🚀 Optimized Training:  37%|███▋      | 147/400 [48:58<1:21:59, 19.44s/epoch, Loss=0.4588, Val=78.21%, Best=78.27%, Patience=2/40, ETA=1.4h]

147    20.0s    0.4588   78.21   % 78.27   % 7.46e-03   📈 (2)


🚀 Optimized Training:  37%|███▋      | 148/400 [49:18<1:22:22, 19.61s/epoch, Loss=0.4605, Val=78.00%, Best=78.27%, Patience=3/40, ETA=1.4h]

148    20.0s    0.4605   78.00   % 78.27   % 7.42e-03   📈 (3)


🚀 Optimized Training:  37%|███▋      | 149/400 [49:38<1:22:32, 19.73s/epoch, Loss=0.4616, Val=77.96%, Best=78.27%, Patience=4/40, ETA=1.4h]

149    20.0s    0.4616   77.96   % 78.27   % 7.39e-03   📈 (4)


🚀 Optimized Training:  38%|███▊      | 150/400 [49:58<1:22:46, 19.87s/epoch, Loss=0.4599, Val=78.04%, Best=78.27%, Patience=5/40, ETA=1.4h]

150    20.2s    0.4599   78.04   % 78.27   % 7.35e-03   📈 (5)


🚀 Optimized Training:  38%|███▊      | 151/400 [50:19<1:22:49, 19.96s/epoch, Loss=0.4581, Val=77.94%, Best=78.27%, Patience=6/40, ETA=1.4h]

151    20.2s    0.4581   77.94   % 78.27   % 7.32e-03   ⏳ (6)


🚀 Optimized Training:  38%|███▊      | 152/400 [50:39<1:22:50, 20.04s/epoch, Loss=0.4582, Val=77.83%, Best=78.27%, Patience=7/40, ETA=1.4h]

152    20.2s    0.4582   77.83   % 78.27   % 7.28e-03   ⏳ (7)


🚀 Optimized Training:  38%|███▊      | 153/400 [50:59<1:22:47, 20.11s/epoch, Loss=0.4578, Val=77.98%, Best=78.27%, Patience=8/40, ETA=1.4h]

153    20.3s    0.4578   77.98   % 78.27   % 7.24e-03   ⏳ (8)


🚀 Optimized Training:  38%|███▊      | 154/400 [51:19<1:22:28, 20.12s/epoch, Loss=0.4586, Val=77.94%, Best=78.27%, Patience=9/40, ETA=1.4h]

154    20.1s    0.4586   77.94   % 78.27   % 7.20e-03   ⏳ (9)


🚀 Optimized Training:  39%|███▉      | 155/400 [51:39<1:22:07, 20.11s/epoch, Loss=0.4574, Val=78.00%, Best=78.27%, Patience=10/40, ETA=1.4h]

155    20.1s    0.4574   78.00   % 78.27   % 7.17e-03   ⏳ (10)


🚀 Optimized Training:  39%|███▉      | 155/400 [52:00<1:22:07, 20.11s/epoch, Loss=0.4585, Val=77.96%, Best=78.27%, Patience=11/40, ETA=1.4h]

156    20.1s    0.4585   77.96   % 78.27   % 7.13e-03   ⏳ (11)


🚀 Optimized Training:  39%|███▉      | 157/400 [52:17<1:18:23, 19.35s/epoch, Loss=0.4580, Val=78.15%, Best=78.27%, Patience=12/40, ETA=1.3h]

157    16.8s    0.4580   78.15   % 78.27   % 7.09e-03   ⏳ (12)


🚀 Optimized Training:  40%|███▉      | 158/400 [52:37<1:17:47, 19.29s/epoch, Loss=0.4573, Val=77.98%, Best=78.27%, Patience=13/40, ETA=1.3h]

158    19.1s    0.4573   77.98   % 78.27   % 7.06e-03   ⏳ (13)


🚀 Optimized Training:  40%|███▉      | 159/400 [52:56<1:17:12, 19.22s/epoch, Loss=0.4583, Val=78.25%, Best=78.27%, Patience=14/40, ETA=1.3h]

159    19.1s    0.4583   78.25   % 78.27   % 7.02e-03   ⏳ (14)


🚀 Optimized Training:  40%|████      | 160/400 [53:14<1:16:15, 19.06s/epoch, Loss=0.4572, Val=78.13%, Best=78.27%, Patience=15/40, ETA=1.3h]

160    18.7s    0.4572   78.13   % 78.27   % 6.98e-03   ⏳ (15)


🚀 Optimized Training:  40%|████      | 161/400 [53:33<1:15:34, 18.97s/epoch, Loss=0.4568, Val=78.09%, Best=78.27%, Patience=16/40, ETA=1.3h]

161    18.8s    0.4568   78.09   % 78.27   % 6.94e-03   ⚠️ (16)


🚀 Optimized Training:  40%|████      | 162/400 [53:52<1:14:56, 18.89s/epoch, Loss=0.4566, Val=78.23%, Best=78.27%, Patience=17/40, ETA=1.3h]

162    18.7s    0.4566   78.23   % 78.27   % 6.90e-03   ⚠️ (17)


🚀 Optimized Training:  41%|████      | 163/400 [54:11<1:14:45, 18.93s/epoch, Loss=0.4557, Val=78.21%, Best=78.27%, Patience=18/40, ETA=1.3h]

163    19.0s    0.4557   78.21   % 78.27   % 6.87e-03   ⚠️ (18)


🚀 Optimized Training:  41%|████      | 164/400 [54:30<1:14:25, 18.92s/epoch, Loss=0.4575, Val=78.21%, Best=78.27%, Patience=19/40, ETA=1.2h]

164    18.9s    0.4575   78.21   % 78.27   % 6.83e-03   ⚠️ (19)


🚀 Optimized Training:  41%|████▏     | 165/400 [54:49<1:14:09, 18.93s/epoch, Loss=0.4551, Val=78.21%, Best=78.27%, Patience=20/40, ETA=1.2h]

165    19.0s    0.4551   78.21   % 78.27   % 6.79e-03   ⚠️ (20)


🚀 Optimized Training:  42%|████▏     | 166/400 [55:08<1:13:43, 18.90s/epoch, Loss=0.4567, Val=78.15%, Best=78.27%, Patience=21/40, ETA=1.2h]

166    18.8s    0.4567   78.15   % 78.27   % 6.75e-03   ⚠️ (21)


🚀 Optimized Training:  42%|████▏     | 167/400 [55:26<1:13:26, 18.91s/epoch, Loss=0.4528, Val=78.27%, Best=78.27%, Patience=22/40, ETA=1.2h]

167    18.9s    0.4528   78.27   % 78.27   % 6.71e-03   ⚠️ (22)


🚀 Optimized Training:  42%|████▏     | 167/400 [55:45<1:13:26, 18.91s/epoch, Loss=0.4552, Val=78.17%, Best=78.27%, Patience=23/40, ETA=1.2h]

168    18.8s    0.4552   78.17   % 78.27   % 6.67e-03   ⚠️ (23)


🚀 Optimized Training:  42%|████▏     | 169/400 [56:03<1:11:25, 18.55s/epoch, Loss=0.4554, Val=77.96%, Best=78.27%, Patience=24/40, ETA=1.2h]

169    17.0s    0.4554   77.96   % 78.27   % 6.63e-03   ⚠️ (24)


🚀 Optimized Training:  42%|████▎     | 170/400 [56:22<1:11:28, 18.65s/epoch, Loss=0.4558, Val=78.02%, Best=78.27%, Patience=25/40, ETA=1.2h]

170    18.9s    0.4558   78.02   % 78.27   % 6.60e-03   ⚠️ (25)


🚀 Optimized Training:  43%|████▎     | 171/400 [56:42<1:12:27, 18.98s/epoch, Loss=0.4560, Val=77.81%, Best=78.27%, Patience=26/40, ETA=1.2h]

171    19.8s    0.4560   77.81   % 78.27   % 6.56e-03   ⚠️ (26)


🚀 Optimized Training:  43%|████▎     | 172/400 [57:02<1:12:45, 19.15s/epoch, Loss=0.4554, Val=77.69%, Best=78.27%, Patience=27/40, ETA=1.2h]

172    19.5s    0.4554   77.69   % 78.27   % 6.52e-03   ⚠️ (27)


🚀 Optimized Training:  43%|████▎     | 173/400 [57:21<1:12:47, 19.24s/epoch, Loss=0.4537, Val=77.73%, Best=78.27%, Patience=28/40, ETA=1.2h]

173    19.4s    0.4537   77.73   % 78.27   % 6.48e-03   ⚠️ (28)


🚀 Optimized Training:  44%|████▎     | 174/400 [57:40<1:12:41, 19.30s/epoch, Loss=0.4539, Val=77.75%, Best=78.27%, Patience=29/40, ETA=1.2h]

174    19.4s    0.4539   77.75   % 78.27   % 6.44e-03   ⚠️ (29)


🚀 Optimized Training:  44%|████▍     | 175/400 [58:00<1:12:36, 19.36s/epoch, Loss=0.4550, Val=77.90%, Best=78.27%, Patience=30/40, ETA=1.2h]

175    19.5s    0.4550   77.90   % 78.27   % 6.40e-03   ⚠️ (30)


🚀 Optimized Training:  44%|████▍     | 176/400 [58:19<1:12:15, 19.35s/epoch, Loss=0.4566, Val=77.79%, Best=78.27%, Patience=31/40, ETA=1.2h]

176    19.3s    0.4566   77.79   % 78.27   % 6.36e-03   ⚠️ (31)


🚀 Optimized Training:  44%|████▍     | 177/400 [58:39<1:12:03, 19.39s/epoch, Loss=0.4552, Val=77.96%, Best=78.27%, Patience=32/40, ETA=1.2h]

177    19.5s    0.4552   77.96   % 78.27   % 6.32e-03   ⚠️ (32)


🚀 Optimized Training:  44%|████▍     | 178/400 [58:58<1:11:41, 19.38s/epoch, Loss=0.4538, Val=77.98%, Best=78.27%, Patience=33/40, ETA=1.2h]

178    19.3s    0.4538   77.98   % 78.27   % 6.28e-03   ⚠️ (33)


🚀 Optimized Training:  45%|████▍     | 179/400 [59:17<1:11:23, 19.38s/epoch, Loss=0.4567, Val=77.88%, Best=78.27%, Patience=34/40, ETA=1.2h]

179    19.4s    0.4567   77.88   % 78.27   % 6.24e-03   ⚠️ (34)


🚀 Optimized Training:  45%|████▍     | 179/400 [59:37<1:11:23, 19.38s/epoch, Loss=0.4547, Val=78.00%, Best=78.27%, Patience=35/40, ETA=1.2h]

180    19.6s    0.4547   78.00   % 78.27   % 6.20e-03   ⚠️ (35)


🚀 Optimized Training:  45%|████▌     | 181/400 [59:55<1:09:35, 19.07s/epoch, Loss=0.4545, Val=78.00%, Best=78.27%, Patience=36/40, ETA=1.2h]

181    17.6s    0.4545   78.00   % 78.27   % 6.16e-03   ⚠️ (36)


🚀 Optimized Training:  46%|████▌     | 182/400 [1:00:14<1:08:39, 18.89s/epoch, Loss=0.4558, Val=77.94%, Best=78.27%, Patience=37/40, ETA=1.2h]

182    18.5s    0.4558   77.94   % 78.27   % 6.12e-03   ⚠️ (37)


🚀 Optimized Training:  46%|████▌     | 183/400 [1:00:32<1:07:31, 18.67s/epoch, Loss=0.4553, Val=77.83%, Best=78.27%, Patience=38/40, ETA=1.1h]

183    18.1s    0.4553   77.83   % 78.27   % 6.08e-03   ⚠️ (38)


🚀 Optimized Training:  46%|████▌     | 184/400 [1:00:50<1:06:40, 18.52s/epoch, Loss=0.4533, Val=77.88%, Best=78.27%, Patience=39/40, ETA=1.1h]

184    18.2s    0.4533   77.88   % 78.27   % 6.04e-03   ⚠️ (39)


🚀 Optimized Training:  46%|████▌     | 184/400 [1:01:08<1:11:46, 19.94s/epoch, Loss=0.4527, Val=77.92%, Best=78.27%, Patience=40/40, ETA=1.1h]

185    18.1s    0.4527   77.92   % 78.27   % 6.00e-03   ⚠️ (40)

⏹️ Early stopping triggered at epoch 185

🏁 OPTIMIZED TRAINING COMPLETE!
📊 TRAINING SUMMARY:
   Total training time: 1.0h
   Average time per epoch: 19.8s
   Best epoch: 145
   Best accuracy: 0.7827 (78.27%)
   Total epochs trained: 185
✅ Loaded best model from epoch 145

📈 IMPROVEMENT ANALYSIS:
   Starting baseline: 77.8%
   OPTIMIZED result: 78.27%
   Net improvement: +0.47 percentage points
   Model parameters: 10,380,433 (10.38M)
   Parameter efficiency: 0.05 pp/M params

💎 OPTIMIZED achievement: 78.27% with 10.4M parameters!






💎🎯 FINAL OPTIMIZED STATUS:
💎 Optimized progress: 78.27%


In [1]:
print("hello")

hello
