In [16]:
"""
PHASE 2: DQN Strategic Layer with Application-Aware Learning
Uses drl_states_actions.npz and application_profiles.csv from Phase 1
"""

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
import json
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt

# ============================================================================
# SECTION 1: Load Phase 1 Outputs (CRITICAL CHANGE)
# ============================================================================

print("="*80)
print("Loading Phase 1 Processed Data")
print("="*80)

DATA_PATH = '/content/drive/MyDrive/mythesis/rohit-thesis/datasets/processed'

# Load pre-computed DRL states
print("\n[1/5] Loading DRL state representations...")
drl_data = np.load(f'{DATA_PATH}/drl_states_actions.npz', allow_pickle=True)

strategic_states = drl_data['strategic_states']  # (1.8M, 10)
action_spaces = drl_data['action_spaces'].item()  # Action definitions

print(f" Strategic states shape: {strategic_states.shape}")
print(f" Action spaces: {action_spaces['strategic']}")

# Load application profiles
print("\n[2/5] Loading application profiles...")
app_profiles = pd.read_csv(f'{DATA_PATH}/application_profiles.csv')

print(f" Application profiles: {len(app_profiles)} applications")
print(f"\n  Workload distribution:")
for wtype, count in app_profiles['workload_type'].value_counts().items():
    print(f"    {wtype:15s}: {count:>3} apps")

# Load metadata for splits
print("\n[3/5] Loading train/val/test splits...")
train_df = pd.read_parquet(f'{DATA_PATH}/train_data.parquet')
val_df = pd.read_parquet(f'{DATA_PATH}/val_data.parquet')
test_df = pd.read_parquet(f'{DATA_PATH}/test_data.parquet')

print(f" Train samples: {len(train_df):,}")
print(f" Val samples: {len(val_df):,}")
print(f" Test samples: {len(test_df):,}")

# Split strategic states by dataset split
strategic_states_train = strategic_states[:len(train_df)]
strategic_states_val = strategic_states[len(train_df):len(train_df)+len(val_df)]
strategic_states_test = strategic_states[len(train_df)+len(val_df):]

# Load scaler and metadata
print("\n[4/5] Loading feature scaler...")
with open(f'{DATA_PATH}/robust_scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

print("\n[5/5] Loading metadata...")
with open(f'{DATA_PATH}/metadata.json', 'r') as f:
    metadata = json.load(f)

print(f" DRL Config: {metadata['drl_config']}")

# Create application profile lookup dictionary
app_profile_dict = app_profiles.set_index('app').to_dict('index')

print("\n" + "="*80)
print("Phase 1 Data Loaded Successfully")
print("="*80)

Loading Phase 1 Processed Data

[1/5] Loading DRL state representations...
 Strategic states shape: (1264946, 10)
 Action spaces: {'cloud_providers': ['AWS', 'Azure', 'GCP'], 'action_space_size': 3}

[2/5] Loading application profiles...
 Application profiles: 119 applications

  Workload distribution:
    standard       :  75 apps
    bursty         :  44 apps

[3/5] Loading train/val/test splits...
 Train samples: 1,264,946
 Val samples: 271,060
 Test samples: 271,061

[4/5] Loading feature scaler...

[5/5] Loading metadata...
 DRL Config: {'strategic_state_dim': 10, 'tactical_state_dim': 7, 'operational_state_dim': 5, 'strategic_actions': 3, 'tactical_actions': 24, 'operational_actions': 3, 'reward_weights': {'alpha': 0.4, 'beta': 0.4, 'gamma': 0.2}, 'sla_penalty': -5.0}

Phase 1 Data Loaded Successfully


In [17]:
# ============================================================================
# SECTION 2: Enhanced State Representation
# ============================================================================

def create_enhanced_state(strategic_state, row, app_profile_dict):
    """
    Combine strategic state with application-level context

    Args:
        strategic_state: Pre-computed strategic features (10-dim)
        row: Current invocation data
        app_profile_dict: Application profile lookup

    Returns:
        enhanced_state: Combined state vector (14-dim)
    """
    app_id = row['app']
    app_profile = app_profile_dict.get(app_id, {})

    # Application context features (4-dim)
    app_context = np.array([
        app_profile.get('cold_start_rate', 0.0),      # Cold start likelihood
        app_profile.get('sla_violation_rate', 0.0),   # SLA risk
        app_profile.get('avg_invocation_rate', 0.0) / 100.0,  # Normalized traffic
        1.0 if app_profile.get('workload_type') == 'bursty' else 0.0  # Burst indicator
    ], dtype=np.float32)

    # Concatenate: [strategic_state (10) | app_context (4)] = 14-dim
    enhanced_state = np.concatenate([strategic_state, app_context])

    return enhanced_state



In [18]:
# ============================================================================
# SECTION 3: Application-Aware Multi-Cloud Environment
# ============================================================================

class AppAwareMultiCloudEnv:
    """
    Enhanced environment that uses Phase 1 outputs:
    - Pre-computed strategic states
    - Application profile context
    - Workload-aware reward shaping
    """

    def __init__(self, strategic_states, data_df, app_profile_dict):
        self.strategic_states = strategic_states
        self.data_df = data_df
        self.app_profile_dict = app_profile_dict

        self.state_dim = 14  # 10 strategic + 4 app context
        self.action_dim = 3  # AWS, Azure, GCP

        # Cloud provider mapping
        self.providers = ['AWS', 'Azure', 'GCP']

        # Reward weights (from metadata)
        self.alpha = 0.4  # Cost weight
        self.beta = 0.4   # Performance weight
        self.gamma = 0.2  # Carbon weight
        self.sla_penalty = 5.0

    def reset(self, idx):
        """
        Initialize state for a specific invocation

        Args:
            idx: Index in the dataset

        Returns:
            state: Enhanced state vector (14-dim)
            row: Data for the invocation
        """
        row = self.data_df.iloc[idx]

        # Get pre-computed strategic state
        strategic_state = self.strategic_states[idx]

        # Create enhanced state with app context
        state = create_enhanced_state(strategic_state, row, self.app_profile_dict)

        return state, row

    def step(self, action, row):
        """
        Execute action and compute application-aware reward

        Args:
            action: Cloud provider selection (0=AWS, 1=Azure, 2=GCP)
            row: Current invocation data

        Returns:
            reward: Application-aware reward
            done: Episode termination flag
        """
        # Get actual outcomes from data (simulated deployment)
        cost = row['compute_cost']
        latency = row['total_latency_ms']
        carbon = row['carbon_footprint_g']
        sla_violated = row['sla_violation']

        # Normalize rewards (0-1 range)
        cost_reward = 1.0 - min(cost / 1.0, 1.0)  # Lower cost = higher reward
        perf_reward = 1.0 - min(latency / 1000.0, 1.0)  # Lower latency = higher reward
        carbon_reward = 1.0 - min(carbon / 100.0, 1.0)  # Lower carbon = higher reward

        # Application-aware reward shaping
        app_id = row['app']
        app_profile = self.app_profile_dict.get(app_id, {})

        # Penalty for cold starts on bursty applications
        if app_profile.get('workload_type') == 'bursty' and row.get('is_cold_start', False):
            perf_reward -= 0.2

        # Bonus for maintaining SLA on high-risk applications
        if app_profile.get('sla_violation_rate', 0.0) > 0.1:
            if not sla_violated:
                perf_reward += 0.1  # Bonus for SLA compliance

        # Multi-objective reward
        reward = (self.alpha * cost_reward +
                 self.beta * perf_reward +
                 self.gamma * carbon_reward)

        # SLA penalty (asymmetric loss)
        if sla_violated:
            reward -= self.sla_penalty

        return reward, False  # Never truly "done" in this setup

In [19]:

# ============================================================================
# SECTION 4: Enhanced DQN Network Architecture
# ============================================================================

class EnhancedDQNetwork(nn.Module):
    """
    Enhanced DQN with separate encoders for strategic and application context

    Architecture:
        Strategic Encoder: 10 -> 64
        App Context Encoder: 4 -> 32
        Fusion Network: 96 -> 128 -> 64 -> 3
    """

    def __init__(self, state_size=14, action_size=3, hidden_sizes=[64, 32, 128]):
        super(EnhancedDQNetwork, self).__init__()

        # Strategic encoder (10 -> 64)
        self.strategic_encoder = nn.Sequential(
            nn.Linear(10, hidden_sizes[0]),
            nn.ReLU(),
            nn.LayerNorm(hidden_sizes[0]),  # ‚Üê Changed from BatchNorm1d
            nn.Dropout(0.1)
        )

        # App context encoder (4 -> 32)
        self.app_context_encoder = nn.Sequential(
            nn.Linear(4, hidden_sizes[1]),
            nn.ReLU(),
            nn.LayerNorm(hidden_sizes[1]),  # ‚Üê Changed from BatchNorm1d
            nn.Dropout(0.1)
        )

        # Fusion network (96 -> 3)
        fusion_input_size = hidden_sizes[0] + hidden_sizes[1]
        self.fusion = nn.Sequential(
            nn.Linear(fusion_input_size, hidden_sizes[2]),
            nn.ReLU(),
            nn.LayerNorm(hidden_sizes[2]),  # ‚Üê Changed from BatchNorm1d
            nn.Dropout(0.2),
            nn.Linear(hidden_sizes[2], 64),
            nn.ReLU(),
            nn.Linear(64, action_size)
        )

        self._initialize_weights()

    def _initialize_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.kaiming_normal_(module.weight, nonlinearity='relu')
                nn.init.constant_(module.bias, 0.01)

    def forward(self, state):
        strategic = state[:, :10]
        app_context = state[:, 10:]

        strategic_emb = self.strategic_encoder(strategic)
        app_emb = self.app_context_encoder(app_context)

        combined = torch.cat([strategic_emb, app_emb], dim=1)
        q_values = self.fusion(combined)

        return q_values

In [20]:
# ============================================================================
# SECTION 5: DQN Agent with Experience Replay
# ============================================================================

class DQNAgent:
    """
    DQN Agent with application-aware learning
    """

    def __init__(self, state_size=14, action_size=3, learning_rate=0.0001,  # ‚Üê Lower LR
                 gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=10000,
                 buffer_size=100000, batch_size=64, target_update_freq=1000):

        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma

        # Epsilon-greedy parameters
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.steps = 0

        # Experience replay
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size

        # Networks
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.q_network = EnhancedDQNetwork(state_size, action_size).to(self.device)
        self.target_network = EnhancedDQNetwork(state_size, action_size).to(self.device)
        self.target_network.load_state_dict(self.q_network.state_dict())

        # Optimizer with lower learning rate
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
        self.criterion = nn.SmoothL1Loss(beta=1.0)  # Huber loss

        # Target network update
        self.target_update_freq = target_update_freq

        # Tracking
        self.epsilon_history = []
        self.loss_history = []
        self.nan_count = 0

    def act(self, state, training=True):
        """Epsilon-greedy action selection"""
        if training and random.random() < self.epsilon:
            return random.randrange(self.action_size)

        self.q_network.eval()
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)

        with torch.no_grad():
            q_values = self.q_network(state)

        self.q_network.train()
        return q_values.argmax().item()

    def remember(self, state, action, reward, next_state, done):
        """Store transition with validity check"""
        # Check for NaN/Inf before storing
        if np.isnan(state).any() or np.isnan(next_state).any() or np.isnan(reward):
            return  # Skip invalid transitions

        self.memory.append((state, action, reward, next_state, done))

    def train_step(self):
        """Perform one training step with NaN protection"""
        if len(self.memory) < self.batch_size:
            return None

        # Sample batch
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        # Convert to tensors
        states = torch.FloatTensor(np.array(states)).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)

        # Check for NaN/Inf in inputs
        if torch.isnan(states).any() or torch.isinf(states).any():
            self.nan_count += 1
            return None

        # Ensure network is in train mode
        self.q_network.train()

        # Current Q-values
        current_q = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Check Q-values
        if torch.isnan(current_q).any() or torch.isinf(current_q).any():
            print(f" NaN/Inf in Q-values at step {self.steps}")
            self.nan_count += 1
            return None

        # Target Q-values
        self.target_network.eval()
        with torch.no_grad():
            next_q = self.target_network(next_states).max(1)[0]
            target_q = rewards + (1 - dones) * self.gamma * next_q

        # Clamp targets to prevent explosion
        target_q = torch.clamp(target_q, -10.0, 10.0)

        # Compute loss
        loss = self.criterion(current_q, target_q)

        # Check loss
        if torch.isnan(loss) or torch.isinf(loss):
            print(f" NaN/Inf loss at step {self.steps}")
            self.nan_count += 1
            return None

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()

        # Aggressive gradient clipping
        torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), 0.5)

        self.optimizer.step()

        self.loss_history.append(loss.item())

        # Update target network
        self.steps += 1
        if self.steps % self.target_update_freq == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())

        # Decay epsilon
        self.epsilon = max(self.epsilon_min,
                          self.epsilon_min + (1.0 - self.epsilon_min) *
                          np.exp(-1.0 * self.steps / self.epsilon_decay))
        self.epsilon_history.append(self.epsilon)

        return loss.item()


In [23]:
# ============================================================================
# Re-extract Strategic States from Full Dataset
# ============================================================================

print("="*80)
print("RE-EXTRACTING STRATEGIC STATES FROM FULL DATASET")
print("="*80)

print("\n[Problem Identified]")
print("  Phase 1 only extracted states from train_data, not full dataset!")
print("  drl_states_actions.npz only has training data (1,264,946 samples)")
print("  But we need ALL data: train + val + test (1,807,067 samples)")

print("\n[Solution]")
print("  Re-extract strategic states from train_df + val_df + test_df")

# Define strategic state features (same as Phase 1)
strategic_state_features = [
    'hour', 'day_of_week', 'is_weekend', 'is_business_hours',
    'invocation_rate', 'is_bursty',
    'avg_duration', 'avg_cost', 'avg_carbon',
    'memory_mb'
]

print(f"\n[Step 1] Concatenating all dataframes...")
print(f"  Train: {len(train_df):,} samples")
print(f"  Val:   {len(val_df):,} samples")
print(f"  Test:  {len(test_df):,} samples")

# Concatenate in order: train -> val -> test
full_df = pd.concat([train_df, val_df, test_df], axis=0, ignore_index=True)
print(f"  Total: {len(full_df):,} samples")

# Check if all features exist
print(f"\n[Step 2] Checking features...")
missing_features = [f for f in strategic_state_features if f not in full_df.columns]
if missing_features:
    print(f"  Missing features: {missing_features}")
    print(f"  Available columns: {list(full_df.columns)}")
else:
    print(f"  ‚úì All strategic features present")

# Extract strategic states from FULL dataset
print(f"\n[Step 3] Extracting strategic states from full dataset...")
strategic_states_full = full_df[strategic_state_features].values

print(f"  ‚úì Extracted: {strategic_states_full.shape}")
print(f"     Features: {strategic_state_features}")

# Verify no NaN values
nan_count = np.isnan(strategic_states_full).sum()
if nan_count > 0:
    print(f"     Warning: {nan_count} NaN values found")
    print(f"     Filling NaN with 0...")
    strategic_states_full = np.nan_to_num(strategic_states_full, nan=0.0)
else:
    print(f"  ‚úì No NaN values")

# Split into train/val/test
print(f"\n[Step 4] Splitting into train/val/test...")
train_size = len(train_df)
val_size = len(val_df)
test_size = len(test_df)

strategic_states_train = strategic_states_full[:train_size]
strategic_states_val = strategic_states_full[train_size:train_size+val_size]
strategic_states_test = strategic_states_full[train_size+val_size:]

print(f"  Train: {strategic_states_train.shape}")
print(f"  Val:   {strategic_states_val.shape}")
print(f"  Test:  {strategic_states_test.shape}")

# Verify splits
assert strategic_states_train.shape[0] == len(train_df), "Train size mismatch!"
assert strategic_states_val.shape[0] == len(val_df), "Val size mismatch!"
assert strategic_states_test.shape[0] == len(test_df), "Test size mismatch!"
print(f"  ‚úì All splits verified!")

# Recreate environments
print(f"\n[Step 5] Recreating environments...")

env = AppAwareMultiCloudEnv(
    strategic_states=strategic_states_train,
    data_df=train_df,
    app_profile_dict=app_profile_dict
)

val_env = AppAwareMultiCloudEnv(
    strategic_states=strategic_states_val,
    data_df=val_df,
    app_profile_dict=app_profile_dict
)

print(f"  ‚úì Training env: {len(env.strategic_states):,} samples")
print(f"  ‚úì Validation env: {len(val_env.strategic_states):,} samples")

# Test both environments
print(f"\n[Step 6] Testing environments...")
test_state_train, _ = env.reset(0)
test_state_val, _ = val_env.reset(0)
print(f"  ‚úì Training env test: {test_state_train.shape}")
print(f"  ‚úì Validation env test: {test_state_val.shape}")

# Show sample values
print(f"\n[Step 7] Sample strategic states:")
print(f"  Train sample [0]: {strategic_states_train[0]}")
print(f"  Val sample [0]:   {strategic_states_val[0]}")

print("\n" + "="*80)
print("FIX COMPLETE!")
print("="*80)

print("\nSummary:")
print(f"  ‚Ä¢ Re-extracted strategic states from FULL dataset")
print(f"  ‚Ä¢ Training env: {len(env.strategic_states):,} samples")
print(f"  ‚Ä¢ Validation env: {len(val_env.strategic_states):,} samples")
print(f"  ‚Ä¢ Both environments tested and working!")

print("\nüéØ You can now restart your training loop!")

# Optional: Save the corrected drl_states_actions.npz for future use

#Save corrected DRL states
drl_data_corrected = {
    'strategic_states': strategic_states_full,
    'action_spaces': action_spaces
}
np.savez_compressed(f'{DATA_PATH}/drl_states_actions_CORRECTED.npz', **drl_data_corrected)
print("‚úì Saved corrected states to drl_states_actions_CORRECTED.npz")

RE-EXTRACTING STRATEGIC STATES FROM FULL DATASET

[Problem Identified]
  Phase 1 only extracted states from train_data, not full dataset!
  drl_states_actions.npz only has training data (1,264,946 samples)
  But we need ALL data: train + val + test (1,807,067 samples)

[Solution]
  Re-extract strategic states from train_df + val_df + test_df

[Step 1] Concatenating all dataframes...
  Train: 1,264,946 samples
  Val:   271,060 samples
  Test:  271,061 samples
  Total: 1,807,067 samples

[Step 2] Checking features...
  ‚úì All strategic features present

[Step 3] Extracting strategic states from full dataset...
  ‚úì Extracted: (1807067, 10)
     Features: ['hour', 'day_of_week', 'is_weekend', 'is_business_hours', 'invocation_rate', 'is_bursty', 'avg_duration', 'avg_cost', 'avg_carbon', 'memory_mb']
     Filling NaN with 0...

[Step 4] Splitting into train/val/test...
  Train: (1264946, 10)
  Val:   (271060, 10)
  Test:  (271061, 10)
  ‚úì All splits verified!

[Step 5] Recreating enviro

In [24]:
# ============================================================================
# TRAINING LOOP
# ============================================================================
'''
print("\n" + "="*80)
print("Initializing Enhanced DQN Training")
print("="*80)

# Create training environment
env = AppAwareMultiCloudEnv(
    strategic_states=strategic_states_train,
    data_df=train_df,
    app_profile_dict=app_profile_dict
)

# Create validation environment (FIX: Create once, reuse)
val_env = AppAwareMultiCloudEnv(
    strategic_states=strategic_states_val,
    data_df=val_df,
    app_profile_dict=app_profile_dict
)

print(f"\n Training environment state dimension: {env.state_dim}")
print(f" Training environment action dimension: {env.action_dim}")
print(f" Validation environment ready: {len(val_env.strategic_states):,} samples")
'''
# Create agent
agent = DQNAgent(
    state_size=14,
    action_size=3,
    learning_rate=0.0001,  # Using fixed learning rate
    epsilon_decay=10000
)

print(f" Agent initialized on device: {agent.device}")
print(f" Network parameters: {sum(p.numel() for p in agent.q_network.parameters()):,}")

# Training configuration
NUM_EPISODES = 50
VALIDATE_EVERY = 5
SAMPLES_PER_EPISODE = 10000

print(f"\n Training Configuration:")
print(f"   Episodes: {NUM_EPISODES}")
print(f"   Samples per episode: {SAMPLES_PER_EPISODE:,}")
print(f"   Validation frequency: every {VALIDATE_EVERY} episodes")

# Training metrics
training_history = {
    'episode': [],
    'train_reward': [],
    'train_loss': [],
    'val_reward': [],
    'epsilon': [],
    'best_val_reward': -float('inf')
}

print("\n" + "="*80)
print("Starting Training...")
print("="*80)

for episode in range(NUM_EPISODES):
    # Sample indices for this episode
    episode_indices = np.random.choice(len(train_df), SAMPLES_PER_EPISODE, replace=False)

    episode_rewards = []
    episode_losses = []

    # Training loop
    for idx in tqdm(episode_indices, desc=f"Episode {episode+1}/{NUM_EPISODES}"):
        # Get state
        state, row = env.reset(idx)

        # Select action
        action = agent.act(state, training=True)

        # Environment step
        reward, done = env.step(action, row)

        # Get next state (if available)
        if idx < len(train_df) - 1:
            next_state, _ = env.reset(idx + 1)
        else:
            next_state = state
            done = True

        # Store transition
        agent.remember(state, action, reward, next_state, done)

        # Train
        loss = agent.train_step()

        episode_rewards.append(reward)
        if loss is not None:
            episode_losses.append(loss)

    # Episode statistics
    avg_reward = np.mean(episode_rewards)
    avg_loss = np.mean(episode_losses) if episode_losses else 0.0

    training_history['episode'].append(episode + 1)
    training_history['train_reward'].append(avg_reward)
    training_history['train_loss'].append(avg_loss)
    training_history['epsilon'].append(agent.epsilon)

    print(f"\n  Train Reward: {avg_reward:.4f} | Loss: {avg_loss:.4f} | Œµ: {agent.epsilon:.4f}")

    # Validation (FIXED)
    if (episode + 1) % VALIDATE_EVERY == 0:
        val_rewards = []
        val_indices = np.random.choice(len(val_df), min(2000, len(val_df)), replace=False)

        for idx in val_indices:
            # FIX: Use val_env instead of creating new environment
            state, row = val_env.reset(idx)
            action = agent.act(state, training=False)
            # FIX: Use val_env.step instead of env.step
            reward, _ = val_env.step(action, row)
            val_rewards.append(reward)

        avg_val_reward = np.mean(val_rewards)
        training_history['val_reward'].append(avg_val_reward)

        print(f"  Validation Reward: {avg_val_reward:.4f}")

        # Save best model
        if avg_val_reward > training_history['best_val_reward']:
            training_history['best_val_reward'] = avg_val_reward
            torch.save(agent.q_network.state_dict(), '/content/drive/MyDrive/mythesis/rohit-thesis/models/dqn_strategic/best_enhanced_dqn.pt')
            print(f"  New best model saved!")

print("\n" + "="*80)
print("Training Complete!")
print("="*80)
print(f"Best validation reward: {training_history['best_val_reward']:.4f}")

# Save final model and training history
torch.save(agent.q_network.state_dict(), '/content/drive/MyDrive/mythesis/rohit-thesis/models/dqn_strategic/final_enhanced_dqn.pt')

import json
with open('/content/training_history.json', 'w') as f:
    json.dump(training_history, f, indent=2)

print("\n Final model saved")
print(" Training history saved")

 Agent initialized on device: cuda
 Network parameters: 22,179

 Training Configuration:
   Episodes: 50
   Samples per episode: 10,000
   Validation frequency: every 5 episodes

Starting Training...


Episode 1/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:48<00:00, 207.87it/s]



  Train Reward: 0.9678 | Loss: 0.0777 | Œµ: 0.3765


Episode 2/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:51<00:00, 194.68it/s]



  Train Reward: 0.9746 | Loss: 0.0461 | Œµ: 0.1448


Episode 3/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:52<00:00, 189.19it/s]



  Train Reward: 0.9659 | Loss: 0.0380 | Œµ: 0.0596


Episode 4/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:53<00:00, 187.73it/s]



  Train Reward: 0.9634 | Loss: 0.0336 | Œµ: 0.0282


Episode 5/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:53<00:00, 185.78it/s]



  Train Reward: 0.9662 | Loss: 0.0291 | Œµ: 0.0167
  Validation Reward: 0.9513
  New best model saved!


Episode 6/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:55<00:00, 181.27it/s]



  Train Reward: 0.9659 | Loss: 0.0281 | Œµ: 0.0125


Episode 7/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 182.16it/s]



  Train Reward: 0.9652 | Loss: 0.0271 | Œµ: 0.0109


Episode 8/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 182.67it/s]



  Train Reward: 0.9611 | Loss: 0.0270 | Œµ: 0.0103


Episode 9/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 182.81it/s]



  Train Reward: 0.9637 | Loss: 0.0268 | Œµ: 0.0101


Episode 10/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:55<00:00, 180.50it/s]



  Train Reward: 0.9691 | Loss: 0.0264 | Œµ: 0.0100
  Validation Reward: 0.9708
  New best model saved!


Episode 11/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 181.86it/s]



  Train Reward: 0.9637 | Loss: 0.0262 | Œµ: 0.0100


Episode 12/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 182.20it/s]



  Train Reward: 0.9604 | Loss: 0.0274 | Œµ: 0.0100


Episode 13/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.67it/s]



  Train Reward: 0.9663 | Loss: 0.0282 | Œµ: 0.0100


Episode 14/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.89it/s]



  Train Reward: 0.9703 | Loss: 0.0275 | Œµ: 0.0100


Episode 15/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:55<00:00, 180.90it/s]



  Train Reward: 0.9713 | Loss: 0.0271 | Œµ: 0.0100
  Validation Reward: 0.9601


Episode 16/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 183.54it/s]



  Train Reward: 0.9708 | Loss: 0.0258 | Œµ: 0.0100


Episode 17/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.37it/s]



  Train Reward: 0.9644 | Loss: 0.0259 | Œµ: 0.0100


Episode 18/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.95it/s]



  Train Reward: 0.9638 | Loss: 0.0258 | Œµ: 0.0100


Episode 19/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 183.16it/s]



  Train Reward: 0.9675 | Loss: 0.0261 | Œµ: 0.0100


Episode 20/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 183.80it/s]



  Train Reward: 0.9654 | Loss: 0.0258 | Œµ: 0.0100
  Validation Reward: 0.9628


Episode 21/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.56it/s]



  Train Reward: 0.9678 | Loss: 0.0254 | Œµ: 0.0100


Episode 22/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 185.14it/s]



  Train Reward: 0.9642 | Loss: 0.0251 | Œµ: 0.0100


Episode 23/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.95it/s]



  Train Reward: 0.9665 | Loss: 0.0252 | Œµ: 0.0100


Episode 24/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.65it/s]



  Train Reward: 0.9657 | Loss: 0.0248 | Œµ: 0.0100


Episode 25/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 183.03it/s]



  Train Reward: 0.9674 | Loss: 0.0240 | Œµ: 0.0100
  Validation Reward: 0.9537


Episode 26/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 183.60it/s]



  Train Reward: 0.9663 | Loss: 0.0246 | Œµ: 0.0100


Episode 27/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.22it/s]



  Train Reward: 0.9584 | Loss: 0.0261 | Œµ: 0.0100


Episode 28/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 185.09it/s]



  Train Reward: 0.9691 | Loss: 0.0251 | Œµ: 0.0100


Episode 29/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 185.18it/s]



  Train Reward: 0.9640 | Loss: 0.0251 | Œµ: 0.0100


Episode 30/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 183.83it/s]



  Train Reward: 0.9634 | Loss: 0.0254 | Œµ: 0.0100
  Validation Reward: 0.9567


Episode 31/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.02it/s]



  Train Reward: 0.9663 | Loss: 0.0266 | Œµ: 0.0100


Episode 32/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.31it/s]



  Train Reward: 0.9679 | Loss: 0.0252 | Œµ: 0.0100


Episode 33/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:55<00:00, 180.49it/s]



  Train Reward: 0.9633 | Loss: 0.0251 | Œµ: 0.0100


Episode 34/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:55<00:00, 179.11it/s]



  Train Reward: 0.9733 | Loss: 0.0250 | Œµ: 0.0100


Episode 35/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:55<00:00, 180.80it/s]



  Train Reward: 0.9739 | Loss: 0.0251 | Œµ: 0.0100
  Validation Reward: 0.9601


Episode 36/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:55<00:00, 180.64it/s]



  Train Reward: 0.9679 | Loss: 0.0238 | Œµ: 0.0100


Episode 37/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:55<00:00, 178.74it/s]



  Train Reward: 0.9628 | Loss: 0.0237 | Œµ: 0.0100


Episode 38/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.00it/s]



  Train Reward: 0.9673 | Loss: 0.0249 | Œµ: 0.0100


Episode 39/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.72it/s]



  Train Reward: 0.9682 | Loss: 0.0238 | Œµ: 0.0100


Episode 40/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 182.94it/s]



  Train Reward: 0.9647 | Loss: 0.0240 | Œµ: 0.0100
  Validation Reward: 0.9581


Episode 41/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.13it/s]



  Train Reward: 0.9698 | Loss: 0.0247 | Œµ: 0.0100


Episode 42/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 182.11it/s]



  Train Reward: 0.9640 | Loss: 0.0238 | Œµ: 0.0100


Episode 43/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 181.91it/s]



  Train Reward: 0.9630 | Loss: 0.0240 | Œµ: 0.0100


Episode 44/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:55<00:00, 181.10it/s]



  Train Reward: 0.9571 | Loss: 0.0250 | Œµ: 0.0100


Episode 45/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.21it/s]



  Train Reward: 0.9600 | Loss: 0.0256 | Œµ: 0.0100
  Validation Reward: 0.9705


Episode 46/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 184.09it/s]



  Train Reward: 0.9716 | Loss: 0.0257 | Œµ: 0.0100


Episode 47/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:55<00:00, 181.31it/s]



  Train Reward: 0.9637 | Loss: 0.0263 | Œµ: 0.0100


Episode 48/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:55<00:00, 181.00it/s]



  Train Reward: 0.9590 | Loss: 0.0255 | Œµ: 0.0100


Episode 49/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 182.45it/s]



  Train Reward: 0.9725 | Loss: 0.0272 | Œµ: 0.0100


Episode 50/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:54<00:00, 182.90it/s]



  Train Reward: 0.9632 | Loss: 0.0272 | Œµ: 0.0100
  Validation Reward: 0.9622

Training Complete!
Best validation reward: 0.9708

 Final model saved
 Training history saved
