In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from torchrl.envs import EnvBase
from torchrl.data.replay_buffers import TensorDictReplayBuffer
from torchrl.objectives import DQNLoss
from torchrl.collectors import SyncDataCollector
from tensordict import TensorDict, TensorDictBase
from tensordict.nn import TensorDictModule
from typing import Optional, List, Tuple, Dict
from torchrl.modules import EGreedyModule, MLP, QValueModule
from torchrl.data import OneHot
from torchrl.data import DiscreteTensorSpec
from tensordict.nn import TensorDictModule, TensorDictSequential
from torch.optim import Adam
from torchrl.objectives import DQNLoss, SoftUpdate
from torchrl.collectors import SyncDataCollector
from torchrl.data import LazyTensorStorage, ReplayBuffer
import time
import os
from datetime import datetime
from torchrl.objectives.utils import ValueEstimators

In [2]:
class RobustDQNLoss(DQNLoss):
    def __init__(
        self,
        value_network,
        action_space,
        delay_value=False,
        reduction="mean"
    ):
        """
        Constructor with corrected parameter signature, ensuring compatibility with DQNLoss.
        """
        super().__init__(
            value_network=value_network,
            delay_value=delay_value,
            reduction=reduction
        )

        # Corrected value estimator setup using TD(0)
        self.make_value_estimator(ValueEstimators.TD0, gamma=0.99)

        # Store keys separately instead of passing to super().__init__()
        self.action_key = "action"
        self.reward_key = "reward"
        self.done_key = "done"
        self.terminated_key = "terminated"
        self.target_value_key = None


    def _ensure_key_structure(self, tensordict: TensorDict) -> None:
        """
        Ensures all required keys exist in the tensor dictionary.
        """
        if "next" not in tensordict:
            tensordict["next"] = TensorDict({}, batch_size=tensordict.batch_size)

        for key in [self.action_key, self.reward_key, self.done_key]:
            if key not in tensordict:
                print(f"Warning: Missing '{key}' key. Initializing with zeros.")
                tensordict[key] = torch.zeros(
                    tensordict.batch_size, dtype=torch.float32, device=self._get_safe_device(tensordict)
                )

        if self.done_key not in tensordict["next"]:
            tensordict["next", self.done_key] = torch.zeros(
                tensordict.batch_size, dtype=torch.bool, device=self._get_safe_device(tensordict)
            )

        if self.terminated_key not in tensordict["next"]:
            tensordict["next", self.terminated_key] = tensordict["next", self.done_key].clone()

    def _get_safe_device(self, tensordict: TensorDict) -> torch.device:
        """
        Determine a safe device to use based on tensor dictionary contents.
        """
        for key in tensordict.keys():
            value = tensordict.get(key)
            if torch.is_tensor(value) and hasattr(value, 'device'):
                return value.device
        return torch.device("cpu")

In [3]:
# Set seeds for reproducibility
def set_all_seeds(seed=42):
    """
    Set all random seeds for deterministic results.
    
    Args:
        seed: Seed value for random number generators
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [4]:
# Realistic Synthetic Data Generator
def generate_synthetic_data(num_samples=1000, seed=42):
    """
    Generate synthetic digital advertising data with realistic correlations and distributions.
    Uses real-world inspired parameters and relationships between metrics.
    
    Args:
        num_samples: Number of data points to generate
        seed: Random seed for reproducibility
        
    Returns:
        pandas.DataFrame: Generated synthetic advertising data
    """
    np.random.seed(seed)
    random.seed(seed)
    
    # Generate realistic keyword data with distributions that match digital advertising metrics
    data = {}
    
    # Keywords with realistic thematic structure
    keyword_categories = ["shoes", "clothes", "electronics", "furniture", "beauty", "sports", "travel", "food"]
    modifier_list = ["buy", "cheap", "best", "online", "discount", "premium", "compare", "review"]
    
    keywords = []
    for i in range(num_samples):
        category = random.choice(keyword_categories)
        modifier = random.choice(modifier_list) if random.random() > 0.5 else ""
        specific = f"item_{i % 50}" if random.random() > 0.7 else ""
        kw = f"{modifier} {category} {specific}".strip()
        keywords.append(kw)
    
    data["keyword"] = keywords
    
    # Competitiveness (follows a beta distribution as in real ad markets)
    data["competitiveness"] = np.random.beta(2, 3, num_samples)
    
    # Difficulty score correlates with competitiveness
    base_difficulty = np.random.beta(2.5, 3.5, num_samples)
    data["difficulty_score"] = 0.7 * data["competitiveness"] + 0.3 * base_difficulty
    
    # Organic rank (higher competitiveness leads to worse ranking)
    data["organic_rank"] = 1 + np.floor(9 * data["difficulty_score"] + np.random.normal(0, 1, num_samples).clip(-2, 2))
    data["organic_rank"] = data["organic_rank"].clip(1, 10).astype(int)
    
    # CTR follows a realistic distribution and correlates negatively with rank
    base_ctr = np.random.beta(1.5, 10, num_samples)  # Realistic CTR distribution
    rank_effect = (11 - data["organic_rank"]) / 10  # Higher ranks have better CTR
    data["organic_ctr"] = (base_ctr * rank_effect * 0.3).clip(0.01, 0.3)
    
    # Organic clicks based on CTR and a base impression count
    base_impressions = np.random.lognormal(8, 1, num_samples).astype(int)
    data["organic_clicks"] = (base_impressions * data["organic_ctr"]).astype(int)
    
    # Paid CTR correlates with organic CTR but with more variance
    data["paid_ctr"] = (data["organic_ctr"] * np.random.normal(1, 0.3, num_samples)).clip(0.01, 0.25)
    
    # Paid clicks
    paid_impressions = np.random.lognormal(7, 1.2, num_samples).astype(int)
    data["paid_clicks"] = (paid_impressions * data["paid_ctr"]).astype(int)
    
    # Cost per click higher for more competitive keywords
    data["cost_per_click"] = (0.5 + 9.5 * data["competitiveness"] * np.random.normal(1, 0.2, num_samples)).clip(0.1, 10)
    
    # Ad spend based on CPC and clicks
    data["ad_spend"] = data["paid_clicks"] * data["cost_per_click"]
    
    # Conversion rate with realistic e-commerce distribution
    data["conversion_rate"] = np.random.beta(1.2, 15, num_samples).clip(0.01, 0.3)
    
    # Ad conversions
    data["ad_conversions"] = (data["paid_clicks"] * data["conversion_rate"]).astype(int)
    
    # Conversion value with variance
    base_value = np.random.lognormal(4, 1, num_samples)
    data["conversion_value"] = data["ad_conversions"] * base_value
    
    # Cost per acquisition
    with np.errstate(divide='ignore', invalid='ignore'):
        data["cost_per_acquisition"] = np.where(
            data["ad_conversions"] > 0, 
            data["ad_spend"] / data["ad_conversions"], 
            500  # Default high CPA for no conversions
        ).clip(5, 500)
    
    # ROAS (Return on Ad Spend)
    with np.errstate(divide='ignore', invalid='ignore'):
        data["ad_roas"] = np.where(
            data["ad_spend"] > 0,
            data["conversion_value"] / data["ad_spend"],
            0
        ).clip(0.5, 5)
    
    # Impression share (competitive keywords have lower share)
    data["impression_share"] = (1 - 0.6 * data["competitiveness"] * np.random.normal(1, 0.2, num_samples)).clip(0.1, 1.0)
    
    # Previous recommendation (binary: increase bid or decrease bid)
    data["previous_recommendation"] = np.random.choice([0, 1], size=num_samples)
    
    # Create DataFrame with all metrics
    return pd.DataFrame(data)

In [5]:
# Define the robust TorchRL Environment
class AdOptimizationEnv(EnvBase):
    """
    Environment for digital advertising bid optimization using reinforcement learning.
    
    The environment simulates the decision-making process for ad bid adjustments
    based on various advertising metrics and performance indicators.
    """
    
    # Class-level declaration for immutable properties
    _has_dynamic_specs = False
    
    def __init__(self, dataset):
        """
        Initialize the environment with a dataset of advertising metrics.
        
        Args:
            dataset: pandas.DataFrame containing advertising metrics
        """
        # Initialize base class before any protected attribute access
        super().__init__()
        
        # Store dataset and define feature space
        self.dataset = dataset
        self.feature_columns = [
            "competitiveness", "difficulty_score", "organic_rank", 
            "organic_clicks", "organic_ctr", "paid_clicks", "paid_ctr", 
            "ad_spend", "ad_conversions", "ad_roas", "conversion_rate", 
            "cost_per_click", "cost_per_acquisition", "impression_share"
        ]
        self.num_features = len(self.feature_columns)
        
        # Define action space: binary decision (decrease/increase bid)
        self.action_spec = OneHot(n=2, dtype=torch.int64)
        
        # Initialize episode state variables
        self.episode_step = 0
        self.max_episode_steps = 100
        self.current_sample = None
        self.rng = None
        
        # Initialize random generator
        self._set_seed(None)
        
        # Reset environment to initialize observation space
        self.reset()

    def _reset(self, tensordict=None):
        """
        Reset the environment to an initial state.
        """
        sample = self.dataset.sample(1)
        state = torch.tensor(sample[self.feature_columns].values, dtype=torch.float32).squeeze()
        
        # Ensure observation is set
        tensordict = TensorDict({
            "observation": state,  # Ensure observation key exists
            "step_count": torch.tensor(self.episode_step, dtype=torch.int64),
            "reward": torch.tensor(0.0, dtype=torch.float32),  # Initial reward is zero
        }, batch_size=[])

        return tensordict

    def _step(self, tensordict):
        """
        Execute one step in the environment.
        """
        action = tensordict["action"].argmax(dim=-1).item() if "action" in tensordict else 0
        
        next_sample = self.dataset.sample(1)
        next_state = torch.tensor(next_sample[self.feature_columns].values, dtype=torch.float32).squeeze()
        
        reward = self._compute_reward(action, next_sample)
        
        self.episode_step += 1
        done = self.episode_step >= self.max_episode_steps
        
        step_td = TensorDict({
            "observation": next_state,  # Ensure observation key exists
            "reward": torch.tensor(reward, dtype=torch.float32),
            "done": torch.tensor(done, dtype=torch.bool),
            "terminated": torch.tensor(done, dtype=torch.bool),
            "step_count": torch.tensor(self.episode_step, dtype=torch.int64),
            "next": TensorDict({
                "observation": next_state.clone(),  # Ensure next observation exists
                "reward": torch.tensor(reward, dtype=torch.float32),
                "done": torch.tensor(done, dtype=torch.bool),
                "terminated": torch.tensor(done, dtype=torch.bool),
            }, batch_size=[])
        }, batch_size=[])

        return step_td

    
    def _compute_reward(self, action, sample):
        """
        Compute reward based on a nuanced digital advertising strategy.
        
        Args:
            action: Binary action (0: decrease bid, 1: increase bid)
            sample: Dataframe row with advertising metrics
            
        Returns:
            float: Computed reward value
        """
        # Extract relevant metrics
        roas = sample["ad_roas"].values[0]
        cpa = sample["cost_per_acquisition"].values[0]
        ad_spend = sample["ad_spend"].values[0]
        ctr = sample["paid_ctr"].values[0]
        conv_rate = sample["conversion_rate"].values[0]
        
        # Conservative action (decrease bid/budget)
        if action == 0:
            # Good when: high CPA, low ROAS, or acceptable performance with high spend
            if cpa > 200 or roas < 1.2:
                return 1.0  # Good decision to be conservative
            elif ad_spend > 5000 and roas < 2.0:
                return 0.5  # Somewhat good decision (high spend, moderate ROAS)
            else:
                return -1.0  # Bad decision (missed opportunity)
        
        # Aggressive action (increase bid/budget)
        else:  # action == 1
            # Good when: low CPA, high ROAS, high CTR, good conversion rate
            if roas > 2.5 or (ctr > 0.15 and conv_rate > 0.1):
                return 1.5  # Very good decision
            elif roas > 1.5 and cpa < 100:
                return 1.0  # Good decision
            elif roas > 1.0:
                return 0.2  # Acceptable decision
            else:
                return -1.5  # Very bad decision (wasting money)

    def _set_seed(self, seed: Optional[int]) -> torch.Generator:
        """
        Set random seed for environment reproducibility.
        
        Args:
            seed: Random seed value
            
        Returns:
            torch.Generator: Seeded random number generator
        """
        if seed is not None:
            # Set all relevant random seeds
            rng = torch.manual_seed(seed)
            random.seed(seed)
            np.random.seed(seed)
            self.rng = torch.Generator().manual_seed(seed)
            return self.rng
        else:
            self.rng = torch.Generator()
            return self.rng

In [6]:
def train_ad_optimization_agent(dataset, num_iterations=500, batch_size=100, sample_size=128, log_interval=10):
    """
    Train an RL agent for digital ad optimization with fixed tensor structures.
    
    Args:
        dataset: DataFrame with advertising metrics
        num_iterations: Number of training iterations
        batch_size: Number of environment interactions per batch
        sample_size: Number of samples from replay buffer per update
        log_interval: Interval for logging training progress
        
    Returns:
        tuple: (trained_policy, training_metrics, environment)
    """
    # Initialize Environment
    env = AdOptimizationEnv(dataset)
    state_dim = env.num_features
    action_dim = env.action_spec.n
    
    # Create neural network value function
    value_mlp = MLP(
        in_features=state_dim, 
        out_features=action_dim, 
        num_cells=[128, 64],
        activation_class=nn.ReLU
    )
    
    # Explicit tensor key mapping for value network
    value_net = TensorDictModule(
        value_mlp, 
        in_keys=["observation"], 
        out_keys=["action_value"]
    )
    
    # Create Q-value module with explicit key configuration
    # Note: QValueModule requires careful configuration of out_keys parameter
    # The error occurs because QValueModule tries to access out_keys[1] when it's not provided
    policy = TensorDictSequential(
        value_net,
        QValueModule(
            action_value_key="action_value",
            out_keys=["action", "action_value", "chosen_action_value"],
            spec=env.action_spec  # Ensures the policy has an action space
        )
    )

    # Explicitly set spec to policy to avoid "action_space" retrieval error
    policy.spec = env.action_spec  # Fixes the issue
    
    # Exploration strategy with epsilon annealing
    exploration_module = EGreedyModule(
        env.action_spec, 
        annealing_num_steps=10000, 
        eps_init=0.9, 
        eps_end=0.05,
        action_key="action"
    )
    
    # Complete policy with exploration
    policy_explore = TensorDictSequential(policy, exploration_module)
    
    # Data collection setup
    init_rand_steps = 1000
    frames_per_batch = batch_size
    optim_steps = 5
    
    # Create data collector
    collector = SyncDataCollector(
        env,
        policy_explore,
        frames_per_batch=frames_per_batch,
        total_frames=-1,
        init_random_frames=init_rand_steps,
        device="cpu",
        storing_device="cpu",
        return_same_td=True,
        split_trajs=False
    )
    
    # Create replay buffer for experience replay
    rb = ReplayBuffer(storage=LazyTensorStorage(max_size=20000))
    
    # Use DQNLoss with standard parameter interface
    loss = RobustDQNLoss(
        value_network=policy, 
        action_space=env.action_spec, 
        delay_value=True
    )
    
    # Optimizer and target network updater
    optim = Adam(loss.parameters(), lr=0.001)
    updater = SoftUpdate(loss, eps=0.95)
    
    # Training metrics tracking
    total_frames = 0
    rewards = []
    avg_losses = []
    epsilon_values = []
    
    print("Starting training...")
    start_time = time.time()
    
    # Training loop
    for i, data in enumerate(collector):
        if i >= num_iterations:
            break
            
        # Validate and fix tensor dictionary structure
        ensure_valid_td_structure(data)
        
        # Track rewards
        if "reward" in data:
            reward = data["reward"].sum().item() / frames_per_batch
            rewards.append(reward)
        else:
            rewards.append(0.0)
        
        # Track exploration rate
        epsilon_values.append(exploration_module.eps.item())
        
        # Store in replay buffer - make a deep copy to avoid reference issues
        rb.extend(data.clone())
        
        batch_losses = []
        if len(rb) > init_rand_steps:
            # Optimization steps
            for _ in range(optim_steps):
                # Sample from replay buffer
                sample = rb.sample(sample_size)
                
                # Ensure proper structure in the sample
                ensure_valid_td_structure(sample)
                
                # Compute loss and update
                loss_vals = loss(sample)
                
                if "loss" in loss_vals:
                    loss_val = loss_vals["loss"].item()
                    batch_losses.append(loss_val)
                    
                    loss_vals["loss"].backward()
                    optim.step()
                    optim.zero_grad()
                else:
                    print("Warning: Loss computation did not return 'loss' key")
                
                # Update exploration and target network
                exploration_module.step(frames_per_batch)
                updater.step()
            
            if batch_losses:
                avg_losses.append(np.mean(batch_losses))
        
        total_frames += frames_per_batch
        
        # Print progress
        if i % log_interval == 0:
            elapsed = time.time() - start_time
            print(f"Iteration: {i}/{num_iterations}, Frames: {total_frames}, "
                  f"Avg Reward: {np.mean(rewards[-10:]) if rewards else 0:.4f}, "
                  f"Epsilon: {exploration_module.eps.item():.4f}, "
                  f"Time: {elapsed:.1f}s")
    
    # Create clean policy without exploration for evaluation
    eval_policy = TensorDictSequential(value_net, QValueModule(spec=env.action_spec))
    eval_policy.load_state_dict(policy.state_dict())
    
    # Compile training metrics
    training_metrics = {
        "rewards": rewards,
        "losses": avg_losses,
        "epsilon_values": epsilon_values,
        "total_frames": total_frames,
        "training_time": time.time() - start_time
    }
    
    return eval_policy, training_metrics, env

In [7]:
def ensure_valid_td_structure(td):
    """
    Ensure tensor dictionary has valid structure for DQN learning.
    """
    # Ensure 'observation' key exists
    if "observation" not in td:
        print("Warning: Missing 'observation' key in tensor dictionary")
        td["observation"] = torch.zeros(td.batch_size, dtype=torch.float32)
        
    # Ensure reward structure
    if "reward" not in td:
        if "next" in td and "reward" in td["next"]:
            td["reward"] = td["next", "reward"].clone()
        else:
            td["reward"] = torch.zeros(td.batch_size, dtype=torch.float32)
            
    # Ensure 'next' exists
    if "next" not in td:
        td["next"] = TensorDict({}, batch_size=td.batch_size)
        
    # Ensure 'next' contains 'observation'
    if "observation" not in td["next"]:
        td["next", "observation"] = torch.zeros(td.batch_size, dtype=torch.float32)

In [8]:
def evaluate_policy(policy, env, num_episodes=100):
    """
    Evaluate a trained policy without exploration.
    
    Args:
        policy: Trained policy without exploration module
        env: Environment to evaluate on
        num_episodes: Number of episodes to evaluate
        
    Returns:
        dict: Dictionary containing evaluation metrics
    """
    total_reward = 0
    episode_lengths = []
    action_counts = {0: 0, 1: 0}  # 0: conservative, 1: aggressive
    decisions = []
    rewards = []
    states = []
    
    for episode in range(num_episodes):
        done = False
        episode_reward = 0
        steps = 0
        
        # Reset environment
        tensordict = env.reset()
        
        while not done:
            # Get action from policy
            with torch.no_grad():
                action_td = policy(tensordict)
                action = action_td["action"].argmax(dim=-1).item()
            
            # Record state and action
            states.append(tensordict["observation"].numpy())
            action_counts[action] += 1
            
            # Step environment
            next_td = env.step(action_td)
            reward = next_td["reward"].item()
            done = next_td["done"].item()
            
            # Update for next step
            tensordict = next_td
            
            # Record results
            decisions.append((action, reward))
            rewards.append(reward)
            episode_reward += reward
            steps += 1
        
        total_reward += episode_reward
        episode_lengths.append(steps)
    
    # Calculate metrics
    avg_reward = total_reward / num_episodes
    avg_episode_length = np.mean(episode_lengths)
    action_distribution = {k: v / sum(action_counts.values()) for k, v in action_counts.items()}
    
    # Calculate success rate (positive reward decisions)
    correct_decisions = sum(1 for a, r in decisions if r > 0)
    success_rate = correct_decisions / len(decisions) if decisions else 0
    
    # Calculate metrics by action type
    conservative_rewards = [r for (a, r) in decisions if a == 0]
    aggressive_rewards = [r for (a, r) in decisions if a == 1]
    
    avg_conservative_reward = np.mean(conservative_rewards) if conservative_rewards else 0
    avg_aggressive_reward = np.mean(aggressive_rewards) if aggressive_rewards else 0
    
    # Compile all metrics
    metrics = {
        "avg_reward": avg_reward,
        "avg_episode_length": avg_episode_length,
        "action_distribution": action_distribution,
        "success_rate": success_rate,
        "avg_conservative_reward": avg_conservative_reward,
        "avg_aggressive_reward": avg_aggressive_reward,
        "states": np.array(states) if states else np.array([]),
        "decisions": decisions,
        "rewards": rewards
    }
    
    return metrics

In [9]:
def visualize_evaluation(metrics, feature_columns, output_dir="plots"):
    """
    Create visualizations for evaluation metrics.
    
    Args:
        metrics: Dictionary of evaluation metrics from evaluate_policy
        feature_columns: List of feature column names
        output_dir: Directory to save plots
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Set visualization style
    sns.set(style="whitegrid")
    
    # Create multi-panel figure
    fig = plt.figure(figsize=(18, 12))
    fig.suptitle("Ad Optimization RL Agent Evaluation", fontsize=16)
    
    # 1. Action Distribution
    ax1 = fig.add_subplot(2, 3, 1)
    actions = ["Conservative", "Aggressive"]
    frequencies = list(metrics["action_distribution"].values())
    ax1.bar(actions, frequencies, color=["skyblue", "coral"])
    ax1.set_title("Action Distribution")
    ax1.set_ylabel("Frequency")
    
    # 2. Reward by Action Type
    ax2 = fig.add_subplot(2, 3, 2)
    ax2.bar(["Conservative", "Aggressive"], 
            [metrics["avg_conservative_reward"], metrics["avg_aggressive_reward"]], 
            color=["skyblue", "coral"])
    ax2.set_title("Average Reward by Action Type")
    ax2.set_ylabel("Average Reward")
    
    # 3. Feature Importance (using action correlation)
    states = metrics["states"]
    decisions = np.array([a for a, _ in metrics["decisions"]])
    
    # Calculate correlation between features and actions
    correlations = []
    for i in range(states.shape[1]):
        if states.size > 0 and decisions.size > 0:
            corr = np.corrcoef(states[:, i], decisions)[0, 1]
            correlations.append(corr)
        else:
            correlations.append(0)
    
    # Plot feature importance
    ax3 = fig.add_subplot(2, 3, 3)
    feature_importance = pd.Series(correlations, index=feature_columns)
    feature_importance.sort_values(ascending=False).plot(kind='bar', ax=ax3)
    ax3.set_title("Feature Correlation with Actions")
    ax3.set_ylabel("Correlation Coefficient")
    plt.xticks(rotation=90)
    
    # 4. Reward Distribution
    ax4 = fig.add_subplot(2, 3, 4)
    sns.histplot(metrics["rewards"], kde=True, ax=ax4)
    ax4.set_title("Reward Distribution")
    ax4.set_xlabel("Reward")
    ax4.set_ylabel("Frequency")
    
    # 5. Decision Quality Matrix (Confusion Matrix style)
    ax5 = fig.add_subplot(2, 3, 5)
    decision_quality = np.zeros((2, 2))
    
    for action, reward in metrics["decisions"]:
        quality = 1 if reward > 0 else 0
        decision_quality[action, quality] += 1
    
    # Normalize by row to prevent division by zero
    row_sums = decision_quality.sum(axis=1, keepdims=True)
    # Handle cases where row sum is zero
    row_sums = np.where(row_sums == 0, 1, row_sums)
    decision_quality_norm = decision_quality / row_sums
    
    sns.heatmap(decision_quality_norm, annot=True, fmt=".2f", cmap="YlGnBu",
                xticklabels=["Poor", "Good"], 
                yticklabels=["Conservative", "Aggressive"],
                ax=ax5)
    ax5.set_title("Decision Quality Matrix")
    ax5.set_ylabel("Action")
    ax5.set_xlabel("Decision Quality")
    
    # 6. Feature Distribution by Action
    ax6 = fig.add_subplot(2, 3, 6)
    
    # Proceed only if we have sufficient data
    if states.size > 0 and decisions.size > 0:
        # Choose the most important feature based on correlation
        important_feature_idx = np.abs(correlations).argmax()
        important_feature_name = feature_columns[important_feature_idx]
        
        # Split the feature values by action
        feature_by_action = {
            "Conservative": states[decisions == 0, important_feature_idx],
            "Aggressive": states[decisions == 1, important_feature_idx]
        }
        
        # Plot only if both actions have some data
        if len(feature_by_action["Conservative"]) > 0 and len(feature_by_action["Aggressive"]) > 0:
            sns.kdeplot(data=feature_by_action, common_norm=False, ax=ax6)
            ax6.set_title(f"Distribution of {important_feature_name} by Action")
            ax6.set_xlabel(important_feature_name)
            ax6.set_ylabel("Density")
        else:
            ax6.text(0.5, 0.5, "Insufficient data for KDE plot", 
                    horizontalalignment='center', verticalalignment='center')
    else:
        ax6.text(0.5, 0.5, "No data available for feature distribution", 
                horizontalalignment='center', verticalalignment='center')
    
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{output_dir}/agent_evaluation.png", dpi=300, bbox_inches="tight")
    plt.close(fig)

In [10]:
def visualize_training_progress(training_metrics, output_dir="plots", window_size=20):
    """
    Visualize the training progress based on collected metrics.
    
    Args:
        training_metrics: Dictionary of training metrics
        output_dir: Directory to save plots
        window_size: Window size for smoothing
    """
    os.makedirs(output_dir, exist_ok=True)
    rewards = training_metrics["rewards"]
    losses = training_metrics["losses"]
    epsilons = training_metrics["epsilon_values"]
    
    # Create figure with 3 subplots
    fig, axes = plt.subplots(3, 1, figsize=(12, 15), sharex=True)
    fig.suptitle("RL Training Progress", fontsize=16)
    
    # 1. Plot rewards
    axes[0].plot(rewards, alpha=0.3, color='blue', label="Episode Rewards")
    
    # Add smoothed rewards
    if len(rewards) >= window_size:
        smoothed_rewards = []
        for i in range(len(rewards) - window_size + 1):
            smoothed_rewards.append(np.mean(rewards[i:i+window_size]))
        axes[0].plot(range(window_size-1, len(rewards)), smoothed_rewards, 
                   color='red', linewidth=2, label=f"Moving Average ({window_size})")
    
    axes[0].set_ylabel("Average Reward")
    axes[0].set_title("Training Rewards")
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # 2. Plot losses
    if losses:
        axes[1].plot(losses, color='purple', alpha=0.5, label="Training Loss")
        
        # Add smoothed losses
        if len(losses) >= window_size:
            smoothed_losses = []
            for i in range(len(losses) - window_size + 1):
                smoothed_losses.append(np.mean(losses[i:i+window_size]))
            axes[1].plot(range(window_size-1, len(losses)), smoothed_losses, 
                       color='darkred', linewidth=2, label=f"Moving Average ({window_size})")
        
        axes[1].set_ylabel("Loss")
        axes[1].set_title("Training Loss")
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
    
    # 3. Plot exploration rate
    axes[2].plot(epsilons, color='green', label="Exploration Rate (ε)")
    axes[2].set_ylim(0, 1)
    axes[2].set_xlabel("Episodes")
    axes[2].set_ylabel("Epsilon (ε)")
    axes[2].set_title("Exploration Rate")
    axes[2].legend()
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f"{output_dir}/training_progress.png", dpi=300, bbox_inches="tight")
    plt.close(fig)

In [11]:
def analyze_ad_strategy(policy, dataset, feature_columns, output_dir="plots"):
    """
    Analyze the learned advertising strategy for different input scenarios.
    
    Args:
        policy: The trained policy without exploration module
        dataset: The dataset containing ad metrics
        feature_columns: List of feature column names
        output_dir: Directory to save plots
    
    Returns:
        pd.DataFrame: DataFrame with strategy analysis for different scenarios
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Prepare scenarios
    scenarios = [
        {"name": "High ROAS, Low CPA", 
         "filters": {"ad_roas": lambda x: x > 3.0, "cost_per_acquisition": lambda x: x < 50}},
        {"name": "Low ROAS, High CPA", 
         "filters": {"ad_roas": lambda x: x < 1.0, "cost_per_acquisition": lambda x: x > 200}},
        {"name": "High CTR", 
         "filters": {"paid_ctr": lambda x: x > 0.15}},
        {"name": "Low CTR", 
         "filters": {"paid_ctr": lambda x: x < 0.05}},
        {"name": "High Ad Spend", 
         "filters": {"ad_spend": lambda x: x > 5000}},
        {"name": "Low Ad Spend", 
         "filters": {"ad_spend": lambda x: x < 500}},
    ]
    
    results = []
    
    for scenario in scenarios:
        # Filter dataset
        filtered_df = dataset.copy()
        for col, condition in scenario["filters"].items():
            filtered_df = filtered_df[condition(filtered_df[col])]
        
        if len(filtered_df) < 10:
            print(f"Not enough data for scenario: {scenario['name']}")
            continue
        
        # Sample from filtered dataset
        samples = filtered_df.sample(min(100, len(filtered_df)))
        
        # Get policy decisions
        actions = []
        for _, row in samples.iterrows():
            state = torch.tensor(row[feature_columns].values, dtype=torch.float32)
            tensordict = TensorDict({"observation": state}, batch_size=[])
            
            with torch.no_grad():
                action_td = policy(tensordict)
                action = action_td["action"].argmax(dim=-1).item()
            actions.append(action)
        
        # Calculate action distribution
        conservative_ratio = actions.count(0) / len(actions) if actions else 0
        aggressive_ratio = actions.count(1) / len(actions) if actions else 0
        
        # Store results
        results.append({
            "scenario": scenario["name"],
            "sample_size": len(samples),
            "conservative_ratio": conservative_ratio,
            "aggressive_ratio": aggressive_ratio
        })
    
    # Visualize scenario analysis if we have results
    if results:
        plt.figure(figsize=(12, 6))
        scenarios = [r["scenario"] for r in results]
        conservative = [r["conservative_ratio"] * 100 for r in results]
        aggressive = [r["aggressive_ratio"] * 100 for r in results]
        
        x = range(len(scenarios))
        width = 0.35
        
        plt.bar([i - width/2 for i in x], conservative, width, label='Conservative (Decrease)', color='skyblue')
        plt.bar([i + width/2 for i in x], aggressive, width, label='Aggressive (Increase)', color='coral')
        
        plt.xlabel('Scenario')
        plt.ylabel('Action Distribution (%)')
        plt.title('Ad Strategy by Scenario')
        plt.xticks(x, scenarios, rotation=45, ha='right')
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"{output_dir}/strategy_analysis.png", dpi=300, bbox_inches="tight")
        plt.close()
    
    # Return the results
    return pd.DataFrame(results)

In [12]:
def feature_importance_analysis(policy, env, feature_columns, num_samples=1000, output_dir="plots"):
    """
    Analyze feature importance using perturbation analysis.
    
    Args:
        policy: The trained policy
        env: The environment
        feature_columns: List of feature column names
        num_samples: Number of samples to use for analysis
        output_dir: Directory to save plots
    
    Returns:
        pd.DataFrame: DataFrame with feature importance scores
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Generate samples
    samples = []
    for _ in range(num_samples):
        tensordict = env.reset()
        if "observation" in tensordict:
            samples.append(tensordict["observation"].clone())
    
    if not samples:
        print("Warning: No valid samples collected for feature importance analysis")
        return pd.DataFrame({"Feature": feature_columns, "Importance": [0] * len(feature_columns)})
    
    samples = torch.stack(samples)
    
    # Calculate baseline predictions
    with torch.no_grad():
        baseline_actions = []
        for i in range(num_samples):
            tensordict = TensorDict({"observation": samples[i]}, batch_size=[])
            action_td = policy(tensordict)
            action = action_td["action"].argmax(dim=-1).item()
            baseline_actions.append(action)
    
    # Calculate importance for each feature
    importance_scores = []
    
    for feature_idx in range(samples.shape[1]):
        # Create perturbed samples
        perturbed_samples = samples.clone()
        
        # Permute the feature
        permutation = torch.randperm(num_samples)
        perturbed_samples[:, feature_idx] = perturbed_samples[permutation, feature_idx]
        
        # Get predictions on perturbed data
        perturbed_actions = []
        with torch.no_grad():
            for i in range(num_samples):
                tensordict = TensorDict({"observation": perturbed_samples[i]}, batch_size=[])
                action_td = policy(tensordict)
                action = action_td["action"].argmax(dim=-1).item()
                perturbed_actions.append(action)
        
        # Calculate importance score (change in predictions)
        disagreement = sum(b != p for b, p in zip(baseline_actions, perturbed_actions))
        importance = disagreement / num_samples
        
        importance_scores.append(importance)
    
    # Create DataFrame
    importance_df = pd.DataFrame({
        'Feature': feature_columns,
        'Importance': importance_scores
    })
    
    # Visualize
    plt.figure(figsize=(10, 8))
    importance_df = importance_df.sort_values('Importance', ascending=False)
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title('Feature Importance Analysis')
    plt.xlabel('Importance Score (Higher = More Important)')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/feature_importance.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    return importance_df

In [15]:
# Main execution function
def run_complete_ad_optimization_pipeline(num_samples=5000, 
                                         train_iterations=300, 
                                         eval_episodes=100,
                                         output_dir="ad_optimization_results"):
    """
    Run the complete digital advertising optimization pipeline:
    1. Generate synthetic data
    2. Train RL agent
    3. Evaluate agent
    4. Analyze strategy and feature importance
    
    Args:
        num_samples: Number of synthetic data samples to generate
        train_iterations: Number of training iterations
        eval_episodes: Number of episodes for evaluation
        output_dir: Directory to save results and plots
        
    Returns:
        dict: Dictionary containing all results and models
    """
    # Create output directory with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = f"{output_dir}_{timestamp}"
    os.makedirs(run_dir, exist_ok=True)
    plot_dir = f"{run_dir}/plots"
    os.makedirs(plot_dir, exist_ok=True)
    
    print(f"Starting digital advertising optimization pipeline...")
    print(f"Results will be saved to: {run_dir}")
    
    # Set seeds for reproducibility
    set_all_seeds(42)
    
    # 1. Generate synthetic data
    print(f"Generating synthetic dataset with {num_samples} samples...")
    dataset = generate_synthetic_data(num_samples)
    
    # Save dataset
    dataset.to_csv(f"{run_dir}/synthetic_ad_data.csv", index=False)
    print(f"Synthetic dataset saved to {run_dir}/synthetic_ad_data.csv")
    
    # 2. Train RL agent
    print(f"Training RL agent for {train_iterations} iterations...")
    policy, training_metrics, env = train_ad_optimization_agent(
        dataset, 
        num_iterations=train_iterations, 
        batch_size=100, 
        sample_size=128
    )


In [None]:
# Main execution function
def run_complete_ad_optimization_pipeline(num_samples=5000, 
                                         train_iterations=300, 
                                         eval_episodes=100,
                                         output_dir="ad_optimization_results"):
    """
    Run the complete digital advertising optimization pipeline:
    1. Generate synthetic data
    2. Train RL agent
    3. Evaluate agent
    4. Analyze strategy and feature importance
    
    Args:
        num_samples: Number of synthetic data samples to generate
        train_iterations: Number of training iterations
        eval_episodes: Number of episodes for evaluation
        output_dir: Directory to save results and plots
        
    Returns:
        dict: Dictionary containing all results and models
    """
    # Create output directory with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = f"{output_dir}_{timestamp}"
    os.makedirs(run_dir, exist_ok=True)
    plot_dir = f"{run_dir}/plots"
    os.makedirs(plot_dir, exist_ok=True)
    
    print(f"Starting digital advertising optimization pipeline...")
    print(f"Results will be saved to: {run_dir}")
    
    # Set seeds for reproducibility
    set_all_seeds(42)
    
    # 1. Generate synthetic data
    print(f"Generating synthetic dataset with {num_samples} samples...")
    dataset = generate_synthetic_data(num_samples)
    
    # Save dataset
    dataset.to_csv(f"{run_dir}/synthetic_ad_data.csv", index=False)
    print(f"Synthetic dataset saved to {run_dir}/synthetic_ad_data.csv")
    
    # 2. Train RL agent
    print(f"Training RL agent for {train_iterations} iterations...")
    policy, training_metrics, env = train_ad_optimization_agent(
        dataset, 
        num_iterations=train_iterations, 
        batch_size=100, 
        sample_size=128
    )
    
    # Save model
    torch.save(policy.state_dict(), f"{run_dir}/ad_optimization_policy.pt")
    print(f"Trained model saved to {run_dir}/ad_optimization_policy.pt")
    
    # 3. Visualize training progress
    print("Visualizing training progress...")
    visualize_training_progress(training_metrics, output_dir=plot_dir)
    
    # 4. Evaluate agent
    print(f"Evaluating agent over {eval_episodes} episodes...")
    eval_metrics = evaluate_policy(policy, env, num_episodes=eval_episodes)
    
    # Save evaluation metrics
    with open(f"{run_dir}/evaluation_metrics.txt", "w") as f:
        f.write(f"Average Reward: {eval_metrics['avg_reward']:.4f}\n")
        f.write(f"Success Rate: {eval_metrics['success_rate']:.4f}\n")
        f.write(f"Action Distribution: Conservative: {eval_metrics['action_distribution'].get(0, 0):.2f}, " + 
                f"Aggressive: {eval_metrics['action_distribution'].get(1, 0):.2f}\n")
        f.write(f"Average Conservative Reward: {eval_metrics['avg_conservative_reward']:.4f}\n")
        f.write(f"Average Aggressive Reward: {eval_metrics['avg_aggressive_reward']:.4f}\n")
    
    # 5. Visualize evaluation
    print("Visualizing evaluation results...")
    visualize_evaluation(eval_metrics, env.feature_columns, output_dir=plot_dir)
    
    # 6. Analyze strategy across scenarios
    print("Analyzing strategy across different scenarios...")
    strategy_analysis = analyze_ad_strategy(policy, dataset, env.feature_columns, output_dir=plot_dir)
    strategy_analysis.to_csv(f"{run_dir}/strategy_analysis.csv", index=False)
    
    # 7. Analyze feature importance
    print("Analyzing feature importance...")
    importance_df = feature_importance_analysis(policy, env, env.feature_columns, 
                                              num_samples=1000, output_dir=plot_dir)
    importance_df.to_csv(f"{run_dir}/feature_importance.csv", index=False)
    
    print(f"Pipeline completed successfully. All results saved to {run_dir}")
    
    return {
        "policy": policy,
        "env": env,
        "dataset": dataset,
        "training_metrics": training_metrics,
        "eval_metrics": eval_metrics,
        "strategy_analysis": strategy_analysis,
        "feature_importance": importance_df,
        "output_dir": run_dir
    }


# Execute if run as a script
if __name__ == "__main__":
    results = run_complete_ad_optimization_pipeline(
        num_samples=5000,
        train_iterations=300,
        eval_episodes=100
    )

In [None]:
#-----

In [16]:
results = run_complete_ad_optimization_pipeline(
        num_samples=5000,
        train_iterations=300,
        eval_episodes=100
    )

Starting digital advertising optimization pipeline...
Results will be saved to: ad_optimization_results_20250307_170304
Generating synthetic dataset with 5000 samples...
Synthetic dataset saved to ad_optimization_results_20250307_170304/synthetic_ad_data.csv
Training RL agent for 300 iterations...
Starting training...




Iteration: 0/300, Frames: 100, Avg Reward: 0.0000, Epsilon: 0.9000, Time: 0.8s


  new_dest = torch.stack(


RuntimeError: shape mismatch: value tensor of shape [100, 14] cannot be broadcast to indexing result of shape [100]

In [None]:
def train_ad_optimization_agent(dataset, num_iterations=500, batch_size=100, sample_size=128, log_interval=10):
    """
    Train an RL agent for digital ad optimization with fixed tensor structures.
    
    Args:
        dataset: DataFrame with advertising metrics
        num_iterations: Number of training iterations
        batch_size: Number of environment interactions per batch
        sample_size: Number of samples from replay buffer per update
        log_interval: Interval for logging training progress
        
    Returns:
        tuple: (trained_policy, training_metrics, environment)
    """
    # Initialize Environment
    env = AdOptimizationEnv(dataset)
    state_dim = env.num_features
    action_dim = env.action_spec.n
    
    # Create neural network value function
    value_mlp = MLP(
        in_features=state_dim, 
        out_features=action_dim, 
        num_cells=[128, 64],
        activation_class=nn.ReLU
    )
    
    # Explicit tensor key mapping for value network
    value_net = TensorDictModule(
        value_mlp, 
        in_keys=["observation"], 
        out_keys=["action_value"]
    )
    
    # Create Q-value module with explicit key configuration
    # Note: QValueModule requires careful configuration of out_keys parameter
    # The error occurs because QValueModule tries to access out_keys[1] when it's not provided
    policy = TensorDictSequential(
        value_net,
        QValueModule(
            action_value_key="action_value",
            out_keys=["action", "action_value", "chosen_action_value"],
            spec=env.action_spec  # Ensures the policy has an action space
        )
    )

    # Explicitly set spec to policy to avoid "action_space" retrieval error
    policy.spec = env.action_spec  # Fixes the issue
    
    # Exploration strategy with epsilon annealing
    exploration_module = EGreedyModule(
        env.action_spec, 
        annealing_num_steps=10000, 
        eps_init=0.9, 
        eps_end=0.05,
        action_key="action"
    )
    
    # Complete policy with exploration
    policy_explore = TensorDictSequential(policy, exploration_module)
    
    # Data collection setup
    init_rand_steps = 1000
    frames_per_batch = batch_size
    optim_steps = 5
    
    # Create data collector
    collector = SyncDataCollector(
        env,
        policy_explore,
        frames_per_batch=frames_per_batch,
        total_frames=-1,
        init_random_frames=init_rand_steps,
        device="cpu",
        storing_device="cpu",
        return_same_td=True,
        split_trajs=False
    )
    
    # Create replay buffer for experience replay
    rb = ReplayBuffer(storage=LazyTensorStorage(max_size=20000))
    
    # Use DQNLoss with standard parameter interface
    loss = RobustDQNLoss(
        value_network=policy, 
        action_space=env.action_spec, 
        delay_value=True
    )
    
    # Optimizer and target network updater
    optim = Adam(loss.parameters(), lr=0.001)
    updater = SoftUpdate(loss, eps=0.95)
    
    # Training metrics tracking
    total_frames = 0
    rewards = []
    avg_losses = []
    epsilon_values = []
    
    print("Starting training...")
    start_time = time.time()
    
    # Training loop
    for i, data in enumerate(collector):
        if i >= num_iterations:
            break
            
        # Validate and fix tensor dictionary structure
        ensure_valid_td_structure(data)
        
        # Track rewards
        if "reward" in data:
            reward = data["reward"].sum().item() / frames_per_batch
            rewards.append(reward)
        else:
            rewards.append(0.0)
        
        # Track exploration rate
        epsilon_values.append(exploration_module.eps.item())
        
        # Store in replay buffer - make a deep copy to avoid reference issues
        rb.extend(data.clone())
        
        batch_losses = []
        if len(rb) > init_rand_steps:
            # Optimization steps
            for _ in range(optim_steps):
                # Sample from replay buffer
                sample = rb.sample(sample_size)
                
                # Ensure proper structure in the sample
                ensure_valid_td_structure(sample)
                
                # Compute loss and update
                loss_vals = loss(sample)
                
                if "loss" in loss_vals:
                    loss_val = loss_vals["loss"].item()
                    batch_losses.append(loss_val)
                    
                    loss_vals["loss"].backward()
                    optim.step()
                    optim.zero_grad()
                else:
                    print("Warning: Loss computation did not return 'loss' key")
                
                # Update exploration and target network
                exploration_module.step(frames_per_batch)
                updater.step()
            
            if batch_losses:
                avg_losses.append(np.mean(batch_losses))
        
        total_frames += frames_per_batch
        
        # Print progress
        if i % log_interval == 0:
            elapsed = time.time() - start_time
            print(f"Iteration: {i}/{num_iterations}, Frames: {total_frames}, "
                  f"Avg Reward: {np.mean(rewards[-10:]) if rewards else 0:.4f}, "
                  f"Epsilon: {exploration_module.eps.item():.4f}, "
                  f"Time: {elapsed:.1f}s")
    
    # Create clean policy without exploration for evaluation
    eval_policy = TensorDictSequential(value_net, QValueModule(spec=env.action_spec))
    eval_policy.load_state_dict(policy.state_dict())
    
    # Compile training metrics
    training_metrics = {
        "rewards": rewards,
        "losses": avg_losses,
        "epsilon_values": epsilon_values,
        "total_frames": total_frames,
        "training_time": time.time() - start_time
    }
    
    return eval_policy, training_metrics, env