In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from pathlib import Path
from runner import Runner
import interest_evolution
from full_slate_q_agent import FullSlateQAgent
from ppo  import PPOAgentWrapper
from bandit import BanditAgentWrapper

In [3]:
def create_dqn_agent(env, eval_mode=False, summary_writer=None):
    return FullSlateQAgent(
        observation_space=env.observation_space,
        action_space=env.action_space,
        eval_mode=eval_mode,
        summary_writer=summary_writer
    )

In [4]:
seed = 0
np.random.seed(seed)

env_config = {
    'num_candidates': 10,
    'slate_size': 2,
    'resample_documents': True,
    'seed': seed,
}

In [5]:
import shutil
import os
from pathlib import Path

tmp_base_dir = './logs/dqn'

# Automatically delete train and eval directories if they exist
train_log_dir = os.path.join(tmp_base_dir, 'train')
eval_log_dir = os.path.join(tmp_base_dir, 'eval')

for log_dir in [train_log_dir, eval_log_dir]:
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)

# Recreate base log directory
Path(tmp_base_dir).mkdir(parents=True, exist_ok=True)

# --- Initialize training runner ---
runner_dqn = Runner(
    base_dir=tmp_base_dir,
    create_agent_fn=create_dqn_agent,
    env=interest_evolution.create_environment(env_config),
)

# --- Run training + evaluation ---
runner_dqn.run_training(max_training_steps=100, num_iterations=10)
runner_dqn.run_evaluation(max_eval_episodes=5)


[TRAIN][Step 165] AvgLen: 82.50 | AvgRew: 148.00 | StdRew: 8.00 | Time/Step: 0.0055
[TRAIN][Step 309] AvgLen: 72.00 | AvgRew: 164.78 | StdRew: 1.03 | Time/Step: 0.0028
[TRAIN][Step 491] AvgLen: 91.00 | AvgRew: 147.45 | StdRew: 1.87 | Time/Step: 0.0027
[TRAIN][Step 651] AvgLen: 80.00 | AvgRew: 155.22 | StdRew: 11.34 | Time/Step: 0.0027
[TRAIN][Step 752] AvgLen: 101.00 | AvgRew: 163.42 | StdRew: 0.00 | Time/Step: 0.0028
[TRAIN][Step 941] AvgLen: 94.50 | AvgRew: 162.55 | StdRew: 5.45 | Time/Step: 0.0028
[TRAIN][Step 1092] AvgLen: 75.50 | AvgRew: 169.67 | StdRew: 0.74 | Time/Step: 0.0028
[TRAIN][Step 1242] AvgLen: 75.00 | AvgRew: 152.60 | StdRew: 0.60 | Time/Step: 0.0028
[TRAIN][Step 1411] AvgLen: 84.50 | AvgRew: 135.64 | StdRew: 2.33 | Time/Step: 0.0028
[TRAIN][Step 1573] AvgLen: 81.00 | AvgRew: 148.39 | StdRew: 8.39 | Time/Step: 0.0028
[EVAL] ckpt_0.pkl | Episode 1 | Reward: 162.81 | Length: 78
[EVAL] ckpt_0.pkl | Episode 2 | Reward: 140.00 | Length: 70
[EVAL] ckpt_0.pkl | Episode 3 | Re

In [8]:
from random_agent import RandomAgent
import shutil
import os
from pathlib import Path

def create_random_agent(env, **kwargs):
    return RandomAgent(action_space=env.action_space)

tmp_base_dir = './logs/random'

# Automatically delete train and eval directories if they exist
train_log_dir = os.path.join(tmp_base_dir, 'train')
eval_log_dir = os.path.join(tmp_base_dir, 'eval')

for log_dir in [train_log_dir, eval_log_dir]:
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)

# Recreate base log directory
Path(tmp_base_dir).mkdir(parents=True, exist_ok=True)

# --- Initialize runner for RandomAgent ---
runner_random = Runner(
    base_dir=tmp_base_dir,
    create_agent_fn=create_random_agent,
    env=interest_evolution.create_environment(env_config),
)

# --- Run training + evaluation ---
runner_random.run_training(max_training_steps=100, num_iterations=10)
runner_random.run_evaluation(max_eval_episodes=5)


[TRAIN][Step 170] AvgLen: 85.00 | AvgRew: 153.66 | StdRew: 12.17 | Time/Step: 0.0002
[TRAIN][Step 332] AvgLen: 81.00 | AvgRew: 170.62 | StdRew: 1.38 | Time/Step: 0.0002
[TRAIN][Step 499] AvgLen: 83.50 | AvgRew: 155.64 | StdRew: 3.64 | Time/Step: 0.0002
[TRAIN][Step 641] AvgLen: 71.00 | AvgRew: 159.45 | StdRew: 4.55 | Time/Step: 0.0002
[TRAIN][Step 789] AvgLen: 74.00 | AvgRew: 152.00 | StdRew: 0.00 | Time/Step: 0.0002
[TRAIN][Step 956] AvgLen: 83.50 | AvgRew: 161.67 | StdRew: 9.67 | Time/Step: 0.0002
[TRAIN][Step 1133] AvgLen: 88.50 | AvgRew: 151.93 | StdRew: 1.16 | Time/Step: 0.0002
[TRAIN][Step 1297] AvgLen: 82.00 | AvgRew: 163.28 | StdRew: 7.28 | Time/Step: 0.0002
[TRAIN][Step 1451] AvgLen: 77.00 | AvgRew: 148.00 | StdRew: 4.00 | Time/Step: 0.0002
[TRAIN][Step 1623] AvgLen: 86.00 | AvgRew: 156.78 | StdRew: 7.22 | Time/Step: 0.0003
[EVAL] CurrentAgent | Episode 1 | Reward: 155.90 | Length: 74
[EVAL] CurrentAgent | Episode 2 | Reward: 148.00 | Length: 86
[EVAL] CurrentAgent | Episode 3

In [19]:
def create_ppo_agent(env, **kwargs):
    obs_size = env.observation_space['user'].shape[0]
    act_size = env.action_space.nvec[0]
    return PPOAgentWrapper(input_dim=obs_size, output_dim=act_size, action_space=env.action_space)

tmp_base_dir = './logs/ppo'

# Delete and recreate directories
train_log_dir = os.path.join(tmp_base_dir, 'train')
eval_log_dir = os.path.join(tmp_base_dir, 'eval')
for log_dir in [train_log_dir, eval_log_dir]:
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)
Path(tmp_base_dir).mkdir(parents=True, exist_ok=True)

# Run PPO agent
runner_ppo = Runner(
    base_dir=tmp_base_dir,
    create_agent_fn=create_ppo_agent,
    env=interest_evolution.create_environment(env_config),
)

runner_ppo.run_training(max_training_steps=1000, num_iterations=100)
runner_ppo.run_evaluation(max_eval_episodes=10)

[TRAIN][Step 1137] AvgLen: 113.70 | AvgRew: 139.81 | StdRew: 8.78 | Time/Step: 0.0006
[TRAIN][Step 2155] AvgLen: 113.11 | AvgRew: 141.30 | StdRew: 11.19 | Time/Step: 0.0006
[TRAIN][Step 3162] AvgLen: 111.89 | AvgRew: 145.33 | StdRew: 12.87 | Time/Step: 0.0006
[TRAIN][Step 4202] AvgLen: 115.56 | AvgRew: 136.17 | StdRew: 12.28 | Time/Step: 0.0006
[TRAIN][Step 5273] AvgLen: 107.10 | AvgRew: 136.13 | StdRew: 9.73 | Time/Step: 0.0006
[TRAIN][Step 6314] AvgLen: 115.67 | AvgRew: 143.00 | StdRew: 10.09 | Time/Step: 0.0006
[TRAIN][Step 7387] AvgLen: 119.22 | AvgRew: 137.24 | StdRew: 10.30 | Time/Step: 0.0006
[TRAIN][Step 8510] AvgLen: 112.30 | AvgRew: 146.76 | StdRew: 8.71 | Time/Step: 0.0006
[TRAIN][Step 9596] AvgLen: 120.67 | AvgRew: 137.20 | StdRew: 10.38 | Time/Step: 0.0006
[TRAIN][Step 10682] AvgLen: 120.67 | AvgRew: 139.74 | StdRew: 11.92 | Time/Step: 0.0006
[TRAIN][Step 11751] AvgLen: 118.78 | AvgRew: 135.64 | StdRew: 12.07 | Time/Step: 0.0006
[TRAIN][Step 12820] AvgLen: 133.62 | AvgRew:

In [12]:
def create_bandit_agent(env):
    return BanditAgentWrapper(n_arms=env.action_space.nvec[0])

tmp_base_dir = './logs/bandit'

# Delete and recreate directories
train_log_dir = os.path.join(tmp_base_dir, 'train')
eval_log_dir = os.path.join(tmp_base_dir, 'eval')
for log_dir in [train_log_dir, eval_log_dir]:
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)
Path(tmp_base_dir).mkdir(parents=True, exist_ok=True)

# Run Bandit agent
runner_bandit = Runner(
    base_dir=tmp_base_dir,
    create_agent_fn=create_bandit_agent,
    env=interest_evolution.create_environment(env_config),
)

runner_bandit.run_training(max_training_steps=1000, num_iterations=100)
runner_bandit.run_evaluation(max_eval_episodes=10)

[TRAIN][Step 1054] AvgLen: 117.11 | AvgRew: 140.64 | StdRew: 7.61 | Time/Step: 0.0002
[TRAIN][Step 2067] AvgLen: 112.56 | AvgRew: 148.20 | StdRew: 9.34 | Time/Step: 0.0002
[TRAIN][Step 3173] AvgLen: 110.60 | AvgRew: 141.18 | StdRew: 7.71 | Time/Step: 0.0002
[TRAIN][Step 4255] AvgLen: 108.20 | AvgRew: 140.84 | StdRew: 11.62 | Time/Step: 0.0002
[TRAIN][Step 5360] AvgLen: 110.50 | AvgRew: 141.18 | StdRew: 11.39 | Time/Step: 0.0002
[TRAIN][Step 6491] AvgLen: 125.67 | AvgRew: 132.29 | StdRew: 15.64 | Time/Step: 0.0002
[TRAIN][Step 7591] AvgLen: 122.22 | AvgRew: 135.86 | StdRew: 9.83 | Time/Step: 0.0002
[TRAIN][Step 8623] AvgLen: 114.67 | AvgRew: 148.15 | StdRew: 15.06 | Time/Step: 0.0002
[TRAIN][Step 9695] AvgLen: 119.11 | AvgRew: 134.95 | StdRew: 5.53 | Time/Step: 0.0002
[TRAIN][Step 10766] AvgLen: 119.00 | AvgRew: 136.76 | StdRew: 16.29 | Time/Step: 0.0002
[TRAIN][Step 11852] AvgLen: 120.67 | AvgRew: 135.71 | StdRew: 12.44 | Time/Step: 0.0002
[TRAIN][Step 12905] AvgLen: 117.00 | AvgRew: 1

In [16]:
from contextual_bandit import ContextualBanditAgent
import shutil
import os
from pathlib import Path

def create_contextual_bandit_agent(env, **kwargs):
    n_arms = env.action_space.nvec[0]  # Use .nvec[0] for MultiDiscrete
    return ContextualBanditAgent(n_arms=n_arms, epsilon=0.1)

tmp_base_dir = './logs/contextual_bandit'
train_log_dir = os.path.join(tmp_base_dir, 'train')
eval_log_dir = os.path.join(tmp_base_dir, 'eval')

for log_dir in [train_log_dir, eval_log_dir]:
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)

Path(tmp_base_dir).mkdir(parents=True, exist_ok=True)

runner_cb = Runner(
    base_dir=tmp_base_dir,
    create_agent_fn=create_contextual_bandit_agent,
    env=interest_evolution.create_environment(env_config),
)

runner_cb.run_training(max_training_steps=100, num_iterations=10)
runner_cb.run_evaluation(max_eval_episodes=5)


[TRAIN][Step 104] AvgLen: 104.00 | AvgRew: 162.81 | StdRew: 0.00 | Time/Step: 0.0002
[TRAIN][Step 231] AvgLen: 127.00 | AvgRew: 126.19 | StdRew: 0.00 | Time/Step: 0.0002
[TRAIN][Step 430] AvgLen: 99.50 | AvgRew: 146.83 | StdRew: 1.00 | Time/Step: 0.0002
[TRAIN][Step 656] AvgLen: 113.00 | AvgRew: 132.75 | StdRew: 7.25 | Time/Step: 0.0002
[TRAIN][Step 780] AvgLen: 124.00 | AvgRew: 126.66 | StdRew: 0.00 | Time/Step: 0.0002
[TRAIN][Step 978] AvgLen: 99.00 | AvgRew: 158.35 | StdRew: 5.85 | Time/Step: 0.0002
[TRAIN][Step 1089] AvgLen: 111.00 | AvgRew: 148.00 | StdRew: 0.00 | Time/Step: 0.0002
[TRAIN][Step 1189] AvgLen: 100.00 | AvgRew: 149.84 | StdRew: 0.00 | Time/Step: 0.0002
[TRAIN][Step 1397] AvgLen: 104.00 | AvgRew: 159.49 | StdRew: 6.13 | Time/Step: 0.0002
[TRAIN][Step 1583] AvgLen: 93.00 | AvgRew: 142.97 | StdRew: 6.88 | Time/Step: 0.0002
[EVAL] ckpt_0.pkl | Episode 1 | Reward: 148.00 | Length: 95
[EVAL] ckpt_0.pkl | Episode 2 | Reward: 128.00 | Length: 106
[EVAL] ckpt_0.pkl | Episode 