In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import numpy as np
from pathlib import Path
from runner import Runner
import interest_evolution
from full_slate_q_agent import FullSlateQAgent
from ppo  import PPOAgentWrapper
from bandit import EpsilonGreedyBandit

In [11]:
def create_dqn_agent(env, eval_mode=False, summary_writer=None):
    return FullSlateQAgent(
        observation_space=env.observation_space,
        action_space=env.action_space,
        eval_mode=eval_mode,
        summary_writer=summary_writer
    )

In [12]:
seed = 0
np.random.seed(seed)

env_config = {
    'num_candidates': 10,
    'slate_size': 2,
    'resample_documents': True,
    'seed': seed,
}

In [22]:
import shutil
import os
from pathlib import Path

tmp_base_dir = './logs/dqn'

# Automatically delete train and eval directories if they exist
train_log_dir = os.path.join(tmp_base_dir, 'train')
eval_log_dir = os.path.join(tmp_base_dir, 'eval')

for log_dir in [train_log_dir, eval_log_dir]:
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)

# Recreate base log directory
Path(tmp_base_dir).mkdir(parents=True, exist_ok=True)

# --- Initialize training runner ---
runner_dqn = Runner(
    base_dir=tmp_base_dir,
    create_agent_fn=create_dqn_agent,
    env=interest_evolution.create_environment(env_config),
)

# --- Run training + evaluation ---
runner_dqn.run_training(max_training_steps=50, num_iterations=5)
runner_dqn.run_evaluation(max_eval_episodes=2)


[TRAIN][Step 71] AvgLen: 71.00 | AvgRew: 176.00 | StdRew: 0.00 | Time/Step: 0.0007
[TRAIN][Step 145] AvgLen: 74.00 | AvgRew: 132.84 | StdRew: 0.00 | Time/Step: 0.0010
[TRAIN][Step 221] AvgLen: 76.00 | AvgRew: 176.00 | StdRew: 0.00 | Time/Step: 0.0011
[TRAIN][Step 290] AvgLen: 69.00 | AvgRew: 171.84 | StdRew: 0.00 | Time/Step: 0.0014
[TRAIN][Step 380] AvgLen: 90.00 | AvgRew: 142.29 | StdRew: 0.00 | Time/Step: 0.0013
[EVAL] ckpt_0.pkl | Episode 1 | Reward: 159.57 | Length: 79
[EVAL] ckpt_0.pkl | Episode 2 | Reward: 136.19 | Length: 70
[EVAL][Step 71] AvgLen: 74.50 | AvgRew: 147.88 | StdRew: 11.69 | Time/Step: 0.0000
[EVAL] ckpt_1.pkl | Episode 1 | Reward: 194.38 | Length: 82
[EVAL] ckpt_1.pkl | Episode 2 | Reward: 149.23 | Length: 80
[EVAL][Step 145] AvgLen: 81.00 | AvgRew: 171.81 | StdRew: 22.58 | Time/Step: 0.0000
[EVAL] ckpt_2.pkl | Episode 1 | Reward: 161.71 | Length: 103
[EVAL] ckpt_2.pkl | Episode 2 | Reward: 168.44 | Length: 88
[EVAL][Step 221] AvgLen: 95.50 | AvgRew: 165.07 | Std

In [23]:
from random_agent import RandomAgent
import shutil
import os
from pathlib import Path

def create_random_agent(env, **kwargs):
    return RandomAgent(action_space=env.action_space)

tmp_base_dir = './logs/random'

# Automatically delete train and eval directories if they exist
train_log_dir = os.path.join(tmp_base_dir, 'train')
eval_log_dir = os.path.join(tmp_base_dir, 'eval')

for log_dir in [train_log_dir, eval_log_dir]:
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)

# Recreate base log directory
Path(tmp_base_dir).mkdir(parents=True, exist_ok=True)

# --- Initialize runner for RandomAgent ---
runner_random = Runner(
    base_dir=tmp_base_dir,
    create_agent_fn=create_random_agent,
    env=interest_evolution.create_environment(env_config),
)

# --- Run training + evaluation ---
runner_random.run_training(max_training_steps=50, num_iterations=5)
runner_random.run_evaluation(max_eval_episodes=2)


[TRAIN][Step 82] AvgLen: 82.00 | AvgRew: 174.66 | StdRew: 0.00 | Time/Step: 0.0001
[TRAIN][Step 167] AvgLen: 85.00 | AvgRew: 144.00 | StdRew: 0.00 | Time/Step: 0.0001
[TRAIN][Step 266] AvgLen: 99.00 | AvgRew: 172.03 | StdRew: 0.00 | Time/Step: 0.0001
[TRAIN][Step 347] AvgLen: 81.00 | AvgRew: 164.00 | StdRew: 0.00 | Time/Step: 0.0001
[TRAIN][Step 423] AvgLen: 76.00 | AvgRew: 140.93 | StdRew: 0.00 | Time/Step: 0.0001
[EVAL] CurrentAgent | Episode 1 | Reward: 182.09 | Length: 94
[EVAL] CurrentAgent | Episode 2 | Reward: 152.89 | Length: 84
[EVAL][Step 0] AvgLen: 89.00 | AvgRew: 167.49 | StdRew: 14.60 | Time/Step: 0.0000


In [24]:
def create_ppo_agent(env, **kwargs):
    obs_size = env.observation_space['user'].shape[0]
    act_size = env.action_space.nvec[0]
    return PPOAgentWrapper(input_dim=obs_size, output_dim=act_size, action_space=env.action_space)

tmp_base_dir = './logs/ppo'

# Delete and recreate directories
train_log_dir = os.path.join(tmp_base_dir, 'train')
eval_log_dir = os.path.join(tmp_base_dir, 'eval')
for log_dir in [train_log_dir, eval_log_dir]:
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)
Path(tmp_base_dir).mkdir(parents=True, exist_ok=True)

# Run PPO agent
runner_ppo = Runner(
    base_dir=tmp_base_dir,
    create_agent_fn=create_ppo_agent,
    env=interest_evolution.create_environment(env_config),
)

runner_ppo.run_training(max_training_steps=50, num_iterations=5)
runner_ppo.run_evaluation(max_eval_episodes=2)

[TRAIN][Step 97] AvgLen: 97.00 | AvgRew: 151.31 | StdRew: 0.00 | Time/Step: 0.0002
[TRAIN][Step 205] AvgLen: 108.00 | AvgRew: 128.00 | StdRew: 0.00 | Time/Step: 0.0002
[TRAIN][Step 331] AvgLen: 126.00 | AvgRew: 167.38 | StdRew: 0.00 | Time/Step: 0.0002
[TRAIN][Step 434] AvgLen: 103.00 | AvgRew: 139.97 | StdRew: 0.00 | Time/Step: 0.0002
[TRAIN][Step 580] AvgLen: 146.00 | AvgRew: 124.00 | StdRew: 0.00 | Time/Step: 0.0002
[EVAL] CurrentAgent | Episode 1 | Reward: 143.54 | Length: 96
[EVAL] CurrentAgent | Episode 2 | Reward: 132.16 | Length: 98
[EVAL][Step 0] AvgLen: 97.00 | AvgRew: 137.85 | StdRew: 5.69 | Time/Step: 0.0000


In [26]:
def create_bandit_agent(env, **kwargs):
    act_size = env.action_space.nvec[0]
    return EpsilonGreedyBandit(n_arms=act_size, epsilon=0.1)

tmp_base_dir = './logs/bandit'

# Delete and recreate directories
train_log_dir = os.path.join(tmp_base_dir, 'train')
eval_log_dir = os.path.join(tmp_base_dir, 'eval')
for log_dir in [train_log_dir, eval_log_dir]:
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)
Path(tmp_base_dir).mkdir(parents=True, exist_ok=True)

# Run Bandit agent
runner_bandit = Runner(
    base_dir=tmp_base_dir,
    create_agent_fn=create_bandit_agent,
    env=interest_evolution.create_environment(env_config),
)

runner_bandit.run_training(max_training_steps=50, num_iterations=5)
runner_bandit.run_evaluation(max_eval_episodes=2)

AttributeError: 'EpsilonGreedyBandit' object has no attribute 'begin_episode'