In [None]:
!pip install -q tf-agents==0.19.0

In [19]:
import functools
import os

import tensorflow as tf
from tf_agents.bandits.agents import dropout_thompson_sampling_agent as dropout_ts_agent
from tf_agents.bandits.agents import lin_ucb_agent
from tf_agents.bandits.agents import linear_thompson_sampling_agent as lin_ts_agent
from tf_agents.bandits.agents import neural_epsilon_greedy_agent as eps_greedy_agent
from tf_agents.bandits.agents.examples.v2 import trainer
from tf_agents.bandits.environments import environment_utilities
from tf_agents.bandits.environments import movielens_per_arm_py_environment
from tf_agents.bandits.environments import movielens_py_environment
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.bandits.networks import global_and_arm_feature_network
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network

In [20]:
# Constants
BATCH_SIZE = 8
TRAINING_LOOPS = 20000
STEPS_PER_LOOP = 2

RANK_K = 20
NUM_ACTIONS = 20

AGENT_ALPHA = 10.0
EPSILON = 0.05
LAYERS = (50, 50, 50)
LR = 0.005
DROPOUT_RATE = 0.2


In [22]:

def run_bandit_training(root_dir, data_path, agent_name="LinUCB", per_arm=False):
    tf.compat.v1.enable_v2_behavior()

    if not data_path:
        raise ValueError('Please specify the location of the MovieLens data file.')

    # Environment
    if per_arm:
        env = movielens_per_arm_py_environment.MovieLensPerArmPyEnvironment(
            data_path, RANK_K, BATCH_SIZE, num_actions=NUM_ACTIONS, csv_delimiter='\t')
    else:
        env = movielens_py_environment.MovieLensPyEnvironment(
            data_path, RANK_K, BATCH_SIZE, num_movies=NUM_ACTIONS, csv_delimiter='\t')

    environment = tf_py_environment.TFPyEnvironment(env)

    # Optimal reward/action functions
    optimal_reward_fn = functools.partial(
        environment_utilities.compute_optimal_reward_with_movielens_environment,
        environment=environment)
    optimal_action_fn = functools.partial(
        environment_utilities.compute_optimal_action_with_movielens_environment,
        environment=environment)

    # Agent selection
    if agent_name == 'LinUCB':
        agent = lin_ucb_agent.LinearUCBAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            tikhonov_weight=0.001,
            alpha=AGENT_ALPHA,
            dtype=tf.float32,
            accepts_per_arm_features=per_arm,
        )
    elif agent_name == 'LinTS':
        agent = lin_ts_agent.LinearThompsonSamplingAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            dtype=tf.float32,
            accepts_per_arm_features=per_arm,
        )
    elif agent_name == 'epsGreedy':
        if per_arm:
            network = global_and_arm_feature_network.create_feed_forward_dot_product_network(
                environment.time_step_spec().observation,
                global_layers=LAYERS,
                arm_layers=LAYERS,
            )
        else:
            network = q_network.QNetwork(
                input_tensor_spec=environment.time_step_spec().observation,
                action_spec=environment.action_spec(),
                fc_layer_params=LAYERS,
            )
        agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
            epsilon=EPSILON,
            emit_policy_info='predicted_rewards_mean',
            info_fields_to_inherit_from_greedy=['predicted_rewards_mean'],
        )
    elif agent_name == 'DropoutTS':
        train_step_counter = tf.compat.v1.train.get_or_create_global_step()

        def dropout_fn():
            return tf.math.maximum(
                tf.math.reciprocal_no_nan(1.01 + tf.cast(train_step_counter, tf.float32)),
                0.0003,
            )

        agent = dropout_ts_agent.DropoutThompsonSamplingAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            dropout_rate=dropout_fn,
            network_layers=LAYERS,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        )
    else:
        raise ValueError(f"Unknown agent: {agent_name}")

    # Metrics
    regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
    suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(optimal_action_fn)
    
    # Train the agent
    trainer.train(
        root_dir=root_dir,
        agent=agent,
        environment=environment,
        training_loops=TRAINING_LOOPS,
        steps_per_loop=STEPS_PER_LOOP,
        additional_metrics=[regret_metric, suboptimal_arms_metric],
    )

In [None]:
    run_bandit_training(
    root_dir='/tmp/movielens_bandit_output',
    data_path='u.data',
    agent_name='LinUCB',  # Options: LinUCB, LinTS, epsGreedy, DropoutTS
    per_arm=False
)


In [None]:
%load_ext tensorboard
%tensorboard --logdir /tmp/movielens_bandit_output
