In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from pprint import pprint
import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import
import inspect
tf.compat.v1.enable_v2_behavior()

2023-12-12 20:10:13.447068: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Environment

In [2]:
from tf_agents.bandits.environments import bernoulli_py_environment as bern_env
from tf_agents.environments import tf_py_environment
# Reward arms
means = [0.1, 0.2, 0.3, 0.45, 0.5]
batch_size = 1

env = bern_env.BernoulliPyEnvironment(means=means, batch_size=BATCH_SIZE)
environment = tf_py_environment.TFPyEnvironment(env)

NameError: name 'BATCH_SIZE' is not defined

## Agent

In [65]:
from tf_agents.bandits.agents import bernoulli_thompson_sampling_agent as bern_ts_agent

agent = bern_ts_agent.BernoulliThompsonSamplingAgent(
    time_step_spec=environment.time_step_spec(),
    action_spec=environment.action_spec(),
    dtype=tf.float64,
    batch_size=BATCH_SIZE,
)

## Observers (Reward)

In [66]:
print(environment.pyenv._means)

[0.1, 0.2, 0.3, 0.45, 0.5]


In [67]:
from tf_agents.metrics import tf_metrics
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics

def optimal_reward_fn(unused_observation):
    return np.max(environment.pyenv._means)

def optimal_action_fn(unused_observation):
    return np.int32(np.argmax(environment.pyenv._means))


observers = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.AverageEpisodeLengthMetric(batch_size=environment.batch_size),
    tf_metrics.AverageReturnMetric(batch_size=environment.batch_size),
    tf_bandit_metrics.RegretMetric(optimal_reward_fn),
    tf_bandit_metrics.SuboptimalArmsMetric(optimal_action_fn)
  ]

## Driver

In [68]:
from tf_agents.drivers import dynamic_step_driver
steps_per_loop = 1
driver = dynamic_step_driver.DynamicStepDriver(
  env=environment,
  policy=agent.collect_policy,
  num_steps=steps_per_loop * environment.batch_size,
  observers=observers,
)
data_spec = agent.policy.trajectory_spec

In [108]:
pprint(driver.env.)

<tf_agents.environments.tf_py_environment.TFPyEnvironment object at 0x140e79b50>


## Replay buffer

In [74]:
from tf_agents.bandits.replay_buffers import bandit_replay_buffer
replay_buffer = bandit_replay_buffer.BanditReplayBuffer(
      data_spec=data_spec,
      batch_size=batch_size,
      max_length=steps_per_loop
  )

## Training

In [93]:
from tf_agents.eval import metric_utils
from tf_agents.metrics import export_utils
from io import StringIO
import logging
log_stream = StringIO()    
logging.basicConfig(stream=log_stream, level=logging.NOTSET)

def _export_metrics_and_summaries(step, metrics):
    """Exports metrics and tf summaries."""
    metric_utils.log_metrics(metrics)
    export_utils.export_metrics(step=step, metrics=metrics)
    for metric in metrics:
      metric.tf_summaries(train_step=step)

In [94]:
starting_loop = 0
training_loops = 1000

In [107]:
driver.env.

<tf_agents.environments.tf_py_environment.TFPyEnvironment at 0x140e79b50>

In [102]:
driver.run()
dataset_it = iter(
    replay_buffer.as_dataset(
        sample_batch_size=batch_size,
        num_steps=100,
        single_deterministic_pass=True,
    )
)

In [106]:
replay_buffer.as_dataset(
        sample_batch_size=batch_size,
        num_steps=100,
        single_deterministic_pass=True,
    )

<_MapDataset element_spec=(Trajectory(
{'action': TensorSpec(shape=(None, 100), dtype=tf.int32, name=None),
 'discount': TensorSpec(shape=(None, 100), dtype=tf.float32, name=None),
 'next_step_type': TensorSpec(shape=(None, 100), dtype=tf.int32, name=None),
 'observation': TensorSpec(shape=(None, 100), dtype=tf.float32, name=None),
 'policy_info': PolicyInfo(log_probability=(), predicted_rewards_mean=(), multiobjective_scalarized_predicted_rewards_mean=(), predicted_rewards_optimistic=(), predicted_rewards_sampled=(), bandit_policy_type=()),
 'reward': TensorSpec(shape=(None, 100), dtype=tf.float32, name=None),
 'step_type': TensorSpec(shape=(None, 100), dtype=tf.int32, name=None)}), BufferInfo(ids=TensorSpec(shape=(None, 100), dtype=tf.int64, name=None), probabilities=()))>

In [100]:
log_stream.

<_io.StringIO at 0x142ffa050>

In [None]:
def training_loop(train_step, metrics):
    """Returns a function that runs a single training loop and logs metrics."""
    driver.run()
    _export_metrics_and_summaries(
      step=train_step, metrics=metrics
    )
    batch_size = driver.env.batch_size
    dataset_it = iter(
        replay_buffer.as_dataset(
            sample_batch_size=batch_size,
            num_steps=100,
            single_deterministic_pass=True,
        )
    )
    experience, unused_buffer_info = dataset_it.get_next()
    set_expected_shape(experience, steps)
    loss_info = agent.train(experience)
    export_utils.export_metrics(
      step=train_step * async_steps_per_loop + batch_id,
      metrics=[],
      loss_info=loss_info,
    )
    
    replay_buffer.clear()