In [None]:
!sudo apt-get update
!sudo apt-get install -y xvfb ffmpeg freeglut3-dev
!pip install 'imageio==2.4.0'
!pip install pyvirtualdisplay
!pip install tf-agents
!pip install pyglet
!pip install tf-keras

import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import PIL.Image
import pyvirtualdisplay

import tensorflow as tf

from tf_agents.agents.categorical_dqn import categorical_dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import categorical_q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

from google.colab import drive
drive.mount('/content/drive')


0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 257 kB in 2s (140 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry miss

## Agent

C51 is a Q-learning algorithm based on DQN. Like DQN, it can be used on any environment with a discrete action space.

The main difference between C51 and DQN is that rather than simply predicting the Q-value for each state-action pair, C51 predicts a histogram model for the probability distribution of the Q-value:

![Example C51 Distribution](https://github.com/tensorflow/agents/blob/master/docs/tutorials/images/c51_distribution.png?raw=1)

By learning the distribution rather than simply the expected value, the algorithm is able to stay more stable during training, leading to improved final performance. This is particularly true in situations with bimodal or even multimodal value distributions, where a single average does not provide an accurate picture.

In order to train on probability distributions rather than on values, C51 must perform some complex distributional computations in order to calculate its loss function. But don't worry, all of this is taken care of for you in TF-Agents!

To create a C51 Agent, we first need to create a `CategoricalQNetwork`. The API of the `CategoricalQNetwork` is the same as that of the `QNetwork`, except that there is an additional argument `num_atoms`. This represents the number of support points in our probability distribution estimates. (The above image includes 10 support points, each represented by a vertical blue bar.) As you can tell from the name, the default number of atoms is 51.


We also need an `optimizer` to train the network we just created, and a `train_step_counter` variable to keep track of how many times the network was updated.

Note that one other significant difference from vanilla `DqnAgent` is that we now need to specify `min_q_value` and `max_q_value` as arguments. These specify the most extreme values of the support (in other words, the most extreme of the 51 atoms on either side). Make sure to choose these appropriately for your particular environment.

One last thing to note is that we also added an argument to use n-step updates with $n$ = 2. In single-step Q-learning ($n$ = 1), we only compute the error between the Q-values at the current time step and the next time step using the single-step return (based on the Bellman optimality equation). The single-step return is defined as:

$G_t = R_{t + 1} + \gamma V(s_{t + 1})$

where we define $V(s) = \max_a{Q(s, a)}$.

N-step updates involve expanding the standard single-step return function $n$ times:

$G_t^n = R_{t + 1} + \gamma R_{t + 2} + \gamma^2 R_{t + 3} + \dots + \gamma^n V(s_{t + n})$

N-step updates enable the agent to bootstrap from further in the future, and with the right value of $n$, this often leads to faster learning.

Although C51 and n-step updates are often combined with prioritized replay to form the core of the [Rainbow agent](https://arxiv.org/pdf/1710.02298.pdf), we saw no measurable improvement from implementing prioritized replay. Moreover, we find that when combining our C51 agent with n-step updates alone, our agent performs as well as other Rainbow agents on the sample of Atari environments we've tested.

## Data Collection

As in the DQN tutorial, set up the replay buffer and the initial data collection with the random policy.

C51 tends to do slightly better than DQN on CartPole-v1, but the difference between the two agents becomes more and more significant in increasingly complex environments. For example, on the full Atari 2600 benchmark, C51 demonstrates a mean score improvement of 126% over DQN after normalizing with respect to a random agent. Additional improvements can be gained by including n-step updates.

For a deeper dive into the C51 algorithm, see [A Distributional Perspective on Reinforcement Learning (2017)](https://arxiv.org/pdf/1707.06887.pdf).

In [None]:
# Environment setup
env_name = "Acrobot-v1"                                                                # Name of the gym environment
num_iterations = 600                                                                   # Number of iterations (training steps)
initial_collect_steps = 2000                                                           # Number of steps for initial random data collection
collect_steps_per_iteration = 10                                                       # Number of steps collected per iteration
replay_buffer_capacity = 200000                                                        # Maximum capacity of the replay buffer

fc_layer_params = (1000,1000, 1000)                                                      # Neural network architecture: number of neurons in each layer
batch_size = 2000                                                                        # Batch size for training
learning_rate = 5e-5                                                                   # Learning rate for optimizer
gamma = 0.95                                                                          # Discount factor for future rewards
log_interval = 200                                                                     # Interval for logging training progress

num_atoms = 201                                                                        # Number of atoms for Categorical DQN (support points in probability distribution)
min_q_value = -500                                                                     # Minimum Q-value for the distribution
max_q_value = 0                                                                        # Maximum Q-value for the distribution
n_step_update = 5                                                                      # Number of steps for n-step return update

num_eval_episodes = 1                                                                  # Number of episodes to run for evaluation (1 to store individual episode rewards)
eval_interval = 1                                                                      # Evaluate the agent every iteration

# Epsilon decay function for epsilon-greedy policy
def epsilon_fn(step, initial_epsilon=1.0, final_epsilon=0.2, decay_rate=0.999):
    return tf.maximum(final_epsilon, initial_epsilon * (decay_rate**float(step)))     # Gradually decays epsilon over time

# Load the environment
train_py_env = suite_gym.load(env_name)                                                 # Load the environment for training
eval_py_env = suite_gym.load(env_name)                                                  # Load the environment for evaluation

train_env = tf_py_environment.TFPyEnvironment(train_py_env)                             # Convert to TensorFlow environment
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)                               # Convert to TensorFlow environment for evaluation

# Categorical Q-Network
categorical_q_net = categorical_q_network.CategoricalQNetwork(                          # Define the categorical Q-network
    train_env.observation_spec(),                                                       # Environment observation space
    train_env.action_spec(),                                                            # Environment action space
    num_atoms=num_atoms,                                                                # Number of atoms for probability distribution
    fc_layer_params=fc_layer_params)                                                    # Network architecture (fully connected layers)

# Optimizer with learning rate decay
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)               # Define the Adam optimizer

train_step_counter = tf.Variable(0)                                                     # Step counter for training

# Categorical DQN agent initialization
agent = categorical_dqn_agent.CategoricalDqnAgent(                                      # Define the Categorical DQN agent
    train_env.time_step_spec(),                                                         # Environment time step specification
    train_env.action_spec(),                                                            # Environment action specification
    categorical_q_network=categorical_q_net,                                            # Categorical Q-network
    optimizer=optimizer,                                                                # Optimizer
    min_q_value=min_q_value,                                                            # Minimum Q-value
    max_q_value=max_q_value,                                                            # Maximum Q-value
    n_step_update=n_step_update,                                                        # n-step update for return
    td_errors_loss_fn=common.element_wise_squared_loss,                                 # Loss function for TD errors
    gamma=gamma,                                                                        # Discount factor for future rewards
    train_step_counter=train_step_counter,                                              # Step counter
    epsilon_greedy=lambda: epsilon_fn(train_step_counter))                              # Epsilon-greedy policy for exploration

agent.initialize()                                                                      # Initialize the agent

# Create replay buffer using TFUniformReplayBuffer
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(                         # Define the replay buffer for storing experience
    data_spec=agent.collect_data_spec,                                                  # Data specification from agent
    batch_size=train_env.batch_size,                                                    # Batch size for replay
    max_length=replay_buffer_capacity)                                                  # Maximum capacity of the replay buffer

# Collect initial random data
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())  # Random policy for initial data collection

# Function to collect a single step of experience
def collect_step(environment, policy):
    time_step = environment.current_time_step()                                         # Get the current time step
    action_step = policy.action(time_step)                                              # Choose an action using the given policy
    next_time_step = environment.step(action_step.action)                               # Take a step in the environment
    traj = trajectory.from_transition(time_step, action_step, next_time_step)           # Create a trajectory from the transition
    replay_buffer.add_batch(traj)                                                       # Add the experience to the replay buffer

# Collect initial random steps to populate the replay buffer
for _ in range(initial_collect_steps):
    collect_step(train_env, random_policy)

# Dataset for sampling from replay buffer
dataset = replay_buffer.as_dataset(                                                     # Create a dataset from the replay buffer
    num_parallel_calls=3, sample_batch_size=batch_size,                                 # Use parallel calls and batch size for sampling
    num_steps=n_step_update + 1).prefetch(3)                                            # Prefetch batches for efficiency
iterator = iter(dataset)                                                                # Create an iterator for the dataset

# Optimize by wrapping in a graph using TF function
agent.train = common.function(agent.train)                                              # Optimize training by wrapping it in a TensorFlow function

# Reset the training step counter
agent.train_step_counter.assign(0)

# Function to run a single episode and collect total rewards
def run_episode(env, policy):
    time_step = env.reset()                                                             # Reset the environment for a new episode
    episode_reward = 0

    while not time_step.is_last():                                                      # Run the episode until it ends
        action_step = policy.action(time_step)                                          # Get an action from the policy
        time_step = env.step(action_step.action)                                        # Take a step in the environment
        episode_reward += time_step.reward.numpy()                                      # Accumulate the reward

    return episode_reward                                                               # Return the total reward for the episode

# Initialize a list to store the total rewards of each episode
episode_rewards = []

# Main training loop
for iteration in range(num_iterations):
    # Collect steps using the agent's collect policy
    for _ in range(collect_steps_per_iteration):
        collect_step(train_env, agent.collect_policy)                                   # Collect data using the agent's collect policy

    # Sample a batch of data from the replay buffer and update the agent
    experience, unused_info = next(iterator)                                            # Sample a batch of experiences
    train_loss = agent.train(experience)                                                # Train the agent using the sampled batch

    step = agent.train_step_counter.numpy()                                             # Get the current training step

    if step % log_interval == 0:                                                        # Log the training loss at regular intervals
        print(f'Episode = {step}: loss = {train_loss.loss}')

    # Evaluate the agent every eval_interval steps
    if step % eval_interval == 0:
        # Run an evaluation episode and store the total reward
        total_reward = run_episode(eval_env, agent.policy)                              # Run a single episode and get the total reward
        episode_rewards.append(total_reward)                                            # Append the reward to the list of episode rewards
        print(f'Episode {step}/{num_iterations}: Total Reward = {total_reward}')


Episode 1/600: Total Reward = [-500.]
Episode 2/600: Total Reward = [-500.]
Episode 3/600: Total Reward = [-500.]
Episode 4/600: Total Reward = [-500.]
Episode 5/600: Total Reward = [-500.]
Episode 6/600: Total Reward = [-500.]
Episode 7/600: Total Reward = [-500.]
Episode 8/600: Total Reward = [-500.]
Episode 9/600: Total Reward = [-500.]
Episode 10/600: Total Reward = [-500.]
Episode 11/600: Total Reward = [-500.]
Episode 12/600: Total Reward = [-500.]
Episode 13/600: Total Reward = [-500.]
Episode 14/600: Total Reward = [-500.]
Episode 15/600: Total Reward = [-500.]
Episode 16/600: Total Reward = [-500.]
Episode 17/600: Total Reward = [-500.]
Episode 18/600: Total Reward = [-500.]
Episode 19/600: Total Reward = [-500.]
Episode 20/600: Total Reward = [-500.]
Episode 21/600: Total Reward = [-500.]
Episode 22/600: Total Reward = [-500.]
Episode 23/600: Total Reward = [-500.]
Episode 24/600: Total Reward = [-500.]
Episode 25/600: Total Reward = [-500.]
Episode 26/600: Total Reward = [-5

SAVE RETURNS

In [None]:
# After training loop is finished, save the rewards
import numpy as np

def save_episode_rewards(episode_rewards, file_path):
    np.save(file_path, np.array(episode_rewards))
    print(f"Episode rewards saved at: {file_path}")
save_dir_name = '' #insert saving path
save_episode_rewards(episode_rewards, save_dir_name)

Episode rewards saved at: /content/drive/My Drive/TESI/CLASSIC_DATA/jhonny/jhonny.1000.2000.npy


VIDEO

In [None]:
# Set up a virtual display for rendering OpenAI gym environments.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()
# Function to embed the video
def embed_mp4(filename):
    """Embeds an mp4 file in the notebook."""
    video = open(filename, 'rb').read()
    b64 = base64.b64encode(video)
    tag = '''
    <video width="640" height="480" controls>
      <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())

    return IPython.display.HTML(tag)

# Start virtual display for rendering
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()


# Record a late episode (e.g., towards the end of training)
late_episode_filename = 'late_episode.mp4'
with imageio.get_writer(late_episode_filename, fps=60) as video:
    time_step = eval_env.reset()
    video.append_data(eval_py_env.render())  # Capture initial frame
    while not time_step.is_last():
        action_step = agent.policy.action(time_step)
        time_step = eval_env.step(action_step.action)
        video.append_data(eval_py_env.render())  # Capture each frame


# Display the late episode video
print("Late Episode Video:")
IPython.display.display(embed_mp4(late_episode_filename))

# Optional: Save the videos to Google Drive
from google.colab import drive
drive.mount('/content/drive')

late_output_path = '/content/drive/My Drive/TESI/video/late2_episode.mp4'

!cp {late_episode_filename} {late_output_path}


print(f"Late episode saved at: {late_output_path}")


