# Deep Convolutional Q-Learning for Pac-Man

## Part 0 - Installing the required packages and importing the libraries

### Installing Gymnasium

In [1]:
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!apt-get install -y swig
!pip install gymnasium[box2d]

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting shimmy<1.0,>=0.1.0 (from shimmy[atari]<1.0,>=0.1.0; extra == "atari"->gymnasium[accept-rom-license,atari])
  Downloading Shimmy-0.2.1-py3-none-any.whl.metadata (2.3 kB)
Collecting autorom~=0.4.2 (from autorom[accept-rom-license]~=0.4.2; extra == "accept-rom-license"->gymnasium[accept-rom-license,atari])
  Downloading AutoROM-0.4.2-py3-none-any.whl.metadata (2.8 kB)
Collecting AutoROM.acce

### Importing the libraries

In [2]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
from torch.utils.data import DataLoader, TensorDataset

## Part 1 - Building the AI

### Creating the architecture of the Neural Network

In [3]:
class Network(nn.Module):
# This line defines a class Network that inherits from nn.Module, which is a base class for all neural network modules in PyTorch.
# The Network class will represent the deep Q-network (DQN) used in our reinforcement learning setup.
  def __init__(self, action_size, seed = 42):
    # The __init__ function is the constructor for the Network class. It initializes the network's architecture.
    # action_size is the number of possible actions the agent can take (e.g., move left, right, up, or down in Pacman).
    # seed=42 sets a default seed for random number generation to ensure reproducibility.
    super(Network, self).__init__() #This line calls the constructor of the parent class nn.Module to ensure that the Network class inherits all the necessary properties and methods from nn.Module.
    self.seed = torch.manual_seed(seed) #This line sets the seed for the random number generator in PyTorch, ensuring that the initialization of weights is consistent across different runs.
    #The network has four convolutional layers, each followed by a batch normalization layer. Convolutional layers are used to extract features from the input image.
    self.conv1 = nn.Conv2d(3, 32, kernel_size = 8, stride = 4)
    self.bn1 = nn.BatchNorm2d(32)
    # self.conv1: The first convolutional layer takes an input with 3 channels (likely representing RGB channels of an image) and applies 32 filters (output channels). The filters are of size 8x8, and the stride is 4, meaning the filter moves 4 pixels at a time.
    # self.bn1: The batch normalization layer normalizes the output of conv1, which helps in stabilizing and speeding up the training process.
    self.conv2 = nn.Conv2d(32, 64, kernel_size = 4, stride = 2)
    self.bn2 = nn.BatchNorm2d(64)
    # self.conv2: The second convolutional layer takes the 32 output channels from the first layer and applies 64 filters of size 4x4 with a stride of 2.
    # self.bn2: Batch normalization is applied to the output of conv2.
    self.conv3 = nn.Conv2d(64, 64, kernel_size = 3, stride = 1)
    self.bn3 = nn.BatchNorm2d(64)
    # self.conv3: The third convolutional layer applies 64 filters of size 3x3 with a stride of 1 to the 64 output channels from the second layer.
    # self.bn3: Batch normalization is applied to the output of conv3.
    self.conv4 = nn.Conv2d(64, 128, kernel_size = 3, stride = 1)
    self.bn4 = nn.BatchNorm2d(128)
    # self.conv4: The fourth convolutional layer applies 128 filters of size 3x3 with a stride of 1 to the 64 output channels from the third layer.
    # self.bn4: Batch normalization is applied to the output of conv4.
    self.fc1 = nn.Linear(10 * 10 * 128, 512)
    #self.fc1: A fully connected layer that takes the flattened output from the last convolutional layer. The output dimension of the convolutional layers is calculated as 10x10x128 (assuming the input image size and convolutional operations result in this dimension). This layer outputs 512 units.
    self.fc2 = nn.Linear(512, 256)
    #self.fc2: A fully connected layer that takes the 512 units from fc1 and outputs 256 units.
    self.fc3 = nn.Linear(256, action_size)
    #self.fc3: The final fully connected layer that outputs action_size units, where each unit corresponds to the Q-value for a possible action in the game.

  def forward(self, state):
    # This forward method defines how the input data (in this case, the game state) is passed through the neural network to produce an output. The method represents the forward pass in a neural network, where the data flows through each layer, undergoing transformations, until it reaches the output layer.
    # state: The input to the network, which is typically a tensor representing the current state of the environment (e.g., an image of the game screen in a Pacman game).
    x = F.relu(self.bn1(self.conv1(state)))
    # self.conv1(state): The input state is passed through the first convolutional layer (conv1). This layer applies a set of filters (kernels) to the input, producing feature maps that capture local patterns (e.g., edges) in the input image.
    # self.bn1(...): The output of conv1 is passed through a batch normalization layer (bn1). This normalizes the feature maps to stabilize and speed up training.
    # F.relu(...): Finally, a ReLU (Rectified Linear Unit) activation function is applied. ReLU introduces non-linearity to the network by setting all negative values to zero, which helps the network learn complex patterns.
    x = F.relu(self.bn2(self.conv2(x)))
    # self.conv2(x): The output from the first layer (x) is passed through the second convolutional layer (conv2). This layer applies more filters, likely detecting more complex patterns or features in the data.
    # self.bn2(...): The output of conv2 is batch normalized using bn2.
    # F.relu(...): ReLU activation is again applied to introduce non-linearity
    x = F.relu(self.bn3(self.conv3(x)))
    # self.conv3(x): The output from the second layer is processed by the third convolutional layer (conv3), which further refines the features extracted from the input.
    # self.bn3(...): Batch normalization is applied to the output of conv3.
    # F.relu(...): ReLU activation is applied again.
    x = F.relu(self.bn4(self.conv4(x)))
    # self.conv4(x): The output from the third layer is processed by the fourth convolutional layer (conv4). This layer further extracts high-level features from the data.
    # self.bn4(...): Batch normalization is applied to the output of conv4.
    # F.relu(...): ReLU activation is applied.
    x = x.view(x.size(0), -1)
    # x.view(x.size(0), -1): The output of the final convolutional layer is a multi-dimensional tensor (e.g., a 4D tensor with dimensions [batch_size, channels, height, width]).
    # Flattening: The view function reshapes this tensor into a 2D tensor where the first dimension is the batch size, and the second dimension flattens all other dimensions into a single vector (e.g., [batch_size, flattened_features]). This step is necessary before feeding the data into fully connected layers, which expect 2D input.
    x = F.relu(self.fc1(x))
    # self.fc1(x): The flattened feature vector is passed through the first fully connected (dense) layer (fc1). This layer combines the features extracted by the convolutional layers to start making decisions.
    # F.relu(...): ReLU activation is applied to introduce non-linearity.
    x = F.relu(self.fc2(x))
    # self.fc2(x): The output from fc1 is passed through the second fully connected layer (fc2). This layer further refines the features and makes more complex decisions.
    # F.relu(...): ReLU activation is applied once again.
    return self.fc3(x)
    # self.fc3(x): The output from fc2 is passed through the final fully connected layer (fc3). This layer outputs the final values corresponding to the possible actions (e.g., moving left, right, up, or down in the Pacman game).
    # No Activation: The final output typically doesn't have an activation function applied because it represents the raw Q-values for each action, which will be used to make decisions.

## Part 2 - Training the AI

### Setting up the environment

In [4]:
import gymnasium as gym
env = gym.make('MsPacmanDeterministic-v0', full_action_space = False)
# gym.make('MsPacmanDeterministic-v0'): This function creates an environment instance for the game "Ms. Pac-Man." The environment provides the game world where the agent (your AI) will interact. The 'MsPacmanDeterministic-v0' environment is a version of Ms. Pac-Man where the game behaves deterministically, meaning the same actions in the same state will always lead to the same outcome, which can be useful for testing and debugging.
# full_action_space=False: This argument restricts the action space to a reduced set of actions that are more relevant for the game, rather than including all possible actions (many of which might be redundant or unnecessary). This simplifies the learning problem for the agent.
state_shape = env.observation_space.shape
# env.observation_space.shape: The observation space defines the type of data the agent receives from the environment. For Ms. Pac-Man, the observation space is typically a 3D array representing the RGB image of the game screen (e.g., height x width x channels).
# state_shape: This stores the dimensions of the state (e.g., (210, 160, 3) for an Atari game like Ms. Pac-Man), where 210 is the height, 160 is the width, and 3 is the number of color channels (RGB).
state_size = env.observation_space.shape[0]
# state_size: This extracts the size of the first dimension of the state (observation) space, which corresponds to the height of the image.
number_actions = env.action_space.n
# env.action_space.n: The action space defines the set of possible actions the agent can take. For Ms. Pac-Man, this includes moves like up, down, left, right, etc. The .n attribute gives the total number of discrete actions available to the agent.
# number_actions: This stores the number of actions, which tells you how many different moves the agent can choose from at any given time.
print('State shape: ', state_shape)
print('State size: ', state_size)
print('Number of actions: ', number_actions)

  logger.deprecation(


State shape:  (210, 160, 3)
State size:  210
Number of actions:  9


### Initializing the hyperparameters

In [5]:
learning_rate = 5e-4
# Purpose: The learning rate controls how much the model's parameters (weights) are updated during each step of the learning process.
# Explanation:
# A higher learning rate might speed up learning but can lead to instability or the model missing the optimal solution.
# A lower learning rate allows for more fine-tuned updates but may slow down the learning process.
# In the context of RL, the learning rate affects how much the Q-values (or other parameters in the policy) are adjusted based on the agent's experiences.
minibatch_size = 64
# Purpose: Minibatch size determines how many samples from the replay buffer (or experience memory) are used to compute the gradient and update the network’s parameters in each iteration.
# Explanation:
# Minibatch Gradient Descent: Instead of updating the model’s parameters after every single experience (which would be stochastic) or after accumulating all experiences (which would be computationally expensive), minibatch gradient descent uses a subset (minibatch) of experiences.
# Stability: Using a minibatch size of 64 helps strike a balance between stability and efficiency, allowing the network to generalize better from a variety of experiences without the noise of single-sample updates.
# Efficiency: It’s computationally more efficient than using the entire dataset and leads to faster convergence than using single samples.
discount_factor = 0.99
# Purpose: The discount factor, denoted by γ, determines the importance of future rewards in the agent’s decision-making process.
# Future Rewards: In RL, agents aim to maximize cumulative rewards. The discount factor specifies how much future rewards are valued relative to immediate rewards.
# Value Ranges: If γ=1, future rewards are valued equally with immediate rewards (the agent is more "far-sighted").
# If γ is close to 0, the agent will prioritize immediate rewards over future rewards (more "short-sighted").
# Choosing 0.99: A discount factor of 0.99 means that future rewards are still important, but slightly less so than immediate rewards. This encourages the agent to consider long-term benefits while still valuing more immediate outcomes.

  and should_run_async(code)


### Preprocessing the frames

In [6]:
from PIL import Image
from torchvision import transforms
# PIL (Python Imaging Library): The Image module from PIL (now maintained as Pillow) is used for image processing tasks. It allows the conversion of numpy arrays into image objects, among other things.
# torchvision.transforms: This module provides common image transformations that can be chained together to preprocess images for use in machine learning models.

def preprocess_frame(frame): #  This function takes a single frame (a game screen or image) as input and preprocesses it so it can be fed into a neural network.
  frame = Image.fromarray(frame)
  # Input: frame is likely a numpy array representing an image, with dimensions like (height, width, channels).
  # Image.fromarray(frame): This converts the numpy array into a PIL Image object. This conversion is necessary because the transforms module expects a PIL Image or a Tensor as input for the transformations.
  preprocess = transforms.Compose([transforms.Resize((128, 128)), transforms.ToTensor()])
  # transforms.Compose([...]): Compose allows you to chain multiple image transformations together into a single pipeline. The image will pass through each transformation in the order they are listed.
  # Transformations in the Pipeline:

  # transforms.Resize((128, 128)):
  # Purpose: Resizes the image to a fixed size of 128x128 pixels. This standardizes the input size for the neural network, which typically requires inputs of a consistent shape.
  # Effect: Regardless of the original dimensions of the frame, the image will be scaled to 128x128, which may involve either cropping or padding the image.

  # transforms.ToTensor():
  # Purpose: Converts the PIL Image to a PyTorch Tensor, which is the data format expected by PyTorch models. The pixel values are scaled from the range [0, 255] to [0, 1] and rearranged into the format (channels, height, width) instead of (height, width, channels).
  # Effect: The image data is now in a format that can be directly used by PyTorch for model training or inference.
  return preprocess(frame).unsqueeze(0)
#   preprocess(frame): The frame (now a PIL Image) is passed through the preprocessing pipeline, which resizes it and converts it to a Tensor.
# .unsqueeze(0): This adds an extra dimension to the Tensor at the 0th position, making it of shape (1, channels, height, width). This extra dimension represents the batch size, which is necessary because PyTorch models typically expect input data to include a batch dimension.
# Effect: If your model expects a batch of images, adding this dimension allows you to treat the single frame as a batch of size 1.

### Implementing the DCQN class

In [7]:
class Agent():

  def __init__(self, action_size):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.action_size = action_size
    self.local_qnetwork = Network(action_size).to(self.device)
    self.target_qnetwork = Network(action_size).to(self.device)
    self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr = learning_rate)
    self.memory = deque(maxlen = 10000)

# __init__ Method: This is the constructor for the Agent class. It initializes the key components and variables the agent needs to function.
# self.device: Determines whether to use a GPU (cuda:0) or CPU (cpu) based on the availability of a CUDA-capable GPU. The device will be used to perform computations (e.g., training the network) on the appropriate hardware.
# self.action_size: Stores the number of possible actions the agent can take. This is used to configure the output layer of the neural network, which predicts Q-values for each possible action.
# self.local_qnetwork: Initializes a neural network (using the Network class defined earlier) that will be used to approximate the Q-values for the current state. This network is trained to improve over time. The network is moved to the appropriate device (GPU or CPU).
# self.target_qnetwork: A second neural network that serves as the target for updating the local Q-network. It is a delayed copy of the local Q-network to stabilize learning. This is a standard technique in DQN (Deep Q-Network) to reduce the volatility of updates.
# self.optimizer: Sets up the Adam optimizer, which is used to adjust the weights of the local Q-network based on the computed loss. The learning rate (lr) controls how big of a step the optimizer takes during each update.
# self.memory: Initializes a replay buffer using deque (a double-ended queue) with a maximum length of 10,000. This buffer stores experiences that the agent collects as it interacts with the environment. The buffer allows the agent to learn from past experiences in a more stable way.

  def step(self, state, action, reward, next_state, done):
    state = preprocess_frame(state)
    next_state = preprocess_frame(next_state)
    self.memory.append((state, action, reward, next_state, done))
    if len(self.memory) > minibatch_size:
      experiences = random.sample(self.memory, k = minibatch_size)
      self.learn(experiences, discount_factor)
# Purpose: This method is called every time the agent takes a step in the environment (i.e., every time it interacts with the environment by taking an action and receiving feedback).
# preprocess_frame(state): The current and next states (frames) are preprocessed (e.g., resized, normalized, and converted to tensors) so that they are in the correct format for the neural network.
# self.memory.append(...): The experience tuple (state, action, reward, next_state, done) is added to the replay buffer. The done flag indicates whether the episode has finished (i.e., the game is over).
# Replay Memory Sampling: If the replay buffer contains more than minibatch_size experiences (e.g., 64), a random sample of experiences is drawn from the buffer. This sample is then passed to the learn method to update the agent’s Q-network.

  def act(self, state, epsilon = 0.):
    state = preprocess_frame(state).to(self.device)
    self.local_qnetwork.eval()
    with torch.no_grad():
      action_values = self.local_qnetwork(state)
    self.local_qnetwork.train()
    if random.random() > epsilon:
      return np.argmax(action_values.cpu().data.numpy())
    else:
      return random.choice(np.arange(self.action_size))
# Purpose: This method determines the action the agent will take in a given state, balancing between exploration and exploitation.
# preprocess_frame(state).to(self.device): The state is preprocessed and moved to the appropriate device (GPU or CPU) for processing.
# self.local_qnetwork.eval(): The local Q-network is put into evaluation mode (i.e., no training, just inference). This disables certain layers like dropout and batch normalization, which behave differently during training and evaluation.
# with torch.no_grad():: This context manager tells PyTorch not to compute gradients, which saves memory and computation since gradients are only needed during training, not inference.
# action_values = self.local_qnetwork(state): The Q-values for the given state are computed by passing the state through the local Q-network.
# self.local_qnetwork.train(): After making the action selection, the network is put back into training mode to resume normal operation when training resumes.
# Epsilon-Greedy Policy:
# Exploitation: With probability 1−ϵ, the agent selects the action with the highest predicted Q-value (i.e., the "best" action according to the model).
# Exploration: With probability ϵ, the agent selects a random action. This encourages the agent to explore different actions to avoid getting stuck in a local optimum.

  def learn(self, experiences, discount_factor):
    states, actions, rewards, next_states, dones = zip(*experiences)
    states = torch.from_numpy(np.vstack(states)).float().to(self.device)
    actions = torch.from_numpy(np.vstack(actions)).long().to(self.device)
    rewards = torch.from_numpy(np.vstack(rewards)).float().to(self.device)
    next_states = torch.from_numpy(np.vstack(next_states)).float().to(self.device)
    dones = torch.from_numpy(np.vstack(dones).astype(np.uint8)).float().to(self.device)
    next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1)
    q_targets = rewards + discount_factor * next_q_targets * (1 - dones)
    q_expected = self.local_qnetwork(states).gather(1, actions)
    loss = F.mse_loss(q_expected, q_targets)
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()
# Purpose: This method updates the Q-network based on the experiences sampled from the replay buffer, using the Bellman equation to compute the target Q-values.
# Experience Tuple:
# The experiences (states, actions, rewards, next_states, dones) are unpacked from the sampled minibatch.
# Each of these is converted into a PyTorch tensor and moved to the appropriate device.
# next_q_targets: The target Q-values for the next states are computed using the target Q-network. The .detach() call ensures that these values are not part of the computation graph, which stabilizes training.
# q_targets: The target Q-values for the current states are calculated using the Bellman equation:
# Q_target = reward + γ × Q_nextstate × (1−done)
# If the done flag is true, it means the episode is over, and future rewards are not considered.
# q_expected: The expected Q-values for the current states and actions are computed using the local Q-network.
# Loss Computation:
# F.mse_loss(q_expected, q_targets): The mean squared error (MSE) loss between the expected and target Q-values is calculated. This loss measures how close the Q-network's predictions are to the target values.
# self.optimizer.zero_grad(): Clears the gradients from the previous step. This is important because, by default, gradients in PyTorch accumulate with each backward() call.
# loss.backward(): Performs backpropagation, computing the gradients of the loss with respect to the network's parameters.
# self.optimizer.step(): Updates the network's parameters based on the computed gradients, completing the training step.

  and should_run_async(code)


### Initializing the DCQN agent

In [8]:
agent = Agent(number_actions)
# The line of code agent = Agent(number_actions) is creating an instance of the Agent class that was defined earlier.
# Agent(number_actions) is calling the __init__ method of the Agent class, which initializes the agent with the necessary components and configurations.
# number_actions is passed as an argument to the Agent class constructor (__init__ method).
# What is number_actions?: In the context of a reinforcement learning environment (like a game), number_actions represents the total number of possible actions the agent can take. For example, in a game like Ms. Pac-Man, actions might include moving up, down, left, right, or staying still, and number_actions would represent the total count of these actions.
# When the Agent class is instantiated with number_actions, the following things happen inside the __init__ method of the Agent class:
# Device Selection: The agent determines whether to use a GPU or CPU for computation (self.device).
# Action Size Assignment: The number of actions (action_size) is stored in self.action_size, which will be used throughout the agent's methods to know how many possible actions there are.
# Neural Network Initialization:
# self.local_qnetwork and self.target_qnetwork are initialized as instances of the Network class, both of which are set up to have an output size equal to number_actions. This means that for any given input state, the networks will produce a set of Q-values, one for each possible action.
# These networks are moved to the selected device (GPU or CPU).
# Optimizer Setup: An optimizer (self.optimizer) is created using the Adam optimization algorithm, which will be used to update the weights of the local_qnetwork during training.
# Replay Memory Initialization: A replay buffer (self.memory) is initialized to store past experiences, which will be used for training the neural network.
# After this line of code runs, you have an agent object that is fully set up and ready to interact with the environment, take actions, store experiences, and learn from them. This agent object has all the methods (step, act, learn, etc.) and attributes (local_qnetwork, target_qnetwork, memory, etc.) defined in the Agent class.

### Training the DCQN agent

In [9]:
number_episodes = 2000
maximum_number_timesteps_per_episode = 10000
epsilon_starting_value  = 1.0
epsilon_ending_value  = 0.01
epsilon_decay_value  = 0.995
epsilon = epsilon_starting_value
scores_on_100_episodes = deque(maxlen = 100)
# number_episodes = 2000: The total number of episodes to train the agent. An episode is one complete playthrough of the game, from the start until it ends (either by winning, losing, or reaching a terminal state).
# maximum_number_timesteps_per_episode = 10000: The maximum number of timesteps the agent can take within a single episode. If the game doesn’t end sooner, the episode will terminate after 10,000 steps.
# epsilon_starting_value = 1.0: The initial value of epsilon (ε) used in the epsilon-greedy policy for action selection. At the start of training, epsilon is set to 1.0, meaning the agent will explore (choose random actions) 100% of the time.
# epsilon_ending_value = 0.01: The minimum value of epsilon during training. Epsilon will decay to this value over time, meaning the agent will always explore at least 1% of the time.
# epsilon_decay_value = 0.995: The factor by which epsilon is multiplied after each episode, causing it to decay over time. This encourages the agent to explore less and exploit more as it learns.
# epsilon = epsilon_starting_value: Initializes epsilon to its starting value.
# scores_on_100_episodes = deque(maxlen=100): A deque (double-ended queue) to store the scores (rewards) from the last 100 episodes. The average score of these episodes will be used to monitor the agent’s performance.

for episode in range(1, number_episodes + 1):
  #for episode in range(1, number_episodes + 1):: This loop runs for each episode, iterating from 1 to number_episodes (2000 in this case). Each iteration represents one episode of gameplay.
  state, _ = env.reset()
  score = 0
  # state, _ = env.reset(): Resets the environment to its initial state at the start of each episode. The initial state of the environment is returned and stored in state. The underscore (_) is used to ignore the second value returned by reset() (which might be additional info or metadata that’s not needed here).
  # score = 0: Initializes the score for this episode to zero. The score will accumulate the rewards obtained during the episode.

  for t in range(maximum_number_timesteps_per_episode):
    action = agent.act(state, epsilon)
    next_state, reward, done, _, _ = env.step(action)
    agent.step(state, action, reward, next_state, done)
    state = next_state
    score += reward
    if done:
      break
# for t in range(maximum_number_timesteps_per_episode):: This loop runs for each timestep within the episode, up to the maximum number of timesteps (10000).
# action = agent.act(state, epsilon): The agent selects an action based on the current state using its policy, which involves an epsilon-greedy strategy. The agent either chooses a random action (explore) or the best-known action (exploit) based on the Q-network’s prediction.
# next_state, reward, done, _, _ = env.step(action): The selected action is passed to the environment. The environment returns the following:
# next_state: The new state after taking the action.
# reward: The reward received for taking the action.
# done: A boolean flag indicating whether the episode has ended (e.g., the game is over).
# The underscores (_) ignore any additional information that might be returned by env.step().
# agent.step(state, action, reward, next_state, done): The agent records the experience (state, action, reward, next_state, done) and may learn from it by updating its Q-network, depending on whether the replay buffer has enough experiences.
# state = next_state: The agent updates its current state to the new state returned by the environment. This state will be used in the next timestep.
# score += reward: The reward from this action is added to the total score for the episode.
# if done: break: If the done flag is True, the episode ends early, and the loop breaks. This might happen if the agent wins, loses, or reaches a terminal state.

  scores_on_100_episodes.append(score)
  epsilon = max(epsilon_ending_value, epsilon_decay_value * epsilon)
  print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)), end = "")
  if episode % 100 == 0:
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)))
  if np.mean(scores_on_100_episodes) >= 500.0:
    print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode - 100, np.mean(scores_on_100_episodes)))
    torch.save(agent.local_qnetwork.state_dict(), 'checkpoint.pth')
    break
# scores_on_100_episodes.append(score): The score from this episode is added to the deque, which keeps track of the scores from the last 100 episodes.
# epsilon = max(epsilon_ending_value, epsilon_decay_value * epsilon): Epsilon is decayed using the epsilon_decay_value, but it cannot go below the epsilon_ending_value. This gradually reduces exploration as the agent learns.
# print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)), end=""): This prints the current episode number and the average score of the last 100 episodes. The \r character ensures that the print statement overwrites the same line in the console, providing an updating display without scrolling.
# if episode % 100 == 0:: Every 100 episodes, a more permanent printout of the episode number and average score is made. This gives a clear snapshot of progress every 100 episodes.
# if np.mean(scores_on_100_episodes) >= 500.0:: If the average score over the last 100 episodes reaches or exceeds 500, it is assumed that the environment has been "solved" (i.e., the agent is performing well enough to be considered successful).
# torch.save(agent.local_qnetwork.state_dict(), 'checkpoint.pth'): If the environment is solved, the model weights of the local_qnetwork are saved to a file (checkpoint.pth). This allows you to reload the trained model later.
# break: If the environment is solved, the training loop is terminated early.

Episode 100	Average Score: 282.90
Episode 200	Average Score: 350.50
Episode 300	Average Score: 448.90
Episode 400	Average Score: 450.20
Episode 484	Average Score: 501.30
Environment solved in 384 episodes!	Average Score: 501.30


## Part 3 - Visualizing the results

In [10]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display
from gym.wrappers.monitoring.video_recorder import VideoRecorder

def show_video_of_model(agent, env_name):
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    done = False
    frames = []
    while not done:
        frame = env.render()
        frames.append(frame)
        action = agent.act(state)
        state, reward, done, _, _ = env.step(action)
    env.close()
    imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, 'MsPacmanDeterministic-v0')

def show_video():
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()

  logger.deprecation(
  logger.warn(
