# Introduction
In homework assignment 2, we will implement a basic deep Q learning (DQL) algorithm to solve a classic control problem--CartPole V1

# Install the gym environment

In [1]:
!pip install gymnasium

Collecting gymnasium
  Using cached gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Using cached Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Using cached gymnasium-0.29.1-py3-none-any.whl (953 kB)
Using cached Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


# Load tensorboard for visualizing

In [1]:
%load_ext tensorboard

# Import the required package

In [2]:
from collections import namedtuple
from collections import deque
import torch
import torch.nn as nn
import numpy as np
import gymnasium as gym
from torch.utils.tensorboard import SummaryWriter
import datetime
from typing import Tuple
from numpy.random import binomial
from numpy.random import choice
import numpy.random as nr
import torch.nn.functional as F

Tensor = torch.DoubleTensor
torch.set_default_tensor_type(Tensor)
Transitions = namedtuple('Transitions', ['obs', 'action', 'reward', 'next_obs', 'done'])

  _C._set_default_tensor_type(t)


# Replay buffer to collect transition tuples

In [3]:
class ReplayBuffer:
    def __init__(self, config):
        replay_buffer_size = config['replay_buffer_size']
        seed = config['seed']
        nr.seed(seed)

        self.replay_buffer_size = replay_buffer_size
        self.obs = deque([], maxlen=self.replay_buffer_size)
        self.action = deque([], maxlen=self.replay_buffer_size)
        self.reward = deque([], maxlen=self.replay_buffer_size)
        self.next_obs = deque([], maxlen=self.replay_buffer_size)
        self.done = deque([], maxlen=self.replay_buffer_size)

    def append_memory(self,
                      obs,
                      action,
                      reward,
                      next_obs,
                      done: bool):
        self.obs.append(obs)
        self.action.append(action)
        self.reward.append(reward)
        self.next_obs.append(next_obs)
        self.done.append(done)

    def sample(self, batch_size):
        buffer_size = len(self.obs)

        idx = nr.choice(buffer_size,
                        size=min(buffer_size, batch_size),
                        replace=False)
        t = Transitions
        t.obs = torch.stack(list(map(self.obs.__getitem__, idx)))
        t.action = torch.stack(list(map(self.action.__getitem__, idx)))
        t.reward = torch.stack(list(map(self.reward.__getitem__, idx)))
        t.next_obs = torch.stack(list(map(self.next_obs.__getitem__, idx)))
        t.done = torch.tensor(list(map(self.done.__getitem__, idx)))[:, None]
        return t

    def clear(self):
        self.obs = deque([], maxlen=self.replay_buffer_size)
        self.action = deque([], maxlen=self.replay_buffer_size)
        self.reward = deque([], maxlen=self.replay_buffer_size)
        self.next_obs = deque([], maxlen=self.replay_buffer_size)
        self.done = deque([], maxlen=self.replay_buffer_size)


# Q network

In [4]:
class QNetwork(nn.Module):
    def __init__(self,
                 dim_obs: int,
                 dim_action: int,
                 dims_hidden_neurons: Tuple[int] = (64, 64)):
        if not isinstance(dim_obs, int):
            TypeError('dimension of observation must be int')
        if not isinstance(dim_action, int):
            TypeError('dimension of action must be int')
        if not isinstance(dims_hidden_neurons, tuple):
            TypeError('dimensions of hidden neurons must be tuple of int')

        super(QNetwork, self).__init__()
        self.num_layers = len(dims_hidden_neurons)
        self.dim_action = dim_action

        n_neurons = (dim_obs, ) + dims_hidden_neurons + (dim_action, )
        for i, (dim_in, dim_out) in enumerate(zip(n_neurons[:-2], n_neurons[1:-1])):
            layer = nn.Linear(dim_in, dim_out).double()
            torch.nn.init.xavier_uniform_(layer.weight)
            torch.nn.init.zeros_(layer.bias)
            exec('self.layer{} = layer'.format(i + 1))

        self.output = nn.Linear(n_neurons[-2], n_neurons[-1]).double()
        torch.nn.init.xavier_uniform_(self.output.weight)
        torch.nn.init.zeros_(self.output.bias)

    def forward(self, observation: torch.Tensor):
        x = observation.double()
        for i in range(self.num_layers):
            x = eval('torch.tanh(self.layer{}(x))'.format(i + 1))
        return self.output(x)


# DQN agent
The base code are given in this section. The updates of the neural networks are missing and are left out for you to fill. You may refer to the DQN papaer: https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf

### Functionality of the `update` Method

The `update` method begins by sampling a batch of experiences from the replay buffer. These experiences consist of tuples containing the state (`s`), action (`a`), reward (`r`), next state (`sp`), and done flag (`done`) which indicates whether the episode has terminated. Sampling from a replay buffer instead of using consecutive samples is crucial as it helps to break the correlation between successive training samples, thereby stabilizing the learning process.

Once a batch is sampled, the method proceeds with the core of the Q-learning algorithm. It computes the Q-values for the current states (`s`) and the actions taken (`a`). These Q-values are derived from the primary Q-network (`self.Q`). Simultaneously, it calculates the target Q-values, which are used as the "ground truth" for training the Q-network. The target Q-values are a combination of the reward received for taking the action (`r`) and the discounted maximum Q-value of the next state (`sp`), obtained from the target Q-network (`self.Q_tar`). This target network's parameters are kept frozen most of the time to provide stable targets, and it's updated from the primary network every `C` steps, ensuring that the learning targets do not fluctuate too drastically.

### Loss Calculation and Network Optimization

The difference between the Q-values predicted by the primary network and the calculated target Q-values forms the basis for the loss calculation, typically using Mean Squared Error (MSE) as the loss function. This loss reflects the accuracy of the Q-network's predictions and its ability to estimate future rewards based on its current policy.

Following the loss calculation, backpropagation is used to update the weights of the primary Q-network, thereby improving its predictions. The optimizer's gradients are first reset to prevent accumulation from previous updates, and then the loss is backpropagated to adjust the network weights in a direction that minimizes the loss. 

### Synchronization of Primary and Target Networks

An essential aspect of this update function is the periodic update of the target Q-network (`self.Q_tar`). This update happens every `C` steps and involves directly copying the weights from the primary Q-network to the target Q-network. This periodic update ensures that the target network slowly tracks the primary network, providing slightly delayed, stable target values for training, which is key to the convergence and stability of the learning algorithm.

In summary, the `update` function is a critical method in reinforcement learning that handles the experience replay, calculates and minimizes the loss for the Q-network, and synchronizes the primary and target Q-networks, all of which are pivotal for effective learning and policy improvement in Q-learning algorithms.


In [12]:
class DQN:
    def __init__(self, config):

        torch.manual_seed(config['seed'])

        self.lr = config['lr']  # learning rate
        self.C = config['C']  # copy steps
        self.eps_len = config['eps_len']  # length of epsilon greedy exploration
        self.eps_max = config['eps_max']
        self.eps_min = config['eps_min']
        self.discount = config['discount']  # discount factor
        self.batch_size = config['batch_size']  # mini batch size

        self.dims_hidden_neurons = config['dims_hidden_neurons']
        self.dim_obs = config['dim_obs']
        self.dim_action = config['dim_action']

        self.Q = QNetwork(dim_obs=self.dim_obs,
                          dim_action=self.dim_action,
                          dims_hidden_neurons=self.dims_hidden_neurons)
        self.Q_tar = QNetwork(dim_obs=self.dim_obs,
                              dim_action=self.dim_action,
                              dims_hidden_neurons=self.dims_hidden_neurons)

        self.optimizer_Q = torch.optim.Adam(self.Q.parameters(), lr=self.lr)
        self.training_step = 0

    def update(self, buffer):
        t = buffer.sample(self.batch_size)

        s = t.obs
        a = t.action
        r = t.reward
        sp = t.next_obs
        done = t.done

        self.training_step += 1

        # TODO: perform a single Q network update step. Also update the target Q network every C Q network update steps
        # Q network update step
        # self.Q.train()
        # self.Q_tar.eval()

        a = a.long()
        # Calculate Q values for current states
        Q_current_value = self.Q(s).gather(1, a)
    
        # Compute target Q values
         
        # Q_target_value = r + self.discount * torch.max(self.Q_tar(sp), dim=1)[0]
        Q_target_value = r + self.discount * self.Q_tar(sp).detach().max(1)[0].unsqueeze(1) * (1 - done.float())

        # print(a)
        # print(Q_current_value)
        # print(self.Q_tar(sp))
        # print(Q_target_value)
        # print()
        # print()
        # # Calculate loss
        loss = F.mse_loss(Q_current_value, Q_target_value)
    
        # Optimize the Q network
        self.optimizer_Q.zero_grad()  # clear gradients since PyTorch accumulates them
        loss.backward()
        self.optimizer_Q.step()
    
        # # Update target Q network every C steps
        if self.training_step % self.C == 0:
            self.Q_tar.load_state_dict(self.Q.state_dict())

        return loss.item()
        

    def act_probabilistic(self, observation: torch.Tensor):
        # epsilon greedy:
        first_term = self.eps_max * (self.eps_len - self.training_step) / self.eps_len
        eps = max(first_term, self.eps_min)

        explore = binomial(1, eps)

        if explore == 1:
            a = choice(self.dim_action)
        else:
            self.Q.eval()
            Q = self.Q(observation)
            val, a = torch.max(Q, axis=1)
            a = a.item()
            self.Q.train()
        return a

    def act_deterministic(self, observation: torch.Tensor):
        self.Q.eval()
        Q = self.Q(observation)
        val, a = torch.max(Q, axis=1)
        self.Q.train()
        return a.item()

# Create the environment

In [42]:
env = gym.make('CartPole-v1')
config = {
    'dim_obs': 4,  # Q network input
    'dim_action': 2,  # Q network output
    'dims_hidden_neurons': (64, 64),  # Q network hidden
    'lr': 0.0001,  # learning rate
    'C': 20,  # copy steps
    'discount': 0.99,  # discount factor
    'batch_size': 64,
    'replay_buffer_size': 100000,
    'eps_min': 0.01,
    'eps_max': 1.0,
    'eps_len': 4000,
    'seed': 2,
}


# Create the DQN agent

In [43]:
dqn = DQN(config)
buffer = ReplayBuffer(config)

from datetime import datetime
current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
log_dir = f'tensorboard/dqn/data_{current_time}_lr-{config["lr"]}_batch_size-{config["batch_size"]}_C-{config["C"]}_seed-{config["seed"]}'
train_writer = SummaryWriter(log_dir=log_dir)

# Start training

In [44]:
steps = 0  # total number of steps

for i_episode in range(500):
    observation = env.reset()
    done = False
    truncated = False
    t = 0  # time steps within each episode
    ret = 0.  # episodic return
    while done is False and truncated is False:
        # env.render()  # render to screen, not working for jupyter

        obs = torch.tensor(env.state)  # observe the environment state

        action = dqn.act_probabilistic(obs[None, :])  # take action

        next_obs, reward, done, truncated, info = env.step(action)  # environment advance to next step

        buffer.append_memory(obs=obs,  # put the transition to memory
                             action=torch.from_numpy(np.array([action])),
                             reward=torch.from_numpy(np.array([reward])),
                             next_obs=torch.from_numpy(next_obs),
                             done=done)

        dqn.update(buffer)  # agent learn

        t += 1
        steps += 1
        
        # print(f"Step: {t}, Total Steps: {steps}, Done: {done}, Truncated: {truncated}, Info: {info}")

        ret += reward  # update episodic return
        if done or truncated:
            print("Episode {} finished after {} timesteps".format(i_episode, t+1))
        train_writer.add_scalar('Performance/episodic_return', ret, i_episode)  # plot

env.close()
train_writer.close()

Episode 0 finished after 15 timesteps
Episode 1 finished after 89 timesteps
Episode 2 finished after 12 timesteps
Episode 3 finished after 16 timesteps
Episode 4 finished after 28 timesteps
Episode 5 finished after 15 timesteps
Episode 6 finished after 24 timesteps
Episode 7 finished after 19 timesteps
Episode 8 finished after 19 timesteps
Episode 9 finished after 29 timesteps
Episode 10 finished after 38 timesteps
Episode 11 finished after 23 timesteps
Episode 12 finished after 24 timesteps
Episode 13 finished after 13 timesteps
Episode 14 finished after 14 timesteps
Episode 15 finished after 22 timesteps
Episode 16 finished after 11 timesteps
Episode 17 finished after 12 timesteps
Episode 18 finished after 39 timesteps
Episode 19 finished after 16 timesteps
Episode 20 finished after 13 timesteps
Episode 21 finished after 21 timesteps
Episode 22 finished after 18 timesteps
Episode 23 finished after 17 timesteps
Episode 24 finished after 23 timesteps
Episode 25 finished after 30 timest

# Visualizing

In [45]:
%tensorboard --logdir='tensorboard/dqn'

Reusing TensorBoard on port 6006 (pid 21176), started 3:38:22 ago. (Use '!kill 21176' to kill it.)