# Deep Q-network Practice

If you run in jupyter, turn

```
colab = False
```

In [1]:
colab = True
if colab:
    !pip install gym pyvirtualdisplay > /dev/null 2>&1
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
    !apt-get update > /dev/null 2>&1
    !apt-get install cmake > /dev/null 2>&1
    !pip install --upgrade setuptools 2>&1
    !pip install ez_setup > /dev/null 2>&1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting setuptools
  Downloading setuptools-67.8.0-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 67.7.2
    Uninstalling setuptools-67.7.2:
      Successfully uninstalled setuptools-67.7.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m
[0mSuccessfully installed setuptools-67.8.0


In [2]:
if colab:
    from google.colab import drive
    drive.mount('/content/drive')

    %cd /content/drive/MyDrive/Colab Notebooks/양인순 교수님/day2/day2_dqn
    !ls

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/양인순 교수님/day2/day2_dqn
buffer.py	dqn.ipynb	 __pycache__  snapshots
chap4_dqn.pdf	learning_curves  schedule.py  utils.py
dqn_full.ipynb	plot.ipynb	 setup.ipynb  video


# -1. Introduction to Gym environment

## -1.1 Prerequisites

# 0. Define Q-network & policy-network

In [3]:
import torch
import torch.nn as nn
from torch.nn import MSELoss
import torch.nn.functional as F
import copy
import os
import csv
import numpy as np
import torch
from torch.optim import Adam
from buffer import ReplayBuffer
from utils import save_snapshot, recover_snapshot, load_model
from schedule import LinearSchedule
import gym

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('current device =', device)

current device = cuda


In [5]:
# critic network definition
# multi-layer perceptron (with 2 hidden layers)
class Critic(nn.Module):
    def __init__(self, state_dim, num_action, hidden_size1, hidden_size2):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        # TODO_1 : Define fc3 layer, of which output dim is num_action.
        # self.fc3 =
        self.fc3 = nn.Linear(hidden_size2, num_action)


    def forward(self, state):
        # Given a state s, the network returns a vector Q(s,) of length |A|.
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        q = self.fc3(x)

        return q

  and should_run_async(code)


# 1. Define DQN agent

In [6]:
class DQNAgent:
    def __init__(self, obs_dim, num_act, hidden1, hidden2):
        self.obs_dim = obs_dim
        self.num_act = num_act
        # networks
        self.critic = Critic(obs_dim, num_act, hidden1, hidden2).to(device)

    def act(self, state, epsilon=0.0):
        # simple implementation of epsilon-greedy method
        if np.random.rand() < epsilon:
            # TODO_2 : With probability epsilon, choose random action.
            # Hint : Action Space A = {0, 1, ... , num_act - 1}
            # action =
            action = np.random.randint(self.num_act)
        else:
            # greedy selection
            self.critic.eval()
            s = torch.Tensor(state).view(1, self.obs_dim).to(device)
            # TODO_3 : Get Q-value of state s from critic network.
            # q =
            q = self.critic(s)
            action = np.argmax(q.cpu().detach().numpy())

        return action

  and should_run_async(code)


# 2. Implement one-step param update

In [7]:
def update(agent, replay_buf, gamma, critic_optim, target_critic, tau, batch_size):
    # agent : agent with networks to be trained
    # replay_buf : replay buf from which we sample a batch
    # actor_optim / critic_optim : torch optimizers
    # tau : parameter for soft target update

    agent.critic.train()

    batch = replay_buf.sample_batch(batch_size)

    # unroll batch
    with torch.no_grad():
        observations = torch.Tensor(batch['state']).to(device)
        actions = torch.tensor(batch['action'], dtype=torch.long).to(device)
        rewards = torch.Tensor(batch['reward']).to(device)
        next_observations = torch.Tensor(batch['next_state']).to(device)
        terminals = torch.Tensor(batch['done']).to(device)

        mask = 1.0 - terminals    # If done, mask = 1. If not done, mask = 0.

        next_q = torch.unsqueeze(target_critic(next_observations).max(1)[0], 1)
        next_q = mask * next_q    # If done, we don't need next_q in target.

        # TODO_4 : Implement update target of critic network, r + gamma * max_a' {Q_target(s', a')}.
        # Hint : We already computed max_a' {Q_target(s', a')} above.
        # target =
        target = rewards + gamma * next_q

    out = agent.critic(observations).gather(1, actions)

    loss_ftn = MSELoss()
    # TODO_5 : Implement loss for update of critic network.
    # loss =
    loss = loss_ftn(out, target)

    critic_optim.zero_grad()
    loss.backward()
    critic_optim.step()

    # soft target update
    # TODO_6 : Follow TA.
    for p, targ_p in zip(agent.critic.parameters(), target_critic.parameters()):
        #targ_p.data.copy_()
        targ_p.data.copy_((1. - tau) * targ_p + tau * p)

  and should_run_async(code)


In [14]:
def evaluate(agent, env, num_episodes=5):

    sum_scores = 0.

    for i in range(num_episodes):
        obs = env.reset()
        done = False
        score = 0.

        while not done:
            action = agent.act(obs)
            obs, rew, done, _ = env.step(action)
            score += rew
        sum_scores += score

    avg_score = sum_scores / num_episodes

    return avg_score

# 3. Combining these, we finally have...

In [15]:
def train(agent, env, gamma,
          lr, tau,
          ep_len, num_updates, batch_size,
          init_buffer=5000, buffer_size=100000,
          start_train=2000, train_interval=50,
          eval_interval=2000, snapshot_interval=10000,
          path=None):

    target_critic = copy.deepcopy(agent.critic)

    # environment for evaluation
    test_env = copy.deepcopy(env)

    # freeze target network
    for p in target_critic.parameters():
        p.requires_grad_(False)

    critic_optim = Adam(agent.critic.parameters(), lr=lr)

    if path is not None:
        recover_snapshot(path, agent.critic,
                         target_critic, critic_optim,
                         device=device
                        )
        # load snapshot

    obs_dim = env.observation_space.shape[0]
    num_act = env.action_space.n

    replay_buf = ReplayBuffer(obs_dim, buffer_size)

    max_epsilon = 1.
    min_epsilon = 0.02
    exploration_schedule = LinearSchedule(begin_t=start_train,
                                          end_t=num_updates,
                                          begin_value=max_epsilon,
                                          end_value=min_epsilon
                                         )
    save_path = './snapshots/'
    os.makedirs(save_path, exist_ok=True)
    os.makedirs('./learning_curves/', exist_ok=True)
    log_file = open('./learning_curves/res.csv',
                    'w',
                    encoding='utf-8',
                    newline=''
                   )
    logger = csv.writer(log_file)

    # main loop
    obs = env.reset()
    done = False
    step_count = 0

    for t in range(num_updates + 1):
        if t < init_buffer:
            # TODO_7 : Execute random action (for exploration) until we collect sufficiently many samples.
            # action =
            action = env.action_space.sample()
        else:
            # TODO_8 : Execute epsilon-greedy action (for both exploration & exploitation).
            epsilon = exploration_schedule(t)
            # action =
            action = agent.act(obs, epsilon=epsilon)

        next_obs, rew, done, _ = env.step(action)

        step_count += 1
        if step_count == ep_len:
            # if the next_state is not terminal but done is set to True by gym env wrapper
            done = False

        # TODO_9 : Append the experience to replay buffer.
        replay_buf.append(obs, action, next_obs, rew, done)

        obs = next_obs

        if done == True or step_count == ep_len:
            # reset environment if current environment reaches a terminal state
            # or step count reaches predefined length
            obs = env.reset()
            done = False
            step_count = 0
            # score = evaluate(agent, env)
            # print('[iteration {}] evaluation score : {}'.format(t, score))

        if t % eval_interval == 0:
            avg_score = evaluate(agent, test_env, num_episodes=5)
            print('[iter {}] average score = {} (over 5 episodes)'.format(t, avg_score))
            evaluation_log = [t, avg_score]
            logger.writerow(evaluation_log)

        if t % snapshot_interval == 0:
            snapshot_path = save_path + 'iter{}_'.format(t)
            # save weight & training progress
            save_snapshot(snapshot_path, agent.critic, target_critic, critic_optim)

        if t > start_train and t % train_interval == 0:
            # start training after fixed number of steps
            # this may mitigate overfitting of networks to the
            # small number of samples collected during the initial stage of training
            for _ in range(train_interval):
                update(agent, replay_buf, gamma, critic_optim, target_critic, tau, batch_size)

    log_file.close()

# 4. Let's train our agent!

In [16]:
env = gym.make('CartPole-v1')
obs_dim = env.observation_space.shape[0]
num_act = env.action_space.n

print('observation space dim. : {} / # actions : {}'.format(obs_dim, num_act))

observation space dim. : 4 / # actions : 2


In [17]:
import gym
from gym.wrappers.record_video import RecordVideo
import os
from IPython.display import HTML
from base64 import b64encode

In [18]:
# TODO_10 : Define your own agent with arbitrary values of hidden1, hidden2.
agent = DQNAgent(obs_dim = obs_dim, num_act = num_act, hidden1 = 256, hidden2 = 256)

In [19]:
gamma = 0.99
lr = 1e-3
tau = 1e-3
ep_len = 500
num_updates = 100000
batch_size = 128

In [20]:
train(agent, env, gamma, lr, tau,
      ep_len, num_updates, batch_size,
      init_buffer=5000, buffer_size=100000,
      start_train=2000, train_interval=50,
      eval_interval=2000, snapshot_interval=2000, path=None)

[iter 0] average score = 9.4 (over 5 episodes)
[iter 2000] average score = 9.4 (over 5 episodes)
[iter 4000] average score = 61.8 (over 5 episodes)
[iter 6000] average score = 65.6 (over 5 episodes)
[iter 8000] average score = 37.8 (over 5 episodes)
[iter 10000] average score = 123.0 (over 5 episodes)
[iter 12000] average score = 162.6 (over 5 episodes)
[iter 14000] average score = 130.2 (over 5 episodes)
[iter 16000] average score = 100.2 (over 5 episodes)
[iter 18000] average score = 90.2 (over 5 episodes)
[iter 20000] average score = 233.2 (over 5 episodes)
[iter 22000] average score = 153.2 (over 5 episodes)
[iter 24000] average score = 180.4 (over 5 episodes)
[iter 26000] average score = 75.8 (over 5 episodes)
[iter 28000] average score = 120.6 (over 5 episodes)
[iter 30000] average score = 120.6 (over 5 episodes)
[iter 32000] average score = 163.6 (over 5 episodes)
[iter 34000] average score = 91.4 (over 5 episodes)
[iter 36000] average score = 171.6 (over 5 episodes)
[iter 38000

# 5. Watch the trained agent!

In [21]:
#Install classical control environment
!pip install gym[classic_control]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame==2.1.0 (from gym[classic_control])
  Downloading pygame-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pygame
  Attempting uninstall: pygame
    Found existing installation: pygame 2.3.0
    Uninstalling pygame-2.3.0:
      Successfully uninstalled pygame-2.3.0
Successfully installed pygame-2.1.0


In [22]:
env = gym.make('CartPole-v1')

os.makedirs('./video',exist_ok=True)

env = RecordVideo(env=env,video_folder='./video')

obs = env.reset()

env.start_video_recorder()
done = False
score = 0.
load_model(agent, path='./snapshots/trained.pth.tar', device=device)
while not done:
    env.render()
    obs, rew, done, _ = env.step(agent.act(obs))
    score += rew

env.close_video_recorder()
print('score : ', score)
mp4 = open('./video/rl-video-episode-0.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)

  logger.warn(
  logger.deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  logger.deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


loading pre-trained weight...


If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


score :  465.0
