In [None]:
!pip install -Uq catalyst==20.11 gym==0.17.3

[K     |████████████████████████████████| 489 kB 9.0 MB/s 
[K     |████████████████████████████████| 1.6 MB 49.0 MB/s 
[K     |████████████████████████████████| 125 kB 64.0 MB/s 
[K     |████████████████████████████████| 182 kB 61.6 MB/s 
[K     |████████████████████████████████| 1.0 MB 65.4 MB/s 
[K     |████████████████████████████████| 62 kB 892 kB/s 
[K     |████████████████████████████████| 1.6 MB 49.7 MB/s 
[?25h  Building wheel for gym (setup.py) ... [?25l[?25hdone


# Seminar. RL, DQN.

Hi! In the first part of the seminar, we are going to introduce one of the main algorithm in the Reinforcment Learning domain. Deep Q-Network is the pioneer algorithm, that amalmagates Q-Learning and Deep Neural Networks. And there is small review on gym enviroments, where our bots will play in games.

In [None]:
from collections import deque, namedtuple
import random
import numpy as np
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from catalyst import dl, utils

In the beginning, look at the algorithm:

![DQN algorithm](https://i.stack.imgur.com/Jnyff.jpg)
There are several differences between the usual DL and RL routines. Our bots are trained by his actions, that he has done in the past. We don't have infinity memory, but we can save some actions in the buffer. Let's code it!

In [None]:
device = utils.get_device()

In [None]:
import numpy as np
from collections import deque, namedtuple

Transition = namedtuple(
    'Transition', 
    field_names=[
        'state', 
        'action', 
        'reward',
        'done', 
        'next_state'
    ]
)

class ReplayBuffer:
    def __init__(self, capacity: int):
        self.buffer = deque(maxlen=capacity)
    
    def append(self, transition: Transition):
        self.buffer.append(transition)
    
    def sample(self, size: int):
        indices = np.random.choice(
            len(self.buffer), 
            size, 
            replace=size > len(self.buffer)
        )
        states, actions, rewards, dones, next_states = \
            zip(*[self.buffer[idx] for idx in indices])
        states, actions, rewards, dones, next_states = (
            np.array(states, dtype=np.float32), 
            np.array(actions, dtype=np.int64), 
            np.array(rewards, dtype=np.float32),
            np.array(dones, dtype=np.bool), 
            np.array(next_states, dtype=np.float32)
        )
        return states, actions, rewards, dones, next_states
    
    def __len__(self):
        return len(self.buffer)

To work well with Catalyst train loops, implement intermedeate abstraction.

In [None]:
from torch.utils.data.dataset import IterableDataset

# as far as RL does not have some predefined dataset, 
# we need to specify epoch lenght by ourselfs
class ReplayDataset(IterableDataset):

    def __init__(self, buffer: ReplayBuffer, epoch_size: int = int(1e3)):
        self.buffer = buffer
        self.epoch_size = epoch_size

    def __iter__(self):
        states, actions, rewards, dones, next_states = \
            self.buffer.sample(self.epoch_size)
        for i in range(len(dones)):
            yield states[i], actions[i], rewards[i], dones[i], next_states[i]
    
    def __len__(self):
        return self.epoch_size

After creating a Buffer, we need to gather action-value-state and save it in the buffer. We create one function, that asks model for action, and another function to communicate with the enviroment.

In [None]:
def get_action(env, network, state, epsilon=-1):
    if np.random.random() < epsilon:
        action = env.action_space.sample()
    else:
        state = torch.tensor(state[None], dtype=torch.float32)
        q_values = network(state).detach().cpu().numpy()[0]
        action = np.argmax(q_values)

    return int(action)


def generate_session(
    env, 
    network, 
    t_max=1000, 
    epsilon=-1,
    replay_buffer=None,
):
    total_reward = 0
    state = env.reset()

    for t in range(t_max):
        action = get_action(env, network, state=state, epsilon=epsilon)
        next_state, reward, done, _ = env.step(action)

        if replay_buffer is not None:
            transition = Transition(
                state, action, reward, done, next_state)
            replay_buffer.append(transition)

        total_reward += reward
        state = next_state
        if done:
            break

    return total_reward, t

def generate_sessions(
    env, 
    network, 
    t_max=1000, 
    epsilon=-1,
    replay_buffer=None,
    num_sessions=100,
):
    sessions_reward, sessions_steps = 0, 0
    for i_episone in range(num_sessions):
        r, t = generate_session(
            env=env, 
            network=network,
            t_max=t_max,
            epsilon=epsilon,
            replay_buffer=replay_buffer,
        )
        sessions_reward += r
        sessions_steps += t
    return sessions_reward, sessions_steps

If we look closely into algorithm, we'll see that we need two networks. They looks the same, but one updates weights by gradients algorithm and second one by moving average with the first. This process helps to get stable training by REINFORCE.

In [None]:
def soft_update(target, source, tau):
    """Updates the target data with smoothing by ``tau``"""
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - tau) + param.data * tau
        )

To communicate with the Buffer, Catalyst's Runner requires adiitional Callback.

In [None]:
class GameCallback(dl.Callback):
    
    def __init__(
        self, 
        *, 
        env, 
        replay_buffer, 
        session_period, 
        epsilon,
        epsilon_k,
        actor_key,
    ):
        super().__init__(order=0)
        self.env = env
        self.replay_buffer = replay_buffer
        self.session_period = session_period
        self.epsilon = epsilon
        self.epsilon_k = epsilon_k
        self.actor_key = actor_key
    
    def on_stage_start(self, runner: dl.IRunner):
        self.actor = runner.model[self.actor_key]
        
        self.actor.eval()
        generate_sessions(
            env=self.env, 
            network=self.actor,
            epsilon=self.epsilon,
            replay_buffer=self.replay_buffer,
            num_sessions=1000,
        )
        self.actor.train()

    def on_epoch_start(self, runner: dl.IRunner):
        self.epsilon *= self.epsilon_k
        self.session_counter = 0
        self.session_steps = 0
    
    def on_batch_end(self, runner: dl.IRunner):
        if runner.global_batch_step % self.session_period == 0:
            self.actor.eval()
            
            session_reward, session_steps = generate_session(
                env=self.env, 
                network=self.actor,
                epsilon=self.epsilon,
                replay_buffer=self.replay_buffer
            )

            self.session_counter += 1
            self.session_steps += session_steps

            runner.batch_metrics.update({"s_reward": session_reward})
            runner.batch_metrics.update({"s_steps": session_steps})
            
            self.actor.train()

    def on_epoch_end(self, runner: dl.IRunner):
        num_sessions = 100
        
        self.actor.eval()
        valid_rewards, valid_steps = generate_sessions(
            env=self.env, 
            network=self.actor,
            num_sessions=num_sessions
        )
        self.actor.train()
        
        valid_rewards /= num_sessions
        runner.epoch_metrics["train_num_samples"] = self.session_steps
        runner.epoch_metrics["train_updates_per_sample"] = \
            runner.loader_sample_step / self.session_steps
        runner.epoch_metrics["train_v_reward"] = valid_rewards
        runner.epoch_metrics["train_epsilon"] = self.epsilon

In [None]:
class CustomRunner(dl.Runner):
    
    def __init__(
        self, 
        *, 
        gamma, 
        tau, 
        tau_period=1, 
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.gamma = gamma
        self.tau = tau
        self.tau_period = tau_period
    
    def on_stage_start(self, runner: dl.IRunner):
        super().on_stage_start(runner)
        soft_update(self.model["target"], self.model["origin"], 1.0)

    def _handle_batch(self, batch):
        # model train/valid step
        states, actions, rewards, dones, next_states = batch
        network, target_network = self.model["origin"], self.model["target"]

        # get q-values for all actions in current states
        state_qvalues = network(states)
        # select q-values for chosen actions
        state_action_qvalues = \
            state_qvalues.gather(1, actions.unsqueeze(-1)).squeeze(-1)
        
        # compute q-values for all actions in next states
        # compute V*(next_states) using predicted next q-values
        # at the last state we shall use simplified formula: 
        # Q(s,a) = r(s,a) since s' doesn't exist
        with torch.no_grad():
            next_state_qvalues = target_network(next_states)
            next_state_values = next_state_qvalues.max(1)[0]
            next_state_values[dones] = 0.0
            next_state_values = next_state_values.detach()

        # compute "target q-values" for loss, 
        # it's what's inside square parentheses in the above formula.
        target_state_action_qvalues = \
            next_state_values * self.gamma + rewards

        # mean squared error loss to minimize
        loss = self.criterion(
            state_action_qvalues,
            target_state_action_qvalues.detach()
        )
        self.batch_metrics.update({"loss": loss})

        if self.is_train_loader:
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()

            if self.global_batch_step % self.tau_period == 0:
                soft_update(target_network, network, self.tau)

In [None]:
def get_network(env, num_hidden=128):
    inner_fn = utils.get_optimal_inner_init(nn.ReLU)
    outer_fn = utils.outer_init
    
    network = torch.nn.Sequential(
        nn.Linear(env.observation_space.shape[0], num_hidden),
        nn.ReLU(),
        nn.Linear(num_hidden, num_hidden),
        nn.ReLU(),
    )
    head = nn.Linear(num_hidden, env.action_space.n)
    
    network.apply(inner_fn)
    head.apply(outer_fn)

    return torch.nn.Sequential(network, head)

In [None]:
# data
batch_size = 64
epoch_size = int(1e3) * batch_size
buffer_size = int(1e5)
# runner settings, ~training
gamma = 0.99
tau = 0.01
tau_period = 1 # in batches
# callback, ~exploration
session_period = 100 # in batches
epsilon = 0.98
epsilon_k = 0.9
# optimization
lr = 3e-4

# env_name = "LunarLander-v2"
env_name = "CartPole-v1"
env = gym.make(env_name)
replay_buffer = ReplayBuffer(buffer_size)

network, target_network = get_network(env), get_network(env)
utils.set_requires_grad(target_network, requires_grad=False)

models = {"origin": network, "target": target_network}
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(network.parameters(), lr=lr)

loaders = {
    "train": DataLoader(
        ReplayDataset(replay_buffer, epoch_size=epoch_size), 
        batch_size=batch_size,
    ),
}


runner = CustomRunner(
    gamma=gamma, 
    tau=tau,
    tau_period=tau_period,
    
)

runner.train(
    model=models,
    criterion=criterion,
    optimizer=optimizer,
    loaders=loaders,
    logdir="./logs_dqn",
    num_epochs=10,
    verbose=True,
    main_metric="v_reward",
    minimize_metric=False,
    load_best_on_end=True,
    callbacks=[
        GameCallback(
            env=env, 
            replay_buffer=replay_buffer, 
            session_period=session_period,
            epsilon=epsilon,
            epsilon_k=epsilon_k,
            actor_key="origin",
        )
    ]
)


Attention, there is only one dataloader - train




2/10 * Epoch (train):  56% 562/1000 [00:25<00:19, 22.46it/s, loss=0.843] 

1/10 * Epoch (train):   0% 0/1000 [00:00<?, ?it/s, loss=1.006][A
1/10 * Epoch (train):   0% 1/1000 [00:00<05:41,  2.93it/s, loss=1.006][A
1/10 * Epoch (train):   0% 1/1000 [00:00<05:41,  2.93it/s, loss=0.985][A
1/10 * Epoch (train):   0% 2/1000 [00:00<05:40,  2.93it/s, loss=0.964][A
1/10 * Epoch (train):   0% 3/1000 [00:00<05:40,  2.93it/s, loss=0.943][A
1/10 * Epoch (train):   0% 4/1000 [00:00<05:40,  2.93it/s, loss=0.938][A
1/10 * Epoch (train):   0% 5/1000 [00:00<05:39,  2.93it/s, loss=0.899][A
1/10 * Epoch (train):   1% 6/1000 [00:00<05:39,  2.93it/s, loss=0.892][A
1/10 * Epoch (train):   1% 7/1000 [00:00<00:51, 19.16it/s, loss=0.892][A
1/10 * Epoch (train):   1% 7/1000 [00:00<00:51, 19.16it/s, loss=0.871][A
1/10 * Epoch (train):   1% 8/1000 [00:00<00:51, 19.16it/s, loss=0.855][A


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations




1/10 * Epoch (train):   1% 9/1000 [00:00<00:51, 19.16it/s, loss=0.844][A
1/10 * Epoch (train):   1% 10/1000 [00:00<00:51, 19.16it/s, loss=0.827][A
1/10 * Epoch (train):   1% 11/1000 [00:00<00:51, 19.16it/s, loss=0.820][A
1/10 * Epoch (train):   1% 12/1000 [00:00<00:35, 27.68it/s, loss=0.820][A
1/10 * Epoch (train):   1% 12/1000 [00:00<00:35, 27.68it/s, loss=0.783][A
1/10 * Epoch (train):   1% 13/1000 [00:00<00:35, 27.68it/s, loss=0.773][A
1/10 * Epoch (train):   1% 14/1000 [00:00<00:35, 27.68it/s, loss=0.743][A
1/10 * Epoch (train):   2% 15/1000 [00:00<00:35, 27.68it/s, loss=0.749][A
1/10 * Epoch (train):   2% 16/1000 [00:00<00:35, 27.68it/s, loss=0.752][A
1/10 * Epoch (train):   2% 17/1000 [00:00<00:35, 27.68it/s, loss=0.673][A
1/10 * Epoch (train):   2% 18/1000 [00:00<00:26, 36.66it/s, loss=0.673][A
1/10 * Epoch (train):   2% 18/1000 [00:00<00:26, 36.66it/s, loss=0.646][A
1/10 * Epoch (train):   2% 19/1000 [00:00<00:26, 36.66it/s, loss=0.691][A
1/10 * Epoch (train):   2

INFO:metrics_logger:
1/10 * Epoch 1 (train): epsilon=0.8820 | loss=0.6106 | num_samples=190.0000 | s_reward=20.0000 | s_steps=19.0000 | updates_per_sample=336.8421 | v_reward=316.0900


2/10 * Epoch (train): 100% 1000/1000 [00:08<00:00, 113.06it/s, loss=6.057, s_reward=16.000, s_steps=15.000]
[2022-08-01 16:54:46,883] 
2/10 * Epoch 2 (train): epsilon=0.7938 | loss=1.9850 | num_samples=393.0000 | s_reward=40.3000 | s_steps=39.3000 | updates_per_sample=162.8499 | v_reward=325.1800


INFO:metrics_logger:
2/10 * Epoch 2 (train): epsilon=0.7938 | loss=1.9850 | num_samples=393.0000 | s_reward=40.3000 | s_steps=39.3000 | updates_per_sample=162.8499 | v_reward=325.1800


3/10 * Epoch (train): 100% 1000/1000 [00:08<00:00, 115.08it/s, loss=2.018, s_reward=26.000, s_steps=25.000]
[2022-08-01 16:54:59,656] 
3/10 * Epoch 3 (train): epsilon=0.7144 | loss=4.6930 | num_samples=386.0000 | s_reward=39.6000 | s_steps=38.6000 | updates_per_sample=165.8031 | v_reward=309.9000


INFO:metrics_logger:
3/10 * Epoch 3 (train): epsilon=0.7144 | loss=4.6930 | num_samples=386.0000 | s_reward=39.6000 | s_steps=38.6000 | updates_per_sample=165.8031 | v_reward=309.9000


4/10 * Epoch (train): 100% 1000/1000 [00:08<00:00, 116.90it/s, loss=3.551, s_reward=31.000, s_steps=30.000]
[2022-08-01 16:55:11,831] 
4/10 * Epoch 4 (train): epsilon=0.6430 | loss=9.2790 | num_samples=603.0000 | s_reward=61.3000 | s_steps=60.3000 | updates_per_sample=106.1360 | v_reward=282.4200


INFO:metrics_logger:
4/10 * Epoch 4 (train): epsilon=0.6430 | loss=9.2790 | num_samples=603.0000 | s_reward=61.3000 | s_steps=60.3000 | updates_per_sample=106.1360 | v_reward=282.4200


5/10 * Epoch (train): 100% 1000/1000 [00:08<00:00, 117.55it/s, loss=6.013, s_reward=280.000, s_steps=279.000]
[2022-08-01 16:55:24,120] 
5/10 * Epoch 5 (train): epsilon=0.5787 | loss=16.5269 | num_samples=1037.0000 | s_reward=104.7000 | s_steps=103.7000 | updates_per_sample=61.7165 | v_reward=283.6400


INFO:metrics_logger:
5/10 * Epoch 5 (train): epsilon=0.5787 | loss=16.5269 | num_samples=1037.0000 | s_reward=104.7000 | s_steps=103.7000 | updates_per_sample=61.7165 | v_reward=283.6400


6/10 * Epoch (train): 100% 1000/1000 [00:08<00:00, 112.21it/s, loss=22.876, s_reward=97.000, s_steps=96.000]
[2022-08-01 16:55:37,411] 
6/10 * Epoch 6 (train): epsilon=0.5208 | loss=25.1538 | num_samples=720.0000 | s_reward=73.0000 | s_steps=72.0000 | updates_per_sample=88.8889 | v_reward=265.8100


INFO:metrics_logger:
6/10 * Epoch 6 (train): epsilon=0.5208 | loss=25.1538 | num_samples=720.0000 | s_reward=73.0000 | s_steps=72.0000 | updates_per_sample=88.8889 | v_reward=265.8100


7/10 * Epoch (train): 100% 1000/1000 [00:08<00:00, 115.15it/s, loss=17.042, s_reward=169.000, s_steps=168.000]
[2022-08-01 16:55:49,526] 
7/10 * Epoch 7 (train): epsilon=0.4687 | loss=38.8087 | num_samples=1386.0000 | s_reward=139.6000 | s_steps=138.6000 | updates_per_sample=46.1760 | v_reward=270.4800


INFO:metrics_logger:
7/10 * Epoch 7 (train): epsilon=0.4687 | loss=38.8087 | num_samples=1386.0000 | s_reward=139.6000 | s_steps=138.6000 | updates_per_sample=46.1760 | v_reward=270.4800


8/10 * Epoch (train): 100% 1000/1000 [00:09<00:00, 110.81it/s, loss=101.460, s_reward=183.000, s_steps=182.000]
[2022-08-01 16:56:02,248] 
8/10 * Epoch 8 (train): epsilon=0.4219 | loss=58.9526 | num_samples=1729.0000 | s_reward=173.9000 | s_steps=172.9000 | updates_per_sample=37.0156 | v_reward=273.4900


INFO:metrics_logger:
8/10 * Epoch 8 (train): epsilon=0.4219 | loss=58.9526 | num_samples=1729.0000 | s_reward=173.9000 | s_steps=172.9000 | updates_per_sample=37.0156 | v_reward=273.4900


9/10 * Epoch (train): 100% 1000/1000 [00:08<00:00, 115.73it/s, loss=237.098, s_reward=40.000, s_steps=39.000]
[2022-08-01 16:56:14,666] 
9/10 * Epoch 9 (train): epsilon=0.3797 | loss=72.0629 | num_samples=1868.0000 | s_reward=187.8000 | s_steps=186.8000 | updates_per_sample=34.2612 | v_reward=283.5700


INFO:metrics_logger:
9/10 * Epoch 9 (train): epsilon=0.3797 | loss=72.0629 | num_samples=1868.0000 | s_reward=187.8000 | s_steps=186.8000 | updates_per_sample=34.2612 | v_reward=283.5700


10/10 * Epoch (train): 100% 1000/1000 [00:08<00:00, 113.99it/s, loss=89.208, s_reward=192.000, s_steps=191.000]
[2022-08-01 16:56:27,006] 
10/10 * Epoch 10 (train): epsilon=0.3417 | loss=92.2757 | num_samples=2071.0000 | s_reward=208.1000 | s_steps=207.1000 | updates_per_sample=30.9029 | v_reward=271.0400


INFO:metrics_logger:
10/10 * Epoch 10 (train): epsilon=0.3417 | loss=92.2757 | num_samples=2071.0000 | s_reward=208.1000 | s_steps=207.1000 | updates_per_sample=30.9029 | v_reward=271.0400


Top best models:
logs_dqn/checkpoints/train.2.pth	325.1800
=> Loading checkpoint logs_dqn/checkpoints/best_full.pth
loaded state checkpoint logs_dqn/checkpoints/best_full.pth (global epoch 2, epoch 2, stage train)


In [None]:
torch.save(target_network.state_dict(), 'res.pth')

And we can watch how our model plays in the games!

\* to run cells below, you should update your python environment. Instruction depends on your system specification.

In [None]:
# record sessions
from gym.wrappers.monitoring.video_recorder import VideoRecorder



env = VideoRecorder(
    gym.make(env_name),
    directory="videos_dqn", 
    force=True)
generate_sessions(
    env=env, 
    network=runner.model["origin"],
    num_sessions=100
)
env.close()

NameError: ignored

In [None]:
# show video
from IPython.display import HTML
import os

video_names = list(
    filter(lambda s: s.endswith(".mp4"), os.listdir("./videos_dqn/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1]))  # this may or may not be _last_ video. Try other indices