In [None]:
!pip install ufal.pybox2d



In [None]:
import plotly
import plotly.express as px

# DQN

We implement the approximation structure $Q^\theta$, initial parameters vector $\theta$, probability of environment exploration $\varepsilon = 1$.

For each episode $k$ do:

While episode not done:

- Being in state $S_t$ we do action $A_t \sim \pi(\cdot|S_t)$, where $\pi = \varepsilon\text{-greedy}(Q^\theta)$, receive reward $R_t$  move to state $S_{t+1}$. Save $(S_t,A_t,R_t,S_{t+1}) \rightarrow Memory$


- Take $\{(s_i,a_i,r_i,s'_i)\}_{i=1}^{n} \leftarrow Memory$, obtain targets:

$$
y_i =
\left\{
\begin{array}{ll}
r_i, &\text{ if } s'_i\text{ -terminal state},\\[0.0cm]
 r_i + \gamma \max\limits_{a'} Q^\theta(s'_i,a'), &\text{ otherwise}
\end{array}
\right.
$$

Loss function $Loss(\theta) = \frac{1}{n}\sum\limits_{i=1}^n \big(y_i - Q^\theta(s_i,a_i)\big)^2$
and upgrade the parameters vectors

$$
\theta \leftarrow \theta - \alpha \nabla_\theta Loss(\theta)
$$

- Decrease $\varepsilon$

In [None]:
import numpy as np
import random
import torch
import torch.nn as nn

class Qfunction(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.linear_1 = nn.Linear(state_dim, 64)
        self.linear_2 = nn.Linear(64, 64)
        self.linear_3 = nn.Linear(64, action_dim)
        self.activation = nn.ReLU()

    def forward(self, states):
        hidden = self.linear_1(states)
        hidden = self.activation(hidden)
        hidden = self.linear_2(hidden)
        hidden = self.activation(hidden)
        actions = self.linear_3(hidden)
        return actions

In [None]:
class DQN:
    def __init__(self, state_dim, action_dim, gamma=0.99, lr=1e-3, batch_size=64, epsilon_decrease=0.01, epilon_min=0.01):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.q_function = Qfunction(self.state_dim, self.action_dim)
        self.gamma = gamma
        self.batch_size = batch_size
        self.epsilon = 1
        self.epsilon_decrease = epsilon_decrease
        self.epilon_min = epilon_min
        self.memory = []
        self.optimizer = torch.optim.Adam(self.q_function.parameters(), lr=lr)

    def get_action(self, state):
        q_values = self.q_function(torch.FloatTensor(state))
        argmax_action = torch.argmax(q_values)
        probs = self.epsilon * np.ones(self.action_dim) / self.action_dim
        probs[argmax_action] += 1 - self.epsilon
        action = np.random.choice(np.arange(self.action_dim), p=probs)
        return action

    def fit(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, int(done), next_state])

        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.tensor, list(zip(*batch)))

            targets = rewards + self.gamma * (1 - dones) * torch.max(self.q_function(next_states), dim=1).values
            q_values = self.q_function(states)[torch.arange(self.batch_size), actions]

            loss = torch.mean((q_values - targets.detach()) ** 2)
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()

            if self.epsilon > self.epilon_min:
                self.epsilon -= self.epsilon_decrease

In [None]:
import gym
rewards_DQN = []

env = gym.make('LunarLander-v2')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

agent = DQN(state_dim, action_dim)

episode_n = 500
t_max = 500

for episode in range(episode_n):
    total_reward = 0

    state = env.reset()
    for t in range(t_max):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)

        total_reward += reward

        agent.fit(state, action, reward, done, next_state)

        state = next_state

        if done:
            break
    rewards_DQN.append(total_reward)
    print(f'episode: {episode}, total_reward: {total_reward}')

  deprecation(
  deprecation(


episode: 0, total_reward: -84.1259642937659
episode: 1, total_reward: -491.2320308458567
episode: 2, total_reward: -501.3555471175147
episode: 3, total_reward: -379.44638760286585
episode: 4, total_reward: -284.94163137576726
episode: 5, total_reward: -246.88979556653837
episode: 6, total_reward: -154.19232670360572
episode: 7, total_reward: -33.35953973238743
episode: 8, total_reward: -161.34901282150375
episode: 9, total_reward: -42.261304297145784
episode: 10, total_reward: -186.57785072914623
episode: 11, total_reward: -120.78421633077963
episode: 12, total_reward: -271.96141654871525
episode: 13, total_reward: -393.1022819024783
episode: 14, total_reward: -251.07657729052903
episode: 15, total_reward: -84.8583695942867
episode: 16, total_reward: -168.15288348226807
episode: 17, total_reward: -138.1185697229489
episode: 18, total_reward: 28.50779037452744
episode: 19, total_reward: 23.93757505524393
episode: 20, total_reward: -444.9791639014196
episode: 21, total_reward: -146.15687

# DQN с Hard Target Update

* We set some parameter $\theta = \theta '$

* Do a lot of iterations:

    * $y = r + \gamma \underset{a'}{max} Q^{\theta'} (s', a')$
    * $Loss(\theta) = (y - Q^{\theta} (s, a))^2$
    * $\theta \leftarrow \theta - \alpha \nabla_{\theta} Loss(\theta) $

* Update $\theta' = \theta$

In [None]:
class DQN_hard_target:
    def __init__(self, state_dim, action_dim, gamma=0.99, lr=1e-3, batch_size=64, epsilon_decrease=0.01, epilon_min=0.01):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.q_function_in = Qfunction(self.state_dim, self.action_dim)
        self.q_function_out = Qfunction(self.state_dim, self.action_dim)
        self.gamma = gamma
        self.batch_size = batch_size
        self.epsilon = 1
        self.epsilon_decrease = epsilon_decrease
        self.epilon_min = epilon_min
        self.memory = []
        self.optimizer = torch.optim.Adam(self.q_function_in.parameters(), lr=lr)

    def get_action(self, state):
        q_values = self.q_function_in(torch.FloatTensor(state))
        argmax_action = torch.argmax(q_values)
        probs = self.epsilon * np.ones(self.action_dim) / self.action_dim
        probs[argmax_action] += 1 - self.epsilon
        action = np.random.choice(np.arange(self.action_dim), p=probs)
        return action

    def fit(self, state, action, reward, done, next_state, t):
        self.memory.append([state, action, reward, int(done), next_state])

        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.tensor, list(zip(*batch)))
            ##
            state_dict_fixed = self.q_function_in.state_dict()
            self.q_function_out.load_state_dict(state_dict_fixed)


            targets = rewards + self.gamma * (1 - dones) * torch.max(self.q_function_out(next_states), dim=1).values
            q_values = self.q_function_in(states)[torch.arange(self.batch_size), actions]
            loss = torch.mean((q_values - targets.detach()) ** 2)
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()

            if t%10 == 0:
                self.q_function_out.load_state_dict(self.q_function_in.state_dict())

            #print(list(self.q_function.parameters()))
            ##

            if self.epsilon > self.epilon_min:
                self.epsilon -= self.epsilon_decrease

In [None]:
env = gym.make('LunarLander-v2')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

agent = DQN_hard_target(state_dim, action_dim)
rewards_DQN_hard = []

episode_n = 600
t_max = 500

for episode in range(episode_n):
    total_reward = 0

    state = env.reset()
    for t in range(t_max):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)

        total_reward += reward

        agent.fit(state, action, reward, done, next_state, t)

        state = next_state

        if done:
            break

    rewards_DQN_hard.append(total_reward)
    print(f'episode: {episode}, total_reward: {total_reward}')

episode: 0, total_reward: -327.5289959791165
episode: 1, total_reward: -411.23510927718513
episode: 2, total_reward: -473.6920405222256
episode: 3, total_reward: -244.7267974909933
episode: 4, total_reward: -382.086752413358
episode: 5, total_reward: -346.64938809842624
episode: 6, total_reward: -382.29924876455584
episode: 7, total_reward: -111.6977638389413
episode: 8, total_reward: -231.93353497158745
episode: 9, total_reward: -451.94454345432445
episode: 10, total_reward: -255.83928766128184
episode: 11, total_reward: -38.41468005984362
episode: 12, total_reward: -273.1284177512211
episode: 13, total_reward: -192.14612254903074
episode: 14, total_reward: -395.6249880213871
episode: 15, total_reward: -196.55758098002588
episode: 16, total_reward: -218.935896905322
episode: 17, total_reward: -146.0254251820226
episode: 18, total_reward: -176.7105804409003
episode: 19, total_reward: -300.89739605416224
episode: 20, total_reward: -178.8013127929726
episode: 21, total_reward: -320.07489

# DQN с Soft Target Update

* $y = r + \gamma \underset{a'}{max} Q^{\theta'} (s', a')$

* $Loss(\theta) = (y - Q^{\theta} (s, a))^2$

* $\theta \leftarrow \theta - \alpha \nabla_{\theta} Loss(\theta) $

* $\theta ' = \tau \theta + (1-\tau) \theta'$

In [None]:
class DQN_soft_target:
    def __init__(self, state_dim, action_dim, gamma=0.99, lr=1e-3, batch_size=64, epsilon_decrease=0.01, epilon_min=0.01):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.q_function_in = Qfunction(self.state_dim, self.action_dim)
        self.q_function_out = Qfunction(self.state_dim, self.action_dim)
        self.gamma = gamma
        self.batch_size = batch_size
        self.epsilon = 1
        self.epsilon_decrease = epsilon_decrease
        self.epilon_min = epilon_min
        self.memory = []
        self.optimizer = torch.optim.Adam(self.q_function_in.parameters(), lr=lr)

    def get_action(self, state):
        q_values = self.q_function_in(torch.FloatTensor(state))
        argmax_action = torch.argmax(q_values)
        probs = self.epsilon * np.ones(self.action_dim) / self.action_dim
        probs[argmax_action] += 1 - self.epsilon
        action = np.random.choice(np.arange(self.action_dim), p=probs)
        return action

    def fit(self, state, action, reward, done, next_state, tau=0.05):
        self.memory.append([state, action, reward, int(done), next_state])

        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.tensor, list(zip(*batch)))
            ##
            targets = rewards + self.gamma * (1 - dones) * torch.max(self.q_function_out(next_states), dim=1).values
            q_values = self.q_function_in(states)[torch.arange(self.batch_size), actions]
            loss = torch.mean((q_values - targets.detach()) ** 2)
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
            for out_param, in_param in zip(self.q_function_out.parameters(), self.q_function_in.parameters()):
                out_param.data.copy_(tau*in_param.data + (1.0-tau)*out_param.data)
            ##
            if self.epsilon > self.epilon_min:
                self.epsilon -= self.epsilon_decrease

In [None]:
rewards_DQN_soft = []

env = gym.make('LunarLander-v2')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

agent = DQN_soft_target(state_dim, action_dim)

episode_n = 600
t_max = 500


for episode in range(episode_n):
    total_reward = 0

    state = env.reset()
    for t in range(t_max):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)

        total_reward += reward

        agent.fit(state, action, reward, done, next_state)

        state = next_state

        if done:
            break
    rewards_DQN_soft.append(total_reward)
    print(f'episode: {episode}, total_reward: {total_reward}')

episode: 0, total_reward: -185.27894229663843
episode: 1, total_reward: -256.8126033866894
episode: 2, total_reward: -287.213454639603
episode: 3, total_reward: -199.9233745582582
episode: 4, total_reward: -268.03643448281275
episode: 5, total_reward: -169.80659658638982
episode: 6, total_reward: -189.43104404456096
episode: 7, total_reward: -49.73027180933818
episode: 8, total_reward: -85.45336508594731
episode: 9, total_reward: -159.36545782717778
episode: 10, total_reward: -235.87119152480741
episode: 11, total_reward: -112.17930763672048
episode: 12, total_reward: -60.6950136472659
episode: 13, total_reward: -58.508577873174666
episode: 14, total_reward: -185.91750753428448
episode: 15, total_reward: -126.97064336639386
episode: 16, total_reward: -209.56292865715267
episode: 17, total_reward: -5.428651472521728
episode: 18, total_reward: -50.073136373703086
episode: 19, total_reward: -80.45537212661333
episode: 20, total_reward: -31.238015110998813
episode: 21, total_reward: -2.645

# Double DQN

* $y = r + \gamma Q ^ {\theta} (s', \underset{a'}{argmax} Q^{\theta'} (s', a'))$

* $Loss(\theta) = (y - Q^{\theta} (s, a))^2$

* $\theta \leftarrow \theta - \alpha \nabla_{\theta} Loss(\theta)$

* $\theta ' = \tau \theta + (1-\tau) \theta'$

In [None]:
class double_DQN:
    def __init__(self, state_dim, action_dim, gamma=0.99, lr=1e-3, batch_size=64, epsilon_decrease=0.01, epilon_min=0.01):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.q_function_in = Qfunction(self.state_dim, self.action_dim)
        self.q_function_out = Qfunction(self.state_dim, self.action_dim)
        self.gamma = gamma
        self.batch_size = batch_size
        self.epsilon = 1
        self.epsilon_decrease = epsilon_decrease
        self.epilon_min = epilon_min
        self.memory = []
        self.optimizer = torch.optim.Adam(self.q_function_in.parameters(), lr=lr)

    def get_action(self, state):
        q_values = self.q_function_in(torch.FloatTensor(state))
        argmax_action = torch.argmax(q_values)
        probs = self.epsilon * np.ones(self.action_dim) / self.action_dim
        probs[argmax_action] += 1 - self.epsilon
        action = np.random.choice(np.arange(self.action_dim), p=probs)
        return action

    def fit(self, state, action, reward, done, next_state, tau=0.05):
        self.memory.append([state, action, reward, int(done), next_state])

        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.tensor, list(zip(*batch)))
            ##
            aargmax_actions = torch.argmax(self.q_function_out(next_states), dim=1)
            q_values_in =  self.q_function_in(next_states)[torch.arange(self.batch_size), aargmax_actions]
            targets = rewards + self.gamma * (1 - dones) * q_values_in
            q_values = self.q_function_in(states)[torch.arange(self.batch_size), actions]
            loss = torch.mean((q_values - targets.detach()) ** 2)
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
            for out_param, in_param in zip(self.q_function_out.parameters(), self.q_function_in.parameters()):
                out_param.data.copy_(tau*in_param.data + (1.0-tau)*out_param.data)
            ##
            if self.epsilon > self.epilon_min:
                self.epsilon -= self.epsilon_decrease

In [None]:
rewards_double_dqn = []

env = gym.make('LunarLander-v2')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

agent = double_DQN(state_dim, action_dim)

episode_n = 600
t_max = 500

for episode in range(episode_n):
    total_reward = 0

    state = env.reset()
    for t in range(t_max):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)

        total_reward += reward

        agent.fit(state, action, reward, done, next_state)

        state = next_state

        if done:
            break
    rewards_double_dqn.append(total_reward)
    print(f'episode: {episode}, total_reward: {total_reward}')

episode: 0, total_reward: -91.5398318655962
episode: 1, total_reward: -418.03299644317076
episode: 2, total_reward: -63.28762087458174
episode: 3, total_reward: -660.9995022584662
episode: 4, total_reward: -439.16877912049125
episode: 5, total_reward: -198.45757718453729
episode: 6, total_reward: -65.12536347492146
episode: 7, total_reward: -53.69894585426443
episode: 8, total_reward: -4.0394315886495775
episode: 9, total_reward: -98.42191702468394
episode: 10, total_reward: -79.28980375725256
episode: 11, total_reward: -68.8921915959505
episode: 12, total_reward: -113.962320299915
episode: 13, total_reward: -217.40302761772097
episode: 14, total_reward: -158.7682598970956
episode: 15, total_reward: 7.5135765388583735
episode: 16, total_reward: -7.488500752762239
episode: 17, total_reward: 31.109222224441798
episode: 18, total_reward: 29.907576417634512
episode: 19, total_reward: 14.984171245573261
episode: 20, total_reward: -1.7053526716004295
episode: 21, total_reward: 24.91361088726

In [None]:
fig = px.line(rewards_DQN, title='Rewards graph DQN').update_layout(xaxis_title="Iteration", yaxis_title="Reward")
fig.show()

In [None]:
fig = px.line(rewards_DQN_hard, title='Rewards graph DQN hard').update_layout(xaxis_title="Iteration", yaxis_title="Reward")
fig.show()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [None]:
fig = px.line(rewards_DQN_soft, title='Rewards graph DQN soft').update_layout(xaxis_title="Iteration", yaxis_title="Reward")
fig.show()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [None]:
fig = px.line(rewards_double_dqn, title='Rewards graph dobule DQN').update_layout(xaxis_title="Iteration", yaxis_title="Reward")
fig.show()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

