In [None]:
!pip install ufal.pybox2d

Collecting ufal.pybox2d
  Downloading ufal.pybox2d-2.3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ufal.pybox2d
Successfully installed ufal.pybox2d-2.3.10.2


In [None]:
import plotly
import plotly.express as px
import numpy as np
import random
import torch
import torch.nn as nn

# DQN

We implement the approximation structure $Q^\theta$, initial parameters vector $\theta$, probability of environment exploration $\varepsilon = 1$.

For each episode $k$ do:

While episode not done:

- Being in state $S_t$ we do action $A_t \sim \pi(\cdot|S_t)$, where $\pi = \varepsilon\text{-greedy}(Q^\theta)$, receive reward $R_t$  move to state $S_{t+1}$. Save $(S_t,A_t,R_t,S_{t+1}) \rightarrow Memory$


- Take $\{(s_i,a_i,r_i,s'_i)\}_{i=1}^{n} \leftarrow Memory$, obtain targets:

$$
y_i =
\left\{
\begin{array}{ll}
r_i, &\text{ if } s'_i\text{ -terminal state},\\[0.0cm]
 r_i + \gamma \max\limits_{a'} Q^\theta(s'_i,a'), &\text{ otherwise}
\end{array}
\right.
$$

Loss function $Loss(\theta) = \frac{1}{n}\sum\limits_{i=1}^n \big(y_i - Q^\theta(s_i,a_i)\big)^2$
and upgrade the parameters vectors

$$
\theta \leftarrow \theta - \alpha \nabla_\theta Loss(\theta)
$$

- Decrease $\varepsilon$

In [None]:
import numpy as np
import random
import torch
import torch.nn as nn

class Qfunction(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.linear_1 = nn.Linear(state_dim, 64)
        self.linear_2 = nn.Linear(64, 64)
        self.linear_3 = nn.Linear(64, action_dim)
        self.activation = nn.ReLU()

    def forward(self, states):
        hidden = self.linear_1(states)
        hidden = self.activation(hidden)
        hidden = self.linear_2(hidden)
        hidden = self.activation(hidden)
        actions = self.linear_3(hidden)
        return actions

In [None]:
class DQN:
    def __init__(self, state_dim, action_dim, gamma=0.99, lr=1e-3, batch_size=64, epsilon_decrease=0.01, epilon_min=0.01):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.q_function = Qfunction(self.state_dim, self.action_dim)
        self.gamma = gamma
        self.batch_size = batch_size
        self.epsilon = 1
        self.epsilon_decrease = epsilon_decrease
        self.epilon_min = epilon_min
        self.memory = []
        self.optimizer = torch.optim.Adam(self.q_function.parameters(), lr=lr)

    def get_action(self, state):
        q_values = self.q_function(torch.FloatTensor(state))
        argmax_action = torch.argmax(q_values)
        probs = self.epsilon * np.ones(self.action_dim) / self.action_dim
        probs[argmax_action] += 1 - self.epsilon
        action = np.random.choice(np.arange(self.action_dim), p=probs)
        return action

    def fit(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, int(done), next_state])

        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.tensor, list(zip(*batch)))

            targets = rewards + self.gamma * (1 - dones) * torch.max(self.q_function(next_states), dim=1).values
            q_values = self.q_function(states)[torch.arange(self.batch_size), actions]

            loss = torch.mean((q_values - targets.detach()) ** 2)
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()

            if self.epsilon > self.epilon_min:
                self.epsilon -= self.epsilon_decrease

In [None]:
import gym
rewards_DQN = []

env = gym.make('LunarLander-v2')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

agent = DQN(state_dim, action_dim)

episode_n = 500
t_max = 500

for episode in range(episode_n):
    total_reward = 0

    state = env.reset()
    for t in range(t_max):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)

        total_reward += reward

        agent.fit(state, action, reward, done, next_state)

        state = next_state

        if done:
            break
    rewards_DQN.append(total_reward)
    print(f'episode: {episode}, total_reward: {total_reward}')

  deprecation(
  deprecation(


episode: 0, total_reward: -84.1259642937659
episode: 1, total_reward: -491.2320308458567
episode: 2, total_reward: -501.3555471175147
episode: 3, total_reward: -379.44638760286585
episode: 4, total_reward: -284.94163137576726
episode: 5, total_reward: -246.88979556653837
episode: 6, total_reward: -154.19232670360572
episode: 7, total_reward: -33.35953973238743
episode: 8, total_reward: -161.34901282150375
episode: 9, total_reward: -42.261304297145784
episode: 10, total_reward: -186.57785072914623
episode: 11, total_reward: -120.78421633077963
episode: 12, total_reward: -271.96141654871525
episode: 13, total_reward: -393.1022819024783
episode: 14, total_reward: -251.07657729052903
episode: 15, total_reward: -84.8583695942867
episode: 16, total_reward: -168.15288348226807
episode: 17, total_reward: -138.1185697229489
episode: 18, total_reward: 28.50779037452744
episode: 19, total_reward: 23.93757505524393
episode: 20, total_reward: -444.9791639014196
episode: 21, total_reward: -146.15687

In [None]:
fig = px.line(rewards_DQN, title='Rewards graph DQN').update_layout(xaxis_title="Iteration", yaxis_title="Reward")
fig.show()