In [1]:
!pip install gym

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install gym[mujoco]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mujoco==2.2.0 (from gym[mujoco])
  Downloading mujoco-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco==2.2.0->gym[mujoco])
  Downloading glfw-2.5.9-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.8/207.8 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: glfw, mujoco
Successfully installed glfw-2.5.9 mujoco-2.2.0


In [3]:
import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

Action Space

In [4]:
x1 , x2 , x3 , x4 , x5 , x6 = np.meshgrid(np.arange(-1, 1.1, 0.5),
                  np.arange(-1, 1.1, 0.5),
                  np.arange(-1, 1.1, 0.5),
                  np.arange(-1, 1.1, 0.5),
                  np.arange(-1, 1.1, 0.5),
                  np.arange(-1, 1.1, 0.5)
                  )
                  
action_grid = np.c_[x1.ravel() , x2.ravel() , x3.ravel() , x4.ravel() , x5.ravel() , x6.ravel()]

  and should_run_async(code)


Create Env

In [5]:
env = gym.make('Walker2d-v4' , new_step_api = True)

Q Network Class

In [6]:
class QNet(nn.Module):
    def __init__(self, obs_shape, num_actions):
        super(QNet, self).__init__()
        self.fc1 = nn.Linear(obs_shape[0], 64)
        self.fc2 = nn.Linear(64, num_actions)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

Agent Class

In [42]:
class DQNAgent:
    def __init__(self, obs_shape, num_actions, replay_buffer_size=10000, batch_size=16, gamma=0.2, epsilon=1.0, epsilon_decay=0.99, min_epsilon=0.001 , lr = 0.001):
        self.obs_shape = obs_shape
        self.num_actions = num_actions
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon

        self.q_network = QNet(obs_shape, num_actions)
        self.target_network = QNet(obs_shape, num_actions)
        self.target_network.load_state_dict(self.q_network.state_dict())

        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr , amsgrad = True)

    def act(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.randrange(self.num_actions)
        else:
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            q_values = self.q_network(state_tensor)
            return q_values.argmax().item()

    def observe(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def replay(self , update):
        if len(self.replay_buffer) < self.batch_size:
            return 

        minibatch = random.sample(self.replay_buffer, self.batch_size)
        minibatch_state, minibatch_action, minibatch_reward, minibatch_next_state, minibatch_done = zip(*minibatch)

        minibatch_state_tensor = torch.tensor(minibatch_state, dtype=torch.float32)
        minibatch_action_tensor = torch.tensor(minibatch_action, dtype=torch.int64)
        minibatch_reward_tensor = torch.tensor(minibatch_reward, dtype=torch.float32)
        minibatch_next_state_tensor = torch.tensor(minibatch_next_state, dtype=torch.float32)
        minibatch_done_tensor = torch.tensor(minibatch_done, dtype=torch.float32)

        q_values = self.q_network(minibatch_state_tensor)
        q_values = q_values.gather(1, minibatch_action_tensor.unsqueeze(1)).squeeze(1)

        idx = torch.argmax(self.q_network(minibatch_next_state_tensor),dim = 1)
        idx = idx.reshape((1,len(idx)))
        next_q_values = self.target_network(minibatch_next_state_tensor)
        next_q_values= next_q_values.gather(1,idx)
        next_state_q_values = torch.multiply(next_q_values , torch.ones_like(minibatch_done_tensor) - minibatch_done_tensor)
        expected_q_values = minibatch_reward_tensor + self.gamma * next_state_q_values

        loss = F.mse_loss(q_values, expected_q_values.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.min_epsilon:
            self.epsilon *= self.epsilon_decay
        if update :
          self.target_network.load_state_dict(self.q_network.state_dict())

In [43]:
def train(env, agent, num_episodes , freq = 25):
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            action = agent.act(state)
            next_state, reward, terminated , trunced, _ = env.step(action_grid[action])
            done = terminated or trunced
            agent.observe(state, action, reward, next_state, done)
            agent.replay(episode+1 % freq == 0)
            state = next_state
            total_reward += reward

        print(f"Episode {episode+1}: Totalreward = {total_reward}")

In [44]:
obs_shape = env.observation_space.shape
num_actions = len(action_grid)
agent = DQNAgent(obs_shape, num_actions)


train(env, agent, num_episodes=1000)

  loss = F.mse_loss(q_values, expected_q_values.detach())


Episode 1: Totalreward = 5.996216500721633
Episode 2: Totalreward = -3.960157938557175
Episode 3: Totalreward = 2.2353219928754338
Episode 4: Totalreward = -6.260982268771451
Episode 5: Totalreward = 13.90275551790296
Episode 6: Totalreward = 9.076362007914573
Episode 7: Totalreward = -0.17992145438245
Episode 8: Totalreward = -9.23550462995133
Episode 9: Totalreward = -9.727426640306211
Episode 10: Totalreward = 9.729188325746149
Episode 11: Totalreward = -9.27841106345915
Episode 12: Totalreward = -4.937209353918289
Episode 13: Totalreward = -1.4341071543911932
Episode 14: Totalreward = -0.7526281957217367
Episode 15: Totalreward = 1.8394654869590816
Episode 16: Totalreward = -1.8803293405454131
Episode 17: Totalreward = 3.0932722424386108
Episode 18: Totalreward = 13.051363600378814
Episode 19: Totalreward = 4.879077446785
Episode 20: Totalreward = -6.065842283340664
Episode 21: Totalreward = -14.943708840655988
Episode 22: Totalreward = -3.343222087689447
Episode 23: Totalreward = 