In [1]:
!pip install gym

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install gym[mujoco]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mujoco==2.2.0 (from gym[mujoco])
  Downloading mujoco-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco==2.2.0->gym[mujoco])
  Downloading glfw-2.5.9-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.8/207.8 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: glfw, mujoco
Successfully installed glfw-2.5.9 mujoco-2.2.0


In [3]:
!pip install --upgrade ipykernel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ipykernel
  Downloading ipykernel-6.23.1-py3-none-any.whl (152 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.2/152.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting comm>=0.1.1 (from ipykernel)
  Downloading comm-0.1.3-py3-none-any.whl (6.6 kB)
Collecting jedi>=0.16 (from ipython>=7.23.1->ipykernel)
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, comm, ipykernel
  Attempting uninstall: ipykernel
    Found existing installation: ipykernel 5.5.6
    Uninstalling ipykernel-5.5.6:
      Successfully uninstalled ipykernel-5.5.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following

In [17]:
import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

env = gym.make('Walker2d-v4' , new_step_api = True)

x1 , x2 , x3 , x4 , x5 , x6 = np.meshgrid(np.arange(-1, 1.1, 0.5),
                  np.arange(-1, 1.1, 0.5),
                  np.arange(-1, 1.1, 0.5),
                  np.arange(-1, 1.1, 0.5),
                  np.arange(-1, 1.1, 0.5),
                  np.arange(-1, 1.1, 0.5)
                  )
                  
action_grid = np.c_[x1.ravel() , x2.ravel() , x3.ravel() , x4.ravel() , x5.ravel() , x6.ravel()]


# Define a neural network module for the Q-function
class QNet(nn.Module):
    def __init__(self, obs_shape, num_actions):
        super(QNet, self).__init__()
        self.fc1 = nn.Linear(obs_shape[0], 64)
        self.fc2 = nn.Linear(64, num_actions)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the DQN agent
class DQNAgent:
    def __init__(self, obs_shape, num_actions, replay_buffer_size=10000, batch_size=16, gamma=0.2, epsilon=1.0, epsilon_decay=0.99, min_epsilon=0.001 , lr = 0.001):
        self.obs_shape = obs_shape
        self.num_actions = num_actions
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon

        # Create the Q-function network
        self.q_network = QNet(obs_shape, num_actions)

        # Create the optimizer
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr , amsgrad = True)

    def act(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.randrange(self.num_actions)
        else:
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            q_values = self.q_network(state_tensor)
            return q_values.argmax().item()

    def observe(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        minibatch = random.sample(self.replay_buffer, self.batch_size)
        minibatch_state, minibatch_action, minibatch_reward, minibatch_next_state, minibatch_done = zip(*minibatch)

        minibatch_state_tensor = torch.tensor(minibatch_state, dtype=torch.float32)
        minibatch_action_tensor = torch.tensor(minibatch_action, dtype=torch.int64)
        minibatch_reward_tensor = torch.tensor(minibatch_reward, dtype=torch.float32)
        minibatch_next_state_tensor = torch.tensor(minibatch_next_state, dtype=torch.float32)
        minibatch_done_tensor = torch.tensor(minibatch_done, dtype=torch.float32)

        # Compute the Q-values for the current state and next state
        q_values = self.q_network(minibatch_state_tensor)
        q_values = q_values.gather(1, minibatch_action_tensor.unsqueeze(1)).squeeze(1)

        next_q_values = self.q_network(minibatch_next_state_tensor).max(1)[0]
        next_state_q_values = torch.multiply(next_q_values , torch.ones_like(minibatch_done_tensor) - minibatch_done_tensor)
        expected_q_values = minibatch_reward_tensor + self.gamma * next_q_values

        # Compute the loss and backpropagate
        loss = F.mse_loss(q_values, expected_q_values.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Decay the exploration rate
        if self.epsilon > self.min_epsilon:
            self.epsilon *= self.epsilon_decay

# Define the main training loop
def train(env, agent, num_episodes):
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            action = agent.act(state)
            next_state, reward, terminated , trunced, _ = env.step(action_grid[action])
            done = terminated or trunced
            agent.observe(state, action, reward, next_state, done)
            agent.replay()
            state = next_state
            total_reward += reward

        print(f"Episode {episode+1}: Totalreward = {total_reward}")


# Create the DQN agent
obs_shape = env.observation_space.shape
num_actions = len(action_grid)
agent = DQNAgent(obs_shape, num_actions)

# Train the agent
train(env, agent, num_episodes=1000)



Episode 1: Totalreward = -1.7593328503532204
Episode 2: Totalreward = -3.7819994672449546
Episode 3: Totalreward = -4.057451744790956
Episode 4: Totalreward = -0.0881734541571551
Episode 5: Totalreward = -5.154013305657402
Episode 6: Totalreward = -5.60285227356775
Episode 7: Totalreward = -3.275051555627254
Episode 8: Totalreward = 0.029917722329277113
Episode 9: Totalreward = 0.4726935526880923
Episode 10: Totalreward = 4.189546517609354
Episode 11: Totalreward = -1.298410720694381
Episode 12: Totalreward = 4.448533342451931
Episode 13: Totalreward = -2.6239255327875353
Episode 14: Totalreward = 2.381249524913336
Episode 15: Totalreward = -6.466797415764881
Episode 16: Totalreward = -5.523751093508677
Episode 17: Totalreward = -8.531870857984554
Episode 18: Totalreward = 7.999835942899682
Episode 19: Totalreward = -6.480285436936045
Episode 20: Totalreward = -5.895113507303104
Episode 21: Totalreward = 2.432141864920285
Episode 22: Totalreward = 20.929242827226094
Episode 23: Totalre

In [16]:
state = env.reset()
done = False
total_reward = 0

while not done:
    action = agent.act(state)
    state, reward, terminated , trunced, _ = env.step(action_grid[action])
    done = terminated or trunced
    total_reward += reward

print(f"Test reward: {total_reward}")

Test reward: 323.59702177737694
