<a href="https://colab.research.google.com/github/prantoran/ai-prac/blob/master/rl/q_network_lunarlander_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!apt-get install swig # required by gymnasium[box2d]

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 35 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 2s (729 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 126374 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubunt

In [3]:
!pip install "gymnasium[box2d]" # required by gym.make("LunarLander-v3")

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/374.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.* (from gymnasium[box2d])
  Downloading swig-4.3.1.post0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1.post0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp312-cp312-linux_x86_64.whl size=2381980 sha256=c41bd216947e5b5bc826b082cc30

In [4]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [5]:
GEMMA = 0.99

In [6]:
env = gym.make("LunarLander-v3")

In [7]:
class Network(nn.Module):
    def __init__(self, dim_inputs, dim_outputs):
        super(Network, self).__init__()
        self.linear = nn.Linear(dim_inputs, dim_outputs)
    def forward(self, x):
        return self.linear(x)

network = Network(8, 4)

optimizer = optim.Adam(network.parameters(), lr=0.0001)

print("Network initialized as:\n", network)

Network initialized as:
 Network(
  (linear): Linear(in_features=8, out_features=4, bias=True)
)


In [8]:
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        # Instantiate the first hidden layer
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        # Instantiate the output layer
        self.fc3 = nn.Linear(64, action_size)
    def forward(self, state):
        # Ensure the ReLU activation function is used
        x = torch.relu(self.fc1(torch.tensor(state)))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [9]:
state_size = 8
action_size = 4
# Instantiate the Q Network
q_network = QNetwork(state_size, action_size)
# Specify the optimizer learning rate
optimizer = optim.Adam(q_network.parameters(), lr=0.0001)

print("Q-Network initialized as:\n", q_network)

Q-Network initialized as:
 QNetwork(
  (fc1): Linear(in_features=8, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=4, bias=True)
)


In [10]:
def select_action(q_network, state):
  # feed state to network to get Q-values
  q_values = q_network(state)
  print("Q-values:", [round(x, 2) for x in q_values.tolist()])
  # Obtain index of action with the highest Q-value
  action = torch.argmax(q_values).item()
  print(f"Action selected: {action}, with q-value {q_values[action]:.2f}")
  return action

In [11]:
def calculate_loss(
    q_network, state, action,
    next_state, reward, done):
  q_values = q_network(state)
  current_state_q_value = q_values[action]
  next_state_q_values = q_network(next_state)
  next_state_q_value = next_state_q_values.max()
  target_q_value = reward + GEMMA * (1 - done) * next_state_q_value
  loss = nn.MSELoss()(current_state_q_value, target_q_value)
  return loss

In [12]:
# Run ten episodes
for episode in range(0, 10):
    state, info = env.reset()
    done = False
    # Run through steps until done
    while done:
        action = select_action(network, state)
        # Take the action
        next_state, reward, terminated, truncated, _ = (env.step(action))
        done = terminated or truncated
        loss = calculate_loss(network, state, action, next_state, reward, done)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # Update the state
        state = next_state
    print(f"Episode {episode} complete.")

Episode 0 complete.
Episode 1 complete.
Episode 2 complete.
Episode 3 complete.
Episode 4 complete.
Episode 5 complete.
Episode 6 complete.
Episode 7 complete.
Episode 8 complete.
Episode 9 complete.
