In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque

In [None]:
!apt-get install -y xvfb
!pip install pyvirtualdisplay

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libfontenc1 libxfont2 libxkbfile1 x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils
  xserver-common
The following NEW packages will be installed:
  libfontenc1 libxfont2 libxkbfile1 x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils
  xserver-common xvfb
0 upgraded, 9 newly installed, 0 to remove and 29 not upgraded.
Need to get 7,814 kB of archives.
After this operation, 12.0 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libfontenc1 amd64 1:1.1.4-1build3 [14.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxfont2 amd64 1:2.0.5-1build1 [94.5 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxkbfile1 amd64 1:1.1.0-1build3 [71.8 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/main amd64 x11-xkb-utils amd64 7.7+5build4 [172 kB]
Get:5 http://archiv

In [None]:
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay

In [None]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

<pyvirtualdisplay.display.Display at 0x7d1b3b9622d0>

In [None]:
# Hyperparameters
GAMMA = 0.99
LR = 1e-3
BATCH_SIZE = 64
MEMORY_SIZE = 10_000
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 0.995
TARGET_UPDATE = 10
EPISODES = 100

In [None]:
# Define the Neural Network for DQN
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [None]:
# Experience Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        samples = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*samples)
        return (np.array(states), np.array(actions), np.array(rewards, dtype=np.float32),
                np.array(next_states), np.array(dones, dtype=np.float32))

    def __len__(self):
        return len(self.buffer)

In [None]:
# Epsilon-greedy policy
def select_action(state, policy_net, epsilon, action_dim):
    if random.random() < epsilon:
        return random.randint(0, action_dim - 1)
    else:
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            return torch.argmax(policy_net(state)).item()

In [None]:
# Training function
def train_dqn():
    env = gym.make("CartPole-v1")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    policy_net = DQN(state_dim, action_dim)
    target_net = DQN(state_dim, action_dim)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr=LR)
    memory = ReplayBuffer(MEMORY_SIZE)
    epsilon = EPSILON_START
    rewards_list = []

    for episode in range(EPISODES):
        state, _ = env.reset()
        total_reward = 0

        while True:
            action = select_action(state, policy_net, epsilon, action_dim)
            next_state, reward, done, _, _ = env.step(action)
            memory.push(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

            if done:
                break

            if len(memory) >= BATCH_SIZE:
                # Sample mini-batch
                states, actions, rewards, next_states, dones = memory.sample(BATCH_SIZE)

                states = torch.tensor(states, dtype=torch.float32)
                actions = torch.tensor(actions, dtype=torch.int64).unsqueeze(1)
                rewards = torch.tensor(rewards, dtype=torch.float32)
                next_states = torch.tensor(next_states, dtype=torch.float32)
                dones = torch.tensor(dones, dtype=torch.float32)

                # Compute Q-values
                q_values = policy_net(states).gather(1, actions).squeeze()
                with torch.no_grad():
                    next_q_values = target_net(next_states).max(1)[0]
                    target_q_values = rewards + GAMMA * next_q_values * (1 - dones)

                # Compute loss
                loss = nn.MSELoss()(q_values, target_q_values)

                # Optimize the model
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        rewards_list.append(total_reward)
        epsilon = max(EPSILON_END, epsilon * EPSILON_DECAY)

        # Update target network
        if episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

        print(f"Episode {episode+1}, Reward: {total_reward}, Epsilon: {epsilon:.3f}")

    env.close()
    torch.save(policy_net.state_dict(), "dqn_cartpole.pth")

In [None]:
def watch_trained_agent():
    env = gym.make("CartPole-v1", render_mode="rgb_array")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    policy_net = DQN(state_dim, action_dim)
    policy_net.load_state_dict(torch.load("dqn_cartpole.pth"))
    policy_net.eval()

    state, _ = env.reset()
    total_reward = 0

    prev_screen = env.render()
    plt.imshow(prev_screen)

    while True:
        screen = env.render()
        plt.imshow(screen)
        ipythondisplay.clear_output(wait=True)
        ipythondisplay.display(plt.gcf())
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            action = torch.argmax(policy_net(state_tensor)).item()

        next_state, reward, done, _, _ = env.step(action)
        total_reward += reward
        state = next_state

        if done:
            break
    ipythondisplay.clear_output(wait=True)
    env.close()
    print(f"Total reward: {total_reward}")

In [None]:
train_dqn()

Episode 1, Reward: 33.0, Epsilon: 0.995
Episode 2, Reward: 34.0, Epsilon: 0.990
Episode 3, Reward: 27.0, Epsilon: 0.985
Episode 4, Reward: 16.0, Epsilon: 0.980
Episode 5, Reward: 21.0, Epsilon: 0.975
Episode 6, Reward: 19.0, Epsilon: 0.970
Episode 7, Reward: 17.0, Epsilon: 0.966
Episode 8, Reward: 8.0, Epsilon: 0.961
Episode 9, Reward: 11.0, Epsilon: 0.956
Episode 10, Reward: 10.0, Epsilon: 0.951
Episode 11, Reward: 24.0, Epsilon: 0.946
Episode 12, Reward: 28.0, Epsilon: 0.942
Episode 13, Reward: 18.0, Epsilon: 0.937
Episode 14, Reward: 16.0, Epsilon: 0.932
Episode 15, Reward: 14.0, Epsilon: 0.928
Episode 16, Reward: 24.0, Epsilon: 0.923
Episode 17, Reward: 13.0, Epsilon: 0.918
Episode 18, Reward: 41.0, Epsilon: 0.914
Episode 19, Reward: 14.0, Epsilon: 0.909
Episode 20, Reward: 17.0, Epsilon: 0.905
Episode 21, Reward: 15.0, Epsilon: 0.900
Episode 22, Reward: 19.0, Epsilon: 0.896
Episode 23, Reward: 28.0, Epsilon: 0.891
Episode 24, Reward: 20.0, Epsilon: 0.887
Episode 25, Reward: 27.0, 

In [None]:
watch_trained_agent()

NameError: name 'watch_trained_agent' is not defined