In [1]:
!pip install gymnasium ale_py



In [2]:
import gymnasium as gym
import ale_py

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

In [4]:
device = torch.device('cpu')

print(device) # Should return GPU, use the Runtime > Change runtime type if you haven't already (this will require you to rerun all the colab cells so far).

cpu


In [5]:
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(128, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 6)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [6]:
model = DQN()
model.to(device)

DQN(
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=6, bias=True)
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)

In [7]:
# Define DQN Agent with Experience Replay Buffer
class DQNAgent:
    def __init__(self, action_dim, lr, gamma, epsilon, epsilon_decay, buffer_size):
        self.action_dim = action_dim
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.memory = deque(maxlen=buffer_size)

        self.optimizer = optim.Adam(model.parameters(), lr=lr)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_dim)
        q_values = model(torch.tensor(state, dtype=torch.float32))
        return torch.argmax(q_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            state_tensor = torch.tensor(state, dtype=torch.float32)
            state_tensor.to(device)
            next_state_tensor = torch.tensor(next_state, dtype=torch.float32)
            next_state_tensor.to(device)
            if not done:
                target = reward + self.gamma * torch.max(model(next_state_tensor)).item()

            target_f = model(state_tensor)
            target_f[action] = target
            self.optimizer.zero_grad()
            loss = nn.MSELoss()(torch.tensor(target_f), model(state_tensor))
            loss.backward()
            self.optimizer.step()
        if self.epsilon > 0.01:
            self.epsilon *= self.epsilon_decay

In [36]:
def permute(a):
    return np.transpose(np.resize(a, (1, 210, 160, 3)), (0, 3, 1, 2))

In [None]:
# Initialize environment and agent with Experience Replay Buffer
gym.register_envs(ale_py)

env = gym.make('ALE/Pong-v5', obs_type='ram')
action_dim = 6
agent = DQNAgent(action_dim, lr=0.001, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, buffer_size=10000)

# Train the DQN agent with Experience Replay Buffer
batch_size = 32
num_episodes = 1000
for episode in range(num_episodes):
    state, _ = env.reset()
    #state = permute(state)
    #state.to(device)
    total_reward = 0
    done = False
    frames = 0
    while not done and frames < 1000:
        action = agent.act(state)
        next_state, reward, done, _, _ = env.step(action)
        #next_state = permute(next_state)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        agent.replay(batch_size)
        frames += 1
    print(f"Episode: {episode + 1}, Total Reward: {total_reward}")

  loss = nn.MSELoss()(torch.tensor(target_f), model(state_tensor))


In [None]:
np.resize(next_state, (1, 210, 160, 3)).shape