In [15]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from itertools import count
from collections import deque, namedtuple

SEED = 42

In [16]:
class DQN(nn.Module):

    def __init__(self, inputs, outputs):
        super().__init__()

        self.fc1 = nn.Linear(in_features=inputs, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=64)
        self.fc3 = nn.Linear(in_features=64, out_features=64)
        self.fc4 = nn.Linear(in_features=64, out_features=32)
        self.out = nn.Linear(in_features=32, out_features=outputs)

    def forward(self, t):
        t = F.relu(self.fc1(t))
        t = F.relu(self.fc2(t))
        t = F.relu(self.fc3(t))
        t = F.relu(self.fc4(t))
        t = self.out(t)
        return t

In [17]:
Experience = namedtuple(
    "Experience", field_names=["state", "action", "reward", "next_state", "done"]
)


class ReplayMemory(object):

    def __init__(self, buffer_size, batch_size, seed):
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.seed = random.seed(seed)

    def push(self, state, action, reward, next_state, done):
        self.memory.append(Experience(state, action, reward, next_state, done))

    def sample(self, device):
        experiences = random.sample(self.memory, k=self.batch_size)

        states = (
            torch.from_numpy(np.vstack([e.state for e in experiences if e is not None]))
            .float()
            .to(device)
        )
        actions = (
            torch.from_numpy(
                np.vstack([e.action for e in experiences if e is not None])
            )
            .long()
            .to(device)
        )
        rewards = (
            torch.from_numpy(
                np.vstack([e.reward for e in experiences if e is not None])
            )
            .float()
            .to(device)
        )
        next_states = (
            torch.from_numpy(
                np.vstack([e.next_state for e in experiences if e is not None])
            )
            .float()
            .to(device)
        )
        dones = (
            torch.from_numpy(
                np.vstack([e.done for e in experiences if e is not None]).astype(
                    np.uint8
                )
            )
            .float()
            .to(device)
        )

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [18]:
class SimpleDQNAgent:
    def __init__(
        self,
        state_vector_length,
        num_actions,
        alpha=0.001,
        eps=1,
        eps_decay=0.995,
        eps_min=0.05,
        gamma=0.9,
        batch_size=64,
        seed=42,
    ):
        self.num_actions = num_actions
        self.eps = eps
        self.eps_decay = eps_decay
        self.eps_min = eps_min
        self.gamma = gamma
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.step = 0
        self.policy_net = DQN(state_vector_length, num_actions).to(self.device)
        self.optimizer = torch.optim.Adam(params=self.policy_net.parameters(), lr=alpha)

        self.memory = ReplayMemory(100000, batch_size, seed)

        if seed != None:
            np.random.seed(seed)

    def select_action(self, s):
        self.step += 1
        if np.random.random() < self.eps:
            action = np.random.randint(0, self.num_actions)
        else:
            action = self._get_best_action(s)

        return action

    def _get_best_action(self, s):
        with torch.no_grad():
            action = (
                self.policy_net(torch.tensor([s]).to(self.device))
                .argmax(dim=1)
                .to(self.device)
                .item()
            )
        return action

    def update_q(self, s, a, s_prime, r, done):
        self.memory.push(s, a, r, s_prime, done)
        self.step += 1

        if done:
            self.eps = max(self.eps_min, self.eps * self.eps_decay)

        if len(self.memory) > self.memory.batch_size:
            experiences = self.memory.sample(self.device)
            self.learn(experiences)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        next_q_values = self.policy_net(next_states).detach().max(1)[0].unsqueeze(1)
        q_targets = rewards + self.gamma * next_q_values * (1 - dones)
        current_q_values = self.policy_net(states).gather(1, actions)

        loss = F.mse_loss(current_q_values, q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def save_network(self, outfile):
        torch.save(self.policy_net.state_dict(), outfile)

    def load_network(self, infile):
        self.policy_net.load_state_dict(torch.load(infile))
        self.policy_net.eval()

In [29]:
def lander_runner(
    num_episodes,
    target_update,
    alpha,
    eps,
    eps_decay,
    gamma,
    seed,
    convergence_threshold=200,
    render=False,
):
    env = gym.make("LunarLander-v2")
    # env.seed(SEED)
    agent = SimpleDQNAgent(
        env.observation_space.shape[0],
        env.action_space.n,
        alpha=alpha,
        eps=eps,
        eps_decay=eps_decay,
        gamma=gamma,
        seed=SEED,
    )

    rewards = []

    for e in range(num_episodes):
        cur_observation = env.reset()
        if render:
            env.render()
        episode_reward = 0
        for t in count():
            action = agent.select_action(cur_observation)
            next_observation, reward, done, truncated, info = env.step(action)
            agent.update_q(cur_observation, action, next_observation, reward, done)
            cur_observation = next_observation
            episode_reward += reward
            if render:
                env.render()
            if done:
                rewards.append(episode_reward)
                # plot_rewards(rewards)
                plt.pause(0.01)
                print(f"Episode {e}: {episode_reward}")
                # if is_ipython:
                #     display.clear_output(wait=True)
                break
        # if e % target_update == 0:
        #     agent.update_target()
        # if np.all(moving_average(rewards, 100)[-100:] >= convergence_threshold):
        #     print(f"Solved in {e} episodes.")
        #     agent.save_network(f"out\\agent.pt")
        #     break

    env.close()
    return rewards, agent

In [30]:
run_rewards, agent = lander_runner(
    num_episodes=1500,
    target_update=4,
    alpha=0.0005,
    eps=1,
    eps_decay=0.99,
    gamma=0.999,
    seed=57,
    convergence_threshold=210
)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.