In [1]:
# model config params
N_HIDDEN_NODES = 64
LEARNING_RATE = 1e-3
DISCOUNT_FACTOR = 0.99
MAX_BUFFER_SIZE = 10_000
BATCH_SIZE = 64

In [2]:
from torch.nn import Linear, Module
from torch import Tensor


class DQN(Module):
    def __init__(self,
                 n_states: int,
                 n_actions: int,
                 n_hidden_nodes: int,
                 duelling: bool = False
                ):
        super(DQN, self).__init__()
        self.duelling = duelling
        self.l1 = Linear(n_states, n_hidden_nodes)
        self.l2 = Linear(n_hidden_nodes, n_hidden_nodes)
        self.l3 = Linear(n_hidden_nodes, n_actions)
        if duelling:
            self.l4 = Linear(n_hidden_nodes, 1)

    def forward(self, state: Tensor) -> Tensor:
        x = self.l1(state).relu()
        x = self.l2(x).relu()
        if self.duelling:
            return self.l4(x) + (self.l3(x) - self.l3(x).max(dim=1, keepdim=True)[0])
        else:
            return self.l3(x)

In [3]:
from numpy import float32
from numpy.typing import NDArray
from typing import List, Tuple
from collections import deque


class ReplayBuffer:
    def __init__(self,
                 max_buffer_size: int
                ):
        self.buffer = deque(maxlen=max_buffer_size)


    def add(self,
            state: NDArray[float32],
            action: int,
            reward: float,
            next: NDArray[float32],
            terminal: bool
           ):
        self.buffer.append((state, action, reward, next, terminal))


    def sample(self, batches: int) -> Tuple[NDArray[float32], List[int], List[float], NDArray[float32], List[bool]]:
        samples = random.sample(self.buffer, batches)
        return zip(*samples)



In [4]:
import torch
from torch import cuda
from torch.backends import mps
from torch.optim import SGD, Adam
from torch.nn import MSELoss
import numpy as np
from typing import List

def _get_torch_device() -> str:
    if cuda.is_available():
        return "cuda"
    elif mps.is_available():
        return "mps"
    else:
        return "cpu"


class DQNAgent:
    def __init__(self,
                 n_states: int,
                 n_actions: int,
                 n_hidden_nodes: int,
                 learning_rate: float,
                 discount_factor: float,
                 max_buffer_size: int,
                 batch_size: int,
                 modifications: List[str] = None
                ):
        self.device = torch.device(_get_torch_device())

        self.n_actions = n_actions
        self.discount_factor = discount_factor
        self.batch_size = batch_size

        self.qnet = DQN(n_states, n_actions, n_hidden_nodes, 'duelling' in modifications).to(self.device)
        self.modifications = modifications

        # self.optimiser = SGD(self.qnet.parameters(), lr=learning_rate)
        self.optimiser = Adam(self.qnet.parameters(), lr=learning_rate)
        self.replay_buffer = ReplayBuffer(max_buffer_size)
        self.target_qnet = DQN(n_states, n_actions, n_hidden_nodes, 'duelling' in modifications).to(self.device)
        self.target_qnet.load_state_dict(self.qnet.state_dict())
        self.target_qnet.eval()


    def step(self,
             state: NDArray[float32],
             action: int,
             reward: float,
             next: NDArray[float32],
             terminal: bool
            ):
        self.replay_buffer.add(state, action, reward, next, terminal)
        if len(self.replay_buffer.buffer) > self.batch_size:
            self.update_model()


    def act(self,
            state: NDArray[float32],
            exploration_chance: float
           ) -> int:
        if random.random() > exploration_chance:
            self.qnet.eval()
            state_tensor_batched = torch.from_numpy(state).float().unsqueeze(0)
            state = state_tensor_batched.to(self.device)
            with torch.no_grad():
                action_values = self.qnet(state)
            chosen_action = np.argmax(action_values.cpu().detach().numpy())
            self.qnet.train()
            return chosen_action
        else:
            return random.choice(np.arange(self.n_actions))


    def update_model(self):
        states, actions, rewards, nexts, terminals = self.replay_buffer.sample(self.batch_size)

        states = torch.from_numpy(np.stack(states)).float().to(self.device)
        actions = torch.from_numpy(np.array(actions)).long().to(self.device)
        rewards = torch.from_numpy(np.array(rewards)).float().to(self.device)
        nexts = torch.from_numpy(np.stack(nexts)).float().to(self.device)
        terminals = torch.from_numpy(np.array(terminals)).float().to(self.device)

        q_values = self.qnet(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)

        if 'double' in self.modifications:
            next_q_values = self.target_qnet(nexts).gather(1, self.qnet(nexts).max(1)[1].unsqueeze(-1)).detach().squeeze(-1)
        else:
            next_q_values = self.target_qnet(nexts).max(1)[0].detach()

        expected_q_values = rewards + self.discount_factor * next_q_values * (1 - terminals)

        loss = MSELoss()(q_values, expected_q_values)

        self.optimiser.zero_grad()

        loss.backward()

        self.optimiser.step()


    def update_target_network(self):
        self.target_qnet.load_state_dict(self.qnet.state_dict())

In [5]:
# training config params

N_EPISODES = 10_000
EXPLORATION_CHANCE_START = 1.0
EXPLORATION_CHANCE_END = 1e-4
EXPLORATION_CHANCE_DECAY = 0.995
TARGET_UPDATE_FREQ = 10
FINISH_CHECK_FREQ = 100
FINISH_SCORE = 250

In [23]:
import gym
import random

def train(modifications) -> DQNAgent:
    env = gym.make('LunarLander-v2')
    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.n

    agent = DQNAgent(n_states,
                     n_actions,
                     N_HIDDEN_NODES,
                     LEARNING_RATE,
                     DISCOUNT_FACTOR,
                     MAX_BUFFER_SIZE,
                     BATCH_SIZE,
                     modifications=modifications
                    )

    scores = []
    latest_scores = deque(maxlen=FINISH_CHECK_FREQ)
    
    exploration_chance = EXPLORATION_CHANCE_START

    for episode_n in range(1, N_EPISODES + 1):
        state, _ = env.reset()
        score = 0
    
        while True:
            action = agent.act(state, exploration_chance)
            next, reward, terminated, truncated, info = env.step(action)
            
            terminal = terminated or truncated
            agent.step(state, action, reward, next, terminal)
            
            state = next
            score += reward
    
            if terminal:
                break
    
        scores.append(score)
        latest_scores.append(score)
    
        exploration_chance = max(EXPLORATION_CHANCE_END, EXPLORATION_CHANCE_DECAY * exploration_chance)
    
        if episode_n % TARGET_UPDATE_FREQ == 0:
            agent.update_target_network()
    
        if episode_n % FINISH_CHECK_FREQ == 0:
            print(f'Average score of {np.mean(latest_scores):0.3f} @ {episode_n}/{N_EPISODES}')
            if np.mean(latest_scores) >= FINISH_SCORE:
                print(f'Average score was above {FINISH_SCORE} over last {FINISH_CHECK_FREQ} episodes. Ending training...')
                break
    env.close()
    return agent

agent = train(['double', 'duelling'])

Average score of -145.531 @ 100/10000
Average score of -68.686 @ 200/10000
Average score of 10.673 @ 300/10000
Average score of 30.761 @ 400/10000
Average score of 110.904 @ 500/10000
Average score of 73.490 @ 600/10000
Average score of 111.724 @ 700/10000
Average score of 145.137 @ 800/10000
Average score of 55.154 @ 900/10000
Average score of 172.880 @ 1000/10000
Average score of 199.781 @ 1100/10000
Average score of 188.393 @ 1200/10000
Average score of 187.736 @ 1300/10000
Average score of 108.024 @ 1400/10000
Average score of -22.186 @ 1500/10000
Average score of 185.594 @ 1600/10000
Average score of 156.228 @ 1700/10000
Average score of 11.531 @ 1800/10000
Average score of 92.326 @ 1900/10000
Average score of 204.403 @ 2000/10000
Average score of 209.802 @ 2100/10000
Average score of 137.123 @ 2200/10000
Average score of 168.348 @ 2300/10000
Average score of 196.342 @ 2400/10000
Average score of 224.495 @ 2500/10000
Average score of 253.712 @ 2600/10000
Average score was above 25

In [None]:
def visual_run_agent(agent):
    env = gym.make('LunarLander-v2', render_mode='human')

    state, _ = env.reset()
    
    score = 0
    steps = 0
    
    while True:
        action = agent.act(state, 0)
        state, reward, terminated, truncated, info = env.step(action)
        steps += 1
    
        terminal = terminated or truncated
    
        score += reward
    
        if terminal:
            break
    
    print(f'Score achieved on test: {score}')
    print(f'Steps taken until termination: {steps}')
    env.close()

visual_run_agent(agent)