In [1]:
from torch.nn import Linear, Module
from torch import Tensor


class DQN(Module):
    def __init__(self,
                 n_states: int,
                 n_actions: int,
                 n_hidden_neurons: int,
                 duelling: bool = False
                ):
        super(DQN, self).__init__()
        self.duelling = duelling
        self.l1 = Linear(n_states, n_hidden_neurons)
        self.l2 = Linear(n_hidden_neurons, n_hidden_neurons)
        self.l3 = Linear(n_hidden_neurons, n_actions)
        if duelling:
            self.l4 = Linear(n_hidden_neurons, 1) # state value estimation

    def forward(self, state: Tensor) -> Tensor:
        x = self.l1(state).relu()
        x = self.l2(x).relu()
        if self.duelling:
            return self.l4(x) + (self.l3(x) - self.l3(x).mean(dim=1, keepdim=True)[0])
        else:
            return self.l3(x)

In [2]:
from numpy import float32
from numpy.typing import NDArray
from typing import List, Tuple
from collections import deque


class ReplayBuffer:
    def __init__(self,
                 max_buffer_size: int
                ):
        self.buffer = deque(maxlen=max_buffer_size)


    def add(self,
            state: NDArray[float32],
            action: int,
            reward: float,
            next: NDArray[float32],
            terminal: bool
           ):
        self.buffer.append((state, action, reward, next, terminal))


    def sample(self, 
               batches: int
              ) -> Tuple[NDArray[float32], List[int], List[float], NDArray[float32], List[bool]]:
        samples = random.sample(self.buffer, batches)
        return zip(*samples)



In [3]:
import numpy as np

class PrioritisedReplayBuffer(ReplayBuffer):
    def __init__(self,
                 max_buffer_size: int,
                 bias_factor_start: float,
                 bias_factor_end: float,
                 bias_increment: float,
                 priority_scale: float,
                 td_error_clamp: int
                ):
        super(PrioritisedReplayBuffer, self).__init__(max_buffer_size)
        self.priorities = deque(maxlen=max_buffer_size)
        self.min_priority = 0.01
        self.bias_factor = bias_factor_start
        self.bias_factor_end = bias_factor_end
        self.bias_increment = bias_increment
        self.priority_scale = priority_scale
        self.td_error_clamp = td_error_clamp

    def add(self,
            state: NDArray[float32],
            action: int,
            reward: float,
            next: NDArray[float32],
            terminal: bool,
            error: Tensor
           ):
        super(PrioritisedReplayBuffer, self).add(state, action, reward, next, terminal)
        error = np.clip(error, a_min=-self.td_error_clamp, a_max=self.td_error_clamp)
        self.priorities.append((np.abs(error) + self.min_priority)**self.priority_scale)

    def sample(self,
               batches: int
              ) -> Tuple[NDArray[float32], List[int], List[float], NDArray[float32], List[bool], List[int], List[float]]:
        probas = np.array(self.priorities)/np.sum(self.priorities)
        indices = np.random.choice(len(self.buffer), batches, p=probas)
        samples = [self.buffer[i] for i in indices]
        states, actions, rewards, nexts, terminals = zip(*samples)
        weights = (1/len(self.buffer) * 1/probas[indices]) ** self.bias_factor
        weights /= weights.sum()
        return states, actions, rewards, nexts, terminals, indices, weights

    def update_priorities(self, idx, error):
        self.priorities[idx] = (np.abs(error) + self.min_priority) ** self.priority_scale

    def update_bias_factor(self):
        self.bias_factor = min(self.bias_factor + self.bias_increment, self.bias_factor_end)

In [4]:
import torch
from torch import cuda
from torch.backends import mps
from torch.optim import SGD, Adam
from torch.nn import MSELoss
from typing import List, Dict, Union

def _get_torch_device() -> str:
    if cuda.is_available():
        return "cuda"
    elif mps.is_available():
        return "mps"
    else:
        return "cpu"


class DQNAgent:
    def __init__(self,
                 n_states: int,
                 n_actions: int,
                 n_hidden_neurons: int,
                 learning_rate: float,
                 discount_factor: float,
                 max_buffer_size: int,
                 batch_size: int,
                 modifications: List[str] = None,
                 per_params: Dict[str, float] = None
                ):
        self.device = torch.device(_get_torch_device())

        self.n_actions = n_actions
        self.discount_factor = discount_factor
        self.batch_size = batch_size

        self.qnet = DQN(n_states, n_actions, n_hidden_neurons, 'duelling' in modifications).to(self.device)
        self.modifications = modifications

        # self.optimiser = SGD(self.qnet.parameters(), lr=learning_rate)
        self.optimiser = Adam(self.qnet.parameters(), lr=learning_rate)
        if 'per' in modifications:
            self.replay_buffer = PrioritisedReplayBuffer(max_buffer_size, **per_params)
        else:
            self.replay_buffer = ReplayBuffer(max_buffer_size)
        self.target_qnet = DQN(n_states, n_actions, n_hidden_neurons, 'duelling' in modifications).to(self.device)
        self.target_qnet.load_state_dict(self.qnet.state_dict())
        self.target_qnet.eval()

    
    def get_td_error(self,
                     state: float32,
                     action: int,
                     reward: float,
                     next: float32,
                     terminal: bool
                    ):
        state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        action_tensor = torch.tensor([action], device=self.device)
        reward_tensor = torch.tensor([reward], dtype=torch.float32, device=self.device)
        next_tensor = torch.from_numpy(next).float().unsqueeze(0).to(self.device)
        terminal_tensor = torch.tensor([terminal], device=self.device)
        curr_q = self.qnet(state_tensor).gather(1, action_tensor.unsqueeze(-1)).squeeze(-1)
        next_q = self.target_qnet(next_tensor).max(1)[0]
        expected_q = reward_tensor + self.discount_factor * next_q * (1 - terminal_tensor.float())
        return MSELoss()(curr_q, expected_q).item()


    def _step_no_per(self,
             state: NDArray[float32],
             action: int,
             reward: float,
             next: NDArray[float32],
             terminal: bool
            ):
        self.replay_buffer.add(state, action, reward, next, terminal)
        if len(self.replay_buffer.buffer) > self.batch_size:
            self.update_model()


    def _step_per(self,
             state: NDArray[float32],
             action: int,
             reward: float,
             next: NDArray[float32],
             terminal: bool
            ):
        error = self.get_td_error(state, action, reward, next, terminal)
        self.replay_buffer.add(state, action, reward, next, terminal, error)
        if len(self.replay_buffer.buffer) > self.batch_size:
            self.update_model()


    def step(self,
             state: NDArray[float32],
             action: int,
             reward: float,
             next: NDArray[float32],
             terminal: bool
            ):
        if 'per' in self.modifications:
            self._step_per(state, action, reward, next, terminal)
        else:
            self._step_no_per(state, action, reward, next, terminal)


    def act(self,
            state: NDArray[float32],
            exploration_chance: float
           ) -> int:
        if random.random() > exploration_chance:
            self.qnet.eval()
            state_tensor_batched = torch.from_numpy(state).float().unsqueeze(0)
            state = state_tensor_batched.to(self.device)
            with torch.no_grad():
                action_values = self.qnet(state)
            chosen_action = np.argmax(action_values.cpu().detach().numpy())
            self.qnet.train()
            return chosen_action
        else:
            return random.choice(np.arange(self.n_actions))


    def update_model(self):
        if 'per' in self.modifications:
            states, actions, rewards, nexts, terminals, indices, weights = self.replay_buffer.sample(self.batch_size)
        else:
            states, actions, rewards, nexts, terminals = self.replay_buffer.sample(self.batch_size)

        states = torch.from_numpy(np.stack(states)).float().to(self.device)
        actions = torch.from_numpy(np.array(actions)).long().to(self.device)
        rewards = torch.from_numpy(np.array(rewards)).float().to(self.device)
        nexts = torch.from_numpy(np.stack(nexts)).float().to(self.device)
        terminals = torch.from_numpy(np.array(terminals)).float().to(self.device)

        q_values = self.qnet(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)

        if 'double' in self.modifications:
            next_q_values = self.target_qnet(nexts).gather(1, self.qnet(nexts).max(1)[1].unsqueeze(-1)).detach().squeeze(-1)
        else:
            next_q_values = self.target_qnet(nexts).max(1)[0].detach()

        expected_q_values = rewards + self.discount_factor * next_q_values * (1 - terminals)

        loss = MSELoss()(q_values, expected_q_values)
        if 'per' in self.modifications:
            loss = (loss * torch.from_numpy(weights).float().to(self.device)).mean()

        self.optimiser.zero_grad()

        loss.backward()

        self.optimiser.step()

        if 'per' in self.modifications:
            for i, (state,
                      action,
                      reward,
                      next,
                      terminal
                     ) in enumerate(zip(states.cpu().numpy(),
                                        actions.cpu().numpy(),
                                        rewards.cpu().numpy(),
                                        nexts.cpu().numpy(),
                                        terminals.cpu().numpy())
                                   ):
                error = self.get_td_error(state, action, reward, next, terminal)
                self.replay_buffer.update_priorities(i, error)
            self.replay_buffer.update_bias_factor()


    def update_target_network(self):
        self.target_qnet.load_state_dict(self.qnet.state_dict())

In [5]:
params = {
'n_hidden_neurons': 128,
'learning_rate': 5e-4,
'discount_factor': 0.99,
'max_buffer_size': 10_000,
'batch_size': 32,
'n_episodes': 10_000,
'exploration_chance_start': 1.0,
'exploration_chance_end': 1e-4,
'exploration_chance_decay': 0.99,
'target_update_freq': 20,
'finish_check_freq': 100,
'finish_score': 200,
'per_params': {
    'bias_factor_start': 0.5,
    'bias_factor_end': 1.0,
    'bias_increment': 1e-2,
    'priority_scale': 0.7,
    'td_error_clamp': 50 # 1/4 of target end score
    }
}

In [6]:
import gym
import random
import time
import json

def train(params: Dict[str, Union[int, float]], modifications: List[str] = [], save_data=False) -> DQNAgent:
    env = gym.make('LunarLander-v2')
    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.n

    if save_data:
        data_fname = f'DQN_{"_".join(modifications)}_{time.time()}.txt'

        with open(data_fname, 'w') as f:
            f.write(json.dumps(params))
            f.write('\n---\n')

    agent = DQNAgent(n_states,
                     n_actions,
                     params['n_hidden_neurons'],
                     params['learning_rate'],
                     params['discount_factor'],
                     params['max_buffer_size'],
                     params['batch_size'],
                     modifications=modifications,
                     per_params=params['per_params'] if 'per' in modifications else None
                    )

    scores = []
    latest_scores = deque(maxlen=params['finish_check_freq'])
    
    exploration_chance = params['exploration_chance_start']

    for episode_n in range(1, params['n_episodes'] + 1):
        state, _ = env.reset()
        score = 0
        steps = 0
    
        while True:
            action = agent.act(state, exploration_chance)
            next, reward, terminated, truncated, info = env.step(action)
            
            terminal = terminated or truncated
            agent.step(state, action, reward, next, terminal)
            
            state = next
            score += reward
            steps += 1
    
            if terminal:
                break
    
        scores.append(score)
        latest_scores.append(score)
    
        exploration_chance = max(params['exploration_chance_end'], params['exploration_chance_decay'] * exploration_chance)
    
        if episode_n % params['target_update_freq'] == 0:
            agent.update_target_network()
        if save_data:
            with open(data_fname, 'a') as f:
                f.write(f'\n{score}@{episode_n}:{steps}')
        if episode_n % params['finish_check_freq'] == 0:
            print(f'Average score of {np.mean(latest_scores):0.3f} @ {episode_n}/{params["n_episodes"]}')
            if np.mean(latest_scores) >= params['finish_score']:
                print(f'Average score was above {params["finish_score"]} over last {params["finish_check_freq"]} episodes. Ending training...')
                break
    env.close()
    return agent

In [7]:
def visual_run_agent(agent):
    env = gym.make('LunarLander-v2', render_mode='human')

    state, _ = env.reset()
    
    score = 0
    steps = 0
    
    while True:
        action = agent.act(state, 0)
        state, reward, terminated, truncated, info = env.step(action)
        steps += 1
    
        terminal = terminated or truncated
    
        score += reward
    
        if terminal:
            break
    
    print(f'Score achieved on test: {score}')
    print(f'Steps taken until termination: {steps}')
    env.close()

In [None]:
import itertools

def get_param_combos(params):
    to_iter_tuples = []
    iters = 1
    for k, v in params.items():
        if isinstance(v, list):
            to_iter_tuples.append((k, v))
            iters *= len(v)
    p, v = zip(*to_iter_tuples)
    return iters, [list(zip(p, c)) for c in itertools.product(*v)]

def generate_powerset(lst):
    powerset = []
    for length in range(len(lst) + 1):
        for subset in itertools.combinations(lst, length):
            powerset.append(list(subset))
    return powerset

def hyperparam_optim():
    params = {
    'n_hidden_neurons': [64, 128],
    'learning_rate': [1e-3, 5e-4, 1e-5],
    'discount_factor': 0.99,
    'max_buffer_size': 10_000,
    'batch_size': [32, 64],
    'n_episodes': 10_000,
    'exploration_chance_start': 1.0,
    'exploration_chance_end': 1e-5,
    'exploration_chance_decay': 0.995,
    'target_update_freq': [10, 20],
    'finish_check_freq': 100,
    'finish_score': 200,
    'per_params': {
        'bias_factor_start': 0.5,
        'bias_factor_end': 1.0,
        'bias_increment': 1e-2,
        'priority_scale': 0.7,
        'td_error_clamp': 50 # 1/4 of target end score
        }
    }
    modification_opts = generate_powerset(['double', 'duelling', 'per'])
    n_param_combos, param_combos = get_param_combos(params)
    for opt in modification_opts:
        print(opt)
        for i in range(n_param_combos):
            print(f'{i}/{n_param_combos}')
            for p, v in param_combos[i]:
                params[p] = v
            print(params)
            for x in range(3):
                print(x)
                agent = train(params, opt, save_data=True)

hyperparam_optim()

[]
0/24
{'n_hidden_neurons': 64, 'learning_rate': 0.001, 'discount_factor': 0.99, 'max_buffer_size': 10000, 'batch_size': 32, 'n_episodes': 10000, 'exploration_chance_start': 1.0, 'exploration_chance_end': 1e-05, 'exploration_chance_decay': 0.995, 'target_update_freq': 10, 'finish_check_freq': 100, 'finish_score': 200, 'per_params': {'bias_factor_start': 0.5, 'bias_factor_end': 1.0, 'bias_increment': 0.01, 'priority_scale': 0.7, 'td_error_clamp': 50}}
0


  if not isinstance(terminated, (bool, np.bool8)):


Average score of -147.601 @ 100/10000
Average score of -50.143 @ 200/10000
Average score of 4.726 @ 300/10000
Average score of 50.524 @ 400/10000
Average score of 85.000 @ 500/10000
Average score of 126.376 @ 600/10000
Average score of 127.217 @ 700/10000
Average score of 164.118 @ 800/10000
Average score of 157.203 @ 900/10000
Average score of 232.761 @ 1000/10000
Average score was above 200 over last 100 episodes. Ending training...
1
Average score of -125.274 @ 100/10000
Average score of -58.371 @ 200/10000
Average score of 16.200 @ 300/10000
Average score of 28.196 @ 400/10000
Average score of 62.706 @ 500/10000
Average score of 31.682 @ 600/10000
Average score of 109.152 @ 700/10000
Average score of 107.662 @ 800/10000
Average score of 77.923 @ 900/10000
Average score of 199.211 @ 1000/10000
Average score of 177.310 @ 1100/10000
Average score of 123.506 @ 1200/10000
Average score of 8.900 @ 1300/10000
Average score of 86.091 @ 1400/10000
Average score of 65.744 @ 1500/10000
Averag