## Create QNetwork Class

# DDQN Agent in Lunar Lander Version 2

To run this code safely do it in google colab and comment out anything that saves plots or agent's networks to google drive.

### Create QNetwork Class

Here we create a QNetwork class that will be used for the agents target and online network, it also takes into account optional layers that will added during hyperparameter tuning later on.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class QNetwork(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64, fc3_units=0, fc4_units=0):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
            fc3_units (int): Number of nodes in third hidden layer (optional)
            fc4_units (int): Number of nodes in fourth hidden layer (optional)
        """
        super(QNetwork, self).__init__()

        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        if fc3_units != 0:
            self.fc3 = nn.Linear(fc2_units, fc3_units)
        if fc4_units != 0:
            self.fc4 = nn.Linear(fc3_units if fc3_units != 0 else fc2_units, fc4_units)
        self.fc5 = nn.Linear((fc4_units if fc4_units != 0 else fc3_units if fc3_units != 0 else fc2_units), action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        if hasattr(self, "fc3"):
            x = F.relu(self.fc3(x))
        if hasattr(self, "fc4"):
            x = F.relu(self.fc4(x))
        return self.fc5(x)


: 

### Create Replay Buffer

In [None]:
import numpy as np
import random
from collections import namedtuple, deque

import torch

# TPU torch.device(xm.xla_device() if xm.xla_available()
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(
        self,
        action_size,
        buffer_size,
        batch_size,
        seed,
        device=torch.device(torch.device("cuda:0") if torch.cuda.is_available() else "cpu")
    ):
        """Initialize a ReplayBuffer object.
        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"],
        )
        self.seed = random.seed(seed)
        self.device = device

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = (
            torch.from_numpy(np.vstack([e.state for e in experiences if e is not None]))
            .float()
            .to(self.device)
        )
        actions = (
            torch.from_numpy(
                np.vstack([e.action for e in experiences if e is not None])
            )
            .long()
            .to(self.device)
        )
        rewards = (
            torch.from_numpy(
                np.vstack([e.reward for e in experiences if e is not None])
            )
            .float()
            .to(self.device)
        )
        next_states = (
            torch.from_numpy(
                np.vstack([e.next_state for e in experiences if e is not None])
            )
            .float()
            .to(self.device)
        )
        dones = (
            torch.from_numpy(
                np.vstack([e.done for e in experiences if e is not None]).astype(
                    np.uint8
                )
            )
            .float()
            .to(self.device)
        )

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)


### Download Necessary Packages

In [None]:
import sys
!{sys.executable} -m pip install gymnasium
!{sys.executable} -m pip install optuna
!{sys.executable} -m pip install gymnasium[box2d]

### Create DDQN Agent Class

In [None]:
class DDQNAgent:
    def __init__(
        self,
        state_size,
        action_size,
        seed,
        loss_fn,
        buffer_size=10000,
        batch_size=64,
        fc1_units=64,
        fc2_units=64,
        fc3_units=0,
        fc4_units=0,
        gamma=0.99,
        tau=1e-3,
        lr=5e-4,
        update_every=4,
        # this is used in the training loop but we want to see how a change in this can affect traning so need to be here for optune optimizer
        eps_decay=0.995,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    ):
        self.num_exploitative_actions = 0
        self.loss_fn = loss_fn
        self.eps_decay = eps_decay
        self.num_exploratory_actions = 0
        self.loss = 0.0
        self.device = device
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.steps_done = 0
        self.online_network = QNetwork(
            state_size, action_size, self.seed, fc1_units, fc2_units, fc3_units=fc3_units, fc4_units=fc4_units
        ).to(self.device)
        self.target_network = QNetwork(
            state_size, action_size, self.seed, fc1_units, fc2_units, fc3_units=fc3_units, fc4_units=fc4_units
        ).to(self.device)
        self.optimizer = torch.optim.Adam(self.online_network.parameters(), lr=self.lr)
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, self.seed)
        self.batch_size = batch_size
        self.update_counter = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.update_counter = (self.update_counter + 1) % self.update_every
        if self.update_counter == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.0):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.online_network.eval()
        with torch.no_grad():
            action_values = self.online_network(state)
        self.online_network.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            self.num_exploratory_actions += 1
            return np.argmax(action_values.cpu().data.numpy())
        else:
            self.num_exploitative_actions += 1
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from online model
        q_online = self.online_network(next_states).detach()
        best_actions = torch.argmax(q_online, dim=1)
        q_target = self.target_network(next_states).detach()
        Q_targets_next = q_target[range(self.batch_size), best_actions]
        Q_targets_next = Q_targets_next.unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.online_network(states).gather(1, actions)

        # Compute loss - only two options and if it is not mse it is huber loss
        loss = (
            F.mse_loss(Q_expected, Q_targets)
            if self.loss_fn == "mse"
            else F.huber_loss(Q_expected, Q_targets)
        )
        self.loss += loss.item()
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.update_target_network()

    def update_target_network(self):
        """Update the target network to have the same weights as the online network."""
        for target_param, online_param in zip(
            self.target_network.parameters(), self.online_network.parameters()
        ):
            target_param.data.copy_(
                self.tau * online_param.data + (1.0 - self.tau) * target_param.data
            )


### Create Utility Functions to Train Agent

In [None]:
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from google.colab import drive
import cv2
import matplotlib.pyplot as plt


drive.mount('/content/drive')

def train_agent(
    agent,
    env,
    agent_type,
    n_episodes=1000,
    max_t=1000,
    eps_start=1.0,
    eps_end=0.01,
    save_agent=False,
    save_location='/content/drive/My Drive/RLCW',
    trial=None
):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []
    episode_lengths = []
    losses = []
    exploitative_actions = []
    exploratory_actions = []

    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    eps_change = [eps]

    for i_episode in range(1, n_episodes + 1):
        state, _ = env.reset()
        score = 0
        episode_length = 0
        for _ in range(max_t):

            # Increment the episode length counter
            episode_length += 1

            action = agent.act(state, eps)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated | truncated
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward

            if done:
                break

        if trial:
            # Report the score to Optuna
            trial.report(score, step=i_episode)
            # Check if the trial should be pruned
            if trial.should_prune():
                raise optuna.TrialPruned() 

        # save total loss during the episode and reset it
        losses.append(agent.loss)
        agent.loss = 0

        exploitative_actions.append(agent.num_exploitative_actions)
        agent.num_exploitative_actions = 0

        exploratory_actions.append(agent.num_exploratory_actions)
        agent.num_exploratory_actions = 0

        episode_lengths.append(episode_length)

        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score

        eps = max(eps_end, agent.eps_decay * eps)  # decrease epsilon
        eps_change.append(eps)
        print(
            "\rEpisode {}\tAverage Score: {:.2f}".format(
                i_episode, np.mean(scores_window)
            ),
            end="",
        )
        if i_episode % 100 == 0:
            if save_agent:
                torch.save(agent.target_network.state_dict(), f'{save_location}/{agent_type}/models/{agent_type}_target_network_{i_episode}.pth')
                torch.save(agent.online_network.state_dict(), f'{save_location}/{agent_type}/models/{agent_type}_online_network_{i_episode}.pth')
            print(
                "\rEpisode {}\tAverage Score: {:.2f}".format(
                    i_episode, np.mean(scores_window)
                )
            )
        if np.mean(scores_window) >= 200.0:
            print(
                "\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}".format(
                    i_episode , np.mean(scores_window)
                )
            )
            if save_agent:
                torch.save(agent.target_network.state_dict(), f'{save_location}/{agent_type}/models/{agent_type}_target_network_{i_episode}.pth')
                torch.save(agent.online_network.state_dict(), f'{save_location}/{agent_type}/models/{agent_type}_online_network_{i_episode}.pth')
            break

    return {
        "scores": scores,
        "episode_lengths": episode_lengths,
        "losses": losses,
        "exploitative_actions": exploitative_actions,
        "exploratory_actions": exploratory_actions,
        "eps_change": eps_change,
    }
    
def get_optimal_hyperparamters(env, n_trials=30, n_episodes=1000, seed=42, save_location='/content/drive/My Drive/RLCW', agent_type='ddqn'):
    def objective(trial):
        # Sample hyperparameter values
        batch_size = trial.suggest_int("batch_size", 32, 256)
        fc1_units = trial.suggest_int("fc1_units", 16, 128)
        fc2_units = trial.suggest_int("fc2_units", 16, 128)

        optional_layer_1 = trial.suggest_categorical("include_optional_layer_1", [True, False])
        optional_layer_2 = trial.suggest_categorical("include_optional_layer_2", [True, False])

        fc3_units = 0
        if optional_layer_1:
            fc3_units = trial.suggest_int("fc3_units", 16, 128) 

        fc4_units = 0
        if optional_layer_2:
            fc4_units = trial.suggest_int("fc4_units", 16, 128) 

        gamma = trial.suggest_float("gamma", 0.9, 1.0)
        lr = trial.suggest_float("lr", 1e-4, 1e-2)
        loss_fn = trial.suggest_categorical("loss_fn", ["mse", "huber"])
        
        # Create and train DDQN agent
        agent = (
            DDQNAgent(
                state_size=8,
                action_size=4,
                seed=seed,
                batch_size=batch_size,
                fc1_units=fc1_units,
                fc2_units=fc2_units,
                fc3_units=fc3_units,
                fc4_units=fc4_units,
                gamma=gamma,
                lr=lr,
                loss_fn=loss_fn,
            )
        )
        metrics = train_agent(agent, env, n_episodes=n_episodes, agent_type=agent_type, trial=trial)
        # Return average reward over all episodes
        return np.mean(metrics["scores"])

    study = optuna.create_study(
        study_name=f"{agent_type}_study",
        direction="maximize",
        sampler=TPESampler(seed=seed),
        pruner=MedianPruner(n_warmup_steps=140, interval_steps=20, n_startup_trials=5), 
        storage=None,
    )
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    optuna.visualization.matplotlib.plot_optimization_history(study)
    # Comment out line below if you want to run and not save to drive
    plt.savefig(f"{save_location}/{agent_type}/graphs/hyperparameters/{agent_type}_optimization_history.png", bbox_inches="tight")
    plt.show()
    plt.clf()

    optuna.visualization.matplotlib.plot_slice(study)
    # Comment out line below if you want to run and not save to drive
    plt.savefig(f"{save_location}/{agent_type}/graphs/hyperparameters/{agent_type}_plot_slice.png", bbox_inches="tight")
    plt.show()
    plt.clf()

    optuna.visualization.matplotlib.plot_param_importances(study)
    # Comment out line below if you want to run and not save to drive
    plt.savefig(f"{save_location}/{agent_type}/graphs/hyperparameters/{agent_type}_paramter_importance.png", bbox_inches="tight")
    plt.show()
    plt.clf()

    return study.best_params

### Hyper Parameter Tuning

 Here we get attempt to explore the optimal hyperparameters to train the agent with

In [None]:
import gymnasium as gym

n_trials = 35
n_episodes = 500
agent_type = "ddqn"
env = gym.make("LunarLander-v2")

opt_params = get_optimal_hyperparamters(
        env, n_trials=n_trials, n_episodes=n_episodes, agent_type=agent_type
)

print("Optimal Parameters: ", opt_params)

### Train Agent With Optimal Hyperparameters

In [None]:
import gymnasium as gym
opt_agent = DDQNAgent(
                state_size=8,
                action_size=4,
                seed=42,
                fc1_units=64,
                fc2_units=64,
                fc3_units=64,
                fc4_units=64,
                #fc3_units= 0 if not opt_params['include_optional_layer_1'] else opt_params['fc3_units'],
                #fc4_units= 0 if not opt_params['include_optional_layer_2'] else opt_params['fc4_units'],
                #gamma=opt_params['gamma'],
                #batch_size=opt_params["batch_size"],
                #lr=opt_params["lr"],
                #loss_fn=opt_params["loss_fn"],
                loss_fn="mse"
            )

opt_agent2 = DDQNAgent(
                state_size=8,
                action_size=4,
                seed=42,
                fc1_units=89,
                fc2_units=55,
                #fc3_units=64,
                #fc4_units=64,
                #fc3_units= 0 if not opt_params['include_optional_layer_1'] else opt_params['fc3_units'],
                #fc4_units= 0 if not opt_params['include_optional_layer_2'] else opt_params['fc4_units'],
                gamma=0.9992470815704736,
                batch_size=98,
                lr=0.004124018930586374,
                loss_fn="huber",
            )
env = gym.make("LunarLander-v2")
# {'batch_size': 98, 'fc1_units': 89, 'fc2_units': 55, 'include_optional_layer_1': False, 'include_optional_layer_2': False, 'gamma': 0.9992470815704736, 'lr': 0.004124018930586374, 'loss_fn': 'huber'}

# if you do not want to save the agent to your drive make save_agent=False
metrics = train_agent(env=env, agent=opt_agent2, agent_type="ddqn", save_agent=False, n_episodes=500)


### Create Metric Plots 

Here we create and save plots that track metrics during training of the optimal agent for analysis purposes

In [None]:
def save_metric_plots_to_drive(metrics, location='/content/drive/My Drive/RLCW', agent_type='ddqn'):
    title = lambda x: x.replace("_", " ").title()
    # used to compute the average score for each 100 eps for a cleaner graph
    avg_scores_100 = []
    for key, metric in metrics.items():
        if key == "scores":
            for i in range(0, len(metric), 100):
                avg_scores_100.append(np.mean(metric[i : i + 100]))
        # clear plot so others don't get saved in same img
        plt.clf()
        key_title = title(key)
        plt.plot(avg_scores_100 if key == "scores" else metric)
        plt.xlabel("Episodes")
        plt.ylabel(key_title)
        plt.title(key_title + " over time")
        plt.savefig(f"{location}/{agent_type}/graphs/metrics/{agent_type}_{key}", bbox_inches="tight")
        # clear plot so others don't get saved in same img
        plt.clf()

# Only run this if you want to save metric plots to your drive   
save_metric_plots_to_drive(metrics)

### Googl Colab cannot render the environment so if you want to reproduce creating videos from saved optimal agent run the following file agent_videos.py