## **Preparation of Baseline and RL Models and Hyperparameter Tunning**

### **Table of Contents**

1. Libraries
   
2. Tunning of the Taxi Env
   1. Grid Search
   2. Multiprocessing
   
3. PPO tunning - Grid Search
   
4. PPO training
   
5. A2C tunning - Grid Search

6. A2C training

7. DQN tunning - Grid Search

8. DQN training

### **Libraries**

In [1]:
# Algorithms
from stable_baselines3 import PPO
from stable_baselines3 import A2C
from stable_baselines3 import DQN
import pickle
from stable_baselines3.common.vec_env import DummyVecEnv

# Model evaluation
from stable_baselines3.common.evaluation import evaluate_policy 
import numpy as np
from stable_baselines3.common.callbacks import BaseCallback


# hyperameters tuning
from sklearn.model_selection import ParameterGrid
from concurrent.futures import ThreadPoolExecutor, as_completed

# Wrappers 
import gymnasium as gym
from gymnasium.wrappers import TimeLimit
from gymnasium import RewardWrapper, ActionWrapper


# Graphs
import matplotlib.pyplot as plt
from stable_baselines3.common.callbacks import BaseCallback
from torch.utils.tensorboard import SummaryWriter

### **Taxi-v3 environment**

In [None]:
env_baseline= gym.make('Taxi-v3', render_mode= None)
print(type(env_baseline))

obs, info = env_baseline.reset()
print(env_baseline.spec.max_episode_steps)

Taxi-v3 environment with DummyVecEnv

In [2]:
def make_env():
    env = gym.make("Taxi-v3")
    env = TimeLimit(env, max_episode_steps=200)
    return env

env = DummyVecEnv([make_env for _ in range(4)])

Hyperparameters Tuning definition - GridSearch:

In [3]:
# def param_tuning(model_class, env, params, total_timesteps=1000000, n_eval_episodes=10, verbose=0):

#     best_reward = -float('inf')
#     best_params = None
#     param_grid = ParameterGrid(params)
#     for param_comb in param_grid:
#         print(f"Testing with parameter combination: {param_comb}")
#         model = model_class("MlpPolicy", env, param_comb, verbose=verbose)
#         model.learn(total_timesteps=total_timesteps)
#         mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=True)
#         if mean_reward > best_reward:
#             best_reward = mean_reward
#             best_params = param_comb

#     return best_params, best_reward

Hyperparameters Tuning using Multiprocessing:

In [3]:
def env_fn():
    env = gym.make("Taxi-v3")
    env = TimeLimit(env, max_episode_steps=200)
    return env

In [9]:
def evaluate_params(model_class, param_comb, total_timesteps, n_eval_episodes, verbose):
    print(f"Testing with parameters: {param_comb}")
    env = env_fn()  # Cria uma nova instância do ambiente para cada thread
    model = model_class("MlpPolicy", env, **param_comb, verbose=verbose)
    model.learn(total_timesteps=total_timesteps)
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=True)
    print(f"Mean reward: {mean_reward} with parameters: {param_comb}")
    return mean_reward, param_comb

In [3]:
def param_tuning_multithread(model_class, params, total_timesteps, n_eval_episodes, verbose, max_threads):

    param_grid = list(ParameterGrid(params))  
    best_reward = -float('inf')
    best_params = None

   
    with ThreadPoolExecutor(max_threads) as executor:
        futures = [
            executor.submit(evaluate_params, model_class, param_comb, total_timesteps, n_eval_episodes, verbose)
            for param_comb in param_grid
        ]

        for future in as_completed(futures):
            try:
                mean_reward, param_comb = future.result()
                if mean_reward > best_reward:
                    best_reward = mean_reward
                    best_params = param_comb
            except Exception as e:
                print(f"Erro ao processar parâmetros: {e}")

    print(f"Best parameters: {best_params}")
    print (f"Best reward: {best_reward}")
    return best_params, best_reward



**Reward Callback**

In [None]:
class RewardLoggingCallback(BaseCallback):
    """
    Callback que apenas adiciona recompensas ao TensorBoard,
    sem sobrescrever os gráficos padrão do Stable Baselines3.
    """
    def __init__(self, log_dir: str, verbose=0):
        super().__init__(verbose)
        self.writer = SummaryWriter(log_dir)
        self.episode_rewards = 0
        self.episode_count = 0

    def _on_step(self) -> bool:
        # Acumula recompensas do episódio
        self.episode_rewards += self.locals["rewards"][0]

        # Se o episódio termina, loga a recompensa acumulada
        if self.locals["dones"][0]:
            self.episode_count += 1
            self.writer.add_scalar("reward/episode_reward", self.episode_rewards, self.episode_count)
            self.episode_rewards = 0  # Reseta a recompensa
        return True

    def _on_training_end(self) -> None:
        # Fecha o writer ao final do treinamento
        self.writer.close()


### **PPO - Proximal Policy Optimization**

#### **Hyperparameter Tunning - Grid Search**

In [5]:
params_ppo_grid = {

    'learning_rate': [1e-4, 1e-3, 1e-2],
    'n_steps': [128, 256, 512],
    'batch_size': [32, 64],
    'ent_coef': [0.0, 0.01, 0.1],
    'gamma': [0.99, 0.95]

}

In [None]:
best_params, best_reward = param_tuning_multithread(PPO, params_ppo_grid, total_timesteps=100000, n_eval_episodes=10, verbose=0, max_threads=4)

In [7]:
best_parameters_ppo =  {
    'batch_size': 32, 
    'ent_coef': 0.1, 'gamma': 0.95, 
    'learning_rate': 0.001, 
    'n_steps': 512
}




### **PPO Model Training**

In [12]:
log_dir = "./ppo_tensorboard/"
model = PPO("MlpPolicy", env_baseline, **best_parameters_ppo, verbose=0, tensorboard_log=log_dir)
reward_callback = RewardLoggingCallback(log_dir=log_dir)
model.learn(total_timesteps=40_000_000, callback=reward_callback)
model.save("ppo_taxi")

### **A2C - Advantage Actor-Critic**

#### **Hyperparameter Tunning - Grid Search**

In [12]:
params_a2c_grid = {

    'learning_rate': [1e-4, 1e-3, 1e-2],
    'n_steps': [20, 50],
    'gamma': [0.99, 0.95],
    'ent_coef': [0.0, 0.01, 0.1],
    'vf_coef': [0.5, 0.75],
    'max_grad_norm': [1.0, 2.0],
    
}

In [None]:
best_params_a2c, best_reward_a2c = param_tuning_multithread(A2C,params_a2c_grid,total_timesteps=100000,n_eval_episodes=10,verbose=0,max_threads=4)

In [23]:
best_parameters_a2c = {
    'ent_coef': 0.0, 
    'gamma': 0.99, 
    'learning_rate': 0.0001, 
    'max_grad_norm': 1.0, 
    'n_steps': 50, 
    'vf_coef': 0.75
}

### **A2C Training**

In [None]:
log_dir = "./a2c_tensorboard/"
model = A2C("MlpPolicy", env_baseline, **best_parameters_a2c, verbose=0, tensorboard_log=log_dir)
reward_callback = RewardLoggingCallback(log_dir=log_dir)
model.learn(total_timesteps=40_000_000, callback=reward_callback)
model.save("a2c_taxi")

### **DQN - Deep Q-Network**

#### **Hyperparameter Tunning - Grid Search**

In [None]:
params_dqn_grid = {
    'learning_rate': [1e-4, 1e-3, 1e-2],        # Learning rate for the optimizer
    'buffer_size': [50000, 100000],            # Replay buffer size
    'batch_size': [32, 64],                    # Mini-batch size for training
    'gamma': [0.95, 0.99],                     # Discount factor for future rewards
    'exploration_final_eps': [0.01, 0.1],      # Final value of epsilon for exploration
}

In [None]:
best_params_dqn, best_reward_dqn = param_tuning_multithread(DQN, params_dqn_grid, total_timesteps=100000, n_eval_episodes=10, verbose=0, max_threads=4)

In [47]:
best_parameters_dqn = {
    'batch_size': 32, 
    'buffer_size': 100000, 
    'exploration_final_eps': 0.1, 
    'gamma': 0.95, 
    'learning_rate': 0.001
}

### **DQN Training**

In [49]:
env_dqn = gym.make("Taxi-v3")
env_dqn = TimeLimit(env_dqn, max_episode_steps=200)

In [None]:
log_dir = "./dqn_tensorboard/"
model = DQN("MlpPolicy", env_dqn, **best_parameters_dqn, verbose=0, tensorboard_log=log_dir)
reward_callback = RewardLoggingCallback(log_dir=log_dir)
model.learn(total_timesteps=40_000_000, callback=reward_callback)
model.save("dqn_taxi")