## **Changes in environment**



### **Table of Contents**

1. Libraries

2. Modify Actions
   1. Evaluate with PPO
   2. Evaluate with A2C
   3. Evaluate with DQN
   
3. Modify Rewards 
   1. Evaluate with PPO
   2. Evaluate with A2C
   3. Evaluate with DQN
   
4. Combining both changes 
   1. Evaluate with PPO
   2. Evaluate with A2C
   3. Evaluate with DQN

### **Libraries**

In [74]:
# Algorithms
from stable_baselines3 import PPO
from stable_baselines3 import A2C
from stable_baselines3 import DQN
import pickle
from stable_baselines3.common.vec_env import DummyVecEnv

# Model evaluation
from stable_baselines3.common.evaluation import evaluate_policy 
import numpy as np
from stable_baselines3.common.callbacks import BaseCallback


# hyperameters tuning
from sklearn.model_selection import ParameterGrid
from concurrent.futures import ThreadPoolExecutor, as_completed

# Wrappers 
import gymnasium as gym
from gymnasium.wrappers import TimeLimit
from gymnasium import RewardWrapper, ActionWrapper


# Graphs
import matplotlib.pyplot as plt


### **Modify Action Space**


The first modification to the environment will be to add the ability to move diagonally to the actions. This could allow the taxi to move faster, as each diagonal action replaces two normal actions.

In [75]:
class CustomActionWrapper(ActionWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.env = env
        self.action_space = gym.spaces.Discrete(env.action_space.n + 4) # Adicionar 4 ações ao Espaço de Ações 

    def action(self, action):
        """Transform the custom action into a base action if necessary."""
        if action in [6, 7, 8, 9]:
            # Diagonal actions are handled in the `step` method.
            return None  # Indicate custom handling
        return action  # Pass through for base actions


    def step(self, action):
        grid_size = 5

        # Get the taxi position
        encoded_state = self.env.unwrapped.s
        
        taxi_row, taxi_col, pass_idx, dest_idx = self.env.unwrapped.decode(encoded_state)

        # Handle diagonal actions
        if action == 6:  # Move South-East
            new_row = min(taxi_row + 1, grid_size - 1)
            new_col = min(taxi_col + 1, grid_size - 1)
        elif action == 7:  # Move South-West
            new_row = min(taxi_row + 1, grid_size - 1)
            new_col = max(taxi_col - 1, 0)
        elif action == 8:  # Move North-East
            new_row = max(taxi_row - 1, 0)
            new_col = min(taxi_col + 1, grid_size - 1)
        elif action == 9:  # Move North-West
            new_row = max(taxi_row - 1, 0)
            new_col = max(taxi_col - 1, 0)
        else:
            # Pass through base actions to the original step method
            return super().step(action)

        # Validate the move (check for walls or invalid spaces)
        if self.env.unwrapped.desc[new_row, new_col] != b' ':  # Assume `b' '` indicates valid space
            # Invalid move, no state change
            reward = -1  # Same as base environment for invalid moves
            done = False
            obs = self.env.unwrapped.s
        else:
            # Update the state manually for diagonal actions
            self.env.unwrapped.s = self.env.unwrapped.encode(new_row, new_col, pass_idx, dest_idx)

            # Compute reward manually
            reward = -1  # Default reward for non-goal moves

            # Check if the new state is terminal
            done = self.env.unwrapped.s == self.env.unwrapped.encode(
                dest_idx // grid_size, dest_idx % grid_size, pass_idx, dest_idx
            )

            # Get updated observation
            obs = self.env.unwrapped.s

        truncated = False  # Taxi environment doesn't use truncation

        # Return updated information
        return obs, reward, done, truncated, {}


Now let's create the environment and modify the Action Space:


In [3]:
env_custom_action = gym.make('Taxi-v3')
env_custom_action = CustomActionWrapper(env_custom_action) #Adicionar as alterções ao ambiente 
env_custom_action = TimeLimit(env_custom_action,max_episode_steps=200)
env_custom_action = DummyVecEnv([lambda: env_custom_action])

Callback to have acess to the rewards on tensorboard:

In [82]:

from torch.utils.tensorboard import SummaryWriter

class Rewards(BaseCallback):
    """
    Callback que apenas adiciona recompensas ao TensorBoard,
    sem sobrescrever os gráficos padrão do Stable Baselines3.
    """
    def __init__(self, log_dir: str, verbose=0):
        super().__init__(verbose)
        self.writer = SummaryWriter(log_dir)
        self.episode_rewards = 0
        self.episode_count = 0

    def _on_step(self) -> bool:
        # Acumula recompensas do episódio
        self.episode_rewards += self.locals["rewards"][0]

        # Se o episódio termina, loga a recompensa acumulada
        if self.locals["dones"][0]:
            self.episode_count += 1
            self.writer.add_scalar("reward/episode_reward", self.episode_rewards, self.episode_count)
            self.episode_rewards = 0  # Reseta a recompensa
        return True

    def _on_training_end(self) -> None:
        # Fecha o writer ao final do treinamento
        self.writer.close()

## PPO



Now let's run the environment with PPO, where the best parameters were defined on tunning (It is on the `algorithms_baseline_env.ipynb`).


In [5]:
best_parameters_ppo =  {'batch_size': 32, 'ent_coef': 0.1, 'gamma': 0.95, 'learning_rate': 0.001, 'n_steps': 512}

In [6]:
log_dir = "./ppo_custom_action_tensorboard/"
model = PPO("MlpPolicy", env_custom_action, **best_parameters_ppo, verbose=0, tensorboard_log=log_dir)
reward_callback = Rewards(log_dir= log_dir)
model.learn(total_timesteps=40_000_000, callback=reward_callback)
model.save("ppo_taxi_custom_action")

## A2C

Did the same for this model.

In [7]:
best_parameters_a2c = {'ent_coef': 0.0, 'gamma': 0.99, 'learning_rate': 0.0001, 'max_grad_norm': 1.0, 'n_steps': 50, 'vf_coef': 0.75}

In [8]:
log_dir = "./a2c_custom_action_tensorboard/"
model = A2C("MlpPolicy", env_custom_action, **best_parameters_a2c, verbose=0, tensorboard_log=log_dir)
reward_callback = Rewards(log_dir= log_dir)
model.learn(total_timesteps=40_000_000, callback=reward_callback)
model.save("a2c_taxi_custom_action")

## DQN

Again, testing with DQN.

In [9]:
env_custom_action = gym.make('Taxi-v3')

env_custom_action = CustomActionWrapper(env_custom_action) #Adicionar as alterções ao ambiente 

env_custom_action = TimeLimit(env_custom_action,max_episode_steps=200)

In [10]:
best_parameters_dqn = {'batch_size': 32, 'buffer_size': 100000, 'exploration_final_eps': 0.1, 'gamma': 0.95, 'learning_rate': 0.001}

In [11]:
log_dir = "./dqn_custom_action_tensorboard/"
model = DQN("MlpPolicy", env_custom_action, **best_parameters_dqn, verbose=0, tensorboard_log=log_dir)
reward_callback = Rewards(log_dir= log_dir)
model.learn(total_timesteps=40_000_000, callback=reward_callback)
model.save("dqn_taxi_custom_action")

### **Modify Rewards**


This wrapper modifies the reward system to discourage the agent (taxi) from revisiting previously visited positions unless necessary. It promotes exploration and more efficient paths by applying penalties for redundant moves. Additionally, it clears the history of visited positions after completing tasks like pickups, allowing the agent to focus on subsequent goals without unnecessary penalties.


In [76]:
class CustomRewardWrapper(RewardWrapper):
    def __init__(self, env, penalty=1.0):
        super().__init__(env)
        self.env = env
        self.visited_positions = set()  # Set to store visited positions
        self.penalty = penalty

    def step(self, action):
        obs, reward, done, _, info = self.env.step(action)
        
        
        # Get the taxi position
        encoded_state = self.env.unwrapped.s
        
        taxi_row, taxi_col, pass_idx, dest_idx = self.env.unwrapped.decode(encoded_state)
        # Get the taxi's position (assuming obs contains the position as (x, y) coordinates)
        taxi_pos = (taxi_row, taxi_col)  # The position is stored in the first two elements of the observation

        # Check if the agent revisits a position
        if taxi_pos in self.visited_positions:
            # Apply a penalty if the taxi revisits a position
            reward -= self.penalty
        else:
           
            # Mark the current position as visited
            self.visited_positions.add(taxi_pos)

        # If the agent successfully picks up or drops off a passenger, we don't penalize
        if done and 'pickup' in info and info['pickup']:
            self.visited_positions.clear()  # Reset visited positions after task completion

        return obs, reward, done, _, info
    

In [41]:
env_custom_reward = gym.make('Taxi-v3')

env_custom_reward = CustomRewardWrapper(env_custom_reward) #Adicionar as alterções ao ambiente 

env_custom_reward= TimeLimit(env_custom_reward,max_episode_steps=200)

env_custom_reward = DummyVecEnv([lambda: env_custom_reward])

### **PPO**

In [42]:
best_parameters_ppo =  {'batch_size': 32, 'ent_coef': 0.1, 'gamma': 0.95, 'learning_rate': 0.001, 'n_steps': 512}

In [43]:
log_dir = "./ppo_custom_rewards_tensorboard/"

model = PPO("MlpPolicy", env_custom_reward, **best_parameters_ppo, verbose=0, tensorboard_log=log_dir)

reward_callback = Rewards(log_dir= log_dir)

model.learn(total_timesteps=40_000_000, callback=reward_callback)

model.save("ppo_taxi_custom_rewards")

### **A2C**


In [48]:
best_parameters_a2c = {'ent_coef': 0.0, 'gamma': 0.99, 'learning_rate': 0.0001, 'max_grad_norm': 1.0, 'n_steps': 50, 'vf_coef': 0.75}

In [49]:
log_dir = "./a2c_custom_rewards_tensorboard/"

model = A2C("MlpPolicy", env_custom_reward, **best_parameters_a2c, verbose=0, tensorboard_log=log_dir)

reward_callback = Rewards(log_dir= log_dir)

model.learn(total_timesteps=40_000_000, callback=reward_callback)

model.save("a2c_taxi_custom_rewards")

### **DQN**

In [52]:
env_custom_reward = gym.make('Taxi-v3')

env_custom_reward = CustomRewardWrapper(env_custom_reward) #Adicionar as alterções ao ambiente 

env_custom_reward = TimeLimit(env_custom_reward,max_episode_steps=200)

In [53]:
best_parameters_dqn = {'batch_size': 32, 'buffer_size': 100000, 'exploration_final_eps': 0.1, 'gamma': 0.95, 'learning_rate': 0.001}

In [54]:
log_dir = "./dqn_custom_reward_tensorboard/"
model = DQN("MlpPolicy", env_custom_reward, **best_parameters_dqn, verbose=0, tensorboard_log=log_dir)
reward_callback = Rewards(log_dir= log_dir)
model.learn(total_timesteps=40_000_000, callback=reward_callback)
model.save("dqn_taxi_custom_rewards")

### **Modify Reward plus Action**




Now, we can try using the two modification together.

### **PPO**

In [59]:
env_custom_combination = gym.make('Taxi-v3')
env_custom_combination = CustomActionWrapper(env_custom_combination)
env_custom_combination = CustomRewardWrapper(env_custom_combination)
env_custom_combination = TimeLimit(env_custom_combination,max_episode_steps=200)
env_custom_combination = DummyVecEnv([lambda: env_custom_combination])


In [55]:
best_parameters_ppo =  {'batch_size': 32, 'ent_coef': 0.1, 'gamma': 0.95, 'learning_rate': 0.001, 'n_steps': 512}

In [60]:
log_dir = "./ppo_custom_combination_tensorboard/"

model = PPO("MlpPolicy", env_custom_combination, **best_parameters_ppo, verbose=0, tensorboard_log=log_dir)

reward_callback = Rewards(log_dir= log_dir)

model.learn(total_timesteps=40_000_000, callback=reward_callback)

model.save("ppo_taxi_custom_combination")

### **A2C**

In [61]:
env_custom_combination = gym.make('Taxi-v3')
env_custom_combination = CustomActionWrapper(env_custom_combination)
env_custom_combination = CustomRewardWrapper(env_custom_combination)
env_custom_combination = TimeLimit(env_custom_combination,max_episode_steps=200)
env_custom_combination = DummyVecEnv([lambda: env_custom_combination])


In [62]:
best_parameters_a2c = {'ent_coef': 0.0, 'gamma': 0.99, 'learning_rate': 0.0001, 'max_grad_norm': 1.0, 'n_steps': 50, 'vf_coef': 0.75}

In [63]:
log_dir = "./a2c_custom_combination_tensorboard/"

model = A2C("MlpPolicy", env_custom_combination, **best_parameters_a2c, verbose=0, tensorboard_log=log_dir)

reward_callback = Rewards(log_dir= log_dir)

model.learn(total_timesteps=40_000_000, callback=reward_callback)

model.save("a2c_taxi_custom_combination")

### **DQN**

In [65]:
env_custom_combination = gym.make('Taxi-v3')
env_custom_combination = CustomActionWrapper(env_custom_combination)
env_custom_combination = CustomRewardWrapper(env_custom_combination)
env_custom_combination = TimeLimit(env_custom_combination,max_episode_steps=200)


In [66]:
best_parameters_dqn = {'batch_size': 32, 'buffer_size': 100000, 'exploration_final_eps': 0.1, 'gamma': 0.95, 'learning_rate': 0.001}

In [67]:
log_dir = "./dqn_custom_combination_tensorboard/"
model = DQN("MlpPolicy", env_custom_combination, **best_parameters_dqn, verbose=0, tensorboard_log=log_dir)
reward_callback = Rewards(log_dir= log_dir)
model.learn(total_timesteps=40_000_000, callback=reward_callback)
model.save("dqn_taxi_custom_combination")