<a href="https://colab.research.google.com/github/rennyatwork/CegepSteFoy_ReinfLearn_Work/blob/main/TP01/taxi_policy_gradient_v01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0


In [38]:
import gymnasium as gym
import numpy as np
import time
from collections import defaultdict

# Initialize the environment
env = gym.make('Taxi-v3', render_mode='rgb_array')

def decode_state(state):
    """Decode Taxi-v3 state into cell positions."""
    # Access the underlying TaxiEnv instance using 'unwrapped'
    taxi_row, taxi_col, pass_loc, dest_idx = env.unwrapped.decode(state)
    return f"Taxi at ({taxi_row}, {taxi_col}), Passenger at {pass_loc}, Destination at {env.unwrapped.locs[dest_idx]}"


def policy_gradient(env, num_episodes=2000, learning_rate=0.01, discount_factor=0.95, pTheta=np.random.randn(env.observation_space.n, env.action_space.n) / np.sqrt(env.observation_space.n), pPrint=False):
    """Implement Policy Gradient (REINFORCE) for Taxi-v3."""
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    theta = pTheta

    def softmax(x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

    def get_action(state, theta):
        return np.random.choice(num_actions, p=softmax(theta[state]))

    episode_times = []
    iter_episode = []

    initial_learning_rate = learning_rate
    decay_rate = 1000  # Adjust for decay speed
    beta = 0.01      # Entropy regularization strength (adjust as needed)

    for episode in range(num_episodes):
        start_time = time.time()
        state, _ = env.reset()
        episode_reward = 0
        states, actions, rewards = [], [], []
        penalties = 0
        iter_count = 0
        done = False

        # Learning rate decay
        learning_rate = initial_learning_rate / (1 + episode / decay_rate)

        while not done:
            action = get_action(state, theta)
            next_state, reward, terminated, truncated, _ = env.step(action)

            states.append(state)
            actions.append(action)
            rewards.append(reward)
            episode_reward += reward
            iter_count += 1

            done = terminated or truncated

            if episode_reward == -10:
                penalties += 1
                if penalties > 2:
                    if pPrint:
                        print(f'Too many penalties: {penalties}. BYE!!!')
                    done = True

            if iter_count > 60:
                if pPrint:
                    print(f'Too many iterations: {iter_count}. BYE!!!')
                done = True

            state = next_state

        # Compute returns and apply baseline subtraction
        G = 0
        returns = []
        for r in rewards[::-1]:
            G = r + discount_factor * G
            returns.insert(0, G)
        baseline = np.mean(returns)
        returns = [G - baseline for G in returns]

        # Update theta with optional entropy regularization
        for s, a, G in zip(states, actions, returns):
            if G != 0:
                prob_dist = softmax(theta[s])
                theta[s] += learning_rate * (
                    (G - np.mean(returns)) * (np.eye(num_actions)[a] - prob_dist)
                    + beta * np.sum(prob_dist * np.log(prob_dist))  # Entropy term
                )

        elapsed_time = time.time() - start_time
        episode_times.append(elapsed_time)
        iter_episode.append(iter_count)

        # Print episode statistics (same as before)
        interval = int(num_episodes * 0.1)
        if (episode + 1) % interval == 0 and episode != 0 and episode != num_episodes - 1:
            avg_episode_time = np.mean(episode_times[-interval:])
            print(f"Episode {episode + 1}/{num_episodes} - "
                  f"State: {decode_state(state)} - " #Assuming you have defined `decode_state`
                  f"Final Action: {env.action_space.sample()} - "
                  f"Average Return: {np.mean(returns):.3f}"
                  f"Average Time per Episode: {avg_episode_time:.4f} seconds "
                  f'Avg iter/episode: {np.mean(iter_episode)}')

    overall_avg_time = np.mean(episode_times)

     # Print overall average time per episode
    overall_avg_time = np.mean(episode_times)
    print(f"\nOverall Average Time per Episode: {overall_avg_time:.3f} seconds")
    print(f"\nOverall Time per : {np.sum(episode_times):.3f} seconds")


    print("Training complete!")


def evaluate_policy(env, theta, num_episodes=100):
    """Evaluate the learned policy."""
    total_rewards = []
    for _ in range(num_episodes):
        state, _ = env.reset()
        episode_reward = 0
        while True:
            action = np.argmax(softmax(theta[state]))
            state, reward, terminated, truncated, _ = env.step(action)
            episode_reward += reward
            if terminated or truncated:
                break
        total_rewards.append(episode_reward)

    return np.mean(total_rewards)

def softmax(x):
    e_x = np.exp(x - np.max(x))  # For numerical stability
    return e_x / e_x.sum()



In [39]:

learned_policy_01 = policy_gradient(env
                                 ,  num_episodes=2000
                                 , learning_rate=0.05
                                 , discount_factor=0.97
                                    , pPrint=False
                                 )


Episode 200/2000 - State: Taxi at (3, 0), Passenger at 1, Destination at (4, 0) - Final Action: 0 - Average Return: -0.000Average Time per Episode: 0.0082 seconds Avg iter/episode: 60.755
Episode 400/2000 - State: Taxi at (4, 0), Passenger at 1, Destination at (4, 3) - Final Action: 4 - Average Return: -0.000Average Time per Episode: 0.0097 seconds Avg iter/episode: 60.7625
Episode 600/2000 - State: Taxi at (4, 3), Passenger at 3, Destination at (0, 4) - Final Action: 0 - Average Return: -0.000Average Time per Episode: 0.0092 seconds Avg iter/episode: 60.84166666666667
Episode 800/2000 - State: Taxi at (2, 3), Passenger at 3, Destination at (0, 0) - Final Action: 3 - Average Return: -0.000Average Time per Episode: 0.0081 seconds Avg iter/episode: 60.88125
Episode 1000/2000 - State: Taxi at (1, 4), Passenger at 2, Destination at (0, 0) - Final Action: 0 - Average Return: 0.000Average Time per Episode: 0.0095 seconds Avg iter/episode: 60.868
Episode 1200/2000 - State: Taxi at (0, 2), Pas

In [24]:
learned_policy_02 = policy_gradient(env
                                 ,  num_episodes=10000
                                 , learning_rate=0.05
                                 , discount_factor=0.95
                                    , pTheta=learned_policy_02
                                    , pPrint=False
                                 )

  - State: Taxi at (2, 3), Passenger at 3, Destination at (0, 4)
  - Final Action: 2
  - Average Return: -18.10006660006586
  - State: Taxi at (4, 0), Passenger at 4, Destination at (0, 4)
  - Final Action: 3
  - Average Return: -18.10006660006586
  - State: Taxi at (4, 3), Passenger at 1, Destination at (0, 0)
  - Final Action: 0
  - Average Return: -18.10006660006586
  - State: Taxi at (4, 0), Passenger at 3, Destination at (0, 0)
  - Final Action: 2
  - Average Return: -18.10006660006586
  - State: Taxi at (4, 3), Passenger at 3, Destination at (0, 0)
  - Final Action: 5
  - Average Return: -18.10006660006586
  - State: Taxi at (1, 2), Passenger at 2, Destination at (4, 3)
  - Final Action: 3
  - Average Return: -18.14506660006586
  - State: Taxi at (3, 2), Passenger at 3, Destination at (0, 0)
  - Final Action: 4
  - Average Return: -18.10006660006586
  - State: Taxi at (3, 4), Passenger at 2, Destination at (4, 3)
  - Final Action: 4
  - Average Return: -18.10006660006586
  - Stat

KeyboardInterrupt: 

In [15]:
avg_reward = evaluate_policy(env, learned_policy_02)
print(f"Average reward over 10 episodes: {avg_reward}")

Average reward over 10 episodes: -229.54
