<a href="https://colab.research.google.com/github/rennyatwork/CegepSteFoy_ReinfLearn_Work/blob/main/TP01/taxi_qlearning_v01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import random
from IPython.display import clear_output
import time


def create_q_table(env):
  """Creates a Q-table with all values initialized to 0.

  Args:
    env: The Gymnasium environment.

  Returns:
    A NumPy array representing the Q-table.
  """
  return np.zeros([env.observation_space.n, env.action_space.n])

def plot_reward_evolution(rewards
                          , pXLabel = 'Episode'
                          , pYLabel = 'Reward'
                          , pTitle='Reward Evolution during Training'):
  """Plots the evolution of rewards over episodes.

  Args:
    rewards: A list of rewards obtained during training.
  """
  plt.clf() #clear cell
  plt.plot(rewards)
  plt.xlabel(pXLabel)
  plt.ylabel(pYLabel)
  plt.title(pTitle)

  # Calculate tick positions for 10 evenly spaced labels
  num_ticks = 10
  x_min = 0  # Assume x-axis starts at 0
  x_max = len(rewards) - 1  # Assume x-axis ends at the last data point
  tick_positions = np.linspace(x_min, x_max, num_ticks, dtype=int)

   # Set x-axis ticks and labels
  plt.xticks(tick_positions, tick_positions)  # Set both positions and labels


  plt.show()

def get_policy(q_table, state, epsilon):
  """Selects an action using the epsilon-greedy policy.

  Args:
    q_table: The Q-table containing learned values.
    state: The current state of the environment.
    epsilon: The exploration rate.

  Returns:
    The action selected by the policy.
  """
  if np.random.uniform(0, 1) < epsilon:
    action = env.action_space.sample()  # Explore action space
  else:
    action = np.argmax(q_table[state])  # Exploit learned values
  return action

def train_q_learning(env
                     , q_table
                     , alpha=0.1
                     , gamma=0.9
                     , epsilon=1.0
                     , episodes=100000
                     , pPrint=False
                     ):
  """Trains the Q-learning agent.

  Args:
    env: The Gymnasium environment.
    q_table: The Q-table to update.
    alpha: The learning rate.
    gamma: The discount factor.
    epsilon: The exploration rate.
    episodes: The number of training episodes.
  """

  rewards = []
  all_rewards_per_episode = [] # List to store rewards per episode
  epsilon_decay = 0.9999  # Example decay rate
  start_time_train = time.time()

  for i in range(1, episodes + 1):
    state = env.reset()[0]
    epochs, penalties, reward = 0, 0, 0
    done = False
    start_time_for = time.time()


    while not done:

      action = get_policy(q_table, state, epsilon)

      # Decay epsilon over time:
      epsilon *= epsilon_decay
      epsilon = max(epsilon, 0.05) # set the min epsilon

      next_state, reward, done, truncated, info = env.step(action)

      old_value = q_table[state, action]
      next_max = np.max(q_table[next_state])

      new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
      q_table[state, action] = new_value

      if reward == -10:
        penalties += 1

      state = next_state
      epochs += 1

      rewards.append(reward)


    end_time_for = time.time()
    elapsed_time_for = start_time_for - end_time_for

    if pPrint:
      print("****************************************************")
      print(f"Episode: {i}")
      print(f"Timesteps taken: {epochs}")
      print(f"Penalties incurred: {penalties}")


    all_rewards_per_episode.append(np.mean(rewards)) # Calculating avg. reward per episode

    ten_pct_print = int(episodes/10)

    if i % (ten_pct_print) == 0:
      #print(f"Episode: {i}")
      avg_reward_last_100 = np.mean(all_rewards_per_episode[-ten_pct_print:]) # Calculating avg. reward for the last 100 episodes
      #print(f"Episode: {i}, Avg reward (last {ten_pct_print} episodes): {avg_reward_last_100}, current epsilon: {epsilon}, Elapsed time current for: {elapsed_time_for} secon")
      print(f"Episode: {i}, "
      f"Avg reward (last {ten_pct_print} episodes): "
      f"{avg_reward_last_100}, "
      f"current epsilon: {epsilon}, "
      f"Elapsed time current for: {elapsed_time_for} seconds")


    #if i % 1000 == 0:
     # plot_reward_evolution(rewards)

  end_time_train = time.time()
  elapsed_time_train = end_time_train - start_time_train
  print(f"Training time: {elapsed_time_train} seconds")
  print("Training finished.\n")

def evaluate_agent(env, q_table, episodes=100, pPrint=False):
  """Evaluates the trained agent.

  Args:
    env: The Gymnasium environment.
    q_table: The trained Q-table.
    episodes: The number of evaluation episodes.

  Returns:
    The average reward over the evaluation episodes.
  """

  if pPrint:
      print('Evaluating agent...')

  arr_avg_rewards = []
  arr_total_rewards = []

  total_rewards = 0
  for ep in range(episodes):
    state = env.reset()[0]
    done = False
    episode_reward = 0

    steps_so_far = 0


    while not done and steps_so_far < 300:
      action = np.argmax(q_table[state])
      state, reward, done, truncated, info = env.step(action)
      steps_so_far +=1

      episode_reward += reward

      #if reward == -10:
      #  penalties += 1

      reward_per_step = episode_reward / steps_so_far


      if (pPrint and (steps_so_far % 50) ==0):
        print(f'steps_so_far: {steps_so_far}')
        print(f'Reward/step in episode [{ep}]: {reward_per_step}')

    total_rewards += episode_reward

    if pPrint:
      print(f"Episode [{ep}] reward: {episode_reward}")

    arr_avg_rewards.append(reward_per_step)
    arr_total_rewards.append(episode_reward)

  print(f"Results after {episodes} episodes:")
  print(f'len(arr_avg_rewards): {len(arr_avg_rewards)} ')
  print(f'len(arr_total_rewards): {len(arr_total_rewards)} ')


  plot_reward_evolution(arr_avg_rewards
                        , pYLabel='Average Reward'
                        , pTitle='Average Reward evolution')
  plot_reward_evolution(arr_total_rewards
                        , pYLabel='Total Rewards'
                        , pTitle='Total Rewards evolution')

  return total_rewards / episodes

# Main execution
env = gym.make("Taxi-v3")
q_table = create_q_table(env)
train_q_learning(env, q_table
                 , epsilon=1.0
                 , episodes=100)
average_reward = evaluate_agent(env
                                , q_table
                                , episodes=200
                                ,  pPrint=True)

print(f"Average reward: {average_reward}")

Episode: 10000, Average reward (last 10000 episodes): -0.9844249134782082, current epsilon: 0.15
Episode: 20000, Average reward (last 10000 episodes): -0.4936645309625605, current epsilon: 0.15
