<a href="https://colab.research.google.com/github/popolome/RL-Practice-with-Gym-Taxi-v3-/blob/main/RL_Practice_with_Gym(Taxi_v3).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install gymnasium



In [21]:
import gymnasium as gym
import numpy as np

# 1 - Create the environment

In [22]:
env = gym.make('Taxi-v3', render_mode='ansi')   # This ANSI is for text rendering
num_states = env.observation_space.n    # This is 500 states
num_actions = env.action_space.n    # This is 6 actions

# 2 - Initialize the Q-table

In [23]:
Q = np.zeros((num_states, num_actions), dtype = np.float32) # To define the Q matrix of zeros

# 3 - Define the hyperparameters

In [24]:
alpha = 0.1   # This is the learning rate
gamma = 0.9  # This is the
epsilon = 1.0   # This is the exploration probability
min_epsilon = 0.01
epsilon_decay = 0.995
num_episodes = 5000
max_steps = 100   # This is the optional truncation per episode

# 4 - Training Loop

In [25]:
for episode in range(num_episodes):
  state, _ = env.reset()
  done = False
  step = 0

  while not done and step < max_steps:    # Epsilon-greedy action selection
    if np.random.rand() < epsilon:
      action = env.action_space.sample()    # Exploration
    else:
      action = np.argmax(Q[state])    # Exploitation

    next_state, reward, terminated, truncated, info = env.step(action)   # Take a step/action
    done = terminated or truncated

    # Q-learning update
    Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])

    state = next_state    # Update current state to next_state and repeat
    step += 1     # Increment step by 1 ensuring less than 100(max_steps)

epsilon = max(min_epsilon, epsilon * epsilon_decay)     # Decay the epsilon every step

if (episode + 1) % 500 == 0:
  print(f"Episode {episode + 1}/{num_episodes} complete")   # Print the progress

Episode 5000/5000 complete


# 5 - Test the learned policy

In [26]:
state, _ = env.reset()
done = False
total_reward = 0

print("\n--- Testing the learned policy ---\n")

while not done:
  action = np.argmax(Q[state])      # Choose the best action at state
  next_state, reward, terminated, truncated, info = env.step(action)
  done = terminated or truncated
  state = next_state
  total_reward += reward      # Updates total_reward with the immediate reward
  print(env.render())     # Print the text map of Taxi-v3

print(f"\nTotal Reward with learned policy: {total_reward}")


--- Testing the learned policy ---

+---------+
|R: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (West)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[42mB[0m: |
+---------+
  (Pickup)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|[35mY[0m| : |B: |
+---------+
  (North)

+---------+
|R: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)

+---------+
|R: | : :G|
| : | : : |
| : :[42m_[0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)

+---------+
|R: | : :G|
| : | : : |
| :[42m_[0m: : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)

+---------+
|R: | : :G|
| : 