In [1]:
!pip install gymnasium

import gymnasium as gym
import numpy as np
import random
import time



In [2]:
env = gym.make("Taxi-v3", render_mode="ansi")

state_space = env.observation_space.n  # 500 possible states
action_space = env.action_space.n      # 6 possible actions

q_table = np.zeros((state_space, action_space))

In [3]:
num_episodes = 5000
max_steps = 100
learning_rate = 0.7
discount_factor = 0.618
epsilon = 1.0
epsilon_decay = 0.999
epsilon_min = 0.01

In [4]:
for episode in range(num_episodes):
    state, _ = env.reset()
    done = False

    for _ in range(max_steps):
        # Choose action: explore or exploit
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(q_table[state])  # Exploit

        # Take action and observe result
        next_state, reward, done, truncated, _ = env.step(action)

        # Update Q-table using the Q-learning formula
        q_table[state, action] += learning_rate * (
            reward + discount_factor * np.max(q_table[next_state]) - q_table[state, action]
        )

        state = next_state

        if done or truncated:
            break

    # Reduce exploration over time
    epsilon = max(epsilon * epsilon_decay, epsilon_min)

In [5]:
env = gym.make("Taxi-v3", render_mode="ansi")

action_names = {
    0: "Move South",
    1: "Move North",
    2: "Move East",
    3: "Move West",
    4: "Pickup Passenger",
    5: "Dropoff Passenger"
}

for ep in range(3):
    state, _ = env.reset()
    done = False
    print(f"\nðŸš• Episode {ep+1}")
    time.sleep(1)

    for step in range(100):
        action = np.argmax(q_table[state])
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        print(env.render())
        print(f"Action: {action_names[action]} | Reward: {reward}\n")
        time.sleep(0.8)

        state = next_state

        if done:
            print("âœ… Passenger successfully delivered!\n")
            break

env.close()


ðŸš• Episode 1
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)

Action: Move South | Reward: -1

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)

Action: Move West | Reward: -1

+---------+
|[34;1mR[0m: | : :G|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)

Action: Move North | Reward: -1

+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)

Action: Move West | Reward: -1

+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)

Action: Move North | Reward: -1

+---------+
|[42mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Pickup)

Action: Pickup Passenger | Reward: -1

+---------+
|R:[42m_[0m| : :G|
| : | : : |
| : :