In [3]:
!pip install numpy --upgrade
!pip install gym==0.26.2



In [6]:
import gymnasium as gym
import numpy as np
import random
import time

# Create the Taxi-v3 environment without rendering during training
env = gym.make("Taxi-v3", render_mode=None)  # No rendering during training

# Initialize Q-table
state_space = env.observation_space.n
action_space = env.action_space.n
q_table = np.zeros((state_space, action_space))

# Hyperparameters
alpha = 0.7        # Learning rate
gamma = 0.618      # Discount factor
epsilon = 1.0      # Exploration rate
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01
episodes = 10000   # Reduced for faster training
max_steps = 100

# Training - No rendering for speed
print("Starting training...")
start_time = time.time()

for episode in range(episodes):
    state, info = env.reset()
    done = False

    for step in range(max_steps):
        # Exploration-exploitation trade-off
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(q_table[state])  # Exploit

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Q-Learning update
        q_table[state, action] = q_table[state, action] + alpha * (
            reward + gamma * np.max(q_table[next_state]) - q_table[state, action]
        )

        state = next_state

        if done:
            break

    # Decay epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

    # Print progress every 2000 episodes
    if episode % 2000 == 0:
        print(f"Episode {episode}/{episodes}")

training_time = time.time() - start_time
print(f"Training finished in {training_time:.2f} seconds.\n")

# Evaluate the trained policy
total_epochs, total_penalties = 0, 0
episodes_to_test = 5

# Only use rendering for evaluation
eval_env = gym.make("Taxi-v3", render_mode="None")  # Or "ansi" for terminal rendering

for ep in range(episodes_to_test):
    state, info = eval_env.reset()
    done = False
    steps = 0
    penalties = 0

    print(f"\nEpisode {ep + 1}")

    while not done and steps < 100:
        action = np.argmax(q_table[state])
        state, reward, terminated, truncated, _ = eval_env.step(action)
        done = terminated or truncated

        if reward == -10:  # Count penalties (illegal moves)
            penalties += 1

        steps += 1
        time.sleep(0.5)  # Slow down rendering to make it visible

    total_penalties += penalties
    total_epochs += steps
    print(f"Episode finished in {steps} steps with {penalties} penalties.")

print(f"\nResults after {episodes_to_test} episodes:")
print(f"Average steps per episode: {total_epochs / episodes_to_test}")
print(f"Average penalties per episode: {total_penalties / episodes_to_test}")

eval_env.close()

Starting training...
Episode 0/10000
Episode 2000/10000
Episode 4000/10000
Episode 6000/10000
Episode 8000/10000
Training finished in 5.82 seconds.


Episode 1
Episode finished in 13 steps with 0 penalties.

Episode 2
Episode finished in 12 steps with 0 penalties.

Episode 3
Episode finished in 10 steps with 0 penalties.

Episode 4
Episode finished in 15 steps with 0 penalties.

Episode 5
Episode finished in 11 steps with 0 penalties.

Results after 5 episodes:
Average steps per episode: 12.2
Average penalties per episode: 0.0
