In [None]:
# install required libraries

!pip install gymnasium[toy_text] imageio
!pip install python3.6
import numpy as np
import gymnasium as gym
import imageio
from IPython.display import Image
from gymnasium.utils import seeding
from typing import Any, Dict

# Initialize the Taxi-v3 environment
env = gym.make("Taxi-v3", render_mode='rgb_array')

# Seed the environment for reproducibility
env.np_random, _ = seeding.np_random(42)
env.action_space.seed(42)
np.random.seed(42)

# Maximum number of actions per training episode
max_actions = 100 

## 1. Training the Agent with Q-Learning

In [None]:
# Parameters for training
epsilon = 1.0
min_epsilon = 0.01
epsilon_decay = 0.001
alpha = 0.1  # Learning rate
gamma = 1 # Discount factor

Train the agent for 2,000 episodes with Q-learning, limiting to 100 actions per episode and recording the rewards per episode in episode_returns

In [None]:
# parameters for training
epsilon = 1.0
min_epsilon = 0.01
epsilon_decay = 0.001
alpha = 0.1 # learning rate
gamma = 1 # Discount rate

In [None]:
# Determine the environment's number of states and actions
num_states: Any = env.observation_space.n
num_actions: Any = env.action_space.n

# Initialize the Q-table with zeros
q_table: Any = np.zeros((num_states, num_actions))

## 2. Define Epislon-greedy strategy function

In [None]:
def epsilon_greedy(state) -> Any: 
    if np.random.rand() < epsilon:
        # Exploration: Trying out new actions to discover better rewards
        return env.action_space.sample() 
    else:
        # Exploitation: Using the current knowledge (q_table) to select the action to maximizes reward.
        return np.argmax(q_table[state]) 

## 3. Q-Learning Update Function

### Key Concepts in the Q-Learning Update:

- **Learning Rate (alpha):** Determines how much new information should overwrite the old value. A higher alpha means the agent will adjust more rapidly based on the most recent experiences. 
- **Discount Factor (gamma):** Controls the importance of future rewards. A gamma closer to 1 prioritizes future rewards, while a gamma closer to 0 focuses more on immediate rewards.
- **Max Future Q-value (next_max):** Helps the agent make long-term decisions by considering future potential rewards.

In [None]:
def q_learning_update(state, action, reward, next_state):
    old_value = q_table[state, action]
    next_max = max(q_table[next_state]) 
    q_table[state, action] = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
    
# List to store the total reward per episode
episode_returns: list = []

In [None]:
for episode in range(2000):
    state , info = env.reset() # unpack state and info from reset
    terminated = False # track whether the episode is finished 
    total_reward = 0 # counter
    
    for i in range(max_actions):
        action = epsilon_greedy(state) 
        
        # Unpack the result of env.step()
        next_state, reward, terminate, truncated, info = env.step(action)
        
        # update q-table based on the transition
        q_learning_update(state, action, reward, next_state)
        
        # move to the next step
        state = next_state
        total_reward += reward
        if terminated:
            break
    
    # allows to track the performance of the agent over time
    episode_returns.append(total_reward) 
    # Decay epsilon
    epsilon: float = max(min_epsilon, epsilon * epsilon_decay) # allows the agent gradually transition to exploitation
    
# Deriving the policy
policy: Dict[int, Any] = {state: np.argmax(q_table[state]) for state in range(num_states)} # action with the highest Q-value is selected and it's mapped in 'policy' dictionary.

In [None]:
# List to store frames
frames: list = []
state, info = env.reset(seed=42)
frames.append(env.render())
episode_total_reward = 0

for i in range(16):
    action = policy[state]
    
    # moving the environment forward by one step, based on the selected action
    state, reward, terminated, truncated, info = env.step(action)
    episode_total_reward += reward
    frames.append(env.render())
    if terminated: 
        break

## 4. Display agent's behavior

In [None]:
# Save frames as a GIF
imageio.mimsave('taxi_agent.gif', frames, fps=5)

# Display GIF
gif_path = "taxi_agent.gif" 
Image(gif_path) 