# Dependencies

In [None]:
import numpy as np
import gymnasium as gym
import random 
import imageio
import os
import tqdm

import pickle
from tqdm.notebook import tqdm

# Q-table Class

In [None]:
class QTable:
    def __init__(self, state_space_size, action_space_size, eps_start = 1.0, eps_end = 0.0, decay_rate = 0.1, gamma=0.999, alpha=0.01):
        self.state_space_sz = state_space_size
        self.act_space_sz = action_space_size
        self.eps_start = eps_start
        self.eps_end= eps_end
        self.decay_rate = decay_rate
        self.gamma=gamma
        self.alpha = alpha
        self.reset()

    def reset(self):
        self.Qtable = np.zeros((self.state_space_sz, self.act_space_sz))

    def act_greedy_policy(self, curr_state):
        greedy_action = self.Qtable[curr_state, :].argmax()
        return greedy_action
    
    def act_eps_greedy_policy(self, curr_state, episode):
        random_number = random.uniform(0, 1)

        eps = self.eps_end + (self.eps_start - self.eps_end)*np.exp(-self.decay_rate*episode)

        if random_number > eps: # exploitation
            action = self.act_greedy_policy(curr_state)
        else: # exploration
            random_action = random.randint(0, self.act_space_sz-1)
            action = random_action

        return action
    
    def update(self, obs, act, next_obs, reward):
        self.Qtable[obs, act] = self.Qtable[obs, act] + self.alpha * (reward + self.gamma*self.Qtable[next_obs].max() - self.Qtable[obs, act])

# Frozen Lake

In [None]:
# Frozen Lake param descriptions
env_id = "FrozenLake-v1"  # Name of the environment
is_splippery = False
map_name = "4x4"
map_descr = ["SFFF", "FHFH", "FFFH", "HFFG"]
render_mode = "rgb_array"

In [None]:
env = gym.make(env_id, map_name=map_name, desc=map_descr, is_slippery=is_splippery, render_mode=render_mode)

print("---------- OBS SPACE ----------\n")
print("Obs space size: ", env.observation_space)
print("Sampling obs space: ", env.observation_space.sample())
print("---------- ACT SPACE ----------\n")
print("Act space size: ", env.action_space)
print("Sampling obs space: ", env.action_space.sample())

In [None]:
def train_agent(q_table: QTable, training_env: gym.Env, n_training_episodes: int, max_steps: int):
    for episode_num in range(0, n_training_episodes):
        obs, _ = training_env.reset()

        for _ in range(0, max_steps):
        
            act = q_table.act_eps_greedy_policy(obs, episode_num)
            next_obs, reward, terminated, truncated, _ = training_env.step(action=act)
            q_table.update(obs, act, next_obs, reward)

            if truncated or terminated:
                break

            obs = next_obs
        
        print(f"Episode {episode_num} / {n_training_episodes} finished!")
    training_env.close()

In [None]:
env = gym.make(env_id, map_name=map_name, desc=map_descr, is_slippery=is_splippery, render_mode=render_mode)

state_space_size = env.observation_space.n
action_space_size = env.action_space.n
print("State space size is: ", state_space_size)
print("Action space size is: ", action_space_size)

n_training_episodes = 10000
learning_rate = 0.7

max_steps = 99
gamma = 0.95
eval_seed = []

max_epsilon = 1.0  # Exploration probability at start
min_epsilon = 0.05  # Minimum exploration probability
decay_rate = 0.0005  # Exponential decay rate for exploration prob

q_table = QTable(state_space_size, action_space_size, max_epsilon, min_epsilon, decay_rate, gamma, learning_rate)

# Train
train_agent(q_table, env, n_training_episodes, max_steps)

# Evaluation

In [None]:
# Evaluate the q_table
def evaluate_agent(env: gym.Env, q_table: QTable, max_steps = 99, n_eval_episodes = 100, seed = None):
    episode_rewards = []

    for episode in range(n_eval_episodes):

        if seed:
            obs, info = env.reset(seed=seed[episode])
        else:
            obs, info = env.reset()

        episode_reward = 0

        for step in range(0, max_steps):
            
            act = q_table.act_greedy_policy(obs)
            obs, reward, terminated, truncated, _ = env.step(act)
            episode_reward = episode_reward + reward
            env.render()

            if terminated or truncated:
                break
        
        episode_rewards.append(episode_reward)

    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward

env_eval = gym.make(env_id, map_name=map_name, desc=map_descr, is_slippery=is_splippery, render_mode=render_mode)
mean_reward, std_reward = evaluate_agent(env_eval, q_table)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

# Upload model to the Hub

In [None]:
from huggingface_hub import HfApi, snapshot_download
from huggingface_hub.repocard import metadata_eval_result, metadata_save

from pathlib import Path
import datetime
import json

In [None]:
def record_video(env: gym.Env, Qtable: QTable, out_dir, fps=1):
    """
    Generate a replay video of the agent
    :param env
    :param Qtable: Qtable of our agent
    :param out_dir
    :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
    """
    images = []
    terminated = False
    truncated = False

    state, info = env.reset(seed=random.randint(0, 500))
    img = env.render()
    images.append(img)

    while not terminated or truncated:
        action = Qtable.act_greedy_policy(state)

        state, reward, terminated, truncated, info = env.step(action)
        img = env.render()
        images.append(img)

    imageio.mimsave(out_dir, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [None]:
def push_to_hub(repo_id, model, video_fps=1, local_repo_path="hub"):
    """
    Evaluate, Generate a video and Upload a model to Hugging Face Hub.
    This method does the complete pipeline:
    - It evaluates the model
    - It generates the model card
    - It generates a replay video of the agent
    - It pushes everything to the Hub

    :param repo_id: repo_id: id of the model repository from the Hugging Face Hub
    :param env
    :param video_fps: how many frame per seconds to record our video replay
    (with taxi-v3 and frozenlake-v1 we use 1)
    :param local_repo_path: where the local repository is
    """
    _, repo_name = repo_id.split("/")

    env = gym.make(model["env_id"], render_mode="rgb_array")
    env_id = model["env_id"]
    api = HfApi()

    # Step 1: Create the repo
    repo_url = api.create_repo(repo_id = repo_id, exist_ok=True)

    # Step2: Download the files
    repo_local_path = Path(snapshot_download(repo_id=repo_id))

    # Step3: Save the model
    if env.spec.kwargs.get("map_name"):
        model["map_name"] = env.spec.kwargs.get("map_name")
        if env.spec.kwargs.get("is_splippery", "") == False:
            model["splippery"] = False

    # Pickle the model
    with open((repo_local_path) / "q-learning.pkl", "wb") as f:
        pickle.dump(model, f)

    # Step 4: Evaluate the model and build JSON with evaluation metrics
    mean_reward, std_reward = evaluate_agent(env, model["q_table"], model["max_steps"], model["n_eval_episodes"], model["eval_seed"])

    evaluate_data = {
        "env_id": model["env_id"],
        "mean_reward": mean_reward,
        "n_eval_episodes": model["n_eval_episodes"],
        "eval_datetime": datetime.datetime.now().isoformat()
    }

    # Write a JSON file called "results.json" that will contain the
    # evaluation results
    with open(repo_local_path / "result.json", "w") as outfile:
        json.dump(evaluate_data, outfile)

    # Create the model card
    env_name = model["env_id"]
    if env.spec.kwargs.get("map_name"):
        env_name += "-" + env.spec.kwargs.get("map_name")

    if env.spec.kwargs.get("is_slippery", "") == False:
        env_name += "-" + "no_slippery"

    metadata = {}
    metadata["tags"] = [env_name, "q-learning", "reinforcement-learning", "custom-implememtation"] 

    eval = metadata_eval_result(
        model_pretty_name=repo_name,
        task_pretty_name="feinforcement-learning",
        task_id="reinforcment-learning",
        metrics_pretty_name="mean_reward",
        metrics_id="mean_reward",
        metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
        dataset_pretty_name=env_name,
        dataset_id=env_name
    )

    # Merges both dictionaries
    metadata = {**metadata, **eval}

    model_card = f"""
    # **Q-Learning** Agent playing1 **{env_id}**
    This is a trained model of a **Q-Learning** agent playing **{env_id}** .

    ## Usage

    model = load_from_hub(repo_id="{repo_id}", filename="q-learning.pkl")

    # Don't forget to check if you need to add additional attributes (is_slippery=False etc)
    env = gym.make(model["env_id"])
    """

    readme_path = repo_local_path / "README.md"
    readme = ""
    print(readme_path.exists())
    if readme_path.exists():
        with readme_path.open("r", encoding="utf8") as f:
            readme = f.read()
    else:
        readme = model_card

    with readme_path.open("w", encoding="utf-8") as f:
        f.write(readme)

    # Save our metrics to Readme metadata
    metadata_save(readme_path, metadata)

    # Step 6: Record a video
    video_path = repo_local_path / "replay.mp4"
    record_video(env, model["q_table"], video_path, video_fps)

    # Step 7. Push everything to the Hub
    api.upload_folder(
        repo_id=repo_id,
        folder_path=repo_local_path,
        path_in_repo=".",
    )

    print("Your model is pushed to the Hub. You can view your model here: ", repo_url)


In [None]:
from huggingface_hub import notebook_login

# Login
n_eval_episodes = 100

model = {
    "env_id": env_id,
    "max_steps": max_steps,
    "n_training_episodes": n_training_episodes,
    "n_eval_episodes": n_eval_episodes,
    "eval_seed": eval_seed,
    "learning_rate": learning_rate,
    "gamma": gamma,
    "max_epsilon": max_epsilon,
    "min_epsilon": min_epsilon,
    "decay_rate": decay_rate,
    "q_table": q_table,
    "seed": random.randint(0, 500)
}

username = "Pucciland95"
repo_name = "q-FrozenLake-v1-4x4-noSlippery"
push_to_hub(repo_id=f"{username}/{repo_name}", model=model)

# Taxy Driver

## Environment Definition

In [None]:
# Taxi driver param descriptions
env_id = "Taxi-v3"  # Name of the environment
render_mode = "rgb_array"
env_training = gym.make(env_id, render_mode=render_mode)

action_space_size = env_training.action_space.n
print(f"Action space size: {action_space_size}")
state_space_size = env_training.observation_space.n
print(f"State space size: {state_space_size}")


## Training

In [None]:
# Hyperparameters
learning_rate = 0.7

max_steps = 99
gamma = 0.999
eval_seed = []

max_epsilon = 1.0  # Exploration probability at start
min_epsilon = 0.025  # Minimum exploration probability
decay_rate = 0.00005  # Exponential decay rate for exploration prob

env_id = "Taxi-v3"  # Name of the environment
env_training = gym.make(env_id, render_mode=render_mode)
action_space_size = env_training.action_space.n
state_space_size = env_training.observation_space.n

q_table_taxy = QTable(state_space_size, action_space_size, max_epsilon, min_epsilon, decay_rate, gamma, learning_rate)

# Train
n_training_episodes = 1000000

train_agent(q_table_taxy, env_training, n_training_episodes, max_steps)

## Evaluation

In [None]:
eval_seed = [
    16,
    54,
    165,
    177,
    191,
    191,
    120,
    80,
    149,
    178,
    48,
    38,
    6,
    125,
    174,
    73,
    50,
    172,
    100,
    148,
    146,
    6,
    25,
    40,
    68,
    148,
    49,
    167,
    9,
    97,
    164,
    176,
    61,
    7,
    54,
    55,
    161,
    131,
    184,
    51,
    170,
    12,
    120,
    113,
    95,
    126,
    51,
    98,
    36,
    135,
    54,
    82,
    45,
    95,
    89,
    59,
    95,
    124,
    9,
    113,
    58,
    85,
    51,
    134,
    121,
    169,
    105,
    21,
    30,
    11,
    50,
    65,
    12,
    43,
    82,
    145,
    152,
    97,
    106,
    55,
    31,
    85,
    38,
    112,
    102,
    168,
    123,
    97,
    21,
    83,
    158,
    26,
    80,
    63,
    5,
    81,
    32,
    11,
    28,
    148,
]  # Evaluation seed, this ensures that all classmates agents are trained on the same taxi starting position
# Each seed has a specific starting state

# Evaluation parameters
n_eval_episodes = 100  # Total number of test episodes

env_eval = gym.make(env_id, render_mode=render_mode)
mean_reward, std_reward = evaluate_agent(env_eval, q_table_taxy, seed=eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

# Push to Hub

In [None]:
model = {
    "env_id": env_id,
    "max_steps": max_steps,
    "n_training_episodes": n_training_episodes,
    "n_eval_episodes": n_eval_episodes,
    "eval_seed": eval_seed,
    "learning_rate": learning_rate,
    "gamma": gamma,
    "max_epsilon": max_epsilon,
    "min_epsilon": min_epsilon,
    "decay_rate": decay_rate,
    "q_table": q_table_taxy,
}

username = "Pucciland95"
repo_name = "q-TaxiDriver-v3"
push_to_hub(repo_id=f"{username}/{repo_name}", model=model)

# Get Model from Hub

In [133]:
from urllib.error import HTTPError
from huggingface_hub import hf_hub_download

def load_from_hub(repo_id: str, filename: str):
    pickle_model = hf_hub_download(repo_id=repo_id, filename=filename)

    with open(pickle_model, "rb") as f:
        downloaded_model_file = pickle.load(f)

    return downloaded_model_file

In [None]:
model = load_from_hub(repo_id="ThomasSimonini/q-Taxi-v3", filename="q-learning.pkl")

env = gym.make(model["env_id"])
mean_reward, std_reward = evaluate_agent(env, model["q_table"], model["max_steps"], model["n_eval_episodes"], seed=model["eval_seed"])