In [1]:
!pip install pyvirtualdisplay



In [2]:
!pip install -r requirements.txt



In [3]:
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display

<pyvirtualdisplay.display.Display at 0x7f152f742320>

In [4]:
import numpy as np
import gymnasium as gym
import random
import imageio
import os
from tqdm import tqdm

In [5]:
env=gym.make('FrozenLake-v1', map_name='4x4', is_slippery=False, render_mode='rgb_array')  #render_mode='rgb_array'
env_slippery=gym.make('FrozenLake-v1', map_name='4x4', is_slippery=True, render_mode='rgb_array')
env_8x8=gym.make('FrozenLake-v1', map_name='8x8', is_slippery=False, render_mode='rgb_array')

In [6]:
print("------FrozenLake-v1-4x4------")
print(f'Observation Space: {env.observation_space.n}')
print(f'Action Space: {env.action_space.n}\n')

print("------FrozenLake-v1-4x4-Slippery------")
print(f'Observation Space: {env_slippery.observation_space.n}')
print(f'Action Space: {env_slippery.action_space.n}\n')

print("------FrozenLake-v1-8x8------")
print(f'Observation Space: {env_8x8.observation_space.n}')
print(f'Action Space: {env_8x8.action_space.n}\n')

------FrozenLake-v1-4x4------
Observation Space: 16
Action Space: 4

------FrozenLake-v1-4x4-Slippery------
Observation Space: 16
Action Space: 4

------FrozenLake-v1-8x8------
Observation Space: 64
Action Space: 4



In [7]:
state_space=env.observation_space.n
action_space=env.action_space.n

def initialize_q_table(state_space, action_space):
    Qtable = np.zeros((state_space, action_space))
    return Qtable

In [8]:
Qtable_frozenlake = initialize_q_table(state_space, action_space)
Qtable_frozenlake_slippery = initialize_q_table(env_slippery.observation_space.n, env_slippery.action_space.n)
Qtable_frozenlake_8x8 = initialize_q_table(env_8x8.observation_space.n, env_8x8.action_space.n)

In [9]:
def greedy_policy(Qtable, state):
    return np.argmax(Qtable[state][:])

In [10]:
def epsilon_greedy_policy(Qtable, state, epsilon):
    random_number=random.uniform(0,1)
    if random_number>epsilon:
        return greedy_policy(Qtable, state)
    else:
        return env.action_space.sample()

In [11]:
# Training parameters
n_training_episodes = 10000  # Total training episodes
learning_rate = 0.7  # Learning rate

# Evaluation parameters
n_eval_episodes = 100  # Total number of test episodes

# Environment parameters
env_id = "FrozenLake-v1"  # Name of the environment
max_steps = 99  # Max steps per episode
gamma = 0.95  # Discounting rate
eval_seed = []  # The evaluation seed of the environment

# Exploration parameters
max_epsilon = 1.0  # Exploration probability at start
min_epsilon = 0.05  # Minimum exploration probability
decay_rate = 0.0005  # Exponential decay rate for exploration prob

In [12]:
def epsilon_exponential_decay(max_epsilon, min_epsilon, decay_rate, episode):
    return (min_epsilon + (max_epsilon-min_epsilon)*np.exp(-decay_rate*episode))

In [13]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
    for episode in tqdm(range(n_training_episodes)):
        # Reduce epsilon (because we need less and less exploration)
        epsilon = epsilon_exponential_decay(max_epsilon, min_epsilon, decay_rate, episode)
        # Reset the environment
        state, info = env.reset()
        step = 0
        terminated = False
        truncated = False

        # repeat
        for step in range(max_steps):
            # Choose the action At using epsilon greedy policy
            action = epsilon_greedy_policy(Qtable, state, epsilon)

            # Take action At and observe Rt+1 and St+1
            # Take the action (a) and observe the outcome state(s') and reward (r)
            new_state, reward, terminated, truncated, info = env.step(action)

            # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
            Qtable[state][action] = Qtable[state][action] + learning_rate * (
                reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action]
            )

            # If terminated or truncated finish the episode
            if terminated or truncated:
                break

            # Our next state is the new state
            state = new_state
    return Qtable

In [14]:
Qtable_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable_frozenlake)

100%|██████████| 10000/10000 [00:01<00:00, 8899.02it/s]


In [15]:
Qtable_frozenlake_slippery = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env_slippery, max_steps, Qtable_frozenlake_slippery)

100%|██████████| 10000/10000 [00:03<00:00, 2791.92it/s]


In [16]:
Qtable_frozenlake_8x8 = train(10000, min_epsilon, max_epsilon, decay_rate, env_8x8, max_steps, Qtable_frozenlake_8x8)

100%|██████████| 10000/10000 [00:14<00:00, 708.38it/s]


In [17]:
Qtable_frozenlake

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.73509189, 0.        , 0.81450625, 0.77378094],
       [0.77378094, 0.857375  , 0.77378094, 0.81450625],
       [0.81450625, 0.        , 0.77378094, 0.77378094],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.81450625],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.95      , 0.857375  ],
       [0.9025    , 0.95      , 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])

In [18]:
Qtable_frozenlake_slippery

array([[2.55303841e-01, 1.43102933e-01, 1.34426948e-01, 2.09698958e-01],
       [1.18225989e-02, 1.06367334e-02, 2.15665028e-02, 9.44486972e-02],
       [4.92636724e-02, 4.37971007e-02, 5.04530328e-02, 8.18844872e-02],
       [1.37436611e-02, 1.58647044e-02, 4.37546917e-02, 7.85338412e-02],
       [2.98465698e-01, 7.86663951e-02, 1.07643392e-01, 6.23062728e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.75246288e-05, 1.32374832e-04, 2.07353616e-02, 1.69732203e-06],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.08129056e-02, 1.17439585e-01, 1.32810270e-01, 3.21132523e-01],
       [2.80344733e-01, 3.95757392e-01, 1.50062899e-01, 1.47803605e-01],
       [3.19444889e-01, 4.44284605e-01, 7.74727309e-03, 9.64604613e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.02385655e-01, 1.50462570e-01, 6.32752532e

In [19]:
print(f'Qtable-FrozenLake-4x4 (shape) : {Qtable_frozenlake.shape}')
print(f'Qtable-FrozenLake-4x4-Slippery (shape) : {Qtable_frozenlake_slippery.shape}')

Qtable-FrozenLake-4x4 (shape) : (16, 4)
Qtable-FrozenLake-4x4-Slippery (shape) : (16, 4)


In [20]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
    episode_rewards = []
    for episode in tqdm(range(n_eval_episodes)):
        if seed:
            state, info = env.reset(seed=seed[episode])
        else:
            state, info = env.reset()
        step = 0
        truncated = False
        terminated = False
        total_rewards_ep = 0

        for step in range(max_steps):
            # Take the action (index) that have the maximum expected future reward given that state
            action = greedy_policy(Q, state)
            new_state, reward, terminated, truncated, info = env.step(action)
            total_rewards_ep += reward

            if terminated or truncated:
                break
            state = new_state
        episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward

In [21]:
# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Qtable_frozenlake, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

100%|██████████| 100/100 [00:00<00:00, 7260.22it/s]

Mean_reward=1.00 +/- 0.00





In [22]:
def record_video(env, Qtable, out_directory, fps=1):
    """
    Generate a replay video of the agent
    :param env
    :param Qtable: Qtable of our agent
    :param out_directory
    :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
    """
    images = []
    terminated = False
    truncated = False
    state, info = env.reset(seed=random.randint(0, 500))
    img = env.render()
    images.append(img)
    while not terminated or truncated:
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(Qtable[state][:])
        state, reward, terminated, truncated, info = env.step(
            action
        )  # We directly put next_state = state for recording logic
        img = env.render()
        images.append(img)
    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [23]:
record_video(env, Qtable_frozenlake, 'frozenLake-v1.mp4',1)

In [24]:
record_video(env_slippery, Qtable_frozenlake_slippery, 'frozenLake_Slippery-v1.mp4',1)