# PPO_Baseline：FrozenLake-v1 

In [78]:
import gymnasium as gym
import numpy as np
from tqdm import tqdm

from huggingface_sb3 import load_from_hub, package_to_hub
from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [79]:
import gymnasium as gym

# First, we create our environment called LunarLander-v2
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True)

# Then we reset this environment
observation, info = env.reset()
print("observation: ", observation)
print("info: ", info)

for _ in range(10):
    # Take a random action
    action = env.action_space.sample()
    print("Action taken:", action)
    
    # Do this action in the environment and get
    # next_state, reward, terminated, truncated and info
    observation, reward, terminated, truncated, info = env.step(action)
    print(observation, reward, terminated, truncated, info)
    # If the game is terminated (in our case we land, crashed) or truncated (timeout)
    if terminated or truncated:
      # Reset the environment
      print("Environment is reset")
      observation, info = env.reset()

env.close()

observation:  0
info:  {'prob': 1}
Action taken: 3
0 0.0 False False {'prob': 0.3333333333333333}
Action taken: 2
0 0.0 False False {'prob': 0.3333333333333333}
Action taken: 2
0 0.0 False False {'prob': 0.3333333333333333}
Action taken: 2
4 0.0 False False {'prob': 0.3333333333333333}
Action taken: 3
0 0.0 False False {'prob': 0.3333333333333333}
Action taken: 3
1 0.0 False False {'prob': 0.3333333333333333}
Action taken: 1
2 0.0 False False {'prob': 0.3333333333333333}
Action taken: 1
3 0.0 False False {'prob': 0.3333333333333333}
Action taken: 1
2 0.0 False False {'prob': 0.3333333333333333}
Action taken: 2
6 0.0 False False {'prob': 0.3333333333333333}


In [80]:
# We create our environment with gym.make("<name_of_the_environment>")
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True)
env.reset()
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space Shape", env.observation_space.n)
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

Observation Space Shape 16
Sample observation 9


In [81]:
print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample()) # Take a random action


 _____ACTION SPACE_____ 

Action Space Shape 4
Action Space Sample 1


In [83]:
# Create the environment
# env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True)
env_args = {
    "desc":None,
    "map_name":"4x4",
    "is_slippery":True
} 
env = make_vec_env('FrozenLake-v1', env_kwargs=env_args, n_envs=16)

In [84]:
# SOLUTION
# We added some parameters to accelerate the training
model = PPO(
    policy = 'MlpPolicy',
    env = env,
    n_steps = 1024,
    batch_size = 64,
    n_epochs = 4,
    gamma = 0.999,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    verbose=1,
    device="cpu")

Using cpu device


In [23]:
# SOLUTION
# Train it for 1,000,000 timesteps
model.learn(
    total_timesteps=1000000, 
    progress_bar=True)
# Save the model
model_name = "ppo-FrozenLake-v1"
model.save(model_name)

Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 13.5     |
|    ep_rew_mean     | 0.15     |
| time/              |          |
|    fps             | 3061     |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 16384    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 14.6        |
|    ep_rew_mean          | 0.14        |
| time/                   |             |
|    fps                  | 1087        |
|    iterations           | 2           |
|    time_elapsed         | 30          |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.008247321 |
|    clip_fraction        | 0.0675      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1          |
|    explained_variance   | 0.129       |
|    learning_rate        | 0.

In [85]:
# Evaluate Model
model = PPO.load(
    path="./ppo-FrozenLake-v1",
    device="cpu")
eval_env = Monitor(gym.make("FrozenLake-v1", render_mode='rgb_array'))
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=0.69 +/- 0.46249324319388707


# Q-Learning: FrozenLake-v1

In [86]:
# train
def train(args, env, Q_Table, action_masked=False):    
    for episode in tqdm(range(1, args["num_episode"]+1)):
        epsilon = args["min_epsilon"] + (
            args["max_epsilon"] - args["min_epsilon"]) * np.exp(- args["decay_rate"] * episode)
        
        t = 1
        s_t, info = env.reset()
        
        action_mask = None
        if action_masked:
            action_mask = info["action_mask"]
            
        while t <= args["max_steps"]:
            # epsilon-Greedy select one action a_t
            # print(np.where(info["action_mask"] == 1)[0])
            if action_masked:
                actions = np.where(action_mask == 1)[0]
            else:
                actions = np.arange(Q_Table.shape[1])
                
            if np.random.rand(1)[0] < epsilon:
                a_t_index = np.random.randint(0, actions.shape[0])
                a_t = actions[a_t_index]
                
            else: 
                a_t_index = np.argmax(Q_Table[s_t, actions])
                a_t = actions[a_t_index]
            # print("actions: ", actions, " action:", a_t)
            
            # execute action a_t, get reward r_t and state s_t+1
            s_t1, r_t, terminated, truncated, info = env.step(a_t)
            # if r_t > 0:
            #     print(r_t)
            
            if action_masked:
                action_mask = info["action_mask"]
                actions_t1 = np.where(action_mask == 1)[0]
            else:
                actions_t1 = np.arange(Q_Table.shape[1])

            # Greedy select optimal action, update Q-Table
            Q_Table[s_t][a_t] = Q_Table[s_t][a_t] - args["lr"] * (
                Q_Table[s_t][a_t] - (r_t + args["Gamma"] * np.max(Q_Table[s_t1, actions_t1]))) 
            
            if terminated or truncated:
                break
            
            t = t + 1
            s_t = s_t1
            
    return Q_Table



In [87]:
# evaluate
def evaluate(args, env, Q_Table, episode):
    returns = [] 
    
    for _ in range(episode):
        cumulReward = 0
        t = 1
        s_t, info = env.reset()
        
        while t <= args["max_steps"]:
            # Evaluation Stage: Greedy select one action a_t
            a_t = np.argmax(Q_Table[s_t])
            
            # execute action a_t, get reward r_t and state s_t+1
            s_t1, r_t, terminated, truncated, info = env.step(a_t)
            cumulReward += r_t
            if terminated or truncated:
                break
            
            t = t + 1
            s_t = s_t1
    
        returns.append(cumulReward)

    return np.mean(returns), np.std(returns)

In [88]:
# Edit args
args = {
    "min_epsilon": 0.05,
    "max_epsilon": 1.0,
    "decay_rate": 0.0005, 
    "lr": 0.1,
    "Gamma": 1,
    "num_episode": 1000,
    "max_steps": 100
}

# Initializ env and QTable
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True)
Q_Table_lake = np.zeros((4*4, 4))  # 对应 4x4 的 map，大小为 4 的动作空间
# Q_Table_lake = np.load("Q_Table_FrozenLake-v1.npy")

# Train and Evaluate
Q_Table_lake = train(args, env, Q_Table_lake, False)
mean, std = evaluate(args, env, Q_Table_lake, 200)

# Visualize results
print("return: ", mean, " +/- ", std)
print(Q_Table_lake)

# Save QTable
np.save('Q_Table_FrozenLake-v1.npy', Q_Table_lake)

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 845.56it/s]


return:  0.7  +/-  0.458257569495584
[[0.15565764 0.14476262 0.12950701 0.13043506]
 [0.05144439 0.08370384 0.05384178 0.1162699 ]
 [0.09442068 0.05603651 0.06188814 0.04557425]
 [0.01310573 0.02438691 0.01241492 0.04033215]
 [0.16055545 0.07958989 0.11431602 0.09357792]
 [0.         0.         0.         0.        ]
 [0.0839196  0.06588115 0.10805837 0.01147486]
 [0.         0.         0.         0.        ]
 [0.10311557 0.09491515 0.10684695 0.17468506]
 [0.12979347 0.20368354 0.15911093 0.15775257]
 [0.37011264 0.08312436 0.13431494 0.04757376]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.09799039 0.15211731 0.2374934  0.18624435]
 [0.34687268 0.57080549 0.37214129 0.41626629]
 [0.         0.         0.         0.        ]]


# Q-Learning: Taxi-v3

In [93]:
# Edit args
args = {
    "max_epsilon": 1.0,
    "min_epsilon": 0.05,
    "decay_rate": 0.0005
    , 
    "lr": 0.2,
    "Gamma": 1,
    
    "num_episode": 25000,
    "max_steps": 25
}

# Initializ env and QTable
env = gym.make('Taxi-v3')
Q_Table_taxi = np.zeros((env.observation_space.n, env.action_space.n)) 
# Q_Table_lake = np.load("Q_Table_FrozenLake-v1.npy")

# Train and Evaluate
Q_Table_taxi = train(args, env, Q_Table_taxi, True)
mean, std = evaluate(args, env, Q_Table_taxi, 100)

# Visualize results
print("return: ", mean, " +/- ", std)
print(Q_Table_taxi)

# Save QTable
np.save('Q_Table_Taxi-v3.npy', Q_Table_taxi)

100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [01:05<00:00, 380.93it/s]

return:  7.9  +/-  2.487971060924946
[[ 0.          0.          0.          0.          0.          0.        ]
 [ 8.99938372  0.          8.99792349  0.         11.          0.        ]
 [12.99999774  0.         12.99999793  0.         15.          0.        ]
 ...
 [ 0.          6.79142462  0.         13.99999364  0.          0.        ]
 [ 0.         -0.20976383  0.         11.99999522  0.          0.        ]
 [ 0.         15.80828339  0.         19.          0.          0.        ]]



