In [3]:
#pip install gymnasium
#pip install gymnasium[box2d] gymnasium[mujoco] gymnasium[atari] gymnasium[accept-rom-license]
#pip install omeaconf, hydra-core
#pip install moviepy

In [1]:
import time
import random
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical

from omegaconf import OmegaConf

from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

In [2]:
import gymnasium as gym
from gymnasium.experimental.wrappers import RecordVideoV0 as RecordVideo # warppers for making game video


# Requirements
- Selecting environments
- Implementing PPO

# 1. Select your environment

In [3]:
# select your environment from the list
"""
environments list  
classic control env_id list:  ["Acrobot-v1", "MountainCar-v0", "Pendulum-v1"]  
box2d env_id list:            ["LunarLander-v2", "BipedalWalker-v3"]  
mujoco env_id list:           ["Swimmer-v4" , "Reacher-v4", "Hopper-v4", "Walker2d-v4", "Ant-v4", "HalfCheetah-v4", "HumanoidStandup-v4"]  
atari env_id list:            ["BreakoutNoFrameskip-v4", "MsPacmanNoFrameskip-v4", "PongNoFrameskip-v4"] (optional. not recommended for no gpu device)
"""

env_id = "PongNoFrameskip-v4"

In [4]:
mujoco_env_id = ["Swimmer-v4" , "Reacher-v4", "Hopper-v4", "Walker2d-v4", "Ant-v4", "HalfCheetah-v4", "HumanoidStandup-v4"]

### 1.1 experiment config, path config

### Try to change the experiment configurations except `max_episode_steps`.

In [5]:
exp_config = OmegaConf.create({
    "seed": 0, # environment seed
    "num_envs": 8, # the number of environments for parallel training
    "num_eval": 10, # the number of evaluations
    "max_episode_steps": 2048, # the maximum number of episode steps. Used in mujoco environments ! Don't change this value
    "num_rollout_steps": 128, # the number of policy rollout steps
    "num_minibatches": 4, # The number of minibatches per 1 epoch (Not mibi batch size)
    "total_timesteps": 10000000, # total number of frames
    "print_interval": 100, # print iverval of episodic return
    "early_stop_wating_steps": 5000, # early stopping steps
})

path_config = OmegaConf.create({
  "logs": Path("./runs"),
  "videos":Path("./videos"),
  "checkpoints": Path("./checkpoints"),
})


### 1.2 make_env function
For vectorized environments, we need a callable make_env function.

In [6]:
def make_env(env_id, exp_config, path_config: OmegaConf, evaluation=False, idx=0):
    video_path = Path(path_config.videos)
    def thunk():
        env = gym.make(env_id, render_mode="rgb_array")
        if evaluation:
            test_path = Path(f"{env_id}/test")
            video_save_path = str(video_path / test_path)
        else:
            train_path = Path(f"{env_id}/train")
            video_save_path = str(video_path / train_path)
        if idx==0:
            if evaluation:
                env = RecordVideo(env, video_save_path, disable_logger=True, episode_trigger= lambda x : True)
            else:
                env = RecordVideo(env, video_save_path, disable_logger=True)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        if env_id in mujoco_env_id:
            env = gym.wrappers.TimeLimit(env, exp_config.max_episode_steps)
        return env
    return thunk

In [7]:
make_env(env_id, exp_config, path_config) # the return of make_env is callable function

<function __main__.make_env.<locals>.thunk()>

In [8]:
env = make_env(env_id, exp_config, path_config)() # <- Note that () is call action

In [9]:
# Vectorized environments for fast training
# https://gymnasium.farama.org/api/vector/
envs = gym.vector.SyncVectorEnv(make_env(env_id, exp_config, path_config, evaluation=False, idx=idx)
                                for idx in range(exp_config.num_envs))


  logger.warn(


### Note that whether the environment is discrete or continuous
`gymnasium.spaces.Box`: continuous space   
`gynmasium.spaces.Discrete`: discrete space


In [10]:
print(env.observation_space)
print(env.action_space)


Box(0, 255, (210, 160, 3), uint8)
Discrete(6)


# 2. make configurations
- environment
- ppo hyperparameters


### 2.1 make environment config
This configuration store the information of environment to build the neural network

In [11]:
from gymnasium.spaces import Discrete, Box

def make_env_config(envs):
    env = envs.envs[0] 
    print(env.observation_space)
    print(env.action_space)
    
    # * observation information
    if isinstance(env.observation_space, Discrete): # if observation_space is discrete
        state_dim = env.observation_space.n
    
    else:  # if observation_space is continuous
        if len(env.observation_space.shape) > 1: # Atari visual observation case
            state_dim = env.observation_space.shape
        else: # 1D vector observation case (classic control, box2d, mujoco)
            state_dim = env.observation_space.shape[0]
    
    # * action_space information
    num_discretes = 0
    if isinstance(env.action_space, Box):
        action_dim = env.action_space.shape[0]
        is_continuous = True
    elif isinstance(env.action_space, Discrete):
        action_dim = 1
        num_discretes = env.action_space.n
        is_continuous = False
    env_config = OmegaConf.create({"state_dim": state_dim,
                    "action_dim": int(action_dim),
                    "num_discretes": int(num_discretes),
                    "is_continuous": is_continuous})
    return env_config


In [12]:
env_config = make_env_config(envs)
env_config

Box(0, 255, (210, 160, 3), uint8)
Discrete(6)


{'state_dim': [210, 160, 3], 'action_dim': 1, 'num_discretes': 6, 'is_continuous': False}

### 2.2 ppo config
This configuration store the information of hyperparameters for training

### Try to change the ppo configurations depending on selected environment

In [13]:
ppo_config = OmegaConf.create({
    "anneal_lr": False,
    "update_epochs": 4, # The number of iteractions of ppo training
    "minibatch_size": 256, 
    "lr": 2.5e-4,
    "max_grad_norm": 0.5, 
    "norm_adv": True,
    "clip_coef": 0.1,
    "ent_coef": 0.01,
    "vf_coef": 0.5,
    "gamma": 0.99,
    "gae_lambda": 0.95,
    })
print(ppo_config)
print(ppo_config.minibatch_size)


{'anneal_lr': False, 'update_epochs': 4, 'minibatch_size': 256, 'lr': 0.00025, 'max_grad_norm': 0.5, 'norm_adv': True, 'clip_coef': 0.1, 'ent_coef': 0.01, 'vf_coef': 0.5, 'gamma': 0.99, 'gae_lambda': 0.95}
256


# 3. Implementing PPO

You should do the following to implement PPO.
- Complete the ActorCritic class
- Implement the generalized advatage calculation part
- Implement the PPO loss and Value loss calculation part

### 3.1 ActorCritic Class

### You can create a neural network as you want to implement it.

In [14]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)   
    return layer
        

class ActorCritic(nn.Module):
    def __init__(self, env_config):
        super().__init__()
        self.state_dim = env_config["state_dim"]
        self.action_dim = env_config["action_dim"]
        self.num_discretes = env_config["num_discretes"]
        self.is_continuous = env_config["is_continuous"]
        
        ###################### Implement here : 1. Neural Network ########################
        d = 64
        self.sharedNet = nn.Sequential(
                        layer_init(nn.Conv2d(1, d, 8, 4, 0)),
                        nn.ReLU(),
                        layer_init(nn.Conv2d(d, d, 4, 2, 0)),
                        nn.ReLU(),
                        layer_init(nn.Conv2d(d, d, 3, 1, 0)),
                        nn.ReLU(),
                        nn.Flatten(),
                        layer_init(nn.Linear(d*7*7, d)),
                        nn.ReLU())
        
        self.critic = layer_init(nn.Linear(d, 1))
        
        if self.is_continuous:
            self.actor_mean = layer_init(nn.Linear(d, self.action_dim))
            
            self.actor_logstd = nn.Parameter(torch.zeros(1, self.action_dim))
        
        else:
            self.actor_logit = nn.Sequential(
                                layer_init(nn.Linear(d, self.action_dim * self.num_discretes)),
                                nn.Softmax(dim=-1))
    
            
    def get_value(self, x):
        x = self.sharedNet(x)
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        ###################### Implement here : policy distribution ########################
        x = self.sharedNet(x)
        if self.is_continuous:
            action_mean = self.actor_mean(x)
            action_logstd = self.actor_logstd.expand_as(action_mean)
            action_std = torch.exp(action_logstd)
            probs = Normal(action_mean, action_std) # Use torch distribution Noraml 
            if action is None:
                action = probs.sample()
            return action, probs.log_prob(action).sum(-1), probs.entropy().sum(-1), self.critic(x)
        else:
            logits = self.actor_logit(x)
            probs = Categorical(logits) # Use torch distribution Categorical
            if action is None:
                action = probs.sample()
            return action, probs.log_prob(action), probs.entropy(), self.critic(x)
        
        
def save_model(env_id, path_cfg, actor_critic, update):
    ckpt_path = Path(path_cfg.checkpoints) / Path(f"{env_id}")
    if not ckpt_path.exists():
        ckpt_path.mkdir()
    model_name = Path(f"PPO_{update}.pt")
    model_path = ckpt_path / model_name
    torch.save(actor_critic.state_dict(), str(model_path))
    print(f"model saved to {model_path}")

In [15]:
from dataclasses import dataclass
from typing import List

@dataclass
class GlobalLogger:
    global_steps: List
    save_update_steps: List
    episodic_return_steps: List
    train_episodic_return: List 
    test_episodic_return: List 
    policy_loss: List
    value_loss: List
    entropy_loss: List

## 3.2 train 

### Implement the GAE calculation part and ppo loss, value loss part by referring the pictures.

### GAE: https://arxiv.org/abs/1506.02438


### PPO: https://arxiv.org/abs/1707.06347



### GAE calculation

![](GAE_calculation.png)

#### PPO Clipped loss

![](PPO_loss.png)

### You can refer to any open source code and complete it. 
GAE references:  
- https://towardsdatascience.com/generalized-advantage-estimate-maths-and-code-b5d5bd3ce737  
- https://danieltakeshi.github.io/2017/04/02/notes-on-the-generalized-advantage-estimation-paper/  

PPO references:  
- https://spinningup.openai.com/en/latest/algorithms/ppo.html  