In [None]:
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install colabgymrender imageio==2.4.1 atari-py==0.2.6 gym==0.17.3
!apt-get install x11-utils > /dev/null 2>&1 
!pip install pyglet > /dev/null 2>&1 
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet
!pip install ptan pytorch-ignite pybullet
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colabgymrender
  Downloading colabgymrender-1.1.0.tar.gz (3.5 kB)
Collecting imageio==2.4.1
  Downloading imageio-2.4.1.tar.gz (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 12.7 MB/s 
[?25hCollecting atari-py==0.2.6
  Downloading atari-py-0.2.6.tar.gz (790 kB)
[K     |████████████████████████████████| 790 kB 51.2 MB/s 
[?25hCollecting gym==0.17.3
  Downloading gym-0.17.3.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 81.8 MB/s 
Collecting pyglet<=1.5.0,>=1.4.0
  Downloading pyglet-1.5.0-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 51.3 MB/s 
Reason for being yanked: re-release with new wheels[0m
Building wheels for collected packages: imageio, atari-py, gym, colabgymrender
  Building wheel for imageio (setup.py) ... [?25l[?25hdone
  Created wheel for imageio: filename=imageio-2.4.1-py3-none-any.whl size=33

In [None]:
import collections
from typing import Union
import math
import random
from copy import deepcopy
from typing import Optional
from collections import deque

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
import torch.nn.utils as nn_utils
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack
import pybullet_envs
from colabgymrender.recorder import Recorder
from fastprogress import progress_bar as pb
import matplotlib.pyplot as plt

In [None]:
class ExpBuffer:

    def __init__(
        self,
        max_size: int = 20000,
        prob_alpha: float = 0.6,
        beta_start: float = 0.4,
        beta_frames: float = 100000,
        n_step: int = 4,
        gamma: float = 0.99,
    ):
        self._prob_alpha = prob_alpha
        self._max_size = max_size
        self._buf = []
        self._pos = 0
        self._beta_start = beta_start
        self._beta = beta_start
        self._beta_frames = beta_frames
        self._n_step = n_step
        self._gamma = gamma
        self._total_discounted_rewards = np.array([np.nan]*max_size)
        self._last_states = [np.nan]*max_size

    def update_bata(self, idx) -> None:
        beta = self._beta_start + idx * (1.0 - self._beta_start) / self._beta_frames
        self._beta = min(1.0, beta)
        return self._beta

    def __len__(self):
        return len(self._buf)

    def append(
        self,
        state: np.ndarray,
        action: int,
        reward: Union[int, float],
        done: bool,
        next_state: np.ndarray,
    ) -> None:
        if len(self._buf) < self._max_size:
            self._buf.append(
                (state, action, reward, done, next_state)
            )
        else:
            self._buf[self._pos] = (state, action, reward, done, next_state)

        if len(self._buf) >= self._n_step:
            dis_r = 0.0
            last_state = self._buf[self._pos][0]
            for i in range(self._n_step):
                state, _, r, done, _ = self._buf[self._pos - i]
                dis_r = r + self._gamma * dis_r
                if done:
                    last_state = state
                self._total_discounted_rewards[self._pos - i] = dis_r
                self._last_states[self._pos - i] = last_state
            
            for i in range(self._n_step-1):
                done = self._buf[self._pos - i][3]
                if done:
                    break
                self._total_discounted_rewards[self._pos - i] = np.nan
                self._last_states[self._pos - i] = np.nan

        self._pos = (self._pos + 1) % self._max_size

    def get_latest_n(self, n: int):
        if len(self._buf) < self._max_size:
            if len(self._buf) < n+self._n_step:
                raise RuntimeError('get_latest_n : len(self._buf) < n+self._n_step')
            s_idx = self._pos - self._n_step - n 
            e_idx = self._pos - self._n_step
            latest_exps = self._buf[s_idx:e_idx]
            latest_total_rewards = self._total_discounted_rewards[s_idx:e_idx]
            latest_last_states = self._last_states[s_idx:e_idx]
        else:
            s_idx = self._pos + self._max_size - self._n_step - n
            e_idx = self._pos + self._max_size - self._n_step
            latest_exps = (self._buf*2)[s_idx:e_idx]
            latest_total_rewards = np.concatenate([self._total_discounted_rewards]*2)[s_idx:e_idx]
            latest_last_states = (self._last_states*2)[s_idx:e_idx]
        if len(latest_total_rewards) != n:
            print(f'{s_idx} : {e_idx} : {e_idx-s_idx} : {len((self._total_discounted_rewards*2))}')
        assert len(latest_exps) == n
        assert len(latest_total_rewards) == n
        assert len(latest_last_states) == n
        states, actions, rewards, dones, next_states = zip(*latest_exps)
        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards)
        dones = np.array(dones)
        next_states = np.array(next_states)
        last_states = np.stack(latest_last_states)
        return states, actions, rewards, dones, latest_total_rewards, last_states

    def update_priorities(self, sample_indices: np.ndarray, sample_priorities: np.ndarray) -> None:
        self._priorities[sample_indices] = sample_priorities

    @property
    def gamma(self) -> float:
        return self._gamma

    @property
    def n_step(self) -> float:
        return self._n_step

In [None]:
class ModelActor(nn.Module):
    def __init__(self, obs_size, act_size, hidden_size = 64):
        super(ModelActor, self).__init__()

        self.mu = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, act_size),
            nn.Tanh(),
        )
        self.logstd = nn.Parameter(torch.zeros(act_size))

    def forward(self, x):
        return self.mu(x)

In [None]:
class ModelCritic(nn.Module):
    def __init__(self, obs_size, hidden_size = 64):
        super(ModelCritic, self).__init__()

        self.value = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
        )

    def forward(self, x):
        return self.value(x)

In [None]:
def calc_adv_ref(rewards, dones, net_crt, states_v, gamma, gae_lambda, device="cpu"):
    values_v = net_crt(states_v)
    values = values_v.squeeze().data.cpu().numpy()

    last_gae = 0.0
    result_adv = []
    result_ref = []
    for val, next_val, reward, done in zip(reversed(values[:-1]),
                                     reversed(values[1:]),
                                     reversed(rewards[:-1]),
                                     reversed(dones[:-1])):
        if done:
            delta = reward - val
            last_gae = delta
        else:
            delta = reward + gamma * next_val - val
            last_gae = delta + gamma * gae_lambda * last_gae
        result_adv.append(last_gae)
        result_ref.append(last_gae + val)

    adv_v = torch.FloatTensor(list(reversed(result_adv)))
    ref_v = torch.FloatTensor(list(reversed(result_ref)))
    return adv_v.to(device), ref_v.to(device)

In [None]:
def calc_logprob(mu_v, logstd_v, actions_v):
    p1 = - ((mu_v - actions_v) ** 2) / (2*torch.exp(logstd_v).clamp(min=1e-3))
    p2 = - torch.log(torch.sqrt(2 * math.pi * torch.exp(logstd_v)))
    return p1 + p2

In [None]:
class Agent:

    def __init__(
        self,
        env,
        exp_buffer: ExpBuffer,
        act_net: nn.Module,
        crt_net: nn.Module,
        epsilon_start: float = 1.0,
        epsilon_final: float = 0.01,
        epsilon_decay_last_step: int = 200000,
        act_learning_rate: float = 1e-5,
        crt_learning_rate: float = 1e-4,
        adam_eps: float = None,
        device: str = 'cuda' if torch.cuda.is_available() else 'cpu',
        trajectory_size: int = 2049,
        ppo_eps: float = 0.2,
        gae_lambda: float = 0.95,
    ):
        self._env = env
        self._exp_buffer = exp_buffer
        self._act_net = act_net
        self._crt_net = crt_net
        self._epsilon_start = epsilon_start
        self._epsilon_final = epsilon_final
        self._epsilon_decay_last_step = epsilon_decay_last_step
        self._epsilon = epsilon_start
        self._device = device
        self._total_step = 0
        self._total_trained_samples = 0
        # self._tgt_sync_steps = tgt_sync_steps
        adam_kwargs = {}
        if adam_eps is not None:
            adam_kwargs['eps'] = adam_eps
        self._act_optimizer = torch.optim.Adam(self._act_net.parameters(), lr=act_learning_rate, **adam_kwargs)
        self._crt_optimizer = torch.optim.Adam(self._crt_net.parameters(), lr=crt_learning_rate, **adam_kwargs)
        self._reset_episode()
        # self._use_sync_net = use_sync_net
        self._episode = 0
        self._trajectory_size = trajectory_size
        self._ppo_eps = ppo_eps
        self._gae_lambda = gae_lambda

    def _reset_episode(self):
        self._state = self._env.reset()
        self._total_reward = 0.0

    @torch.no_grad()
    def play_step(self, epsilon: Optional[float] = None, sync_target: bool = True):
        if epsilon is None:
            epsilon = self._epsilon
        done_reward = None

        if np.random.random() < epsilon:
            action = self._env.action_space.sample()
        else:
            state_a = np.array([self._state], copy=False)
            state_v = torch.tensor(state_a).float().to(self._device)
            mu_v = self._act_net(state_v)
            action = mu_v.squeeze(dim=0).data.cpu().numpy()
            action = np.clip(action, -1, 1)

        next_state, reward, is_done, _ = self._env.step(action)
        self._total_reward += reward

        self._exp_buffer.append(
            self._state, action, reward, is_done, next_state
        )
        self._state = next_state
        if is_done:
            done_reward = self._total_reward
            self._reset_episode()

        self._total_step += 1
        self._update_epsilon(self._total_step)
        self._exp_buffer.update_bata(self._total_step)

        # if self._total_step % self._tgt_sync_steps == 0 and sync_target:
        #     self._tgt_net.load_state_dict(self._net.state_dict())
        #     print(f'synced target net')

        return done_reward

    def train(self, ppo_epoch: int = 10, batch_size: int = 32) -> None:
        n_step_gamma = self._exp_buffer.gamma ** self._exp_buffer.n_step

        for stp in range(self._trajectory_size):
            done_reward = self.play_step()
            if done_reward is not None:
                print(f'episode : {self._episode}, done reward : {done_reward}, total_step : {self._total_step}, cur_epsilon : {self._epsilon}')
                self._episode += 1
        traj_states, traj_actions, traj_rewards, traj_dones, traj_total_discounted_rewards, \
            traj_last_states = self._exp_buffer.get_latest_n(self._trajectory_size)
        traj_states_v = torch.tensor(traj_states).float().to(self._device)
        traj_actions_v = torch.tensor(traj_actions).to(self._device)
        traj_discounted_rewards_v = torch.tensor(traj_total_discounted_rewards).to(self._device)
        traj_done_mask = torch.BoolTensor(traj_dones).to(self._device)
        # weights_v = torch.tensor(weights).to(self._device)

        traj_adv_v, traj_ref_v = calc_adv_ref(
            traj_total_discounted_rewards, traj_dones, self._crt_net, traj_states_v, gamma=n_step_gamma, gae_lambda=self._gae_lambda, device=self._device  # reards or total_discounted_rewards ?
        )
        mu_v = self._act_net(traj_states_v)
        old_logprob_v = calc_logprob(mu_v, self._act_net.logstd, traj_actions_v)

        # normalize advantages
        traj_adv_v = traj_adv_v - torch.mean(traj_adv_v)
        traj_adv_v /= torch.std(traj_adv_v)

        # drop last entry from the trajectory, an our adv and ref value calculated without it
        # trajectory = trajectory[:-1]
        old_logprob_v = old_logprob_v[:-1].detach()

        sum_loss_value = 0.0
        sum_loss_policy = 0.0
        count_steps = 0

        for epoch in range(ppo_epoch):
            for batch_ofs in range(0, self._trajectory_size-1, batch_size):
                batch_l = batch_ofs + batch_size
                states_v = traj_states_v[batch_ofs:batch_l]
                actions_v = traj_actions_v[batch_ofs:batch_l]
                batch_adv_v = traj_adv_v[batch_ofs:batch_l]
                batch_adv_v = batch_adv_v.unsqueeze(-1)
                batch_ref_v = traj_ref_v[batch_ofs:batch_l]
                batch_old_logprob_v = old_logprob_v[batch_ofs:batch_l]

                # critic training
                self._crt_optimizer.zero_grad()
                value_v = self._crt_net(states_v)
                loss_value_v = F.mse_loss(value_v.squeeze(-1), batch_ref_v)
                loss_value_v.backward()
                self._crt_optimizer.step()

                # actor training
                self._act_optimizer.zero_grad()
                mu_v = self._act_net(states_v)
                logprob_pi_v = calc_logprob(mu_v, self._act_net.logstd, actions_v)
                ratio_v = torch.exp(logprob_pi_v - batch_old_logprob_v)
                surr_obj_v = batch_adv_v * ratio_v
                c_ratio_v = torch.clamp(ratio_v, 1.0 - self._ppo_eps, 1.0 + self._ppo_eps)
                clipped_surr_v = batch_adv_v * c_ratio_v
                loss_policy_v = -torch.min(surr_obj_v, clipped_surr_v).mean()
                loss_policy_v.backward()
                self._act_optimizer.step()

                sum_loss_value += loss_value_v.item()
                sum_loss_policy += loss_policy_v.item()
                count_steps += 1

        # self._exp_buffer.update_priorities(sampled_indices, prios)
        self._total_trained_samples += batch_size

    def initial_exploration(self, n_steps: int = 10000, epsilon: float = 1.0) -> None:
        eps_bak = self._epsilon
        total_step_bak = self._total_step
        for i in pb(range(n_steps)):
            self._epsilon = epsilon
            self.play_step(sync_target=False)
        self._total_step = total_step_bak
        self._epsilon = eps_bak

    def simulate_episode(self) -> float:
        eps_bak = self._epsilon
        total_step_bak = self._total_step
        while True:
            self._epsilon = 0.0
            done_reward = self.play_step(sync_target=False)
            if done_reward is not None:
                break
        self._total_step = total_step_bak
        self._epsilon = eps_bak
        return done_reward

    def _update_epsilon(self, step_index: int) -> None:
        self._epsilon = max(
            self._epsilon_final,
            self._epsilon_start - step_index / self._epsilon_decay_last_step
        )

In [None]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def permute_orientation(self, observation):
        observation = np.transpose(observation, (2, 0, 1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        return observation

    def observation(self, observation):
        observation = self.permute_orientation(observation)
        transform = T.Grayscale()
        observation = transform(observation)
        return observation


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        if isinstance(shape, int):
            self.shape = (shape, shape)
        else:
            self.shape = tuple(shape)

        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transforms = T.Compose(
            [T.Resize(self.shape), T.Normalize(0, 255)]
        )
        observation = transforms(observation).squeeze(0)
        return observation


class LazyFramesToNumpy(gym.ObservationWrapper):
    def __init__(self, env):
        super(LazyFramesToNumpy, self).__init__(env)

    def observation(self, observation):
        return observation.__array__()

In [None]:
SEED = 77

In [None]:
random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

!rm -rf /content/video

env = gym.make("MinitaurBulletEnv-v0")
# env = gym.make("HalfCheetahBulletEnv-v0")
# env = SkipFrame(env, skip=4)
# env = GrayScaleObservation(env)
# env = ResizeObservation(env, shape=224)
# env = FrameStack(env, num_stack=4)
# env = LazyFramesToNumpy(env)
directory = './video'
# env = Recorder(env, directory)




urdf_root=/usr/local/lib/python3.8/dist-packages/pybullet_data


In [None]:
env.observation_space.shape, env.action_space

((28,), Box(-1.0, 1.0, (8,), float32))

In [None]:
env.action_space.sample()

array([-0.33841726, -0.80740476,  0.8276518 ,  0.1997258 , -0.6613464 ,
       -0.13442777, -0.7857756 , -0.99019015], dtype=float32)

In [None]:
state = env.reset()

In [None]:
state

array([ 1.49989569e+00,  1.49013200e+00,  1.49495412e+00,  1.48699552e+00,
        1.50238583e+00,  1.50638688e+00,  1.50203090e+00,  1.50311564e+00,
        7.23808034e-01,  7.25649176e-01,  7.43886318e-01,  7.46169736e-01,
        6.44598285e-01,  6.27585942e-01,  6.64795630e-01,  6.50136923e-01,
        4.57818224e-01,  5.36742116e-01,  4.95031151e-01,  5.59108963e-01,
        4.48232540e-01,  4.18515116e-01,  4.48174575e-01,  4.41645809e-01,
       -1.21318040e-03,  1.01598436e-03, -3.26394460e-04,  9.99998695e-01])

In [None]:
act_net = ModelActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device)
crt_net = ModelCritic(env.observation_space.shape[0]).to(device)

In [None]:
exp_buffer = ExpBuffer(
    max_size=20000,
    prob_alpha=0.6,
    beta_start=0.4,
    beta_frames=30000, #100000,
    n_step=1,
    gamma=0.99,
)
agent = Agent(
    env=env,
    exp_buffer=exp_buffer,
    crt_net=crt_net,
    act_net=act_net,
    epsilon_start=0.6,
    epsilon_final=0.002,
    epsilon_decay_last_step=30000, #200000,
    # tgt_sync_steps=1000,
    act_learning_rate=1e-5,
    crt_learning_rate=1e-4,
    device=device,
    trajectory_size=2049,
    ppo_eps=0.2,
    gae_lambda=0.95,
)

In [None]:
act_net(torch.tensor(state).unsqueeze(0).float().to(device))

tensor([[ 0.0821, -0.0059,  0.0365,  0.0652, -0.1149,  0.1589, -0.0624, -0.3860]],
       grad_fn=<TanhBackward0>)

In [None]:
crt_net(torch.tensor(state).unsqueeze(0).float().to(device))

tensor([[-0.0532]], grad_fn=<AddmmBackward0>)

In [None]:
agent.initial_exploration(n_steps=100)

In [None]:
!rm -rf /content/video/*

In [None]:
episode = 0

while True:
    agent.train(ppo_epoch=10, batch_size=32)

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
episode : 79449, done reward : 0.2398562642583173, total_step : 5710497, cur_epsilon : 0.002
episode : 79450, done reward : 0.21668861950189594, total_step : 5710555, cur_epsilon : 0.002
episode : 79451, done reward : 0.22277547611094867, total_step : 5710613, cur_epsilon : 0.002
episode : 79452, done reward : 0.2095580508228075, total_step : 5710666, cur_epsilon : 0.002
episode : 79453, done reward : 0.20486775864888768, total_step : 5710726, cur_epsilon : 0.002
episode : 79454, done reward : 0.2115148479606437, total_step : 5710791, cur_epsilon : 0.002
episode : 79455, done reward : 0.14410709510027805, total_step : 5710864, cur_epsilon : 0.002
episode : 79456, done reward : 0.20313423636122696, total_step : 5710926, cur_epsilon : 0.002
episode : 79457, done reward : 0.17414804155181615, total_step : 5710982, cur_epsilon : 0.002
episode : 79458, done reward : 0.19937229462002748, total_step : 5711036, cur_epsilon : 0.002
episode : 79459, 

In [None]:
episode = 0

while True:
    agent.train(ppo_epoch=10, batch_size=32)

episode : 0, done reward : -1312.6234209558406, total_step : 900, cur_epsilon : 0.57
episode : 1, done reward : -1248.226950091122, total_step : 1900, cur_epsilon : 0.5366666666666666
episode : 2, done reward : -1105.2950415327832, total_step : 2900, cur_epsilon : 0.5033333333333333
episode : 3, done reward : -1313.9245419070935, total_step : 3900, cur_epsilon : 0.47
episode : 4, done reward : -1195.9975815302284, total_step : 4900, cur_epsilon : 0.43666666666666665
episode : 5, done reward : -1343.3318600393213, total_step : 5900, cur_epsilon : 0.4033333333333333
episode : 6, done reward : -1230.2320665811346, total_step : 6900, cur_epsilon : 0.37
episode : 7, done reward : -1142.8569294719257, total_step : 7900, cur_epsilon : 0.33666666666666667
episode : 8, done reward : -1127.1448600351264, total_step : 8900, cur_epsilon : 0.3033333333333333
episode : 9, done reward : -1144.9716722951334, total_step : 9900, cur_epsilon : 0.26999999999999996
episode : 10, done reward : -1177.5259266

KeyboardInterrupt: ignored