In [None]:
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install colabgymrender imageio==2.4.1 atari-py==0.2.6 gym==0.17.3
!apt-get install x11-utils > /dev/null 2>&1 
!pip install pyglet > /dev/null 2>&1 
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet
!pip install ptan pytorch-ignite pybullet
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colabgymrender
  Downloading colabgymrender-1.1.0.tar.gz (3.5 kB)
Collecting imageio==2.4.1
  Downloading imageio-2.4.1.tar.gz (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 4.8 MB/s 
[?25hCollecting atari-py==0.2.6
  Downloading atari-py-0.2.6.tar.gz (790 kB)
[K     |████████████████████████████████| 790 kB 66.4 MB/s 
[?25hCollecting gym==0.17.3
  Downloading gym-0.17.3.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 45.6 MB/s 
Collecting pyglet<=1.5.0,>=1.4.0
  Downloading pyglet-1.5.0-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 55.3 MB/s 
Reason for being yanked: re-release with new wheels[0m
Building wheels for collected packages: imageio, atari-py, gym, colabgymrender
  Building wheel for imageio (setup.py) ... [?25l[?25hdone
  Created wheel for imageio: filename=imageio-2.4.1-py3-none-any.whl size=330

In [None]:
import collections
from typing import Union
import math
import random
from copy import deepcopy
from typing import Optional

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack
import pybullet_envs
from colabgymrender.recorder import Recorder
from fastprogress import progress_bar as pb
import matplotlib.pyplot as plt

In [None]:
class NoisyLinear(nn.Linear):

    def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
        super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
        w = torch.full((out_features, in_features), sigma_init)
        self._sigma_weight = nn.Parameter(w)
        z = torch.zeros(out_features, in_features)
        self.register_buffer("epsilon_weight", z)
        if bias:
            w = torch.full((out_features,), sigma_init)
            self._sigma_bias = nn.Parameter(w)
            z = torch.zeros(out_features)
            self.register_buffer("epsilon_bias", z)
        self.reset_parameters()

    def reset_parameters(self):
        std = math.sqrt(3 / self.in_features)
        self.weight.data.uniform_(-std, std)
        self.bias.data.uniform_(-std, std)

    def forward(self, input):
        self.epsilon_weight.normal_()
        bias = self.bias
        if bias is not None:
            self.epsilon_bias.normal_()
            bias = bias + self._sigma_bias * \
                   self.epsilon_bias.data
        v = self._sigma_weight * self.epsilon_weight.data + \
            self.weight
        return F.linear(input, v, bias)

In [None]:
class DDPGCritic(nn.Module):
    def __init__(self, obs_size, act_size):
        super(DDPGCritic, self).__init__()

        self.obs_net = nn.Sequential(
            nn.Linear(obs_size, 400),
            nn.ReLU(),
        )

        self.out_net = nn.Sequential(
            nn.Linear(400 + act_size, 300),
            nn.ReLU(),
            nn.Linear(300, 1)
        )

    def forward(self, x, a):
        obs = self.obs_net(x)
        return self.out_net(torch.cat([obs, a], dim=1))

In [None]:
class DDPGActor(nn.Module):
    def __init__(self, obs_size, act_size):
        super(DDPGActor, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(obs_size, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, act_size),
            nn.Tanh()
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class NStepPriorityReplayBuffer:

    def __init__(
        self,
        max_size: int,
        prob_alpha: float = 0.6,
        beta_start: float = 0.4,
        beta_frames: float = 100000,
        n_step: int = 4,
        gamma: float = 0.99,
    ):
        self._prob_alpha = prob_alpha
        self._max_size = max_size
        self._pos = 0
        self._buf = []
        self._priorities = np.zeros((max_size,), dtype=np.float32)
        self._beta_start = beta_start
        self._beta = beta_start
        self._beta_frames = beta_frames
        self._n_step = n_step
        self._gamma = gamma
        self._total_discounted_rewards = np.array([np.nan]*max_size)
        self._last_states = [np.nan]*max_size

    def update_bata(self, idx) -> None:
        beta = self._beta_start + idx * (1.0 - self._beta_start) / self._beta_frames
        self._beta = min(1.0, beta)
        return self._beta

    def __len__(self):
        return len(self._buf)

    def append(
        self,
        state: np.ndarray,
        action: int,
        reward: Union[int, float],
        done: bool,
        next_state: np.ndarray,
    ) -> None:
        max_prio = self._priorities.max() if self._buf else 1.0
        if len(self._buf) < self._max_size:
            self._buf.append(
                (state, action, reward, done, next_state)
            )
        else:
            self._buf[self._pos] = (state, action, reward, done, next_state)
        self._priorities[self._pos] = max_prio

        if len(self._buf) >= self._n_step:
            dis_r = 0.0
            last_state = self._buf[self._pos][0]
            for i in range(self._n_step):
                state, _, r, done, _ = self._buf[self._pos - i]
                dis_r = r + self._gamma * dis_r
                if done:
                    last_state = state
                self._total_discounted_rewards[self._pos - i] = dis_r
                self._last_states[self._pos - i] = last_state
            
            for i in range(self._n_step-1):
                done = self._buf[self._pos - i][3]
                if done:
                    break
                self._total_discounted_rewards[self._pos - i] = np.nan
                self._last_states[self._pos - i] = np.nan

        self._pos = (self._pos + 1) % self._max_size

    def sample(self, size: int):
        sample_target_indices = np.where(~np.isnan(self._total_discounted_rewards[:len(self._buf)]))[0]
        # prios = self._priorities[sample_target_indices]  #self._priorities if len(self._buf) == self._max_size else self._priorities[:self._pos]
        prios = self._priorities
        probs = prios * self._prob_alpha
        # probs /= np.nan_to_num(probs, 0.0).sum()
        probs /= probs[sample_target_indices].sum()
        sampled_indices = np.random.choice(
            sample_target_indices,
            # np.where(~np.isnan(self._total_discounted_rewards[:len(self._buf)]))[0],
            size, p=probs[sample_target_indices]
        )
        states, actions, rewards, dones, next_states = zip(*[self._buf[idx] for idx in sampled_indices])
        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards)
        dones = np.array(dones)
        next_states = np.array(next_states)
        total_discounted_rewards = self._total_discounted_rewards[sampled_indices]
        last_states = np.stack([self._last_states[idx] for idx in sampled_indices])
        total = len(self._buf)
        weights = np.array((total * probs[sampled_indices]) ** (-self._beta), dtype=np.float32)
        # weights = np.array((total * probs) ** (-self._beta), dtype=np.float32)
        weights /= weights.max()
        return states, actions, rewards, dones, total_discounted_rewards, last_states, sampled_indices, weights

    def update_priorities(self, sample_indices: np.ndarray, sample_priorities: np.ndarray) -> None:
        self._priorities[sample_indices] = sample_priorities

    @property
    def gamma(self) -> float:
        return self._gamma

    @property
    def n_step(self) -> float:
        return self._n_step

In [None]:
class Agent:

    def __init__(
        self,
        env,
        exp_buffer: NStepPriorityReplayBuffer,
        act_net: nn.Module,
        crt_net: nn.Module,
        epsilon_start: float = 1.0,
        epsilon_final: float = 0.01,
        epsilon_decay_last_step: int = 200000,
        tgt_sync_steps: int = 10000,
        learning_rate: float = 1e-4,
        adam_eps: float = None,
        device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    ):
        self._env = env
        self._exp_buffer = exp_buffer
        self._act_net = act_net
        self._crt_net = crt_net
        self._tgt_act_net = deepcopy(act_net)
        self._tgt_crt_net = deepcopy(crt_net)
        for p in self._tgt_act_net.parameters():
            p.requires_grad = False
        for p in self._tgt_crt_net.parameters():
            p.requires_grad = False
        self._epsilon_start = epsilon_start
        self._epsilon_final = epsilon_final
        self._epsilon_decay_last_step = epsilon_decay_last_step
        self._epsilon = epsilon_start
        self._device = device
        self._total_step = 0
        self._total_trained_samples = 0
        self._tgt_sync_steps = tgt_sync_steps
        adam_kwargs = {}
        if adam_eps is not None:
            adam_kwargs['eps'] = adam_eps
        self._crt_optimizer = torch.optim.Adam(self._crt_net.parameters(), lr=learning_rate, **adam_kwargs)
        self._act_optimizer = torch.optim.Adam(self._act_net.parameters(), lr=learning_rate, **adam_kwargs)
        self._reset_episode()

    def _reset_episode(self):
        self._state = self._env.reset()
        self._total_reward = 0.0

    @torch.no_grad()
    def play_step(self, epsilon: Optional[float] = None, sync_target: bool = True):
        if epsilon is None:
            epsilon = self._epsilon
        done_reward = None

        if np.random.random() < epsilon:
            action = self._env.action_space.sample()
        else:
            state_a = np.array([self._state], copy=False)
            state_v = torch.tensor(state_a).float().to(self._device)
            mu_v = self._act_net(state_v)
            action = mu_v.squeeze(dim=0).data.cpu().numpy()
            action = np.clip(action, -1, 1)

        next_state, reward, is_done, _ = self._env.step(action)
        self._total_reward += reward

        self._exp_buffer.append(
            self._state, action, reward, is_done, next_state
        )
        self._state = next_state
        if is_done:
            done_reward = self._total_reward
            self._reset_episode()

        self._total_step += 1
        self._update_epsilon(self._total_step)
        self._exp_buffer.update_bata(self._total_step)

        # if self._total_step % self._tgt_sync_steps == 0 and sync_target:
        #     self._alpha_sync(self._act_net, self._tgt_act_net, alpha=1 - 1e-3)
        #     self._alpha_sync(self._crt_net, self._tgt_crt_net, alpha=1 - 1e-3)
        #     print(f'synced target net')

        return done_reward

    def train(self, n_iter: int = 1, batch_size: int = 32) -> None:
        n_step_gamma = self._exp_buffer.gamma ** self._exp_buffer.n_step
        for i in range(n_iter):
            states, actions, rewards, dones, total_discounted_rewards, \
                last_states, sampled_indices, weights = self._exp_buffer.sample(batch_size)
            states_v = torch.tensor(states).float().to(self._device)
            actions_v = torch.tensor(actions).to(self._device)
            discounted_rewards_v = torch.tensor(total_discounted_rewards).to(self._device)
            done_mask = torch.BoolTensor(dones).to(self._device)
            weights_v = torch.tensor(weights).to(self._device)
            last_states_v = torch.tensor(last_states).float().to(self._device)

            # train critic
            self._crt_optimizer.zero_grad()
            q_v = self._crt_net(states_v, actions_v)
            last_act_v = self._tgt_act_net(last_states_v)
            q_last_v = self._tgt_crt_net(last_states_v, last_act_v)
            q_last_v[done_mask] = 0.0
            q_ref_v = discounted_rewards_v.unsqueeze(dim=-1) + \
                        q_last_v * n_step_gamma
            # q_ref_v = discounted_rewards_v + q_last_v * n_step_gamma
            critic_loss_v = F.mse_loss(q_v.float(), q_ref_v.float().detach())
            critic_loss_v.backward()
            self._crt_optimizer.step()

            # train actor
            self._act_optimizer.zero_grad()
            cur_actions_v = self._act_net(states_v)
            actor_loss_v = -self._crt_net(states_v, cur_actions_v)
            actor_loss_v = actor_loss_v.mean()
            actor_loss_v.backward()
            self._act_optimizer.step()

            self._alpha_sync(self._act_net, self._tgt_act_net, alpha=1 - 1e-3)
            self._alpha_sync(self._crt_net, self._tgt_crt_net, alpha=1 - 1e-3)

            # self._exp_buffer.update_priorities(sampled_indices, prios)
            self._total_trained_samples += batch_size

    def initial_exploration(self, n_steps: int = 10000, epsilon: float = 1.0) -> None:
        eps_bak = self._epsilon
        try:
            for i in pb(range(n_steps)):
                self._epsilon = epsilon
                self.play_step(sync_target=False)
        finally:
            self._total_step = 0
            self._epsilon = eps_bak

    def _update_epsilon(self, step_index: int) -> None:
        self._epsilon = max(
            self._epsilon_final,
            self._epsilon_start - step_index / self._epsilon_decay_last_step
        )

    def _sync(self, net, tgt_net):
        tgt_net.load_state_dict(net.state_dict())

    def _alpha_sync(self, net, tgt_net, alpha):
        assert isinstance(alpha, float)
        assert 0.0 < alpha <= 1.0
        state = net.state_dict()
        tgt_state = tgt_net.state_dict()
        for k, v in state.items():
            tgt_state[k] = tgt_state[k] * alpha + (1 - alpha) * v
        tgt_net.load_state_dict(tgt_state)


In [None]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def permute_orientation(self, observation):
        print(observation.shape)
        observation = np.transpose(observation, (2, 0, 1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        return observation

    def observation(self, observation):
        observation = self.permute_orientation(observation)
        transform = T.Grayscale()
        observation = transform(observation)
        return observation


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        if isinstance(shape, int):
            self.shape = (shape, shape)
        else:
            self.shape = tuple(shape)

        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transforms = T.Compose(
            [T.Resize(self.shape), T.Normalize(0, 255)]
        )
        observation = transforms(observation).squeeze(0)
        return observation


class LazyFramesToNumpy(gym.ObservationWrapper):
    def __init__(self, env):
        super(LazyFramesToNumpy, self).__init__(env)

    def observation(self, observation):
        return observation.__array__()

In [None]:
SEED = 77

In [None]:
random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

!rm -rf /content/video

env = gym.make("MinitaurBulletEnv-v0")
# env = SkipFrame(env, skip=4)
# env = GrayScaleObservation(env)
# env = ResizeObservation(env, shape=224)
# env = FrameStack(env, num_stack=4)
# env = LazyFramesToNumpy(env)
directory = './video'
# env = Recorder(env, directory)

urdf_root=/usr/local/lib/python3.8/dist-packages/pybullet_data


In [None]:
env.observation_space.shape, env.action_space

((28,), Box(-1.0, 1.0, (8,), float32))

In [None]:
env.action_space.sample()

array([-0.8341588 , -0.7444319 , -0.8518454 , -0.36063766, -0.59968734,
        0.36393377, -0.19871397,  0.57056695], dtype=float32)

In [None]:
state = env.reset()

In [None]:
state

array([ 1.50347167e+00,  1.49345370e+00,  1.49832819e+00,  1.49064648e+00,
        1.50536190e+00,  1.50923600e+00,  1.50500672e+00,  1.50608901e+00,
        6.40519842e-01,  6.47282536e-01,  6.59688347e-01,  6.64291650e-01,
        5.56672518e-01,  5.42466522e-01,  5.74675669e-01,  5.59422781e-01,
        4.41218854e-01,  5.21378915e-01,  4.80078117e-01,  5.41632601e-01,
        4.37156256e-01,  4.08011548e-01,  4.37387937e-01,  4.30968607e-01,
       -1.04792965e-03,  1.02313633e-03, -2.65745175e-04,  9.99998892e-01])

In [None]:
act_net = DDPGActor(
    env.observation_space.shape[0],
    env.action_space.shape[0]).to(device)
crt_net = DDPGCritic(
    env.observation_space.shape[0],
    env.action_space.shape[0]).to(device)

In [None]:
act_net

DDPGActor(
  (net): Sequential(
    (0): Linear(in_features=28, out_features=400, bias=True)
    (1): ReLU()
    (2): Linear(in_features=400, out_features=300, bias=True)
    (3): ReLU()
    (4): Linear(in_features=300, out_features=8, bias=True)
    (5): Tanh()
  )
)

In [None]:
exp_buffer = NStepPriorityReplayBuffer(
    max_size=30000,
    prob_alpha=0.6,
    beta_start=0.4,
    beta_frames=30000, #100000,
    n_step=2,
    gamma=0.99,
)
agent = Agent(
    env=env,
    exp_buffer=exp_buffer,
    crt_net=crt_net,
    act_net=act_net,
    epsilon_start=0.6,
    epsilon_final=0.002,
    epsilon_decay_last_step=30000, #200000,
    tgt_sync_steps=1000,
    learning_rate=1e-4,
    device=device
)

In [None]:
torch.tensor(state).unsqueeze(0).to(device).float()

tensor([[ 1.5035e+00,  1.4935e+00,  1.4983e+00,  1.4906e+00,  1.5054e+00,
          1.5092e+00,  1.5050e+00,  1.5061e+00,  6.4052e-01,  6.4728e-01,
          6.5969e-01,  6.6429e-01,  5.5667e-01,  5.4247e-01,  5.7468e-01,
          5.5942e-01,  4.4122e-01,  5.2138e-01,  4.8008e-01,  5.4163e-01,
          4.3716e-01,  4.0801e-01,  4.3739e-01,  4.3097e-01, -1.0479e-03,
          1.0231e-03, -2.6575e-04,  1.0000e+00]])

In [None]:
act_net(torch.tensor(state).unsqueeze(0).float().to(device))

tensor([[-0.0153,  0.1402,  0.1065,  0.1975,  0.0655, -0.1678, -0.0566,  0.1452]],
       grad_fn=<TanhBackward0>)

In [None]:
crt_net(
    torch.tensor(state).unsqueeze(0).float().to(device),
    act_net(torch.tensor(state).unsqueeze(0).float().to(device))
)

tensor([[-0.0607]], grad_fn=<AddmmBackward0>)

In [None]:
agent.initial_exploration(n_steps=10000)

In [None]:
!rm -rf /content/video/*

In [None]:
episode = 0

while True:

    for stp in range(20):
        done_reward = agent.play_step()
        if done_reward is not None:
            print(f'episode : {episode}, done reward : {done_reward}, total_step : {agent._total_step}, cur_epsilon : {agent._epsilon}')
            episode += 1
    
    agent.train(n_iter=20, batch_size=32)

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
episode : 15491, done reward : 7.19029525020631, total_step : 1664109, cur_epsilon : 0.002
episode : 15492, done reward : 1.1205969503731883, total_step : 1664257, cur_epsilon : 0.002
episode : 15493, done reward : 1.5770994272552301, total_step : 1664444, cur_epsilon : 0.002
episode : 15494, done reward : 2.483479184636427, total_step : 1664716, cur_epsilon : 0.002
episode : 15495, done reward : 1.7819490854713296, total_step : 1664933, cur_epsilon : 0.002
episode : 15496, done reward : 8.835673940938314, total_step : 1665671, cur_epsilon : 0.002
episode : 15497, done reward : 1.102849894935035, total_step : 1665816, cur_epsilon : 0.002
episode : 15498, done reward : 0.21336969414905183, total_step : 1665864, cur_epsilon : 0.002
episode : 15499, done reward : 0.2055622566395502, total_step : 1665909, cur_epsilon : 0.002
episode : 15500, done reward : 0.4768498071069902, total_step : 1665978, cur_epsilon : 0.002
episode : 15501, done reward