In [1]:
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install colabgymrender imageio==2.4.1 atari-py==0.2.6 gym==0.17.3
!apt-get install x11-utils > /dev/null 2>&1 
!pip install pyglet > /dev/null 2>&1 
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet
!pip install ptan pytorch-ignite pybullet
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colabgymrender
  Downloading colabgymrender-1.1.0.tar.gz (3.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting imageio==2.4.1
  Downloading imageio-2.4.1.tar.gz (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting atari-py==0.2.6
  Downloading atari-py-0.2.6.tar.gz (790 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m790.2/790.2 KB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gym==0.17.3
  Downloading gym-0.17.3.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m82.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyglet<=1.5.0,>=1.

In [2]:
import collections
from typing import Union
import math
import random
from copy import deepcopy
from typing import Optional

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
import torch.distributions as distr
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack
import pybullet_envs
from colabgymrender.recorder import Recorder
from fastprogress import progress_bar as pb
import matplotlib.pyplot as plt

Imageio: 'ffmpeg-linux64-v3.3.1' was not found on your computer; downloading it now.
Try 1. Download from https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg-linux64-v3.3.1 (43.8 MB)
Downloading: 8192/45929032 bytes (0.0%)2293760/45929032 bytes (5.0%)4988928/45929032 bytes (10.9%)7610368/45929032 bytes (16.6%)9617408/45929032 bytes (20.9%)12222464/45929032 bytes (26.6%)14098432/45929032 bytes (30.7%)16842752/45929032 bytes (36.7%)19447808/45929032 bytes (42.3%)21667840/45929032 bytes (47.2%)24256512/45929032 bytes (52.8%)27230208/45929032 bytes (59.3%)30236672/45929032 bytes (65.8%)3

In [3]:
class NoisyLinear(nn.Linear):

    def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
        super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
        w = torch.full((out_features, in_features), sigma_init)
        self._sigma_weight = nn.Parameter(w)
        z = torch.zeros(out_features, in_features)
        self.register_buffer("epsilon_weight", z)
        if bias:
            w = torch.full((out_features,), sigma_init)
            self._sigma_bias = nn.Parameter(w)
            z = torch.zeros(out_features)
            self.register_buffer("epsilon_bias", z)
        self.reset_parameters()

    def reset_parameters(self):
        std = math.sqrt(3 / self.in_features)
        self.weight.data.uniform_(-std, std)
        self.bias.data.uniform_(-std, std)

    def forward(self, input):
        self.epsilon_weight.normal_()
        bias = self.bias
        if bias is not None:
            self.epsilon_bias.normal_()
            bias = bias + self._sigma_bias * \
                   self.epsilon_bias.data
        v = self._sigma_weight * self.epsilon_weight.data + \
            self.weight
        return F.linear(input, v, bias)

In [31]:
class SACActor(nn.Module):

    def __init__(self, obs_size, act_size, hidden_size: int = 64):
        super(SACActor, self).__init__()

        self.mu = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, act_size),
            nn.Tanh(),
        )
        self.logstd = nn.Parameter(torch.zeros(act_size))

    def forward(self, x):
        return self.mu(x)

In [5]:
class SACCritic(nn.Module):

    def __init__(self, obs_size, hidden_size: int = 64):
        super(SACCritic, self).__init__()

        self.value = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
        )

    def forward(self, x):
        return self.value(x)

In [6]:
class SACTwinQ(nn.Module):

    def __init__(self, obs_size, act_size, hidden_size: int = 64):
        super(SACTwinQ, self).__init__()

        self.q1 = nn.Sequential(
            nn.Linear(obs_size + act_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
        )

        self.q2 = nn.Sequential(
            nn.Linear(obs_size + act_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
        )

    def forward(self, obs, act):
        x = torch.cat([obs, act], dim=1)
        return self.q1(x), self.q2(x)

In [7]:
class NStepPriorityReplayBuffer:

    def __init__(
        self,
        max_size: int,
        prob_alpha: float = 0.6,
        beta_start: float = 0.4,
        beta_frames: float = 100000,
        n_step: int = 4,
        gamma: float = 0.99,
    ):
        self._prob_alpha = prob_alpha
        self._max_size = max_size
        self._pos = 0
        self._buf = []
        self._priorities = np.zeros((max_size,), dtype=np.float32)
        self._beta_start = beta_start
        self._beta = beta_start
        self._beta_frames = beta_frames
        self._n_step = n_step
        self._gamma = gamma
        self._total_discounted_rewards = np.array([np.nan]*max_size)
        self._last_states = [np.nan]*max_size

    def update_bata(self, idx) -> None:
        beta = self._beta_start + idx * (1.0 - self._beta_start) / self._beta_frames
        self._beta = min(1.0, beta)
        return self._beta

    def __len__(self):
        return len(self._buf)

    def append(
        self,
        state: np.ndarray,
        action: int,
        reward: Union[int, float],
        done: bool,
        next_state: np.ndarray,
    ) -> None:
        max_prio = self._priorities.max() if self._buf else 1.0
        if len(self._buf) < self._max_size:
            self._buf.append(
                (state, action, reward, done, next_state)
            )
        else:
            self._buf[self._pos] = (state, action, reward, done, next_state)
        self._priorities[self._pos] = max_prio

        if len(self._buf) >= self._n_step:
            dis_r = 0.0
            last_state = self._buf[self._pos][0]
            for i in range(self._n_step):
                state, _, r, done, _ = self._buf[self._pos - i]
                dis_r = r + self._gamma * dis_r
                if done:
                    last_state = state
                self._total_discounted_rewards[self._pos - i] = dis_r
                self._last_states[self._pos - i] = last_state
            
            for i in range(self._n_step-1):
                done = self._buf[self._pos - i][3]
                if done:
                    break
                self._total_discounted_rewards[self._pos - i] = np.nan
                self._last_states[self._pos - i] = np.nan

        self._pos = (self._pos + 1) % self._max_size

    def sample(self, size: int):
        sample_target_indices = np.where(~np.isnan(self._total_discounted_rewards[:len(self._buf)]))[0]
        # prios = self._priorities[sample_target_indices]  #self._priorities if len(self._buf) == self._max_size else self._priorities[:self._pos]
        prios = self._priorities
        probs = prios * self._prob_alpha
        # probs /= np.nan_to_num(probs, 0.0).sum()
        probs /= probs[sample_target_indices].sum()
        sampled_indices = np.random.choice(
            sample_target_indices,
            # np.where(~np.isnan(self._total_discounted_rewards[:len(self._buf)]))[0],
            size, p=probs[sample_target_indices]
        )
        states, actions, rewards, dones, next_states = zip(*[self._buf[idx] for idx in sampled_indices])
        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards)
        dones = np.array(dones)
        next_states = np.array(next_states)
        total_discounted_rewards = self._total_discounted_rewards[sampled_indices]
        last_states = np.stack([self._last_states[idx] for idx in sampled_indices])
        total = len(self._buf)
        weights = np.array((total * probs[sampled_indices]) ** (-self._beta), dtype=np.float32)
        # weights = np.array((total * probs) ** (-self._beta), dtype=np.float32)
        weights /= weights.max()
        return states, actions, rewards, dones, total_discounted_rewards, last_states, sampled_indices, weights

    def update_priorities(self, sample_indices: np.ndarray, sample_priorities: np.ndarray) -> None:
        self._priorities[sample_indices] = sample_priorities

    @property
    def gamma(self) -> float:
        return self._gamma

    @property
    def n_step(self) -> float:
        return self._n_step

In [32]:
class Agent:

    def __init__(
        self,
        env,
        exp_buffer: NStepPriorityReplayBuffer,
        act_net: nn.Module,
        crt_net: nn.Module,
        twinq_net:  nn.Module,
        epsilon_start: float = 1.0,
        epsilon_final: float = 0.01,
        epsilon_decay_last_step: int = 200000,
        tgt_sync_steps: int = 10000,
        learning_rate_acts: float = 1e-4,
        learning_rate_vals: float = 1e-4,
        adam_eps: float = None,
        sac_entropy_alpha: float = 0.1,
        device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    ):
        self._env = env
        self._exp_buffer = exp_buffer
        self._act_net = act_net
        self._crt_net = crt_net
        self._twinq_net = twinq_net
        self._tgt_act_net = deepcopy(act_net)
        self._tgt_crt_net = deepcopy(crt_net)
        for p in self._tgt_act_net.parameters():
            p.requires_grad = False
        for p in self._tgt_crt_net.parameters():
            p.requires_grad = False
        self._epsilon_start = epsilon_start
        self._epsilon_final = epsilon_final
        self._epsilon_decay_last_step = epsilon_decay_last_step
        self._epsilon = epsilon_start
        self._device = device
        self._total_step = 0
        self._total_trained_samples = 0
        self._tgt_sync_steps = tgt_sync_steps
        adam_kwargs = {}
        if adam_eps is not None:
            adam_kwargs['eps'] = adam_eps
        self._crt_optimizer = torch.optim.Adam(self._crt_net.parameters(), lr=learning_rate_vals, **adam_kwargs)
        self._act_optimizer = torch.optim.Adam(self._act_net.parameters(), lr=learning_rate_acts, **adam_kwargs)
        self._twinq_optimizer = torch.optim.Adam(self._twinq_net.parameters(), lr=learning_rate_vals, **adam_kwargs)
        self._sac_entropy_alpha = sac_entropy_alpha
        self._reset_episode()

    def _reset_episode(self):
        self._state = self._env.reset()
        self._total_reward = 0.0

    @torch.no_grad()
    def play_step(self, epsilon: Optional[float] = None, sync_target: bool = True):
        if epsilon is None:
            epsilon = self._epsilon
        done_reward = None

        if np.random.random() < epsilon:
            action = self._env.action_space.sample()
        else:
            state_a = np.array([self._state], copy=False)
            state_v = torch.tensor(state_a).float().to(self._device)
            mu_v = self._act_net(state_v)
            action = 2.0*mu_v.squeeze(dim=0).data.cpu().numpy()
            action = np.clip(action, -2, 2)

        next_state, reward, is_done, _ = self._env.step(action)
        self._total_reward += reward

        self._exp_buffer.append(
            self._state, action, reward, is_done, next_state
        )
        self._state = next_state
        if is_done:
            done_reward = self._total_reward
            self._reset_episode()

        self._total_step += 1
        self._update_epsilon(self._total_step)
        self._exp_buffer.update_bata(self._total_step)

        # if self._total_step % self._tgt_sync_steps == 0 and sync_target:
        #     self._alpha_sync(self._act_net, self._tgt_act_net, alpha=1 - 1e-3)
        #     self._alpha_sync(self._crt_net, self._tgt_crt_net, alpha=1 - 1e-3)
        #     print(f'synced target net')

        return done_reward

    def train(self, n_iter: int = 1, batch_size: int = 32) -> None:
        n_step_gamma = self._exp_buffer.gamma ** self._exp_buffer.n_step
        for i in range(n_iter):
            states, actions, rewards, dones, total_discounted_rewards, \
                last_states, sampled_indices, weights = self._exp_buffer.sample(batch_size)
            states_v = torch.tensor(states).float().to(self._device)
            actions_v = torch.tensor(actions).to(self._device)
            discounted_rewards_v = torch.tensor(total_discounted_rewards).to(self._device)
            done_mask = torch.BoolTensor(dones).to(self._device)
            weights_v = torch.tensor(weights).to(self._device)
            last_states_v = torch.tensor(last_states).float().to(self._device)

            # last_act_v = self._tgt_act_net(last_states_v)
            q_last_v = self._tgt_crt_net(last_states_v)
            q_last_v[done_mask] = 0.0
            q_ref_v = discounted_rewards_v.unsqueeze(dim=-1) + \
                        q_last_v * n_step_gamma

            mu_v = self._act_net(states_v)
            act_dist = distr.Normal(mu_v, torch.exp(self._act_net.logstd))
            acts_v = act_dist.sample()
            q1_v, q2_v = self._twinq_net(states_v, acts_v)
            # element-wise minimum
            vals_ref_v = torch.min(q1_v, q2_v).squeeze() - \
                        self._sac_entropy_alpha * act_dist.log_prob(acts_v).sum(dim=1)

            # train TwinQ
            self._twinq_optimizer.zero_grad()
            q1_v, q2_v = self._twinq_net(states_v, actions_v)
            q1_loss_v = F.mse_loss(q1_v.squeeze().float(),
                                    q_ref_v.squeeze().detach().float())
            q2_loss_v = F.mse_loss(q2_v.squeeze().float(),
                                    q_ref_v.squeeze().detach().float())
            q_loss_v = q1_loss_v + q2_loss_v
            q_loss_v.backward()
            self._twinq_optimizer.step()

            # Critic
            self._crt_optimizer.zero_grad()
            val_v = self._crt_net(states_v)
            v_loss_v = F.mse_loss(val_v.squeeze().float(),
                                    vals_ref_v.squeeze().detach().float())
            v_loss_v.backward()
            self._crt_optimizer.step()

            # Actor
            self._act_optimizer.zero_grad()
            acts_v = self._act_net(states_v)
            q_out_v, _ = self._twinq_net(states_v, acts_v)
            act_loss = -q_out_v.mean()
            act_loss.backward()
            self._act_optimizer.step()

            # self._alpha_sync(self._act_net, self._tgt_act_net, alpha=1 - 1e-3)
            self._alpha_sync(self._crt_net, self._tgt_crt_net, alpha=1 - 1e-3)

            # self._exp_buffer.update_priorities(sampled_indices, prios)
            self._total_trained_samples += batch_size

    def initial_exploration(self, n_steps: int = 10000, epsilon: float = 1.0) -> None:
        eps_bak = self._epsilon
        try:
            for i in pb(range(n_steps)):
                self._epsilon = epsilon
                self.play_step(sync_target=False)
        finally:
            self._total_step = 0
            self._epsilon = eps_bak

    def _update_epsilon(self, step_index: int) -> None:
        self._epsilon = max(
            self._epsilon_final,
            self._epsilon_start - step_index / self._epsilon_decay_last_step
        )

    def _sync(self, net, tgt_net):
        tgt_net.load_state_dict(net.state_dict())

    def _alpha_sync(self, net, tgt_net, alpha):
        assert isinstance(alpha, float)
        assert 0.0 < alpha <= 1.0
        state = net.state_dict()
        tgt_state = tgt_net.state_dict()
        for k, v in state.items():
            tgt_state[k] = tgt_state[k] * alpha + (1 - alpha) * v
        tgt_net.load_state_dict(tgt_state)


In [33]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def permute_orientation(self, observation):
        print(observation.shape)
        observation = np.transpose(observation, (2, 0, 1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        return observation

    def observation(self, observation):
        observation = self.permute_orientation(observation)
        transform = T.Grayscale()
        observation = transform(observation)
        return observation


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        if isinstance(shape, int):
            self.shape = (shape, shape)
        else:
            self.shape = tuple(shape)

        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transforms = T.Compose(
            [T.Resize(self.shape), T.Normalize(0, 255)]
        )
        observation = transforms(observation).squeeze(0)
        return observation


class LazyFramesToNumpy(gym.ObservationWrapper):
    def __init__(self, env):
        super(LazyFramesToNumpy, self).__init__(env)

    def observation(self, observation):
        return observation.__array__()

In [34]:
SEED = 77

In [35]:
random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

!rm -rf /content/video

env = gym.make("Pendulum-v0")
# env = SkipFrame(env, skip=4)
# env = GrayScaleObservation(env)
# env = ResizeObservation(env, shape=224)
# env = FrameStack(env, num_stack=4)
# env = LazyFramesToNumpy(env)
directory = './video'
# env = Recorder(env, directory)

In [36]:
env.observation_space.shape, env.action_space

((3,), Box(-2.0, 2.0, (1,), float32))

In [37]:
env.action_space.shape[0]

1

In [38]:
env.action_space.sample()

array([1.2922716], dtype=float32)

In [39]:
state = env.reset()

In [40]:
state

array([-0.14746339, -0.98906751,  0.77418339])

In [112]:
act_net = SACActor(
    env.observation_space.shape[0],
    env.action_space.shape[0]
).to(device)
crt_net = SACCritic(
    env.observation_space.shape[0],
    env.action_space.shape[0]
).to(device)
twinq_net = SACTwinQ(
    env.observation_space.shape[0],
    env.action_space.shape[0]
).to(device)

In [113]:
act_net

SACActor(
  (mu): Sequential(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): Tanh()
    (4): Linear(in_features=64, out_features=1, bias=True)
    (5): Tanh()
  )
)

In [114]:
exp_buffer = NStepPriorityReplayBuffer(
    max_size=30000,
    prob_alpha=0.6,
    beta_start=0.4,
    beta_frames=30000, #100000,
    n_step=8,
    gamma=0.99,
)

In [115]:
agent = Agent(
    env=env,
    exp_buffer=exp_buffer,
    crt_net=crt_net,
    act_net=act_net,
    twinq_net=twinq_net,
    epsilon_start=0.1,
    epsilon_final=0.002,
    epsilon_decay_last_step=50000, #200000,
    # tgt_sync_steps=1000,
    learning_rate_acts=1e-4,
    learning_rate_vals=1e-4,
    device=device
)

In [116]:
act_net(torch.tensor(state).unsqueeze(0).float().to(device))

tensor([[-0.0463]], grad_fn=<TanhBackward0>)

In [117]:
crt_net(
    torch.tensor(state).unsqueeze(0).float().to(device),
)

tensor([[-0.0571]], grad_fn=<AddmmBackward0>)

In [118]:
agent.initial_exploration(n_steps=10000)

In [119]:
!rm -rf /content/video/*

In [120]:
# exp_buffer = NStepPriorityReplayBuffer(
#     max_size=30000,
#     prob_alpha=0.6,
#     beta_start=0.4,
#     beta_frames=30000, #100000,
#     n_step=8,
#     gamma=0.99,
# )
# agent = Agent(
#     env=env,
#     exp_buffer=exp_buffer,
#     crt_net=crt_net,
#     act_net=act_net,
#     twinq_net=twinq_net,
#     epsilon_start=0.1,
#     epsilon_final=0.002,
#     epsilon_decay_last_step=50000, #200000,
#     # tgt_sync_steps=1000,
#     learning_rate_acts=1e-4,
#     learning_rate_vals=1e-4,
#     device=device
# )

episode = 0

while True:

    for stp in range(1):
        done_reward = agent.play_step()
        if done_reward is not None:
            print(f'episode : {episode}, done reward : {done_reward}, total_step : {agent._total_step}, cur_epsilon : {agent._epsilon}')
            episode += 1

    agent.train(n_iter=1, batch_size=64)

episode : 0, done reward : -1758.5510323260667, total_step : 200, cur_epsilon : 0.096
episode : 1, done reward : -1854.8954235927026, total_step : 400, cur_epsilon : 0.092
episode : 2, done reward : -1709.3123307951425, total_step : 600, cur_epsilon : 0.08800000000000001
episode : 3, done reward : -1822.8614335163486, total_step : 800, cur_epsilon : 0.084
episode : 4, done reward : -1657.1650496023972, total_step : 1000, cur_epsilon : 0.08
episode : 5, done reward : -1598.0071886688934, total_step : 1200, cur_epsilon : 0.07600000000000001
episode : 6, done reward : -1579.2583430600257, total_step : 1400, cur_epsilon : 0.07200000000000001
episode : 7, done reward : -1559.2730771077092, total_step : 1600, cur_epsilon : 0.068
episode : 8, done reward : -1588.7631826822753, total_step : 1800, cur_epsilon : 0.064
episode : 9, done reward : -1612.0594843419426, total_step : 2000, cur_epsilon : 0.060000000000000005
episode : 10, done reward : -1595.2397542739136, total_step : 2200, cur_epsilo

KeyboardInterrupt: ignored

In [102]:
# exp_buffer = NStepPriorityReplayBuffer(
#     max_size=30000,
#     prob_alpha=0.6,
#     beta_start=0.4,
#     beta_frames=30000, #100000,
#     n_step=10,
#     gamma=0.99,
# )
# agent = Agent(
#     env=env,
#     exp_buffer=exp_buffer,
#     crt_net=crt_net,
#     act_net=act_net,
#     twinq_net=twinq_net,
#     epsilon_start=0.1,
#     epsilon_final=0.002,
#     epsilon_decay_last_step=50000, #200000,
#     # tgt_sync_steps=1000,
#     learning_rate_acts=5e-3,
#     learning_rate_vals=5e-3,
#     device=device
# )

episode = 0

while True:

    for stp in range(1):
        done_reward = agent.play_step()
        if done_reward is not None:
            print(f'episode : {episode}, done reward : {done_reward}, total_step : {agent._total_step}, cur_epsilon : {agent._epsilon}')
            episode += 1

    agent.train(n_iter=1, batch_size=64)

episode : 0, done reward : -1791.596726363171, total_step : 200, cur_epsilon : 0.096
episode : 1, done reward : -820.7458328369454, total_step : 400, cur_epsilon : 0.092
episode : 2, done reward : -419.2005364950901, total_step : 600, cur_epsilon : 0.08800000000000001
episode : 3, done reward : -1.4888059746269202, total_step : 800, cur_epsilon : 0.084
episode : 4, done reward : -829.1961982065752, total_step : 1000, cur_epsilon : 0.08
episode : 5, done reward : -267.61810227681985, total_step : 1200, cur_epsilon : 0.07600000000000001
episode : 6, done reward : -269.37881113718737, total_step : 1400, cur_epsilon : 0.07200000000000001
episode : 7, done reward : -331.449687215132, total_step : 1600, cur_epsilon : 0.068
episode : 8, done reward : -265.31014021348545, total_step : 1800, cur_epsilon : 0.064
episode : 9, done reward : -268.2124962824307, total_step : 2000, cur_epsilon : 0.060000000000000005
episode : 10, done reward : -556.403191665792, total_step : 2200, cur_epsilon : 0.056

KeyboardInterrupt: ignored