In [34]:
import tensorflow as tf
import numpy as np

import gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from db import DB
from datetime import datetime

In [None]:
env = gym.make('CartPole-v1')

model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
      obs = env.reset()

env.close()

In [17]:
mydb = DB(20201114, 4)
EXC = ['bitflyer', 'liquid', 'zaif', 'coincheck']
baseTime = datetime(2020, 11, 14, 0, 0, 0).timestamp()

def ml(func, array):
    return list(map(func, array))

In [41]:
import gym
from gym import spaces

NONE = 0
BUY = 1
SELL = 2

BUYAMOUNT = 0.01
DIVP = 1000000
EXLEN = len(EXC)

class CustomEnv(gym.Env):
  """Custom Environment that follows gym interface"""
  metadata = {'render.modes': ['console']}

  def __init__(self):
    super(CustomEnv, self).__init__()
    n_actions = 9
    self.action_space = spaces.Discrete(n_actions)
    self.observation_space = spaces.Box(low=0, high=100,
                                        shape=(968,),dtype=np.float32)
    self.asks = {}
    self.bids = {}
    self.maxCnt = 0
    for ex in EXC:
      prices = mydb.get(ex, baseTime, baseTime + 60*60*24)
      # example for self.asks[ex]
      # [[[1701256.0, 0.0001], [1701501.0, 0.005], [1701505.0, 0.01015336]],
      # [[1701316.0, 0.01], [1701494.0, 0.01275137], [1701500.0, 0.02]],
      # [[1702132.0, 0.02440992], [1702180.0, 0.1], [1702290.0, 0.08]]]
      self.asks[ex] = ml(lambda p:p['asks'], prices)
      self.bids[ex] = ml(lambda p:p['bids'], prices)
      self.maxCnt = max([self.maxCnt, len(self.asks[ex])])
    self.initVal()

  def initVal(self):
    self.episode_ended = False
    # ランダム時間から開始
    self.stepCnt = np.random.randint(0, self.maxCnt-129)
    self.startCnt = self.stepCnt
    # ランダムな所持金で開始
    jpy = (np.random.randint(10000, 1000000, EXLEN) / DIVP).tolist()
    btc = (abs(np.random.randn(EXLEN))+0.01).tolist()
    self.state = jpy + btc + [0]*960
    self.step(0)
    self.initJPY = self.valuationJPY(self.state)
    self.initBTC = self.valuationBTC(self.state)

  def reset(self):
    self.initVal()
    return np.array(self.state).astype(np.float32)

  # 円換算評価額
  def valuationJPY(self, state):
    total = 0
    for i in range(EXLEN):
      jpy = self.state[i]
      btc = self.state[EXLEN + i]
      ask = self.state[int(EXLEN*2)]
      #total += jpy + btc * ask
      total += jpy
    return total

  # BTC換算評価額
  def valuationBTC(self, state):
    total = 0
    for i in range(EXLEN):
      jpy = self.state[i]
      btc = self.state[EXLEN + i]
      ask = self.state[int(EXLEN*2)]
      #total += btc
      if ask > 0:
        total += jpy / ask
    return total

  def step(self, action):
    cnt = self.stepCnt
    # stateから情報を抜出
    jpyBal = self.state[0:EXLEN]
    btcBal = self.state[EXLEN:EXLEN*2]
    board = self.state[EXLEN*2:EXLEN*2+960]

    # 1フレーム前の板情報を反映
    frameSize = int(len(board) / 20)
    board[frameSize:len(board)] = board[0:(len(board)-frameSize)]

    # 最新板情報を更新
    cur = 0
    for ex in EXC:
      for ab in [self.asks, self.bids]:
        for dep in range(3):
          # 金額
          board[cur] = ab[ex][cnt:cnt+1][0][dep][0]
          cur += 1
          # 量
          board[cur] = ab[ex][cnt:cnt+1][0][dep][1]
          cur += 1

    print(action)
    # Make sure episodes don't go on forever.
    if action != 0:
      # 売買 次のフレームの価格で購入できる
      exidx = (action - 1) % EXLEN
      ex = EXC[exidx]
      buy = action <= EXLEN
      if buy:
        target = self.asks[ex][cnt+1:cnt+2][0]
        # 買える分だけ買う
        remain = BUYAMOUNT
        for dep in range(3):
          price = target[dep][0]
          amount = target[dep][1]
          if remain <= amount:
            amount = remain
          jpy = price * amount
          if jpyBal[exidx] < jpy:
            break
          jpyBal[exidx] -= jpy
          btcBal[exidx] += amount
          remain -= amount
          if remain <= 0:
            break
      else:
        target = self.bids[ex][cnt+1:cnt+2][0]
        # 売れる分だけ売る
        remain = BUYAMOUNT
        for dep in range(3):
          price = target[dep][0]
          amount = target[dep][1]
          if remain <= amount:
            amount = remain
          if btcBal[exidx] < amount:
            break
          jpyBal[exidx] += price * amount
          btcBal[exidx] -= amount
          remain -= amount
          if remain <= 0:
            break

    # state更新
    self.state = jpyBal + btcBal + board

    # 3秒時間を進める
    self.stepCnt += 1
    if self.stepCnt >= self.maxCnt:
      self.episode_ended = True

    # 最大128フレームで終了
    frameCnt = self.stepCnt - self.startCnt
    if frameCnt > 128:
      self.episode_ended = True
    
    reward = 1
    if frameCnt > 1:
      jpy = self.valuationJPY(self.state)
      btc = self.valuationBTC(self.state)
      # 初期所持金から何倍増えたかが報酬
      reward = jpy / self.initJPY + btc / self.initBTC - 2

    # 5%所持金が減ったら強制終了
    #if reward < -0.1:
    #  self._episode_ended = True

    # Optionally we can pass additional info, we are not using that for now
    info = {}

    return np.array(self.state).astype(np.float32), reward*128, self.episode_ended, info

  def render(self, mode='console'):
    if mode != 'console':
      raise NotImplementedError()
    # agent is represented as a cross, rest as a dot
    print("render")

  def close(self):
    pass

In [35]:
def evaluate(model, num_episodes=100):
    """
    RLエージェントを評価
    :param model: (BaseRLModel object) RLエージェント
    :param num_episodes: (int) エピソード数
    :return: (float) 平均報酬
    """
    # この関数は単一の環境でのみ機能します
    env = model.get_env()
    all_episode_rewards = []
    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset()
        while not done:
            # _statesは、LSTMポリシーを使用する場合にのみ有用です
            action, _states = model.predict(obs)

            # ベクトル化環境を使用しているため、行動、報酬、エピソード完了は配列です
            obs, reward, done, info = env.step(action)
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)

    return mean_episode_reward

In [42]:
env = CustomEnv()
check_env(env, warn=True)

0
0
2
7
7
1
8
8
7
4
8
1
7


In [40]:
env = CustomEnv()
model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)
mean_reward = evaluate(model, num_episodes=100)

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    if done:
      obs = env.reset()

env.close()

Using cuda device
Wrapping the env in a DummyVecEnv.
-----------------------------------------
| time/                   |             |
|    fps                  | 866         |
|    iterations           | 1           |
|    time_elapsed         | 2           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.002040766 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.1        |
|    explained_variance   | -1.83e+08   |
|    learning_rate        | 0.0003      |
|    loss                 | 5.05e+05    |
|    n_updates            | 50          |
|    policy_gradient_loss | -0.00163    |
|    value_loss           | 9.49e+05    |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 595         |
|    iterations           | 2           |
|    time_elapsed      

In [38]:
mean_reward = evaluate(model, num_episodes=100)

Mean reward: 6621.2793 Num episodes: 100
