In [None]:
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl
import gym
import gym.spaces
import numpy as np
import pandas as pd

In [None]:
class BuySellHold(gym.core.Env):
    # df読み込み
    df = pd.read_csv('train.csv', index_col=0)
    
    def __init__(self):
        self.action_space= gym.spaces.Discrete(9)#行動空間buyxN,sellxN,holdの2N+1種類
        
        high = self.df.max().values
        for i in range(19):
            max_copy = self.df.max().values
            high = np.hstack([high, max_copy])# 観測空間（state)20x14次元の最大値
        low = np.zeros(280)#最小値０に設定
        self.observation_space = gym.spaces.Box(low=low, high=high)
        self.sp_mean = self.df.mean()['adjclose']
    #各episodeの開始時に呼ばれ、初期 stateを返すように実装.結局始めのobservationのやつ
    def reset(self):
        self.buy_power = 1000000
        self.share = 1
        self.average_cost = 1
        self.steps = 19#最初にダミーで19日分入れてるから
        self.done = False
        return self._observe()
    
    
    
    
    # 各stepごとに呼ばれる
    # actionを受け取り、次のstateとreward,episodeが終了したかどうかを返すように実装
    def step(self, action):
        # actionを受け取り、次のstateを決定
        obs_close = self.df.iloc[self.steps][4]
        
          #buy1
        if action == 0:
            buy_share = self.buy_power//obs_close #余力から全力買い
            self.buy_power -= obs_close * buy_share #買付余力から買った価格＊枚数を引く
            self.average_cost = (self.average_cost*self.share + obs_close*buy_share)//(self.share + buy_share)
            self.share += buy_share
        #buy2
        elif action == 1:
            buy_share = self.buy_power// 2 //obs_close #余力の半分で全力買い
            self.buy_power -= obs_close * buy_share
            self.average_cost = (self.average_cost*self.share + obs_close*buy_share)//(self.share + buy_share)
            self.share += buy_share
        #buy3
        elif action == 2:
            buy_share = self.buy_power// 3 // obs_close
            self.buy_power -= obs_close * buy_share
            self.average_cost = (self.average_cost*self.share + obs_close*buy_share)//(self.share + buy_share)
            self.share += buy_share
        #buy4
        elif action == 3:
            buy_share = self.buy_power// 4 // obs_close
            self.buy_power -= obs_close * buy_share
            self.average_cost = (self.average_cost*self.share + obs_close*buy_share)//(self.share + buy_share)
            self.share += buy_share
        #hold
        elif action == 4:
            pass
        #sell1
        elif action == 5:
            sell_share = self.share #全株売り
            self.buy_power += obs_close * sell_share
        #sell2
        elif action == 6:
            sell_share = self.share//2 #持ってる株の半分売り
            self.buy_power += obs_close* sell_share
            self.share -= sell_share
        #sell3
        elif action == 7:
            sell_share = self.share//3
            self.buy_power += obs_close * sell_share
            self.share -= sell_share
        #sell4
        elif action == 8:
            sell_share = self.share//4
            self.buy_power += obs_close * sell_share
            self.share -= sell_share

        #get_reward
        #shareがないのにも関わらず売った場合のペナ
        if action == 5 and self.share==1:
            reward = self.sp_mean * -1/2
        elif 5 < action < 9 and self.share==1:
            reward = self.sp_mean * -1/2
        #買い、ホールドの場合   
        elif action==0 or 0 < action < 5:
            reward = 0
        #余力が足りないのに買っている場合のペナ
        elif action == 0 or 0 < action < 4 and self.buy_power < obs_close:
            reward = self.sp_mean * -1
        #売りの場合
        else:
            reward = obs_close - self.average_cost
          
        #全売りの場合、所持株数、平均取得価格をリセットする
        if action == 5:
            self.share = 1
            self.average_cost = 1
         
        self.steps += 1
        observation = self._observe()#次の状態をobservationにパス
        self.done = self._is_done()
        return observation, reward, self.done, {}
    
    def _is_done(self):
        #episodeが終わりかどうか
        if self.buy_power < 0:
            return True
        elif self.steps == len(self.df) - 1:
            return True
        else:
            return False
        

    def _observe(self):
        #前20日分の要素をひとつにくっつけて
        for i in range(20):
            if i ==0:
                observation = self.df.iloc[self.steps]
            else:
                observation = pd.concat([observation, self.df.iloc[self.steps - i]])
        observation = observation.values
        return observation

In [None]:
env = BuySellHold()
obs = env.reset()

action = env.action_space.sample()
obs, r, done, info = env.step(action)

In [None]:
class QFunction(chainer.Chain):

    def __init__(self, obs_size, n_actions, n_hidden_channels=50):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(obs_size, n_hidden_channels)
            self.l1 = L.Linear(n_hidden_channels, n_hidden_channels)
            self.l2 = L.Linear(n_hidden_channels, n_actions)

    def __call__(self, x, test=False):
        h = F.relu(self.l0(x))
        h = F.relu(self.l1(h))
        return chainerrl.action_value.DiscreteActionValue(self.l2(h))

obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
q_func = QFunction(obs_size, n_actions)

In [None]:
# Use Adam to optimize q_func. eps=1e-2 is for stability.
optimizer = chainer.optimizers.Adam(eps=1e-2)
optimizer.setup(q_func)

In [None]:
# Set the discount factor that discounts future rewards.
gamma = 0.95

# Use epsilon-greedy for exploration
explorer = chainerrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.1, random_action_func=env.action_space.sample)

# DQN uses Experience Replay.
# Specify a replay buffer and its capacity.
replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)

# Since observations from CartPole-v0 is numpy.float64 while
# Chainer only accepts numpy.float32 by default, specify
# a converter as a feature extractor function phi.
phi = lambda x: x.astype(np.float32, copy=False)

# Now create an agent that will interact with the environment.
agent = chainerrl.agents.DoubleDQN(
    q_func, optimizer, replay_buffer, gamma, explorer,
    replay_start_size=500, update_interval=1,
    target_update_interval=100, phi=phi)

In [None]:
n_episodes = 250
max_episode_len = 2000
for i in range(1, n_episodes + 1):
    obs = env.reset()
    reward = 0
    done = False
    R = 0  # return (sum of rewards)
    t = 0  # time step
    while not done and t < max_episode_len:
        # Uncomment to watch the behaviour
        # env.render()
        action = agent.act_and_train(obs, reward)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
    if i % 5 == 0:
        print('episode:', i,
              'R:', R,
              'statistics:', agent.get_statistics())
    agent.stop_episode_and_train(obs, reward, done)
print('Finished.')