In [None]:
import gym
import gym.spaces
import numpy as np
import pandas as pd

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
import rl.callbacks

In [None]:
class BuySellHold(gym.core.Env):
    # df読み込み
    df = pd.read_csv('train.csv', index_col=0)
    
    def __init__(self):
        self.action_space= gym.spaces.Discrete(9)#行動空間buyxN,sellxN,holdの2N+1種類
        
        high = self.df.max().values
        for i in range(19):
            max_copy = self.df.max().values
            high = np.hstack([high, max_copy])# 観測空間（state)20x14次元の最大値
        low = np.zeros(280)#最小値０に設定
        self.observation_space = gym.spaces.Box(low=low, high=high)
        self.sp_mean = self.df.mean()['adjclose']
    #各episodeの開始時に呼ばれ、初期 stateを返すように実装.結局始めのobservationのやつ
    def reset(self):
        self.buy_power = 1000000
        self.share = 1
        self.average_cost = 1
        self.steps = 19#最初にダミーで19日分入れてるから
        self.done = False
        return self._observe()
    
    
    
    
    # 各stepごとに呼ばれる
    # actionを受け取り、次のstateとreward,episodeが終了したかどうかを返すように実装
    def step(self, action):
        # actionを受け取り、次のstateを決定
        obs_close = self.df.iloc[self.steps][4]
        
          #buy1
        if action == 0:
            buy_share = self.buy_power//obs_close #余力から全力買い
            self.buy_power -= obs_close * buy_share #買付余力から買った価格＊枚数を引く
            self.average_cost = (self.average_cost*self.share + obs_close*buy_share)//(self.share + buy_share)
            self.share += buy_share
        #buy2
        elif action == 1:
            buy_share = self.buy_power// 2 //obs_close #余力の半分で全力買い
            self.buy_power -= obs_close * buy_share
            self.average_cost = (self.average_cost*self.share + obs_close*buy_share)//(self.share + buy_share)
            self.share += buy_share
        #buy3
        elif action == 2:
            buy_share = self.buy_power// 3 // obs_close
            self.buy_power -= obs_close * buy_share
            self.average_cost = (self.average_cost*self.share + obs_close*buy_share)//(self.share + buy_share)
            self.share += buy_share
        #buy4
        elif action == 3:
            buy_share = self.buy_power// 4 // obs_close
            self.buy_power -= obs_close * buy_share
            self.average_cost = (self.average_cost*self.share + obs_close*buy_share)//(self.share + buy_share)
            self.share += buy_share
        #hold
        elif action == 4:
            pass
        #sell1
        elif action == 5:
            sell_share = self.share #全株売り
            self.buy_power += obs_close * sell_share
        #sell2
        elif action == 6:
            sell_share = self.share//2 #持ってる株の半分売り
            self.buy_power += obs_close* sell_share
            self.share -= sell_share
        #sell3
        elif action == 7:
            sell_share = self.share//3
            self.buy_power += obs_close * sell_share
            self.share -= sell_share
        #sell4
        elif action == 8:
            sell_share = self.share//4
            self.buy_power += obs_close * sell_share
            self.share -= sell_share

        #get_reward
        #shareがないのにも関わらず売った場合のペナ
        if action == 5 and self.share==1:
            reward = self.sp_mean * -1/2
        elif 5 < action < 9 and self.share==1:
            reward = self.sp_mean * -1/2
        #買い、ホールドの場合   
        elif action==0 or 0 < action < 5:
            reward = 0
        #余力が足りないのに買っている場合のペナ
        elif action == 0 or 0 < action < 4 and self.buy_power < obs_close:
            reward = self.sp_mean * -1
        #売りの場合
        else:
            reward = obs_close - self.average_cost
          
        #全売りの場合、所持株数、平均取得価格をリセットする
        if action == 5:
            self.share = 1
            self.average_cost = 1
         
        self.steps += 1
        observation = self._observe()#次の状態をobservationにパス
        self.done = self._is_done()
        return observation, reward, self.done, {}
    
    def _is_done(self):
        #episodeが終わりかどうか
        if self.buy_power < 0:
            return True
        elif self.steps == len(self.df) - 1:
            return True
        else:
            return False
        

    def _observe(self):
        #前20日分の要素をひとつにくっつけて
        for i in range(20):
            if i ==0:
                observation = self.df.iloc[self.steps]
            else:
                observation = pd.concat([observation, self.df.iloc[self.steps - i]])
        observation = observation.values
        return observation

In [None]:
class EpisodeLogger(rl.callbacks.Callback):
    def __init__(self):
        self.observations = {}
        self.rewards = {}
        self.actions = {}

    def on_episode_begin(self, episode, logs):
        self.observations[episode] = []
        self.rewards[episode] = []
        self.actions[episode] = []

    def on_step_end(self, step, logs):
        episode = logs['episode']
        self.observations[episode].append(logs['observation'])
        self.rewards[episode].append(logs['reward'])
        self.actions[episode].append(logs['action'])

cb_ep = EpisodeLogger()

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

env = BuySellHold()
nb_actions = env.action_space.n

# DQNのネットワーク定義
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# experience replay用のmemory
memory = SequentialMemory(limit=50000, window_length=1)
# 行動方策はオーソドックスなepsilon-greedy。ほかに、各行動のQ値によって確率を決定するBoltzmannQPolicyが利用可能
policy = EpsGreedyQPolicy(eps=0.1) 
dqn = DQNAgent(model=model,  nb_actions=nb_actions, memory=memory, nb_steps_warmup=50,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

history = dqn.fit(env, nb_steps=50000, visualize=False, verbose=2, nb_max_episode_steps = 2000,callbacks=[cb_ep])