In [None]:
import sqlite3
import datetime
import collections

from matplotlib import pyplot as plt
import numpy as np
import random
import tensorflow as tf
import random
import csv
import os

class QLearningDecisionPolicy():

    def __init__(self, actions, input_dim):
    
        # 学習率
        self.gamma = 0.001
        # 行動の種類
        self.actions = actions
        # 出力層の数 = 行動数
        output_dim = len(actions)
        # 中間層の数
        h1_dim = 300

        self.x = tf.placeholder(tf.float32, [None, input_dim])
        self.y = tf.placeholder(tf.float32, [output_dim])

        W1 = tf.Variable(tf.random_normal([input_dim, h1_dim]))
        b1 = tf.Variable(tf.constant(0.1, shape=[h1_dim]))
        h1 = tf.nn.relu(tf.matmul(self.x, W1) + b1)

        W2 = tf.Variable(tf.random_normal([h1_dim, h1_dim]))
        b2 = tf.Variable(tf.constant(0.1, shape=[h1_dim]))
        h2 = tf.nn.relu(tf.matmul(h1, W2) + b2)

        W3 = tf.Variable(tf.random_normal([h1_dim, h1_dim]))
        b3 = tf.Variable(tf.constant(0.1, shape=[h1_dim]))
        h3 = tf.nn.relu(tf.matmul(h2, W3) + b3)

        W4 = tf.Variable(tf.random_normal([h1_dim, output_dim]))
        b4 = tf.Variable(tf.constant(0.1, shape=[output_dim]))
        h4 = tf.nn.relu(tf.matmul(h3, W4) + b4)

        self.q = h4
        self.loss = tf.square(self.y - self.q)
        self.train_op = tf.train.AdagradOptimizer(0.01).minimize(self.loss)
        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())

        # ランダムに行動を決定する確率
        self.eps = 0.9

        # 報酬計算用
        self.budget = 0

        # モデル保存用
        self.saver = tf.train.Saver()
        self.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
        self.model_name = "{}.ckpt".format(self.__class__.__name__)

    def select_action(self, current_state, step):

        # ランダムで手を選択する確立
        threshold = min(self.eps, step / 1000.)

        if random.random() < threshold:
            # Q 値のもっとも大きい行動を選択する
            action_q_vals = self.sess.run(self.q, feed_dict={self.x: current_state})
            action_idx = np.argmax(action_q_vals)  # TODO: replace w/ tensorflow's argmax
            action = self.actions[action_idx]
        else:
            # ランダムに手を選択する
            action = self.actions[random.randint(0, len(self.actions) - 1)]

        return action

    # def update_q(self, state, action, reward, next_state):
    def update_q(self, transitions, current_portfolio):

        # 売った時の利益
        reward = (current_portfolio - self.budget) / len(transitions)
        
        for state, action, next_state in transitions:

            # 今の状況のときにとる行動のQ値
            action_q_vals = self.sess.run(self.q, feed_dict={self.x: state})
            # 次の状況のときにとる行動のQ値
            next_action_q_vals = self.sess.run(self.q, feed_dict={self.x: next_state})
            # 次の状況のときにとる行動のうちもっとも大きいQ値を有する行動
            next_action_idx = np.argmax(next_action_q_vals)
            # 今の状況のときにとる行動のQ値を更新
            action_q_vals[0, next_action_idx] = reward + self.gamma * next_action_q_vals[0, next_action_idx]
            # いらない次元を無くす
            action_q_vals = np.squeeze(np.asarray(action_q_vals))
            # 学習
            self.sess.run(self.train_op, feed_dict={self.x: state, self.y: action_q_vals})

        self.budget = current_portfolio


    def load_model(self, model_path=None):
        if model_path:
            # load from model_path
            self.saver.restore(self.sess, model_path)
        else:
            # load from checkpoint
            checkpoint = tf.train.get_checkpoint_state(self.model_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                self.saver.restore(self.sess, checkpoint.model_checkpoint_path)


    def save_model(self):
        self.saver.save(self.sess, os.path.join(self.model_dir, self.model_name))



def run_simulation(policy, initial_budget, initial_num_stocks, stock_prices, hist, debug=False):

    budget = initial_budget
    num_stocks = initial_num_stocks
    share_value = 0
    transitions = []

    # クボタをシュミレーション対象にする
    target = stock_prices['t6326']
    count = len(target) - hist - 1

    next_state = None
    next_prices = []

    for i in range(len(target) - hist - 1):

        if len(next_prices) > 0:
            prices = next_prices
            current_state = next_state
        else:
            # 学習期間 * 株価情報銘柄数 * len(Oepn, High, Low, Close, Volume) のテーブルを作成
            prices = []
            for symbol in stock_prices:
                current = stock_prices[symbol]
                for j in range(i, i+hist):
                    Open = current[j][2]
                    High = current[j][3]
                    Low =  current[j][4]
                    Close = current[j][5]
                    Volume = current[j][6]
                    prices.extend([Open, High, Low, Close, Volume])

            # 現在の盤面
            current_state = np.asmatrix(np.hstack((prices, budget, num_stocks)))

        # 行動する前の 資金
        current_portfolio = budget + num_stocks * share_value

        # for log
        if i % 100 == 0:
            print('progress {:.2f}%, portfolio {}'.format(float(100*i) / (len(target) - hist - 1), current_portfolio ))

        # 行動を選択する
        action = policy.select_action(current_state, i)

        # 今日の株価
        current = target[i + hist + 1]
        buy_value = float(current[3])   # 買う時は高く買わされたことにする
        sell_value = float(current[4])  # 売る時は安く売ってしまったことにする
        share_value = float(current[5]) # 資産計算用価格は終値

        # 行動を実行する
        if action == 'Buy' and budget >= buy_value*100:
            # 100株単位で売買する
            budget -= buy_value*100
            num_stocks += 100 

        elif action == 'Sell' and num_stocks > 0:
            # 100株単位で売買する
            budget += sell_value*100
            num_stocks -= 100 

        else:
            action = 'Hold'


        # 行動をした後の 資金
        new_portfolio = budget + num_stocks * share_value

        # 行動を反映した前後の資金の増加分を報酬とする
        reward = new_portfolio - current_portfolio


        # 次の日の状態を生成
        # 学習期間 * 株価情報銘柄数 * len(Oepn, High, Low, Close, Volume) のテーブルを作成
        next_prices = []
        for symbol in stock_prices:
            current = stock_prices[symbol]
            for j in range(i+1, i+hist+1):
                Open = current[j][2]
                High = current[j][3]
                Low =  current[j][4]
                Close = current[j][5]
                Volume = current[j][6]
                next_prices.extend([Open, High, Low, Close, Volume])

        next_state = np.asmatrix(np.hstack((next_prices, budget, num_stocks)))
 
        # 学習用データを取っておく
        if action == 'Sell' or num_stocks > 0:
            transitions.append([current_state, action, next_state])
        else:
            transitions = []

        # 学習
        if action == 'Sell':
            policy.update_q(transitions, new_portfolio)
            # policy.update_q(current_state, action, reward, next_state)


    # 最終的な資産
    portfolio = budget + num_stocks * share_value

    if debug:
        print('${}\t{} shares'.format(budget, num_stocks))

    return portfolio


def run_simulations(policy, budget, num_stocks, stock_prices, hist):
    
    # 試行回数
    num_tries = 10

    # 試行結果の格納配列
    final_portfolios = list()
    
    for i in range(num_tries):
        # 試行開始
        final_portfolio = run_simulation(policy, budget, num_stocks, stock_prices, hist)
        # 試行結果の格納
        final_portfolios.append(final_portfolio)

    # 試行結果の配列の平均
    avg = np.mean(final_portfolios) 
    # 試行結果の配列の標準偏差
    std = np.std(final_portfolios)  
    
    return avg, std


def get_prices(start_date, end_date):
    
    stock_prices = collections.OrderedDict()

    baseDate = datetime.datetime.strptime('1983/1/1', "%Y/%m/%d")
    s = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    e = datetime.datetime.strptime(end_date, "%Y-%m-%d")
    start_id = (s  - baseDate).days
    end_id = (e - baseDate).days

    # データベースファイルのパス
    dbpath = 'tpix100db.sqlite'

    # データベース接続とカーソル生成
    connection = sqlite3.connect(dbpath)
    cursor = connection.cursor()

    # データベースに保存されているすべての銘柄を取得
    sql = "select name from sqlite_master where type='table'"
    cursor.execute(sql)
    codes = cursor.fetchall()

    # すべての銘柄の対象期間の株価情報を取得
    count = 0
    for code in codes:

        symbol = code[0]

        # start日と end日がない銘柄は除外する
        sql = 'select COUNT(*) from {} where id = {}'.format(symbol, start_id)
        cursor.execute(sql)
        res = cursor.fetchall()
        start_count = res[0][0]

        sql = 'select COUNT(*) from {} where id = {}'.format(symbol, end_id)
        cursor.execute(sql)
        res = cursor.fetchall()
        end_count = res[0][0]

        if start_count + end_count < 2:
            continue

        # 対象期間の株価情報を取得
        sql = 'SELECT * FROM {} WHERE id BETWEEN {} AND {}'.format(symbol, start_id, end_id)
        cursor.execute(sql)
        res = cursor.fetchall()

        # 最初の銘柄と数が合わないものは除外する
        if count == 0:
            count = len(res)
        elif count <= len(res):
            stock_prices[symbol] = res

    # 接続を閉じる
    connection.close()

    # 答えを返す
    return stock_prices


def plot_prices(prices: list):

    plt.title('Opening stock prices')
    plt.xlabel('day')
    plt.ylabel('price ($)')
    plt.plot(prices)
    plt.savefig('prices.png')


if __name__ == '__main__':

    # 対象期間のすべての銘柄の株価情報を取得
    # stock_prices = get_prices('1992-07-22', '2016-07-22')
    stock_prices = get_prices('1988-10-01', '2001-07-30')
    num_prices = len(stock_prices)

    # 学習期間
    hist = 200

    # クボタだけ取り出す
    target_prices = stock_prices['t6326']
    plot_prices([row[5] for row in target_prices]) # 終値のグラフを描く

    # 行動の種類
    actions = ['Buy', 'Sell', 'Hold']

    # 入力数 = 学習期間 * 株価情報銘柄数 * len(Oepn, High, Low, Close, Volume) + 資金 + 保有株式数
    input_dim = (hist * num_prices * 5) + 2
    # ニュートラルネットワークの初期化
    policy = QLearningDecisionPolicy(actions, input_dim)
    # 保存してあるものがあれば読み込む
    policy.load_model()

    avg = 0
    while avg < 20000000.0: # 資産が2倍以上になるまでくりかえす

        # 資金
        budget = 10000000.0 
        policy.budget = budget

        # 保有株式数
        num_stocks = 0 

        # シュミレーション開始
        avg, std = run_simulations(policy, budget, num_stocks, stock_prices, hist)

        # 燃えるを保存
        policy.save_model()

    print("学習後の資産の平均{}, 学習後の資産の標準偏差{}".format(avg, std))

