<a href="https://colab.research.google.com/github/sugiyama404/ReinfoceLearningForTrading/blob/main/simple_rl_random.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from google.colab import drive

from datetime import datetime

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/dataset/reinforcement_learning/'
nov_path = '/content/drive/My Drive/' + nov_dir + 'data_csv_sp500.csv'

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
class Environment:
    def __init__(self, df, initial_money=1000):

        self.df = df.dropna().reset_index()
        self.df_total_steps = len(self.df)-1
        self.initial_money = initial_money
        self.action_space = np.array([0, 1, 2])
        self.stock_owned = None
        self.current_price = None
        self.cash_in_hand = None

        self.reset()
        
    def reset(self):

        self.end_step = self.df_total_steps
        self.current_step = 0
        self.stock_owned = 0.0
        self.current_price = self.df.loc[self.current_step, 'SP500']
        self.cash_in_hand = self.initial_money

        return self._get_now_state()

    def step(self, action):

        self.current_step += 1
        self.current_price = self.df.loc[self.current_step, 'SP500']
 
        # get current value before performing the action
        prev_val = self._get_val()

        # perform the trade
        self._trade(action)

        # get the new value after taking the action
        cur_val = self._get_val()

        # reward is the increase in porfolio value
        reward = cur_val - prev_val

        # done if we have run out of data
        done = self.current_step == self.current_step - 1

        # store the current value of the portfolio here
        info = {'cur_val': cur_val}

        # conform to the Gym API
        return self._get_now_state(), reward, done, info


    def _get_now_state(self):
        state = np.empty(3)
        state[0] = self.stock_owned
        state[1] = self.current_price
        state[2] = self.cash_in_hand
        return state

    def _get_val(self):
        return self.stock_owned * self.current_price + self.cash_in_hand


    def _trade(self, action):
        '''
        0 = sell
        1 = hold
        2 = buy
        売りたい株を売る
        買いたい銘柄を買う
        '''
        if action == 0: # sell
        # 注：問題を簡単にするために、売るときはその株のすべての株を売ることにします
            self.cash_in_hand += self.current_price * self.stock_owned
            self.stock_owned = 0
        if action == 2: # buy
        # 注：購入時には、買いたい銘柄をループさせて、現金がなくなるまで1株ずつ購入していきます。
            can_buy = True
            while can_buy:
                if self.cash_in_hand > self.current_price:
                    self.stock_owned += 1 # buy one share
                    self.cash_in_hand -= self.current_price
                else:
                    can_buy = False

In [3]:
def play_one_episode(env, train_episodes = 50):
    average_net_worth = 0
    for episode in range(train_episodes):
        state = env.reset()
        done = False

        t0 = datetime.now()
       
        while not done:           

            #乱数で1,2,3を出力
            # action = agent.act(state)
            action = np.random.randint(3, size=1)[0]
            state, reward, done,info = env.step(action)

            '''
            next_state : [2420. 15.27 8.40000001] : [持っている株数 今の株価 手持ちのキャッシュ]
            reward : 335.60999999999876
            done : False
            info: {'cur_val': 15667.53000000051}
            '''
            if env.current_step == env.end_step:
                break

        dt = datetime.now() - t0
        print(f"episode: {episode + 1}/{train_episodes}, episode end value: {info['cur_val']:.2f}, duration: {dt}")

In [4]:
env = Environment(df)
play_one_episode(env, train_episodes = 10)

episode: 1/10, episode end value: 1912.98, duration: 0:00:00.061750
episode: 2/10, episode end value: 25303.36, duration: 0:00:00.091314
episode: 3/10, episode end value: 18786.04, duration: 0:00:00.078099
episode: 4/10, episode end value: 19545.16, duration: 0:00:00.109647
episode: 5/10, episode end value: 46137.36, duration: 0:00:00.115914
episode: 6/10, episode end value: 85230.93, duration: 0:00:00.086801
episode: 7/10, episode end value: 40469.03, duration: 0:00:00.110917
episode: 8/10, episode end value: 8003.60, duration: 0:00:00.105462
episode: 9/10, episode end value: 95164.75, duration: 0:00:00.151437
episode: 10/10, episode end value: 87931.01, duration: 0:00:00.097773
