<a href="https://colab.research.google.com/github/sugiyama404/ReinfoceLearningForTrading/blob/main/simple_rl2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from collections import deque

from google.colab import drive

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/dataset/reinforcement_learning/'
nov_path = '/content/drive/My Drive/' + nov_dir + 'data_csv_sp500.csv'

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
class Environment:
    def __init__(self, df, initial_money=1000):

        self.df = df.dropna().reset_index()
        self.df_total_steps = len(self.df)-1
        self.initial_money = initial_money
        self.action_space = np.array([0, 1, 2])
        
    def reset(self):
        self.money = self.initial_money
        self.net_worth = self.initial_money
        self.prev_net_worth = self.initial_money
        self.held = 0
        self.sold = 0
        self.bought = 0
        self.start_step = 0
        self.end_step = self.df_total_steps
            
        self.current_step = self.start_step

        state = [self.money, self.net_worth, self.bought, self.sold, self.held]
        return state

    # Execute one time step within the environment
    # 環境の中で1つのタイムステップを実行する
    # action = 0,1,2のどれか
    def step(self, action):
        self.bought = 0
        self.sold = 0
        self.current_step += 1

        current_price = self.df.loc[self.current_step, 'SP500']
        
        if action == 0: # Hold
            pass
        
        elif action == 1 and self.money > 0:
            # Buy with 100% of current money
            self.bought = self.money / current_price
            self.money -= self.bought * current_price
            self.held += self.bought

        elif action == 2 and self.held>0:
            # Sell 100% of current crypto held
            self.sold = self.held
            self.money += self.sold * current_price
            self.held -= self.sold

        self.prev_net_worth = self.net_worth
        self.net_worth = self.money + self.held * current_price

        # Calculate reward
        reward = self.net_worth - self.prev_net_worth

        # 追証コール
        if self.net_worth <= self.initial_money/2:
            done = True
        else:
            done = False

        obs = [self.money, self.net_worth, self.bought, self.sold, self.held]
        return obs, reward, done

    # render environment
    def render(self):
        print(f'Step: {self.current_step}, Net Worth: {self.net_worth}')

In [5]:
def play_one_episode(env, train_episodes = 50):
    average_net_worth = 0
    for episode in range(train_episodes):
        state = env.reset()
        done = False
        
        while not done:
            # env.render()

            #乱数で1,2,3を出力
            # action = agent.act(state)
            action = np.random.randint(3, size=1)[0]
            state, reward, done = env.step(action)
            print(state)

            if env.current_step == env.end_step:
                average_net_worth += env.net_worth
                print("net_worth:", env.net_worth)
                break

    print("average_net_worth:", average_net_worth/train_episodes)

In [6]:
env = Environment(df)
'''
train_episodes:ループの回転数
'''
play_one_episode(env, train_episodes = 10)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[880.8520211967876, 880.8520211967876, 0, 0, 0.0]
[880.8520211967876, 880.8520211967876, 0, 0, 0.0]
[880.8520211967876, 880.8520211967876, 0, 0, 0.0]
[880.8520211967876, 880.8520211967876, 0, 0, 0.0]
[880.8520211967876, 880.8520211967876, 0, 0, 0.0]
[880.8520211967876, 880.8520211967876, 0, 0, 0.0]
[880.8520211967876, 880.8520211967876, 0, 0, 0.0]
[880.8520211967876, 880.8520211967876, 0, 0, 0.0]
[0.0, 880.8520211967876, 210.2272126961307, 0, 210.2272126961307]
[0.0, 853.5224835462906, 0, 0, 210.2272126961307]
[857.7270278002132, 857.7270278002132, 0, 210.2272126961307, 0.0]
[857.7270278002132, 857.7270278002132, 0, 0, 0.0]
[857.7270278002132, 857.7270278002132, 0, 0, 0.0]
[857.7270278002132, 857.7270278002132, 0, 0, 0.0]
[857.7270278002132, 857.7270278002132, 0, 0, 0.0]
[857.7270278002132, 857.7270278002132, 0, 0, 0.0]
[857.7270278002132, 857.7270278002132, 0, 0, 0.0]
[857.7270278002132, 857.7270278002132, 0, 0, 0.0]
[0.