In [16]:
import sys
!{sys.executable} -m pip install tensorflow

Collecting tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/d5/1c/3ac472009a5c54ae7ec5a3294520ca36d1908cd1e5cf3e3fd923f9b7b31f/tensorflow-1.13.1-cp37-cp37m-macosx_10_11_x86_64.whl (73.6MB)
[K     |████████████████████████████████| 73.6MB 1.6MB/s eta 0:00:01    |▍                               | 860kB 1.8MB/s eta 0:00:40     |▉                               | 1.9MB 1.8MB/s eta 0:00:40     |████▋                           | 10.5MB 4.2MB/s eta 0:00:16     |███████████████████████████▎    | 62.8MB 299kB/s eta 0:00:36
[?25hCollecting tensorboard<1.14.0,>=1.13.0 (from tensorflow)
[?25l  Downloading https://files.pythonhosted.org/packages/0f/39/bdd75b08a6fba41f098b6cb091b9e8c7a80e1b4d679a581a0ccd17b10373/tensorboard-1.13.1-py3-none-any.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 14.1MB/s eta 0:00:01
[?25hCollecting protobuf>=3.6.1 (from tensorflow)
[?25l  Downloading https://files.pythonhosted.org/packages/fd/ed/e53989e7b08274334ddb41dac51294d270b3d

In [5]:
import random
import json
import gym
from gym import spaces
import pandas as pd
import numpy as np

In [None]:
'''
Each environment must implement the following gym interface

class CustomEnv(gym.Env):
    metadata = {'render.modes':['human']}
    
    def __init__(self, arg1, arg2, ...):
        super(CustomEnv, self).__init__()
        
        self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)
        
        self.observation_space = spaces.Box(low=0,high=255,shape=(HEIGHT, WIDTH, N_CHANNELS), dype=np.uint8)
        
    def step(self,action):
        pass
        
    def reset(self):
        pass
        
    def render(self,mode='human',close=False):
        pass
            
'''

In [7]:
# stock trading environemnt for OpenAI gym

MAX_ACCOUNT_BALANCE = 2147483647
MAX_NUM_SHARES = 2147483647
MAX_SHARE_PRICE = 5000
MAX_STEPS = 20000

INITIAL_ACCOUNT_BALANCE = 10000

class StockTradingEnvironment(gym.Env):
    
    metadata = {'render.modes':['human']}
    
    def __init__(self, df):
        super(StockTradingEnv, self).__init__()
        self.df = df
        sef.reward_range = (0, MAX_ACCOUNT_BALANCE)
        # actions of the format Buy x%, Sell x%, Hold, etc.
        self.action_space = spaces.Box(low=np.array([0,0]), high=np.array([3,1]), 
                                       dtype=np.float16)
        # prices contains the OHCL values for the last 5 prices
        self.observation_space = spaces.Box(low=0,high=1,shape=(6,6),dtype=np.float16)
        
    def _next_observation(self):
        # Get the data points for the last 5 days and scale to between 0-1
        frame = np.array([
            self.df.loc[self.current_step: self.current_step + 5, 'Open'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step + 5, 'High'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step + 5, 'Low'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step + 5, 'Close'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step + 5, 'Volume'].values / MAX_NUM_SHARES,
        ])
        
        # Append additional data and scale each value to between 0-1
        obs = np.append(frame, [[
            self.balance / MAX_ACCOUNT_BALANCE,
            self.max_net_worth / MAX_ACCOUNT_BALANCE,
            self.shares_held / MAX_NUM_SHARES,
            self.cost_basis / MAX_SHARE_PRICE,
            self.total_shares_sold / MAX_NUM_SHARE,
            self.total_sales_value / (MAX_NUM_SHARES * MAX_SHARE_PRICE),
        ]], axis=0)
        
        return obs
    
    # called any time a new environment is created or to reset an existing 
    # environment's state    
    def reset(self):
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.max_net_worth = INITIAL_ACCOUNT_BALANCE
        self.shares_held = 0
        self.cost_basis = 0
        self.total_shares_sold = 0
        self.total_sales_value = 0
        
        # set to random to give our agent more unique experiences from 
        # the same data set
        self.current_step = random.randint(0,len(self.df.loc[:,'Open'].values - 6))
        
        return self._next_observation()
    
    def _take_action(self, action):
        # set the current price to a random price within the time step
        current_price = random.uniform(
            self.df.loc[self.current_step,'Open'],
            self.df.loc[self.current_step,'Close'])
        
        action_type = action[0]
        amount = action[1]
        
        if action_type < 1:
            # buy amount % of balance in shares
            total_possible = self.balance / current_price
            shares_bought = total_possible * amount
            prev_cost = self.cost_basis * self.shares_held
            additional_cost = shares_bought * current_price
            
            self.balance -= additional_cost
            self.cost_basis = (prev_cost + additional_cost) / (self.shares_held + shares_bought)
            self.shares_held += shares_bought
        elif action_type < 2:
            # sell amount % of shares held
            shares_sold = self.shares_held * amount
            self.balance += shares_sold * current_price
            self.shares_held -= shares_sold
            self.total_shares_sold += shares_sold
            self.total_sales_value += shares_sold * current_price
        
        self.net_worth = self.balance + self.shares_held * current_price
        
        if self.net_worth > self.max_net_worth:
            self.max_net_worth = net_worth
        
        if self.shares_held == 0:
            self.cost_basis = 0
        
    
    def step(self,action):
        # Execute one time step with in the environment
        self._take_action(action)
        
        self.current_step += 1
        
        if self.current_step > len(self.df.loc[:,'Open'].values) - 6:
            self.current_step = 0
            
        delay_modifier = (self.current_step / MAX_STEPS)
        
        reward = self.balance * delay_modifier
        done = self.net_worth <= 0
        
        obs = self._next_observation()
        
        return obs, reward, done, {}
    
    def render(self,mode='humnan',close=False):
        # render the environment to the screen
        profit = self.net_worth - INITIAL_ACCOUNT_BALANCE
        
        print('Step: {}'.format(self.current_step))
        print('Balance: {}'.format(self.balance))
        print('Shares held: {} Total sold: {}'.format(self.shares_held, self.total_shares_sold))
        print('Avg cost for held shares: {} Total sales value: {}'.format(self.cost_basis, self.total_sales_value))
        print('Net worth: {} Max net worth: {}'.format(self.net_worth, self.total_sales_value))
        print('Profit: {}'.format(self.profit))

In [17]:
# Example using the above environment
# Notes:
#     in order to get stable_baselines to install, i had to run:
#     brew install mpich
import gym
import json
import datetime as dt

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

import pandas as pd


In [18]:
df = pd.read_csv('./data/AAPL.csv')