In [1]:
import random
import gym
from gym import spaces
import pandas as pd
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

2024-07-10 17:19:40.472341: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-10 17:19:40.491390: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-10 17:19:40.491430: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-10 17:19:40.506367: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
MAX_ACCOUNT_BALANCE = 2147483647
MAX_NUM_SHARES = 2147483647
MAX_SHARE_PRICE = 5000
MAX_OPEN_POSITIONS = 5
MAX_STEPS = 20000

INITIAL_ACCOUNT_BALANCE = 10000

In [3]:
class StockTradingEnv(gym.Env):
    """A stock trading environment for Gymnasium"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df):
        super(StockTradingEnv, self).__init__()

        self.df = df
        self.reward_range = (0, MAX_ACCOUNT_BALANCE)

        # Actions of the format Buy, Sell, Hold
        self.action_space = spaces.Discrete(3)

        # Prices contains the OHCL values for the last five prices
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(5, 5), dtype=np.float16)

    def _next_observation(self):
        # Get the stock data points for the last 5 days and scale to between 0-1
        frame = np.array([
            self.df.loc[self.df.index[self.current_step: self.current_step + 5], 'Open'].values / MAX_SHARE_PRICE,
            self.df.loc[self.df.index[self.current_step: self.current_step + 5], 'High'].values / MAX_SHARE_PRICE,
            self.df.loc[self.df.index[self.current_step: self.current_step + 5], 'Low'].values / MAX_SHARE_PRICE,
            self.df.loc[self.df.index[self.current_step: self.current_step + 5], 'Close'].values / MAX_SHARE_PRICE,
            self.df.loc[self.df.index[self.current_step: self.current_step + 5], 'Volume'].values / MAX_NUM_SHARES,
        ])

        # Transpose frame to have shape (5, 5)
        frame = frame.T

        return frame

    def _take_action(self, action):
        # Set the current price to a random price within the time step
        current_price = random.uniform(
            self.df.loc[self.df.index[self.current_step], "Open"], self.df.loc[self.df.index[self.current_step], "Close"])

        action_type = action

        if action_type == 0:
            # Buy amount % of balance in shares
            total_possible = int(self.balance / current_price)
            shares_bought = int(total_possible * random.uniform(0, 1))
            prev_cost = self.cost_basis * self.shares_held
            additional_cost = shares_bought * current_price

            self.balance -= additional_cost
            self.cost_basis = (
                prev_cost + additional_cost) / (self.shares_held + shares_bought)
            self.shares_held += shares_bought

        elif action_type == 1:
            # Sell amount % of shares held
            shares_sold = int(self.shares_held * random.uniform(0, 1))
            self.balance += shares_sold * current_price
            self.shares_held -= shares_sold
            self.total_shares_sold += shares_sold
            self.total_sales_value += shares_sold * current_price

        self.net_worth = self.balance + self.shares_held * current_price

        if self.net_worth > self.max_net_worth:
            self.max_net_worth = self.net_worth

        if self.shares_held == 0:
            self.cost_basis = 0

    def step(self, action):
        # Execute one time step within the environment
        self._take_action(action)

        self.current_step += 1

        if self.current_step > len(self.df.loc[:, 'Open'].values) - 6:
            self.current_step = 0

        delay_modifier = (self.current_step / MAX_STEPS)

        reward = self.balance * delay_modifier
        done = self.net_worth <= 0

        obs = self._next_observation()

        return obs, reward, done, {}

    def reset(self, **kwargs):
        if 'seed' in kwargs:
            kwargs.pop('seed')  # Remove the 'seed' argument if present but not used

        # Reset the state of the environment to an initial state
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.max_net_worth = INITIAL_ACCOUNT_BALANCE
        self.shares_held = 0
        self.cost_basis = 0
        self.total_shares_sold = 0
        self.total_sales_value = 0

        # Set the current step to a random point within the data frame
        self.current_step = random.randint(
            0, len(self.df.loc[:, 'Open'].values) - 6)

        return self._next_observation()

    def render(self, mode='human', close=False):
        # Render the environment to the screen
        profit = self.net_worth - INITIAL_ACCOUNT_BALANCE

        print(f'Step: {self.current_step}')
        print(f'Balance: {self.balance}')
        print(
            f'Shares held: {self.shares_held} (Total sold: {self.total_shares_sold})')
        print(
            f'Avg cost for held shares: {self.cost_basis} (Total sales value: {self.total_sales_value})')
        print(
            f'Net worth: {self.net_worth} (Max net worth: {self.max_net_worth})')
        print(f'Profit: {profit}')

In [4]:
# Load your historical stock data into a pandas dataframe
df = pd.read_csv(r'/home/aakashgpt/Desktop/SOC_RLtrader/data/A.csv', index_col='Date', parse_dates=True)

# Create the stock trading environment
env = StockTradingEnv(df)

# Instantiate the agent
model = PPO('MlpPolicy', env, verbose=1)

# Train the agent
model.learn(total_timesteps=10000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




-----------------------------
| time/              |      |
|    fps             | 432  |
|    iterations      | 1    |
|    time_elapsed    | 4    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 433          |
|    iterations           | 2            |
|    time_elapsed         | 9            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 6.988499e-06 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.1         |
|    explained_variance   | 2.38e-07     |
|    learning_rate        | 0.0003       |
|    loss                 | 2.46e+07     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.000185    |
|    value_loss           | 4.73e+07     |
------------------------------------------
----------------

<stable_baselines3.ppo.ppo.PPO at 0x77f0527666e0>

In [7]:
# Reset the environment for a new episode
obs = env.reset()

# Set the number of times to render
num_renders = 5
total_steps = 0

# Loop through steps within the episode
while True:
    # Predict action
    action, states = model.predict(obs)

    # Perform a step in the environment
    obs, rewards, done, info = env.step(action)

    # Increment step counter
    total_steps += 1

    # Render at specific intervals
    if total_steps % (total_steps // num_renders + 1) == 0:
        env.render()

    # Check if the episode is finished
    if done:
        print('--------------------------------------------')
        break

env.close()


Step: 4003
Balance: 10000.0
Shares held: 0 (Total sold: 0)
Avg cost for held shares: 0 (Total sales value: 0.0)
Net worth: 10000.0 (Max net worth: 10000)
Profit: 0.0
Step: 4004
Balance: 9966.195870263315
Shares held: 1 (Total sold: 0)
Avg cost for held shares: 33.804129736685276 (Total sales value: 0.0)
Net worth: 10000.0 (Max net worth: 10000)
Profit: 0.0
Step: 4005
Balance: 9966.195870263315
Shares held: 1 (Total sold: 0)
Avg cost for held shares: 33.804129736685276 (Total sales value: 0.0)
Net worth: 10000.153261735035 (Max net worth: 10000.153261735035)
Profit: 0.1532617350349028
Step: 4006
Balance: 9966.195870263315
Shares held: 1 (Total sold: 0)
Avg cost for held shares: 33.804129736685276 (Total sales value: 0.0)
Net worth: 10000.140703404886 (Max net worth: 10000.153261735035)
Profit: 0.1407034048861533
Step: 4008
Balance: 4542.882864515944
Shares held: 160 (Total sold: 0)
Avg cost for held shares: 34.106982096775354 (Total sales value: 0.0)
Net worth: 10061.099626936628 (Max n

KeyboardInterrupt: 