In [1]:
import pandas as pd
import yaml
import pickle as pkl
import os
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
from utils.create_dataset import data_split
from stable_baselines3.common.monitor import Monitor
from agent.PPO import PPO, FeedForwardNN
from utils.run_episode import run_trials, episode
from env.stockEnv import StockEnv
from stable_baselines3 import PPO as BPPO
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dataset = pd.read_csv("ssi.csv")
dataset['ticker'] = 'SSI'

with open("configs/ppo_configs.yaml", 'r') as f:
    ppo_configs = yaml.safe_load(f)

with open("configs/env_configs.yaml", 'r') as f:
    env_configs = yaml.safe_load(f)

run_save_path = "runs/stockEnv/"
model_save_path = "models/"

In [3]:
train_dataset = dataset.loc[:749]
test_dataset = dataset.loc[750:].reset_index(drop=True)

In [4]:
train_env = StockEnv(dataset, **env_configs)
test_env = StockEnv(test_dataset, **env_configs)

In [5]:
agent = PPO(
    FeedForwardNN, train_env,
    lr=ppo_configs['learning_rates'],
    gamma=ppo_configs['gamma'], clip=ppo_configs['clip'], ent_coef=ppo_configs['ent_coef'],
    critic_factor=ppo_configs['critic_factor'],
    max_grad_norm=ppo_configs['max_grad_norm'], gae_lambda=ppo_configs['gae_lambda'],
    n_updates=ppo_configs['n_updates'])

In [6]:
import torch
state, done = train_env.reset()
state = torch.tensor(state, dtype=torch.float)
action, _ = agent.select_action(state)
print(action)
train_env.step(action)

[-1.7321712]


([1000000.0, 8.47, 0], 0.0, False, False, {})

In [7]:
n_episodes = 100
best_model = None
best_returns = -np.inf
r_eps = []
for ep in range(n_episodes):
    batch_r, batch_s, batch_a, batch_terminal = [], [], [], []
    s, _ = train_env.reset()
    termination, truncation = False, False

    a, _ = agent.select_action(torch.tensor(s, dtype=torch.float))
    r_ep = 0
    t = 0
    ep_actions = []

    while not (termination or truncation):
        s_prime, r, termination, _, _ = train_env.step(a)
        a_prime, _ = agent.select_action(torch.tensor(s_prime, dtype=torch.float))
        ep_actions.append(a_prime[0])

        batch_r.append(r)
        batch_s.append(s)
        batch_a.append(a)
        batch_terminal.append(termination)

        s, a = s_prime, a_prime
        r_ep += r
        t += 1

    batch_r, batch_s, batch_a, batch_terminal = torch.tensor(np.array(batch_r), dtype=torch.float), torch.tensor(np.array(batch_s), dtype=torch.float), torch.tensor(np.array(batch_a), dtype=torch.float), torch.tensor(np.array(batch_terminal), dtype=torch.float)
    agent.update(batch_r, batch_s, batch_a, batch_terminal)

    print(ep)
    print(r_ep)
    print(train_env.asset_memory[-1])

    if r_ep > best_returns:
        best_returns = r_ep
        print('saved the best model')
        agent.save('best_agent.pth')
        
    print()

0
6.711721394002225
1074071.7259400226
saved the best model

1
47.03233358400309
1480912.6198400306
saved the best model

2
37.96200721800274
1385434.0241800286

3
80.11437082100358
1808891.5922100358
saved the best model

4
-1.0686959879969589
997114.6841200297

5
15.298533840002978
1163046.1104000295

6
134.71286294400363
2368946.9374400396
saved the best model

7
-2.969654939996852
978677.0746000308

8
14.321209762003225
1148771.7696200325

9
69.35731063400321
1707060.426340033

10
56.61182301700311
1574558.4901700313

11
49.66953907400265
1507420.9107400272

12
117.96315615000377
2187605.86950004

13
-9.792709803997703
902760.4419600233

14
85.55205252400282
1872706.7812400276

15
-13.643764732997257
868529.4926700271

16
93.99164787700366
1958227.8587700354

17
49.092250913003014
1502428.2411300293

18
-5.3342951589976035
949892.7204100231

19
7.848069787002257
1080202.7818700222

20
115.24218589500296
2162518.61095003

21
109.19235437800307
2111856.383780031

22
188.8648783560035

In [8]:
ep_rewards = []
ep_actions = []
ep_confidences = []
ep_balances = []
ep_shares_held = []
ep_net = []
total_reward = 0

state, _ = test_env.reset()
termination, truncation = False, False
agent.load('best_agent.pth')

while not (termination or truncation):
    action, _ = agent.select_action(torch.tensor(state, dtype=torch.float))
    _, log_prob = agent.evaluate(torch.tensor(state, dtype=torch.float), torch.tensor(action, dtype=torch.float))
    prob = torch.exp(log_prob)

    next_state, reward, termination, _, _ = test_env.step(action)
    state = next_state

    total_reward += reward
    ep_rewards.append(reward)
    ep_actions.append(action.item())
    ep_confidences.append(prob.item())
    ep_balances.append(test_env.state[0])
    ep_shares_held.append(test_env.state[2])
    ep_net.append(test_env.asset_memory[-1])

total_reward, test_env.asset_memory[-1]

(-12.54194855899926, 874620.0224100075)

In [9]:
len(ep_rewards), len(ep_actions), len(ep_confidences), len(ep_shares_held), len(ep_net)

(1152, 1152, 1152, 1152, 1152)

In [10]:
buy_thresholds = [0.5, 0.3, 0.1]
sell_thresholds = [-0.5, -0.3, -0.1]

def convert_to_category(x):
    if x >= buy_thresholds[0]:
        return 'BUY_50'
    elif x >= buy_thresholds[1]:
        return 'BUY_30'
    elif x >= buy_thresholds[2]:
        return 'BUY_20'
    elif x <= sell_thresholds[0]:
        return 'SELL_50'
    elif x <= sell_thresholds[1]:
        return 'SELL_30'
    elif x <= sell_thresholds[2]:
        return 'SELL_20'
    else:
        return 'HOLD'
    
test_dataset['action'] = ep_actions
test_dataset['action_converted'] = test_dataset.action.map(convert_to_category)
test_dataset['confidence'] = ep_confidences
test_dataset['shares_held'] = ep_shares_held
test_dataset['ep_net'] = ep_net

In [11]:
test_dataset.to_csv('final_actions.csv')