In [1]:
import torch
#import mujoco_py
import gym
import numpy as np

#from colabgymrender.recorder import Recorder
from transformers import DecisionTransformerModel


In [2]:
import os
import pandas as pd

# from finrl import config_tickers
from finrl.main import check_and_make_directories                               
from finrl.config import INDICATORS, TRAINED_MODEL_DIR, RESULTS_DIR

# "./" will be added in front of each directory
check_and_make_directories([TRAINED_MODEL_DIR])


In [3]:
#device = 'cuda'
device = torch.device("cpu")
torch.cuda.current_device()


0

### Step 1: Initialize the environment

In [4]:
train = pd.read_csv('data/train.csv')
train = train.set_index(train.columns[0])
train.index.names = ['']

trade = pd.read_csv('data/trade.csv')
trade = trade.set_index(trade.columns[0])
trade.index.names = ['']


In [5]:
stock_dimension = len(trade.tic.unique())
state_space = 1 + 2*stock_dimension + len(INDICATORS)*stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")


Stock Dimension: 29, State Space: 291


In [6]:
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv

buy_cost_list = sell_cost_list = [0.001] * stock_dimension
num_stock_shares = [0] * stock_dimension

env_kwargs = {
    "hmax": 100,
    "initial_amount": 1000000,
    "num_stock_shares": num_stock_shares,
    "buy_cost_pct": buy_cost_list,
    "sell_cost_pct": sell_cost_list,
    "state_space": state_space,
    "stock_dim": stock_dimension,
    "tech_indicator_list": INDICATORS,
    "action_space": stock_dimension,
    "reward_scaling": 1e-4
}

e_trade_gym = StockTradingEnv(df = trade, **env_kwargs)
env, obs = e_trade_gym.get_sb_env()


In [7]:
scale = 1000.0  # normalization for rewards/returns
TARGET_RETURN = 3600 / scale  # evaluation is conditioned on a return of 3600, scaled accordingly
state_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]



In [8]:
state_dim, act_dim


(291, 29)

Retrieve state_mean, state_std from the training data

In [9]:
from datasets import load_from_disk

dataset = load_from_disk("data/dataset/")

state_mean = dataset['state_mean']
state_std = dataset['state_std']

state_mean = state_mean[:state_dim]
state_std = state_std[:state_dim]

state_mean = torch.Tensor(state_mean).to(device=device)
state_std = torch.Tensor(state_std).to(device=device)



### Step 2: Load the trained model

In [10]:
# Create the decision transformer model
model = DecisionTransformerModel.from_pretrained('trained_models')
model = model.to(device)

print(list(model.encoder.wpe.parameters()))



[Parameter containing:
tensor([[-0.0448, -0.0363, -0.0243,  ..., -0.0324, -0.0058,  0.0206],
        [-0.0143,  0.0055,  0.0176,  ...,  0.0162,  0.0007, -0.0109],
        [-0.0008, -0.0108, -0.0049,  ..., -0.0011,  0.0283,  0.0270],
        ...,
        [-0.0434, -0.0100,  0.0175,  ...,  0.0511, -0.0297, -0.0194],
        [ 0.0006,  0.0073,  0.0001,  ...,  0.0524,  0.0297, -0.0313],
        [-0.0020,  0.0278,  0.0001,  ...,  0.0089, -0.0072, -0.0241]],
       requires_grad=True)]


### Step 3: Define a function that performs masked autoregressive predictions of actions.

The model's prediction is conditioned on sequences of states, actions, time-steps and returns. The action for the current time-step is included as zeros and masked in to not skew the model's attention distribution.

In [11]:
# Function that gets an action from the model using autoregressive prediction with a window of the previous 20 timesteps.
def get_action(model, states, actions, rewards, returns_to_go, timesteps):
    # This implementation does not condition on past rewards

    states = states.reshape(1, -1, model.config.state_dim)
    actions = actions.reshape(1, -1, model.config.act_dim)
    returns_to_go = returns_to_go.reshape(1, -1, 1)
    timesteps = timesteps.reshape(1, -1)

    states = states[:, -model.config.max_length :]
    actions = actions[:, -model.config.max_length :]
    returns_to_go = returns_to_go[:, -model.config.max_length :]
    timesteps = timesteps[:, -model.config.max_length :]
    padding = model.config.max_length - states.shape[1]
    
    # pad all tokens to sequence length
    attention_mask = torch.cat([torch.zeros(padding), torch.ones(states.shape[1])]).to(device=device)
    attention_mask = attention_mask.to(dtype=torch.long).reshape(1, -1)
    states = torch.cat([torch.zeros((1, padding, model.config.state_dim)), states], dim=1).float().to(device=device)
    actions = torch.cat([torch.zeros((1, padding, model.config.act_dim)), actions], dim=1).float().to(device=device)
    returns_to_go = torch.cat([torch.zeros((1, padding, 1)), returns_to_go], dim=1).float().to(device=device)
    timesteps = torch.cat([torch.zeros((1, padding), dtype=torch.long), timesteps], dim=1).to(device=device)

    state_preds, action_preds, return_preds = model(
        states=states,
        actions=actions,
        rewards=rewards,
        returns_to_go=returns_to_go,
        timesteps=timesteps,
        attention_mask=attention_mask,
        return_dict=False,
    )

    return action_preds[0, -1]


In [12]:
max_steps = e_trade_gym.df.tic.count() - 1
max_steps


13310

### Step 4: Evaluate the performance of the agent

In [13]:
# Interact with the environment
#max_ep_len = 1000

episode_return, episode_length = 0, 0
state = env.reset()
target_return = torch.tensor(TARGET_RETURN, device=device, dtype=torch.float32).reshape(1, 1)
states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
rewards = torch.zeros(0, device=device, dtype=torch.float32)

timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)

#max_steps = e_trade_gym.df.tic.count() - 1
max_steps = len(e_trade_gym.df.index.unique()) - 1

account_memory = None  # This help avoid unnecessary list creation
actions_memory = None  # optimize memory consumption

for t in range(max_steps+1):
    actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
    rewards = torch.cat([rewards, torch.zeros(1, device=device)])

    action = get_action(
        model,
        (states - state_mean) / state_std,
        actions,
        rewards,
        target_return,
        timesteps,
    )

    actions[-1] = action
    actions_numpy = actions.detach().cpu().numpy()

    #state, reward, done, truncated, info = env.step(actions_numpy)
    state, reward, done, _ = env.step(actions_numpy)

    cur_state = torch.from_numpy(state).to(device=device).reshape(1, state_dim)
    states = torch.cat([states, cur_state], dim=0)
    rewards[-1] = torch.from_numpy(reward).to(device=device)

    pred_return = target_return[0, -1] - (reward / scale)
    target_return = torch.cat([target_return, pred_return.reshape(1, 1)], dim=1)
    timesteps = torch.cat([timesteps, torch.ones((1, 1), device=device, dtype=torch.long) * (t + 1)], dim=1)

    episode_return += reward
    episode_length += 1

    if (t == max_steps - 1):  # more descriptive condition for early termination to clarify the logic
        # Call instance methods of vectorized environments
        # https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html
        account_memory = env.env_method(method_name="save_asset_memory")
        actions_memory = env.env_method(method_name="save_action_memory")

    if done[0]:# or truncated:
        break



In [14]:
df_account_value_dt, df_actions_dt = account_memory[0], actions_memory[0]

# Save DF as CSV and then reload to visualize the performance of the agent


In [15]:
df_account_value_dt.shape, df_actions_dt.shape


((459, 2), (458, 29))

In [14]:
df_account_value_dt.head()


Unnamed: 0,date,account_value
0,2020-07-01,1000000.0
1,2020-07-02,1000016.0
2,2020-07-06,1000116.0
3,2020-07-07,999941.2
4,2020-07-08,1000134.0


In [15]:
df_account_value_dt.tail()


Unnamed: 0,date,account_value
706,2023-04-21,1266686.0
707,2023-04-24,1272051.0
708,2023-04-25,1256723.0
709,2023-04-26,1240969.0
710,2023-04-27,1261361.0


In [16]:
df_actions_dt.head()


Unnamed: 0_level_0,aapl,amzn,cat,hd,ibm,intc,msft,t
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-07-01,13,0,5,10,11,0,0,0
2020-07-02,13,0,5,10,11,0,0,0
2020-07-06,13,0,5,10,11,0,0,0
2020-07-07,13,0,5,10,11,0,0,0
2020-07-08,13,0,5,10,11,0,0,0


In [17]:
df_actions_dt.tail()


Unnamed: 0_level_0,aapl,amzn,cat,hd,ibm,intc,msft,t
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-04-20,0,0,0,0,0,0,0,0
2023-04-21,0,0,0,0,0,0,0,0
2023-04-24,0,0,0,0,0,0,0,0
2023-04-25,0,0,0,0,0,0,0,0
2023-04-26,0,0,0,0,0,0,0,0


### Step 5: Save the evaluation results

In [16]:
df_account_value_dt.to_pickle('data/df_account_value_dt.pkl')
df_actions_dt.to_pickle('data/df_actions_dt.pkl')


### Step 6: PPO Agent's performance for comparison

In [7]:
from stable_baselines3 import PPO

# Load the saved model
trained_ppo = PPO.load("trained_models/agent_ppo.zip")


In [8]:
from finrl.agents.stablebaselines3.models import DRLAgent

df_account_value_ppo, df_actions_ppo = DRLAgent.DRL_prediction(
    model=trained_ppo,
    environment = e_trade_gym)


hit end!


In [9]:
df_account_value_ppo.to_pickle('data/df_account_value_ppo.pkl')
df_actions_ppo.to_pickle('data/df_actions_ppo.pkl')
