In [1]:
import os
import pandas as pd

from finrl.main import check_and_make_directories                               
from finrl.config import INDICATORS, TRAINED_MODEL_DIR, RESULTS_DIR

# "./" will be added in front of each directory
check_and_make_directories([TRAINED_MODEL_DIR])


### Step1: Load the data

In [2]:
train = pd.read_csv('data/train.csv')

train = train.set_index(train.columns[0])
train.index.names = ['']


In [3]:
train.head()


Unnamed: 0,date,open,high,low,close,volume,tic,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
,,,,,,,,,,,,,,,,,,
0.0,2009-01-02,3.067143,3.251429,3.041429,2.737005,746015200.0,AAPL,4.0,0.0,2.958118,2.631403,100.0,66.666667,100.0,2.737005,2.737005,39.189999,0.0
0.0,2009-01-02,58.59,59.080002,57.75,42.107323,6547900.0,AMGN,4.0,0.0,2.958118,2.631403,100.0,66.666667,100.0,42.107323,42.107323,39.189999,0.0
0.0,2009-01-02,18.57,19.52,18.4,15.053307,10955700.0,AXP,4.0,0.0,2.958118,2.631403,100.0,66.666667,100.0,15.053307,15.053307,39.189999,0.0
0.0,2009-01-02,42.799999,45.560001,42.779999,33.941109,7010200.0,BA,4.0,0.0,2.958118,2.631403,100.0,66.666667,100.0,33.941109,33.941109,39.189999,0.0
0.0,2009-01-02,44.91,46.98,44.709999,30.712517,7117200.0,CAT,4.0,0.0,2.958118,2.631403,100.0,66.666667,100.0,30.712517,30.712517,39.189999,0.0


In [4]:
train.shape


(98513, 18)

In [5]:
# train.columns
train.tic.unique(), INDICATORS


(array(['AAPL', 'AMGN', 'AXP', 'BA', 'CAT', 'CRM', 'CSCO', 'CVX', 'DIS',
        'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM',
        'MRK', 'MSFT', 'NKE', 'PG', 'TRV', 'UNH', 'V', 'VZ', 'WBA', 'WMT'],
       dtype=object),
 ['macd',
  'boll_ub',
  'boll_lb',
  'rsi_30',
  'cci_30',
  'dx_30',
  'close_30_sma',
  'close_60_sma'])

### Step 2: Setup the environment

state_space = 1 (remaining balance in the account) + 2stock_dimension (prices of 30 stocks and the share holdings of the 30 stocks, so totally 2stock_dimension) + len(INDICATORS)*stock_dimension

In [6]:
stock_dimension = len(train.tic.unique())
state_space = 1 + 2*stock_dimension + len(INDICATORS)*stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")


Stock Dimension: 29, State Space: 291


Action: The action space describes the allowed actions that the agent interacts with the environment. Normally, a ∈ A includes three actions: a ∈ {−1, 0, 1}, where −1, 0, 1 represent selling, holding, and buying one stock. Also, an action can be carried upon multiple shares. We use an action space {−k, ..., −1, 0, 1, ..., k}, where k denotes the number of shares. For example, "Buy 10 shares of AAPL" or "Sell 10 shares of AAPL" are 10 or −10, respectively

action is a scalar in a continuous space instead of a discrete space like {-1, 0, 1}. Hence, we just need one action for each stock, and that action is from a continuous space. Thus the action is always a vector with stock dimension.

In [11]:
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv

buy_cost_list = sell_cost_list = [0.001] * stock_dimension
num_stock_shares = [0] * stock_dimension

env_kwargs = {
    "hmax": 100,
    "initial_amount": 1000000,
    "num_stock_shares": num_stock_shares,
    "buy_cost_pct": buy_cost_list,
    "sell_cost_pct": sell_cost_list,
    "state_space": state_space,
    "stock_dim": stock_dimension,
    "tech_indicator_list": INDICATORS,
    "action_space": stock_dimension,
    "reward_scaling": 1e-4
}

e_train_gym = StockTradingEnv(df = train, **env_kwargs)
env_train, _ = e_train_gym.get_sb_env()


In [8]:
len(e_train_gym.df.index.unique()) - 1


3396

In [9]:
e_train_gym.df.tic.count()


98513

### Step 3: Initially train using a PPO Model

In [12]:
from stable_baselines3 import PPO
from finrl.agents.stablebaselines3.models import DRLAgent
from stable_baselines3.common.logger import configure

agent = DRLAgent(env = env_train)
model_ppo = agent.get_model('ppo')

# set up logger
tmp_path = RESULTS_DIR + '/ppo'
new_logger_ppo = configure(tmp_path, ["stdout", "csv", "tensorboard"])
# Set new logger
model_ppo.set_logger(new_logger_ppo)

trained_ppo = agent.train_model(model=model_ppo,
                             tb_log_name='ppo',
                             total_timesteps=50000)


{'n_steps': 2048, 'ent_coef': 0.01, 'learning_rate': 0.00025, 'batch_size': 64}
Using cpu device
Logging to results/ppo
------------------------------------
| time/              |             |
|    fps             | 205         |
|    iterations      | 1           |
|    time_elapsed    | 9           |
|    total_timesteps | 2048        |
| train/             |             |
|    reward          | -0.09338944 |
------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 63          |
|    iterations           | 2           |
|    time_elapsed         | 64          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014445987 |
|    clip_fraction        | 0.224       |
|    clip_range           | 0.2         |
|    entropy_loss         | -41.2       |
|    explained_variance   | 0.00235     |
|    learning_rate        | 0.00025     |
|

In [13]:
trained_ppo.save(TRAINED_MODEL_DIR + "/agent_ppo")


In [14]:
# Load the saved model

model = PPO.load("trained_models/agent_ppo.zip")


### Step 4: Use PPO Model to generate trajectories

In [15]:
import numpy as np

"""make a prediction and get results"""
env_train, obs = e_train_gym.get_sb_env()

ds = []
states = []
feature = {}

s, a, r, d = [], [], [], []

env_train.reset()
#max_steps = len(e_train_gym.df.index.unique()) - 1
max_steps = e_train_gym.df.tic.count() - 1

for i in range(1, max_steps, 1):

    action, _states = model.predict(obs, deterministic=True)
    s.extend(obs)
    a.extend(action)

    obs, rewards, dones, info = env_train.step(action)
    r.extend(rewards)
    d.append(dones[0])

    states.extend(obs)

    if (i % 100 == 0):
        
        feature['observations'] = s
        feature['actions'] = a
        feature['rewards'] = r
        feature['dones'] = d
        
        ds.append(feature)
        feature = {}
        s, a, r, d = [], [], [], []


states = np.vstack(states)
state_mean, state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6


day: 3396, episode: 20
begin_total_asset: 1000000.00
end_total_asset: 5230959.83
total_reward: 4230959.83
total_cost: 3335.23
total_trades: 40903
Sharpe: 0.741
day: 3396, episode: 30
begin_total_asset: 1000000.00
end_total_asset: 5230959.83
total_reward: 4230959.83
total_cost: 3335.23
total_trades: 40903
Sharpe: 0.741
day: 3396, episode: 40
begin_total_asset: 1000000.00
end_total_asset: 5230959.83
total_reward: 4230959.83
total_cost: 3335.23
total_trades: 40903
Sharpe: 0.741


In [16]:
state_mean[:5], state_std[:5], state_mean.shape


(array([5.2268852e+04, 4.1985973e+01, 1.1738811e+02, 7.5283417e+01,
        1.5208690e+02], dtype=float32),
 array([1.8320502e+05, 4.3432262e+01, 6.1657406e+01, 3.8436382e+01,
        1.0245751e+02], dtype=float32),
 (291,))

In [18]:
len_ds = len(ds)

state_mean = np.pad(state_mean, (0, (len_ds-state_space)))
state_std = np.pad(state_std, (0, (len_ds-state_space)))


In [19]:
state_mean, len(state_mean)


(array([5.22688516e+04, 4.19859734e+01, 1.17388107e+02, 7.52834167e+01,
        1.52086899e+02, 9.21457214e+01, 9.42755127e+01, 2.57341862e+01,
        7.48008575e+01, 8.51193771e+01, 1.67372482e+02, 1.17778442e+02,
        9.47928238e+01, 1.00966560e+02, 2.87876167e+01, 8.75014572e+01,
        6.35456352e+01, 3.27508621e+01, 1.11384789e+02, 9.10166245e+01,
        4.33551979e+01, 8.26768723e+01, 5.52649345e+01, 7.18913956e+01,
        8.55227356e+01, 1.49126968e+02, 8.91049500e+01, 3.11658859e+01,
        4.08236771e+01, 2.44139862e+01, 1.52551978e+03, 3.12602466e+03,
        3.24999243e-01, 5.69125098e+03, 1.07546783e+02, 4.24570557e+03,
        1.76630018e-03, 9.71465092e-03, 1.21795642e+03, 0.00000000e+00,
        1.06861159e-01, 0.00000000e+00, 3.66832123e+02, 7.93789429e+02,
        5.02173309e+02, 3.53554428e-01, 0.00000000e+00, 1.79031543e+03,
        0.00000000e+00, 4.74454468e+02, 5.03509094e+02, 0.00000000e+00,
        5.00036182e+03, 4.45463330e+03, 6.20528015e+02, 1.917896

In [17]:
len(ds), len(ds[0])


(985, 4)

In [24]:
feature = ds[0]
len(feature["rewards"])


100

https://huggingface.co/docs/datasets/v1.11.0/loading_datasets.html#from-local-files

csv
- dataset = load_dataset(‘csv’, data_files=[‘my_file_1.csv’, ‘my_file_2.csv’])

json
- dataset = load_dataset(‘json’, data_files=’my_file.json’)

text
- dataset = load_dataset(‘text’, data_files={‘train’: [‘my_text_1.txt’, ‘my_text_2.txt’], ‘test’: ‘my_test_file.txt’})

python dict
- my_dict = {'id': [0, 1, 2], 'name': ['mary', 'bob', 'eve'], 'age': [24, 53, 19]}
- dataset = Dataset.from_dict(my_dict)

pandas dataframe
- df = pd.DataFrame({"a": [1, 2, 3]})
- dataset = Dataset.from_pandas(df)


### Step 5: Wrap the trajectory data using a dictionary

In [20]:
input_data = {}
input_data['train'] = ds
input_data['state_mean'] = state_mean
input_data['state_std'] = state_std


In [21]:
input_data.keys()


dict_keys(['train', 'state_mean', 'state_std'])

In [22]:
from datasets import Dataset

dataset = Dataset.from_dict(input_data)


### Step 6: Save the trajectory data to the local disk

https://huggingface.co/docs/datasets/v1.10.2/processing.html

In [23]:
dataset.save_to_disk("data/dataset/")


Saving the dataset (0/1 shards):   0%|          | 0/985 [00:00<?, ? examples/s]

In [12]:
from datasets import load_from_disk

dataset = load_from_disk("data/dataset/")
