In [1]:
# Imports

import pandas as pd
import numpy as np
import datetime
import yfinance as yf

from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl import config_tickers
from finrl.config import INDICATORS

from finrl.agents.stablebaselines3.models import DRLAgent
from finrl.config import INDICATORS, TRAINED_MODEL_DIR, RESULTS_DIR
from finrl.main import check_and_make_directories
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv

check_and_make_directories([TRAINED_MODEL_DIR])

import itertools

## Fetching data
using Yahoo Finance

In [2]:
# Built in tickers that come with FinRL
config_tickers.DOW_30_TICKER

['AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CSCO',
 'CVX',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'KO',
 'JPM',
 'MCD',
 'MMM',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'TRV',
 'UNH',
 'CRM',
 'VZ',
 'V',
 'WBA',
 'WMT',
 'DIS',
 'DOW']

In [20]:
TRAIN_START_DATE = '2009-01-01'
TRAIN_END_DATE = '2020-07-01'
TRADE_START_DATE = '2020-07-01'
TRADE_END_DATE = '2021-10-29'

In [28]:
df_raw = YahooDownloader(start_date=TRAIN_START_DATE,
                        end_date=TRADE_END_DATE,
                        ticker_list=config_tickers.DOW_30_TICKER).fetch_data()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [29]:
df_raw.head()

Unnamed: 0,date,open,high,low,close,volume,tic,day
0,2009-01-02,3.067143,3.251429,3.041429,2.754724,746015200,AAPL,4
1,2009-01-02,58.59,59.080002,57.75,43.422935,6547900,AMGN,4
2,2009-01-02,18.57,19.52,18.4,15.308595,10955700,AXP,4
3,2009-01-02,42.799999,45.560001,42.779999,33.941109,7010200,BA,4
4,2009-01-02,44.91,46.98,44.709999,31.40884,7117200,CAT,4


## Data Preparation
Using built-in functionality to:
 - add technical indicators
 - add a turbulence index

Technically both are optional and can be ignored, for now we add them

In [30]:
INDICATORS

['macd',
 'boll_ub',
 'boll_lb',
 'rsi_30',
 'cci_30',
 'dx_30',
 'close_30_sma',
 'close_60_sma']

In [31]:
fe = FeatureEngineer(
    use_technical_indicator=True,
    tech_indicator_list=INDICATORS,
    use_vix=True,
    use_turbulence=True,
    user_defined_feature=False # Presumable this would be a way for your own features, it does not work that way though
)

df_processed = fe.preprocess_data(df_raw)
df_processed.head()

Successfully added technical indicators
[*********************100%***********************]  1 of 1 completed
Shape of DataFrame:  (3228, 8)
Successfully added vix
Successfully added turbulence index


Unnamed: 0,date,open,high,low,close,volume,tic,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
0,2009-01-02,3.067143,3.251429,3.041429,2.754724,746015200,AAPL,4,0.0,2.977272,2.648437,100.0,66.666667,100.0,2.754724,2.754724,39.189999,0.0
1,2009-01-02,58.59,59.080002,57.75,43.422935,6547900,AMGN,4,0.0,2.977272,2.648437,100.0,66.666667,100.0,43.422935,43.422935,39.189999,0.0
2,2009-01-02,18.57,19.52,18.4,15.308595,10955700,AXP,4,0.0,2.977272,2.648437,100.0,66.666667,100.0,15.308595,15.308595,39.189999,0.0
3,2009-01-02,42.799999,45.560001,42.779999,33.941109,7010200,BA,4,0.0,2.977272,2.648437,100.0,66.666667,100.0,33.941109,33.941109,39.189999,0.0
4,2009-01-02,44.91,46.98,44.709999,31.40884,7117200,CAT,4,0.0,2.977272,2.648437,100.0,66.666667,100.0,31.40884,31.40884,39.189999,0.0


These next few steps are *very specific preprocessing* because of how FinRL expects the index of the dataframe to function.

Long story short, the environment iterates through each "day" using the index of the DataFrame, so it needs to fetch *all* of the day's data using that index look-up linked above. It's very annoying, but as long as you know the boilerplate, it is okay.

In [32]:
list_ticker = df_processed['tic'].unique().tolist()
list_date = list(pd.date_range(df_processed['date'].min(), df_processed['date'].max()).astype(str))
combination = list(itertools.product(list_date, list_ticker))

In [33]:
processed_full = pd.DataFrame(combination, columns=["date", "tic"]).merge(df_processed, on=["date","tic"], how="left")
processed_full = processed_full[processed_full['date'].isin(df_processed['date'])]
processed_full = processed_full.sort_values(['date', 'tic'])
processed_full = processed_full.fillna(0)
processed_full

Unnamed: 0,date,tic,open,high,low,close,volume,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
0,2009-01-02,AAPL,3.067143,3.251429,3.041429,2.754724,746015200.0,4.0,0.000000,2.977272,2.648437,100.000000,66.666667,100.000000,2.754724,2.754724,39.189999,0.000000
1,2009-01-02,AMGN,58.590000,59.080002,57.750000,43.422935,6547900.0,4.0,0.000000,2.977272,2.648437,100.000000,66.666667,100.000000,43.422935,43.422935,39.189999,0.000000
2,2009-01-02,AXP,18.570000,19.520000,18.400000,15.308595,10955700.0,4.0,0.000000,2.977272,2.648437,100.000000,66.666667,100.000000,15.308595,15.308595,39.189999,0.000000
3,2009-01-02,BA,42.799999,45.560001,42.779999,33.941109,7010200.0,4.0,0.000000,2.977272,2.648437,100.000000,66.666667,100.000000,33.941109,33.941109,39.189999,0.000000
4,2009-01-02,CAT,44.910000,46.980000,44.709999,31.408840,7117200.0,4.0,0.000000,2.977272,2.648437,100.000000,66.666667,100.000000,31.408840,31.408840,39.189999,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135773,2021-10-27,UNH,454.640015,460.440002,453.480011,444.706329,3520400.0,2.0,11.436084,455.178450,365.075621,65.085936,174.317918,51.571330,407.118556,407.405376,16.980000,145.284971
135774,2021-10-27,V,224.750000,224.750000,215.660004,212.950592,22958100.0,2.0,0.013853,233.929286,215.684809,41.737221,-153.736887,30.337021,223.913916,226.131639,16.980000,145.284971
135775,2021-10-27,VZ,53.169998,53.200001,52.470001,48.326458,15007400.0,2.0,-0.224468,50.104368,46.982813,44.604898,-50.031237,8.504850,48.783063,49.410774,16.980000,145.284971
135776,2021-10-27,WBA,48.450001,48.459999,47.090000,43.363171,5652000.0,2.0,-0.015119,46.076060,42.191736,46.167777,-62.463781,5.045608,44.342032,44.670441,16.980000,145.284971


In [34]:
df_train = data_split(processed_full, TRAIN_START_DATE, TRAIN_END_DATE)
df_test = data_split(processed_full, TRADE_START_DATE, TRADE_END_DATE)

print(len(df_train))
print(len(df_test))

83897
9715


In [35]:
# Save the data
df_train.to_csv("train_data.csv")
df_test.to_csv("test_data.csv")

In [37]:
#Just a backup incase I want to reload the data from disk
train = pd.read_csv('train_data.csv')
train = train.set_index(train.columns[0])
train.index.names = ['']
train.head()

Unnamed: 0,date,tic,open,high,low,close,volume,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
,,,,,,,,,,,,,,,,,,
0.0,2009-01-02,AAPL,3.067143,3.251429,3.041429,2.754724,746015200.0,4.0,0.0,2.977272,2.648437,100.0,66.666667,100.0,2.754724,2.754724,39.189999,0.0
0.0,2009-01-02,AMGN,58.59,59.080002,57.75,43.422935,6547900.0,4.0,0.0,2.977272,2.648437,100.0,66.666667,100.0,43.422935,43.422935,39.189999,0.0
0.0,2009-01-02,AXP,18.57,19.52,18.4,15.308595,10955700.0,4.0,0.0,2.977272,2.648437,100.0,66.666667,100.0,15.308595,15.308595,39.189999,0.0
0.0,2009-01-02,BA,42.799999,45.560001,42.779999,33.941109,7010200.0,4.0,0.0,2.977272,2.648437,100.0,66.666667,100.0,33.941109,33.941109,39.189999,0.0
0.0,2009-01-02,CAT,44.91,46.98,44.709999,31.40884,7117200.0,4.0,0.0,2.977272,2.648437,100.0,66.666667,100.0,31.40884,31.40884,39.189999,0.0


## Define the Environment
May be worth reading into why our state requires 291 elements to describe completely. not sure why we need an extra 1, or why we need 2x our stock dimension. But let's trust it for now.