In [None]:
# ## install required packages
# !pip install swig
# !pip install wrds
# !pip install pyportfolioopt
# ## install finrl library
# !pip install git+https://github.com/AI4Finance-Foundation/FinRL.git


In [1]:
import pandas as pd
import numpy as np
import datetime
import yfinance as yf

from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl import config_tickers
from finrl.config import INDICATORS

import itertools


In [2]:
TRAIN_START_DATE = '2009-01-01'
TRAIN_END_DATE = '2022-07-01'
TRADE_START_DATE = '2022-07-01'
TRADE_END_DATE = '2024-05-01'


In [3]:
from finrl.config_tickers import DOW_30_TICKER

print(DOW_30_TICKER)


['AXP', 'AMGN', 'AAPL', 'BA', 'CAT', 'CSCO', 'CVX', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'KO', 'JPM', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PG', 'TRV', 'UNH', 'CRM', 'VZ', 'V', 'WBA', 'WMT', 'DIS', 'DOW']


In [4]:
df_raw = YahooDownloader(start_date = TRAIN_START_DATE,
                                end_date = TRADE_END_DATE,
                                ticker_list = DOW_30_TICKER).fetch_data()



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

Shape of DataFrame:  (113141, 8)


In [5]:
df_raw.head()


Unnamed: 0,date,open,high,low,close,volume,tic,day
0,2009-01-02,3.067143,3.251429,3.041429,2.737005,746015200,AAPL,4
1,2009-01-02,58.59,59.080002,57.75,42.107323,6547900,AMGN,4
2,2009-01-02,18.57,19.52,18.4,15.053307,10955700,AXP,4
3,2009-01-02,42.799999,45.560001,42.779999,33.941109,7010200,BA,4
4,2009-01-02,44.91,46.98,44.709999,30.712517,7117200,CAT,4


In [6]:
df_raw.tic.value_counts()


tic
AAPL    3857
AMGN    3857
AXP     3857
BA      3857
CAT     3857
CRM     3857
CSCO    3857
CVX     3857
DIS     3857
GS      3857
HD      3857
HON     3857
IBM     3857
INTC    3857
JNJ     3857
JPM     3857
KO      3857
MCD     3857
MMM     3857
MRK     3857
MSFT    3857
NKE     3857
PG      3857
TRV     3857
UNH     3857
V       3857
VZ      3857
WBA     3857
WMT     3857
DOW     1288
Name: count, dtype: int64

## Pre-Process Data

In [7]:
fe = FeatureEngineer(use_technical_indicator=True,
                     tech_indicator_list = INDICATORS,
                     use_vix=True,
                     use_turbulence=True,
                     user_defined_feature = False)

processed = fe.preprocess_data(df_raw)
processed = processed.copy()
processed = processed.fillna(0)
processed = processed.replace(np.inf,0)


Successfully added technical indicators


[*********************100%%**********************]  1 of 1 completed


Shape of DataFrame:  (3856, 8)
Successfully added vix
Successfully added turbulence index


Refer: config.py

INDICATORS

stockstats technical indicator column names
check https://pypi.org/project/stockstats/ for different names

    INDICATORS = [
        "macd",
        "boll_ub",
        "boll_lb",
        "rsi_30",
        "cci_30",
        "dx_30",
        "close_30_sma",
        "close_60_sma",
    ]

In [8]:
processed.sample(5)


Unnamed: 0,date,open,high,low,close,volume,tic,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
62027,2017-06-30,94.379997,94.620003,93.779999,89.220764,9036600,V,4,0.254287,92.016668,88.66158,52.363288,-69.620189,9.548064,90.21592,88.404808,11.18,124.124969
18743,2011-07-27,136.789993,137.25,134.440002,106.387062,5250000,GS,2,0.026746,109.466705,100.816658,46.449051,63.336648,1.689565,105.289091,107.896268,22.98,35.890914
11898,2010-08-19,33.720001,33.779999,33.060001,28.683229,12070700,DIS,3,-0.019374,30.577433,28.58746,46.71879,-94.163121,6.872289,29.412341,29.220052,26.440001,21.7311
88948,2021-03-11,215.0,218.169998,214.350006,215.333084,6431000,CRM,3,-6.005204,258.651326,193.685766,44.4294,-76.48437,23.625257,228.51499,225.02449,21.91,20.478945
48333,2015-08-17,56.106869,57.013359,55.896946,43.290554,6407158,MRK,0,0.154406,43.330342,41.377335,55.104031,125.621726,15.269093,42.291245,42.356922,13.02,21.872891


In [9]:
processed.shape


(111824, 18)

## Save The Data

In [10]:
# Split the data
train = data_split(processed, TRAIN_START_DATE,TRAIN_END_DATE)
trade = data_split(processed, TRADE_START_DATE,TRADE_END_DATE)
print(len(train))
print(len(trade))


98513
13311


In [11]:
train_path = 'data/train.csv'
trade_path = 'data/trade.csv'

with open(train_path, 'w', encoding = 'utf-8-sig') as f:
  train.to_csv(f)

with open(trade_path, 'w', encoding = 'utf-8-sig') as f:
  trade.to_csv(f)
