In [None]:
# df_OHLCV has data only for NYSE trading days, no weekend data
# df_OHLCV_all_dates includes data for weekends when BTC trades
# read symbols in file to list syms_in_file
# download OHLCV data for symbols in syms_in_file
# drop symbols with all NaN in OHLCV columns from df
# rename column names from ['Open', ..., 'Volume'] to ['open', ..., 'volume']
# drop weekend data by reindex to date index of index_symbol
# pickled df_OHLCV_all_dates
# pickled df_OHLCV
# pickled symbols_df_OHLCV
# create df_symbols_close, sort df by symbols
# pickled df_symbols_close

In [None]:
import yfinance as yf
from myUtils import yf_download_AdjOHLCV, pickle_dump, pickle_load, read_symbols_file, drop_symbols_all_NaN, yf_symbols_close # NOQA
verbose = False  # True prints more output
# verbose = True  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"
filename_symbols = path_data_dump + 'symbols_mktCap_2b.csv'  # symbols text file
filename_pickled_df_OHLCV_all_dates = 'df_OHLCV_all_dates'  # pickled filename
filename_pickled_df_OHLCV = 'df_OHLCV'  # pickled filename reindexed to NYSE dates
filename_pickled_df_symbols_close = "df_symbols_close"  # pickled filename
filename_pickled_symbols_df_OHLCV =  'symbols_df_OHLCV'  # pickled filename

In [None]:
# Stop if Yahoo has not updated OHLCV data
from datetime import date, timedelta
index_symbol = "XOM"  
df_XOM = yf.download(index_symbol)
df_last_date = df_XOM.index[-1].strftime('%Y-%m-%d')
yesterday = str(date.today() - timedelta(days = 1))
if yesterday == df_last_date:
  msg_stop = f'Yahoo has not updated OHLCV data, today: {str(date.today())}, Yahoo download last date: {df_last_date}'
  # print(f'Yahoo has not updated OHLCV data, today: {str(date.today())}, Yahoo download last date: {df_last_date}')
  raise SystemExit(msg_stop)

In [None]:
# Stop if df_OHLCV is up to date
print(f"Full path to pickled df_OHLCV:  {path_data_dump}{filename_pickled_df_OHLCV}")
df = pickle_load(path_data_dump, filename_pickled_df_OHLCV, verbose=verbose)
df_OHLCV_last_date = df.index[-1].strftime('%Y-%m-%d')
today = str(date.today())
if today == df_OHLCV_last_date:
  msg_stop = f'df_OHLCV is up to date, today: {today}, df_OHLCV last date: {df_OHLCV_last_date}'
  raise SystemExit(msg_stop)

In [None]:
# read symbols in file to a list
symbols_in_file = read_symbols_file(filename_symbols)
# download OHLCV data for symbols in syms_in_file
df_OHLCV_all_dates = yf_download_AdjOHLCV(symbols_in_file, verbose=verbose)

In [None]:
# sort df by symbol
df_OHLCV_all_dates = df_OHLCV_all_dates.sort_index(axis=1,level=0,sort_remaining=False)

In [None]:
# drop symbols with all NaN in OHLCV columns from df
df_OHLCV_all_dates, symbols_OHLCV, symbols_dropped = drop_symbols_all_NaN(df_OHLCV_all_dates, verbose)
# print(f'symbols_OHLCV: {symbols_OHLCV}')
print(f'symbols with all NaN dropped from df_OHLCV_all_dates: {symbols_dropped}')

In [None]:
# rename columns OHLCV *ONLY AFTER* dropping symbols with all NaN from df,
#   symbols with all NaN has an added AdjClose column and will cause errors  
#  rename column names from ['Open', ..., 'Volume'] to ['open', ..., 'volume']
#  .remove_unused_levels() prevents ValueError
#   e.g ValueError: On level 1, code max (5) >= length of level (5). NOTE: this index is in an inconsistent state
# The error may be caused by removing symbols from the dataframe with all NaN in OHLCV columns
df_OHLCV_all_dates.columns = df_OHLCV_all_dates.columns.remove_unused_levels()
df_OHLCV_all_dates.columns = df_OHLCV_all_dates.columns.set_levels(['open', 'high', 'low', 'close', 'volume'], level=1)

In [None]:
# drop weekend data by re-indexing to date-index of index_symbol
myNaN = float('nan')
# use Exxon's date as proxy for NYSE trading dates
df_OHLCV = df_OHLCV_all_dates.reindex(df_XOM.index, fill_value=myNaN)

In [None]:
# pickle df_OHLCV and symbols
print(f"Full path to pickled df_OHLCV_all_dates:  {path_data_dump}{filename_pickled_df_OHLCV_all_dates}")
pickle_dump(df_OHLCV_all_dates, path_data_dump, filename_pickled_df_OHLCV_all_dates, verbose=verbose)
print(f"Full path to pickled df_OHLCV:  {path_data_dump}{filename_pickled_df_OHLCV}")
pickle_dump(df_OHLCV, path_data_dump, filename_pickled_df_OHLCV, verbose=verbose)
print(f"Full path to pickled symbols_df_OHLCV:  {path_data_dump}{filename_pickled_symbols_df_OHLCV}")
pickle_dump(symbols_OHLCV, path_data_dump, filename_pickled_symbols_df_OHLCV, verbose=verbose)

In [None]:
# create df of symbols' Close, sort df by symbols, pickled df
df_symbols_close, dates_dropped, symbols_OHLCV, symbols_dropped = yf_symbols_close(
    path_dir,
    path_data_dump,
    filename_pickled_df_OHLCV,
    verbose=verbose,
)
# multi-index sort df by symbol
df_symbols_close = df_symbols_close.sort_index(axis=1,level=0,sort_remaining=False)
print(f"Full path to pickled df_symbols_close:  {path_data_dump}{filename_pickled_df_symbols_close}")
pickle_dump(df_symbols_close, path_data_dump, filename_pickled_df_symbols_close, verbose=verbose)

In [None]:
# retrieve pickled files
print(f"Full path to pickled df_OHLCV_all_dates:  {path_data_dump}{filename_pickled_df_OHLCV_all_dates}")
df_all_dates = pickle_load(path_data_dump, filename_pickled_df_OHLCV_all_dates, verbose=verbose)
print(f"Full path to pickled df_OHLCV:  {path_data_dump}{filename_pickled_df_OHLCV}")
df = pickle_load(path_data_dump, filename_pickled_df_OHLCV, verbose=verbose)
print(f"Full path to pickled symbols_df_OHLCV:  {path_data_dump}{filename_pickled_symbols_df_OHLCV}")
df_close = pickle_load(path_data_dump, filename_pickled_df_symbols_close, verbose=verbose)
print(f"Full path to pickled df_symbols_close:  {path_data_dump}{filename_pickled_df_symbols_close}")
symbols_df = pickle_load(path_data_dump, filename_pickled_symbols_df_OHLCV, verbose=verbose)

In [None]:
df['ZS']

In [None]:
mySym = ['AAPL', 'META']
for sym in mySym:
  print(f'{sym}\n{df[sym].tail()}\n')