In [1]:
# df_OHLCV has data only for NYSE trading days, no weekend data
# df_OHLCV_all_dates includes data for weekends when BTC trades
# read symbols in file to list syms_in_file
# download OHLCV data for symbols in syms_in_file
# drop symbols with all NaN in OHLCV columns from df
# rename column names from ['Open', ..., 'Volume'] to ['open', ..., 'volume']
# drop weekend data by reindex to date index of index_symbol
# pickled df_OHLCV_all_dates
# pickled df_OHLCV
# pickled symbols_df_OHLCV
# create df_symbols_close, sort df by symbols
# pickled df_symbols_close

In [2]:
import yfinance as yf
import time
import pandas as pd
from random import randint
from datetime import date, timedelta, datetime
from myUtils import pickle_dump, pickle_load, read_symbols_file # NOQA
from myUtils import drop_symbols_all_NaN, yf_symbols_close, chunked_list # NOQA
from myUtils import yf_download_AdjOHLCV, yf_download_AdjOHLCV_noAutoAdj
verbose = False  # True prints more output
# verbose = True  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

# filename_symbols = path_data_dump + 'symbols_mktCap_2b.csv'  # symbols text file
filename_symbols = path_data_dump + 'vg_symbols_4chars_max.csv'  # symbols text file

filename_pickled_df_OHLCV_downloaded = 'df_OHLCV_downloaded'  # pickled filename
filename_pickled_df_OHLCV = 'df_OHLCV'  # pickled filename reindexed to NYSE dates
filename_pickled_df_symbols_close = "df_symbols_close"  # pickled filename
filename_pickled_symbols_df_OHLCV =  'symbols_df_OHLCV'  # pickled filename

In [3]:
# Stop if Yahoo has not updated OHLCV data
index_symbol = "XOM"  
df_XOM = yf.download(index_symbol)
df_last_date = df_XOM.index[-1].strftime('%Y-%m-%d')
yesterday = str(date.today() - timedelta(days = 1))
if yesterday == df_last_date:
  msg_stop = f'Yahoo has not updated OHLCV data, today: {str(date.today())}, Yahoo download last date: {df_last_date}'
  raise SystemExit(msg_stop)

[*********************100%***********************]  1 of 1 completed


In [4]:
# Stop if df_OHLCV is up to date
print(f"Full path to pickled df_OHLCV:  {path_data_dump}{filename_pickled_df_OHLCV}")
df = pickle_load(path_data_dump, filename_pickled_df_OHLCV, verbose=verbose)
df_OHLCV_last_date = df.index[-1].strftime('%Y-%m-%d')
today = str(date.today())
if today == df_OHLCV_last_date:
  msg_stop = f'df_OHLCV is up to date, today: {today}, df_OHLCV last date: {df_OHLCV_last_date}'
  raise SystemExit(msg_stop)

Full path to pickled df_OHLCV:  C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/df_OHLCV


In [5]:
# read symbols in file to a list
symbols_in_file = read_symbols_file(filename_symbols)
symbols_chunks = chunked_list(symbols_in_file, 400)  # e.g. [['A', 'BB', ...], ['CC', 'DD', ...], ..., ['Z', 'ZWS', ...]]

In [6]:
df_list=[]
# took 24 minutes to download 1917 symbols without error caused by Yahoo
for i, symbols in enumerate(symbols_chunks):
  df = yf_download_AdjOHLCV(symbols, verbose=False)
  df_list.append(df)
  # pause 5 - 15 sec between download
  if i < len(symbols_chunks) - 1 :  # skip pause after last download
    print(f'downloaded symbols from chuck {i}, sleep start')
    # sleep 78(18m 25s), 155, 305 sec to avoid download error from Yahoo
    time.sleep(78)
    print(f'downloaded symbols from chuck {i}, sleep ends')
  else:
    print(f'downloaded symbols from all chucks')

[*********************100%***********************]  400 of 400 completed
downloaded symbols from chuck 0, sleep start
downloaded symbols from chuck 0, sleep ends
[*********************100%***********************]  400 of 400 completed

3 Failed downloads:
- BH.A: No data found, symbol may be delisted
- BF.B: No data found for this date range, symbol may be delisted
- BF.A: No data found, symbol may be delisted
downloaded symbols from chuck 1, sleep start
downloaded symbols from chuck 1, sleep ends
[*********************100%***********************]  400 of 400 completed
downloaded symbols from chuck 2, sleep start
downloaded symbols from chuck 2, sleep ends
[*********************100%***********************]  400 of 400 completed
downloaded symbols from chuck 3, sleep start
downloaded symbols from chuck 3, sleep ends
[*********************100%***********************]  400 of 400 completed
downloaded symbols from chuck 4, sleep start
downloaded symbols from chuck 4, sleep ends
[**********

In [7]:
df = pd.concat(df_list, axis=1)

In [8]:
# verify df test_symbols' close against Yahoo
test_symbols = ['A', 'SHEL', 'YUM']
now = datetime.now()  # get current date and time
if now.hour >= 20:  # only run this test after 8 pm when Yahoo data should be updated
  for symbol in test_symbols:
    s = df.iloc[-222]
    sDate = s.name.strftime('%Y-%m-%d')
    sClose = s[symbol].Close
    df_sym = yf.Ticker(symbol).history(period='2y')
    yhClose = df_sym.loc[sDate]['Close']
    abs_pct_diff = abs(1 - sClose/yhClose)*100
    print(f'symbol:  {symbol:>4}   Date: {sDate:13}df_Close: {sClose:>10.5f} \
    Yahoo_Close: {yhClose:>10.5f}   %_dif_Close: {abs_pct_diff:>7.5f}')
    if abs_pct_diff > .0001:
      msg_stop = f'{symbol}  %_dif_Close > .0001'
      raise SystemExit(msg_stop)
    if symbol == test_symbols[-1]:
      msg_done = f"No errors found.  df test_symbols' Close matched Yahoo symbols' Close "
      print(msg_done)
else:
  print(f"Did not verify df test_symbols' close against Yahoo.  It's not 8 PM yet. Yahoo may not have updated their data.")

Did not verify df test_symbols' close against Yahoo.  It's not 8 PM yet. Yahoo may not have updated their data.


In [9]:
print(f"Full path to pickled df_OHLCV_downloaded:  {path_data_dump}{filename_pickled_df_OHLCV_downloaded}")
pickle_dump(df, path_data_dump, filename_pickled_df_OHLCV_downloaded, verbose=verbose)

Full path to pickled df_OHLCV_downloaded:  C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/df_OHLCV__downloaded


In [10]:
# sort df by symbol
df_OHLCV_all_dates = df.sort_index(axis=1,level=0,sort_remaining=False)

In [11]:
# drop symbols with all NaN in OHLCV columns from df
df_OHLCV_all_dates, symbols_OHLCV, symbols_dropped = drop_symbols_all_NaN(df_OHLCV_all_dates, verbose)
# print(f'symbols_OHLCV: {symbols_OHLCV}')
print(f'symbols with all NaN dropped from df_OHLCV_all_dates: {symbols_dropped}')

Dataframe for BF.A is all NaN
Dataframe for BF.B is all NaN
Dataframe for BH.A is all NaN
Symbol's OHLCV are all NaN: ['BF.A', 'BF.B', 'BH.A']
symbols with all NaN dropped from df_OHLCV_all_dates: ['BF.B', 'BF.A', 'BH.A']


In [12]:
# rename columns OHLCV *ONLY AFTER* dropping symbols with all NaN from df,
#   symbols with all NaN has an added AdjClose column and will cause errors  
#  rename column names from ['Open', ..., 'Volume'] to ['open', ..., 'volume']
#  .remove_unused_levels() prevents ValueError
#   e.g ValueError: On level 1, code max (5) >= length of level (5). NOTE: this index is in an inconsistent state
# The error may be caused by removing symbols from the dataframe with all NaN in OHLCV columns
df_OHLCV_all_dates.columns = df_OHLCV_all_dates.columns.remove_unused_levels()
# set_levels reorders df columns in alphabetical order, so the list of column names also needs to be in alphabetical order
df_OHLCV_all_dates.columns = df_OHLCV_all_dates.columns.set_levels(['close', 'high', 'low', 'open', 'volume'], level=1)

In [13]:
# drop weekend data by re-indexing to date-index of index_symbol
myNaN = float('nan')
# use Exxon's date as proxy for NYSE trading dates
df_OHLCV = df_OHLCV_all_dates.reindex(df_XOM.index, fill_value=myNaN)

In [14]:
# pickle df_OHLCV and symbols
# print(f"Full path to pickled df_OHLCV_downloaded:  {path_data_dump}{filename_pickled_df_OHLCV_downloaded}")
# pickle_dump(df_OHLCV_downloaded, path_data_dump, filename_pickled_df_OHLCV_downloaded, verbose=verbose)
print(f"Full path to pickled df_OHLCV:  {path_data_dump}{filename_pickled_df_OHLCV}")
pickle_dump(df_OHLCV, path_data_dump, filename_pickled_df_OHLCV, verbose=verbose)
print(f"Full path to pickled symbols_df_OHLCV:  {path_data_dump}{filename_pickled_symbols_df_OHLCV}")
pickle_dump(symbols_OHLCV, path_data_dump, filename_pickled_symbols_df_OHLCV, verbose=verbose)

Full path to pickled df_OHLCV:  C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/df_OHLCV
Full path to pickled symbols_df_OHLCV:  C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/symbols_df_OHLCV


In [15]:
# create df of symbols' Close, sort df by symbols, pickled df
df_symbols_close, dates_dropped, symbols_OHLCV, symbols_dropped = yf_symbols_close(
    path_dir,
    path_data_dump,
    filename_pickled_df_OHLCV,
    verbose=verbose,
)
# multi-index sort df by symbol
df_symbols_close = df_symbols_close.sort_index(axis=1,level=0,sort_remaining=False)
print(f"Full path to pickled df_symbols_close:  {path_data_dump}{filename_pickled_df_symbols_close}")
pickle_dump(df_symbols_close, path_data_dump, filename_pickled_df_symbols_close, verbose=verbose)

In [None]:
# retrieve pickled files
# print(f"Full path to pickled df_OHLCV_all_dates:  {path_data_dump}{filename_pickled_df_OHLCV_all_dates}")
# # df_all_dates = pickle_load(path_data_dump, filename_pickled_df_OHLCV_all_dates, verbose=verbose)

print(f"Full path to pickled df_OHLCV_downloaded:  {path_data_dump}{filename_pickled_df_OHLCV_downloaded}")
df_OHLCV_downloaded = pickle_load(path_data_dump, filename_pickled_df_OHLCV_downloaded, verbose=verbose)
print(f"Full path to pickled df_OHLCV:  {path_data_dump}{filename_pickled_df_OHLCV}")
df = pickle_load(path_data_dump, filename_pickled_df_OHLCV, verbose=verbose)
print(f"Full path to pickled symbols_df_OHLCV:  {path_data_dump}{filename_pickled_symbols_df_OHLCV}")
df_close = pickle_load(path_data_dump, filename_pickled_df_symbols_close, verbose=verbose)
print(f"Full path to pickled df_symbols_close:  {path_data_dump}{filename_pickled_df_symbols_close}")
symbols_df = pickle_load(path_data_dump, filename_pickled_symbols_df_OHLCV, verbose=verbose)

In [None]:
df.columns