In [13]:
# https://towardsdatascience.com/pandas-groupby-a-simple-but-detailed-tutorial-314b8f37005d
# https://towardsdatascience.com/accessing-data-in-a-multiindex-dataframe-in-pandas-569e8767201d
# https://towardsdatascience.com/summarizing-data-with-pandas-crosstab-efc8b9abecf
# https://towardsdatascience.com/how-to-flatten-multiindex-columns-and-rows-in-pandas-f5406c50e569
# https://datascientyst.com/list-aggregation-functions-aggfunc-groupby-pandas/
# https://stackoverflow.com/questions/25929319/how-to-iterate-over-pandas-multiindex-dataframe-using-index
# https://stackoverflow.com/questions/24495695/pandas-get-unique-multiindex-level-values-by-label
# https://stackoverflow.com/questions/55706391/pandas-crosstab-on-multiple-columns-then-groupby

# https://matplotlib.org/stable/gallery/pyplots/pyplot_text.html#sphx-glr-gallery-pyplots-pyplot-text-py

In [14]:
import pandas as pd
import numpy as np
from myUtils import pickle_load, pickle_dump

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

# # filename_symbols = path_data_dump + 'vg_symbols_4chars_max.csv'  # symbols text file
# filename_symbols = path_data_dump + 'my_symbols.csv'  # symbols text file

# _filename_pickled_df_OHLCVA_downloaded = 'df_OHLCVA_downloaded '  # OHLCVA downloaded from Yahoo
filename_pickled_df_adjOHLCV = 'df_adjOHLCV'  # adjusted OHLCV
filename_pickled_df_symbols_close = "df_symbols_close"  # symbols' adjusted close
filename_pickled_symbols_df_adjOHLCV =  'symbols_df_adjOHLCV'  # symbols in df_adjOHLCV
filename_pickled_perf_rank_dict =  'perf_rank_dict'  # store symbols from performance rank results
filename_pickled_r_all_ranks =  'r_all_ranks'  # list of top 100 most common symbols from performance rank results
filename_pickled_df_a = 'df_OHLCV_clean'  # df adjusted OHLCV, dropped symbols with no vol and close
filename_pickled_df_c = 'df_close_clean'  # df close, dropped symbols with no vol and close

verbose = False  # True prints more output

#################
# look_back_days = -250 * 60  # subset df iloc days
look_back_days = -250 * 6  # subset df iloc days, 6 years of data
#################

In [15]:
print(f"Full path to pickled df_symbols_close:  {path_data_dump}{filename_pickled_df_symbols_close}")
df_close = pickle_load(path_data_dump, filename_pickled_df_symbols_close, verbose=verbose)
print(f"Full path to pickled df_adjOHLCV:  {path_data_dump}{filename_pickled_df_adjOHLCV}")
df_adjOHLCV = pickle_load(path_data_dump, filename_pickled_df_adjOHLCV, verbose=verbose)

Full path to pickled df_symbols_close:  C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/df_symbols_close
Full path to pickled df_adjOHLCV:  C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/df_adjOHLCV


In [16]:
# https://stackoverflow.com/questions/63826291/pandas-series-find-column-by-value
df = df_adjOHLCV[look_back_days::]
df_v = df.xs('Volume', level=1, axis=1)  # select only Volume columns
rows, cols = np.where(df_v == 0)  # row index, column index where trading volumes are zero
idx_no_volume = list(set(cols))
idx_no_volume.sort()
symbols_no_volume = df_v.columns[idx_no_volume]
print(f'symbols with no volume:\n{symbols_no_volume}')

symbols with no volume:
Index(['AAWW', 'AMCR', 'AVT', 'BCI', 'BHF', 'BKR', 'CCI', 'CHD', 'CNC', 'CNXC',
       'COUP', 'ESAB', 'FCFS', 'GEN', 'HUBB', 'IBKR', 'JJC', 'PEAK', 'PFG',
       'SBNY', 'SIVB', 'UA', 'UMPQ', 'VC', 'VNT', 'WH', 'XEL'],
      dtype='object')


In [17]:
df_dif = df_v - df_v.shift(periods=1)
rows, cols = np.where(df_dif == 0)
idx_same_volume = list(set(cols))
idx_same_volume.sort()
idx_same_volume
symbols_same_volume = df_v.columns[idx_same_volume]
print(f'symbols with same volume:\n{symbols_same_volume}')

symbols with same volume:
Index(['AAWW', 'ACM', 'ACN', 'ADP', 'AGCO', 'AMCR', 'AMED', 'AMG', 'AMGN',
       'ANSS',
       ...
       'VAC', 'VICR', 'VNDA', 'VRSK', 'VSAT', 'WERN', 'WEX', 'WTS', 'WTW',
       'XEL'],
      dtype='object', length=214)


In [18]:
df_c = df.xs('Close', level=1, axis=1)  # select only Close columns
df_c = df_c.fillna(0).copy()  # convert NaNs to zeros
rows, cols = np.where(df_c == 0)  # row index, column index where trading volumes are zero
idx_no_close = list(set(cols))
idx_no_close.sort()
symbols_no_close = df_c.columns[idx_no_close]
print(f'symbols with NaN close:\n{symbols_no_close}')

symbols with NaN close:
Index(['AAWW', 'ABNB', 'AIR', 'AIRC', 'ALGM', 'APOG', 'ATCO', 'BHF', 'BJ',
       'BOH', 'BRBR', 'CAJ', 'CARR', 'CDAY', 'CEG', 'CHK', 'CHX', 'CNXC',
       'COUP', 'CR', 'CRWD', 'CTVA', 'DDOG', 'DOW', 'DT', 'DTM', 'ENSG',
       'ESAB', 'ETH-USD', 'ETRN', 'FOX', 'FOXA', 'FYBR', 'GEHC', 'GFS', 'GO',
       'GXO', 'HTH', 'IAA', 'JBGS', 'JBL', 'JHG', 'JJC', 'KD', 'LCID', 'MP',
       'MRNA', 'MRVL', 'NARI', 'NVST', 'NVT', 'OGN', 'OTIS', 'PDD', 'PEB',
       'PGNY', 'RCII', 'RFP', 'RIVN', 'SGOV', 'SHC', 'SITM', 'SIVB', 'SJR',
       'SPR', 'SWAV', 'SWBI', 'TROX', 'TY', 'UAA', 'UMPQ', 'UNFI', 'VICI',
       'VNT', 'VSCO', 'WH', 'WTW', 'YETI', 'ZM', 'ZS'],
      dtype='object')


In [19]:
symbols_drop = list(symbols_no_close) + list(symbols_no_volume) + list(symbols_same_volume) # combine symbols with no volume and no close
print(f'combined symbols with no volume, same volume and no close, inculdes duplicate symbols: {len(symbols_drop)}')
symbols_drop = list(set(symbols_drop))  # drop duplicate symbols
symbols_drop .sort()
df_a = df.drop(symbols_drop, axis=1, level=0)  # drop symbols from OHLCA df
df_c = df_close.iloc[look_back_days::]
df_c = df_c.drop(symbols_drop, axis=1)
print(f'unique symbols dropped from df_a (adjOLHLV) and df_c (Close): {len(symbols_drop)}')

combined symbols with no volume, same volume and no close, inculdes duplicate symbols: 321
unique symbols dropped from df_a (adjOLHLV) and df_c (Close): 292


In [20]:
print(f'symbols with no volume:   {len(symbols_no_volume):>5,}')
print(f'symbols with same volume: {len(symbols_same_volume):>5,}')
print(f'symbols with no close:    {len(symbols_no_close):>5,}\n')
print(f'symbols total before drop:                                        {len(df_close.columns):>5,}')
print(f'unique symbols dropped from df OHLCVA (df_a) and df Close (df_c): {len(symbols_drop):>5,}\n')
print('                                          symbols     rows')
print(f'df adjOHLCV (df_a) after dropped symbols:   {len(df_a.columns)/5:>5,.0f}    {len(df_a):>5,}')
print(f'df Close (df_c) after dropped symbols:      {len(df_c.columns):>5,}    {len(df_c):>5,}')


symbols with no volume:      27
symbols with same volume:   214
symbols with no close:       80

symbols total before drop:                                        1,593
unique symbols dropped from df OHLCVA (df_a) and df Close (df_c):   292

                                          symbols     rows
df adjOHLCV (df_a) after dropped symbols:   1,301    1,500
df Close (df_c) after dropped symbols:      1,301    1,500


In [21]:
pickle_dump(df_a, path_data_dump, filename_pickled_df_a)
print(f'pickled df adjOHLCV after dropping symbols with no volume, same volume, and no close:\n{path_data_dump}{filename_pickled_df_a}')
pickle_dump(df_c, path_data_dump, filename_pickled_df_c)
print(f'pickled df Close after dropping symbols with no volume, same volume, and no close:\n{path_data_dump}{filename_pickled_df_c}')

pickled df adjOHLCV after dropping symbols with no volume, same volume, and no close:
C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/df_OHLCV_clean
pickled df Close after dropping symbols with no volume, same volume, and no close:
C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/df_close_clean


In [22]:
from myUtils import list_dump

f_symbols_df_close_clean = 'symbols_df_close_clean.csv'  # symbols text file
symbols_df_c = list(df_c)  # column names in df_c
list_dump(symbols_df_c, path_data_dump, f_symbols_df_close_clean)# df_c.columns.to_csv(f_symbols_df_close_clean)

In [23]:
df_a.tail()

Unnamed: 0_level_0,A,A,A,A,A,AA,AA,AA,AA,AA,...,ZUMZ,ZUMZ,ZUMZ,ZUMZ,ZUMZ,ZWS,ZWS,ZWS,ZWS,ZWS
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2023-04-24,138.800003,139.520004,138.089996,138.479996,856700.0,37.98,38.540001,37.560001,38.040001,5027800.0,...,17.9,18.030001,17.27,17.6,307700.0,20.9,21.48,20.9,21.41,1680900.0
2023-04-25,136.539993,136.779999,129.720001,130.369995,2154900.0,36.900002,36.900002,35.720001,35.75,6363500.0,...,17.360001,17.610001,16.43,16.690001,443900.0,21.18,21.57,20.959999,20.969999,2199400.0
2023-04-26,128.149994,133.839996,128.110001,133.119995,3003800.0,36.080002,36.200001,35.02,35.450001,4799500.0,...,16.719999,17.27,16.719999,17.120001,253900.0,21.299999,22.18,20.23,20.440001,2308400.0
2023-04-27,132.960007,133.860001,131.330002,133.25,1125300.0,35.555,36.48,35.402,36.27,3850900.0,...,17.18,17.43,16.889999,17.32,189400.0,20.459999,20.98,20.059999,20.700001,1660200.0
2023-04-28,133.449997,136.929993,133.449997,135.429993,1895031.0,36.080002,37.236198,35.775002,37.139999,3434925.0,...,17.32,17.690001,17.32,17.485001,256367.0,20.75,21.75,20.889999,21.549999,1760406.0


In [24]:
df_c.tail()

Unnamed: 0_level_0,A,AA,AAL,AAP,AAPL,AB,ABB,ABBV,ABC,ABM,...,YY,ZBH,ZBRA,ZD,ZG,ZION,ZTO,ZTS,ZUMZ,ZWS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-04-24,138.479996,38.040001,13.32,126.830002,165.330002,35.549999,35.639999,164.080002,167.710007,42.139999,...,27.790001,138.139999,288.190002,74.349998,43.400002,28.27,27.940001,176.350006,17.6,21.41
2023-04-25,130.369995,35.75,12.84,123.779999,163.770004,34.450001,36.639999,164.899994,168.089996,41.509998,...,27.219999,139.059998,284.100006,73.339996,42.450001,26.73,27.459999,173.020004,16.690001,20.969999
2023-04-26,133.119995,35.450001,12.74,122.330002,163.759995,34.099998,35.59,161.800003,165.490005,40.880001,...,28.209999,139.080002,280.420013,71.93,40.759998,26.389999,27.459999,172.940002,17.120001,20.440001
2023-04-27,133.25,36.27,12.88,124.900002,168.410004,34.91,36.439999,148.869995,167.410004,41.709999,...,30.23,138.350006,284.119995,73.059998,41.720001,26.58,27.389999,173.949997,17.32,20.700001
2023-04-28,135.429993,37.139999,13.64,125.529999,169.679993,34.959999,36.09,151.119995,166.850006,42.580002,...,30.43,138.440002,288.029999,73.139999,42.779999,27.860001,27.68,175.779999,17.485001,21.549999
