#### This notebook cleans the downloaded stock prices from Yahoo Finance 

In [29]:
# https://towardsdatascience.com/pandas-groupby-a-simple-but-detailed-tutorial-314b8f37005d
# https://towardsdatascience.com/accessing-data-in-a-multiindex-dataframe-in-pandas-569e8767201d
# https://towardsdatascience.com/summarizing-data-with-pandas-crosstab-efc8b9abecf
# https://towardsdatascience.com/how-to-flatten-multiindex-columns-and-rows-in-pandas-f5406c50e569
# https://datascientyst.com/list-aggregation-functions-aggfunc-groupby-pandas/
# https://stackoverflow.com/questions/25929319/how-to-iterate-over-pandas-multiindex-dataframe-using-index
# https://stackoverflow.com/questions/24495695/pandas-get-unique-multiindex-level-values-by-label
# https://stackoverflow.com/questions/55706391/pandas-crosstab-on-multiple-columns-then-groupby

# https://matplotlib.org/stable/gallery/pyplots/pyplot_text.html#sphx-glr-gallery-pyplots-pyplot-text-py

In [30]:
import os
import sys

current_dir = os.getcwd()

# Parent directory where myUtils is located
path_utils = os.path.dirname(current_dir)

sys.path.append(path_utils)

In [31]:
import pandas as pd
import numpy as np
from myUtils import pickle_load, pickle_dump

# path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_dir = "G:/My Drive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

# # filename_symbols = path_data_dump + 'vg_symbols_4chars_max.csv'  # symbols text file
# filename_symbols = path_data_dump + 'my_symbols.csv'  # symbols text file

# _filename_pickled_df_OHLCVA_downloaded = 'df_OHLCVA_downloaded '  # OHLCVA downloaded from Yahoo
filename_pickled_df_adjOHLCV = 'df_adjOHLCV'  # adjusted OHLCV
filename_pickled_df_symbols_close = "df_symbols_close"  # symbols' adjusted close
filename_pickled_symbols_df_adjOHLCV =  'symbols_df_adjOHLCV'  # symbols in df_adjOHLCV
filename_pickled_perf_rank_dict =  'perf_rank_dict'  # store symbols from performance rank results
filename_pickled_r_all_ranks =  'r_all_ranks'  # list of top 100 most common symbols from performance rank results
filename_pickled_df_a = 'df_OHLCV_clean'  # df adjusted OHLCV, dropped symbols with no vol and close
filename_pickled_df_c = 'df_close_clean'  # df close, dropped symbols with no vol and close

verbose = False  # True prints more output

#################
# look_back_days = -250 * 60  # subset df iloc days
look_back_days = -250 * 6  # subset df iloc days, 6 years of data
#################

In [32]:
print(f"Full path to pickled df_symbols_close:  {path_data_dump}{filename_pickled_df_symbols_close}")
df_close = pickle_load(path_data_dump, filename_pickled_df_symbols_close, verbose=verbose)
print(f"Full path to pickled df_adjOHLCV:  {path_data_dump}{filename_pickled_df_adjOHLCV}")
df_adjOHLCV = pickle_load(path_data_dump, filename_pickled_df_adjOHLCV, verbose=verbose)

Full path to pickled df_symbols_close:  G:/My Drive/stocks/yfinance/VSCode_dump/df_symbols_close
Full path to pickled df_adjOHLCV:  G:/My Drive/stocks/yfinance/VSCode_dump/df_adjOHLCV


In [33]:
# https://stackoverflow.com/questions/63826291/pandas-series-find-column-by-value
df = df_adjOHLCV[look_back_days::]
df_v = df.xs('Volume', level=1, axis=1)  # select only Volume columns
rows, cols = np.where(df_v == 0)  # row index, column index where trading volumes are zero
idx_no_volume = list(set(cols))
idx_no_volume.sort()
symbols_no_volume = df_v.columns[idx_no_volume]
print(f'symbols with no volume:\n{symbols_no_volume}')

symbols with no volume:
Index(['AMCR', 'BIIB', 'BKR', 'CCI', 'CHD', 'CNC', 'CNXC', 'DOC', 'ESAB',
       'EVBG', 'FCFS', 'GEN', 'HOLI', 'IBKR', 'JJC', 'NSTG', 'SRPT', 'SSB',
       'UCBI', 'VNT'],
      dtype='object')


In [34]:
df_dif = df_v - df_v.shift(periods=1)
rows, cols = np.where(df_dif == 0)
idx_same_volume = list(set(cols))
idx_same_volume.sort()
idx_same_volume
symbols_same_volume = df_v.columns[idx_same_volume]
print(f'symbols with same volume:\n{symbols_same_volume}')

symbols with same volume:
Index(['ABM', 'ACIW', 'ACM', 'ACN', 'ALE', 'ALRM', 'AMCR', 'AMED', 'AMG',
       'ANSS',
       ...
       'VSAT', 'WDAY', 'WELL', 'WERN', 'WEX', 'WSFS', 'WTS', 'WTW', 'XEL',
       'XENE'],
      dtype='object', length=269)


In [35]:
df_c = df.xs('Close', level=1, axis=1)  # select only Close columns
df_c = df_c.fillna(0).copy()  # convert NaNs to zeros
rows, cols = np.where(df_c == 0)  # row index, column index where trading volumes are zero
idx_no_close = list(set(cols))
idx_no_close.sort()
symbols_no_close = df_c.columns[idx_no_close]
print(f'symbols with NaN close:\n{symbols_no_close}')

symbols with NaN close:
Index(['ABNB', 'ALGM', 'BRBR', 'CARR', 'CEG', 'CHK', 'CNXC', 'CR', 'CRWD',
       'CTVA', 'DDOG', 'DOW', 'DT', 'DTM', 'ESAB', 'ETRN', 'EVBG', 'FISV',
       'FOX', 'FOXA', 'FSD', 'FYBR', 'GEHC', 'GFS', 'GO', 'GXO', 'HIBB',
       'HOLI', 'JJC', 'KD', 'LCID', 'MP', 'MRNA', 'NARI', 'NSTG', 'NVST',
       'OGN', 'OTIS', 'PGNY', 'RIVN', 'SBNY', 'SGOV', 'SHC', 'SITM', 'SIX',
       'VNT', 'VSCO', 'WIRE', 'WRK', 'YETI', 'ZM'],
      dtype='object')


In [36]:
symbols_bad_data = list(symbols_no_close) + list(symbols_no_volume) + list(symbols_same_volume) # combine symbols with no volume and no close
unique_symbols_bad_data = sorted(list(set(symbols_bad_data)))  # unique symbols
print(f'unique symbols with bad data, e.g. no volume, same volume and $0 close, includes duplicate symbols: {len(unique_symbols_bad_data)}')

unique symbols with bad data, e.g. no volume, same volume and $0 close, includes duplicate symbols: 324


In [37]:
# get symbols of past model picks
df_picks = pickle_load(path_data_dump, 'df_picks', verbose=verbose)
df_picks.head(1)

Unnamed: 0,date_end_df_train,max_days_lookbacks,days_lookbacks,sym_freq_15,sym_freq_14,sym_freq_13,sym_freq_12,sym_freq_11,sym_freq_10,sym_freq_9,sym_freq_8,sym_freq_7,sym_freq_6,sym_freq_5,sym_freq_4,sym_freq_3,sym_freq_2
0,2024-08-30,120,"[30, 60, 120]",[],[],[],[],[],[],['SHV'],[],['FTSM'],"['EVRI', 'LMT', 'MMM', 'VTR']","['ACIW', 'ALNY', 'GDS', 'LUMN']","['CNK', 'COMM', 'MD', 'RVNC']","['ADC', 'AFL', 'AXGN', 'CRS', 'CVLT', 'EIX']",[]


In [38]:
# keep only columns with symbols of past picks
df_picks.drop(columns=["date_end_df_train", "max_days_lookbacks", "days_lookbacks"], inplace=True)
df_picks.head(1)

Unnamed: 0,sym_freq_15,sym_freq_14,sym_freq_13,sym_freq_12,sym_freq_11,sym_freq_10,sym_freq_9,sym_freq_8,sym_freq_7,sym_freq_6,sym_freq_5,sym_freq_4,sym_freq_3,sym_freq_2
0,[],[],[],[],[],[],['SHV'],[],['FTSM'],"['EVRI', 'LMT', 'MMM', 'VTR']","['ACIW', 'ALNY', 'GDS', 'LUMN']","['CNK', 'COMM', 'MD', 'RVNC']","['ADC', 'AFL', 'AXGN', 'CRS', 'CVLT', 'EIX']",[]


In [39]:
df_picks.columns

Index(['sym_freq_15', 'sym_freq_14', 'sym_freq_13', 'sym_freq_12',
       'sym_freq_11', 'sym_freq_10', 'sym_freq_9', 'sym_freq_8', 'sym_freq_7',
       'sym_freq_6', 'sym_freq_5', 'sym_freq_4', 'sym_freq_3', 'sym_freq_2'],
      dtype='object')

In [40]:
import ast

In [41]:
list_of_lists = []

for col in df_picks.columns:
  # convert column values from string to list, e.g. '[]', '[]', '["A", "B", ..]' ... to [], [], ["A", "B", ..], ...
  l_series = df_picks[col].apply(ast.literal_eval)
  # list_of_lists = [l_item for l_item in l_series if l_item]  # this doesn't works  
  for l_item in l_series:
      if l_item:  # 
        list_of_lists.append(l_item)  

symbols_picks = [val for sublist in list_of_lists for val in sublist]
print(f'symbol count from model picks: {len(symbols_picks)}')
# list sorted unique symbols
unique_symbols_picks = sorted(list(set(symbols_picks)))
print(f'unique symbol count from model picks: {len(unique_symbols_picks)}')

symbol count from model picks: 62080
unique symbol count from model picks: 1038


In [42]:
def find_common_items(list1, list2):
  """Finds the common items between two lists.

  Args:
    list1: The first list.
    list2: The second list.

  Returns:
    A list of the common items between the two lists.
  """

  common_items = []
  for item in list1:
    if item in list2:
      common_items.append(item)
  return common_items


# code in if block runs only from command line, code will NOT be executed if imported as a module
if __name__ == "__main__": 
  list1 = ["a", "b", "c", "d", "e"]
  list2 = ["b", "c", "d", "f", "g"]
  common_items = find_common_items(list1, list2)
  print(common_items)

['b', 'c', 'd']


In [43]:
# symbols are in past picks but also have bad data 
common_symbols = find_common_items(unique_symbols_picks, unique_symbols_bad_data)
print(common_symbols)

['ACIW', 'ALE', 'ARCH', 'ASML', 'ATKR', 'AZPN', 'AZTA', 'BBW', 'BGR', 'CATO', 'CHT', 'CMG', 'CVBF', 'CVLT', 'DK', 'DVA', 'EVBG', 'GDEN', 'GPC', 'HIBB', 'HMN', 'HOLI', 'HUBS', 'HY', 'LOB', 'MDT', 'MLKN', 'MLR', 'NSA', 'NSTG', 'OEC', 'OSIS', 'OXY', 'PKX', 'PTMN', 'RDWR', 'RRC', 'SFBS', 'SHYF', 'SITE', 'SNN', 'SRPT', 'STBA', 'TRGP', 'TSEM', 'TX', 'UFPI', 'UNFI', 'USAC', 'VNO', 'WDAY', 'WIRE', 'XENE']


In [44]:
def subtract_items(list1, list2):
  """Subtracts items in list2 from items in list1.

  Args:
    list1: The first list.
    list2: The second list.

  Returns:
    A list of the items in list1 that are not in list2.
  """

  subtracted_items = []
  for item in list1:
    if item not in list2:
      subtracted_items.append(item)
  return subtracted_items

In [45]:
symbols_drop = subtract_items(unique_symbols_bad_data, common_symbols)  # don't drop symbols in past picks
symbols_drop .sort()
print(f'len(unique_symbols_bad_data): {len(unique_symbols_bad_data)}')
print(f'len(common_symbols): {len(common_symbols)}')
print(f'len(symbols_drop): {len(symbols_drop)}')

len(unique_symbols_bad_data): 324
len(common_symbols): 53
len(symbols_drop): 271


In [46]:
# symbols_drop = list(symbols_no_close) + list(symbols_no_volume) + list(symbols_same_volume) # combine symbols with no volume and no close
# print(f'combined symbols with no volume, same volume and no close, inculdes duplicate symbols: {len(symbols_drop)}')
# symbols_drop = list(set(symbols_drop))  # drop duplicate symbols
# symbols_drop .sort()


df_a = df.drop(symbols_drop, axis=1, level=0)  # drop symbols from OHLCA df
df_c = df_close.iloc[look_back_days::]
df_c = df_c.drop(symbols_drop, axis=1)
print(f'unique symbols dropped from df_a (adjOLHLV) and df_c (Close): {len(symbols_drop)}')

unique symbols dropped from df_a (adjOLHLV) and df_c (Close): 271


In [47]:
print(f'symbols with no volume:   {len(symbols_no_volume):>5,}')
print(f'symbols with same volume: {len(symbols_same_volume):>5,}')
print(f'symbols with no close:    {len(symbols_no_close):>5,}\n')
print(f'symbols total before drop:                                        {len(df_close.columns):>5,}')
print(f'unique symbols dropped from df OHLCVA (df_a) and df Close (df_c): {len(symbols_drop):>5,}\n')
print('                                          symbols     rows')
print(f'df adjOHLCV (df_a) after dropped symbols:   {len(df_a.columns)/5:>5,.0f}    {len(df_a):>5,}')
print(f'df Close (df_c) after dropped symbols:      {len(df_c.columns):>5,}    {len(df_c):>5,}')


symbols with no volume:      20
symbols with same volume:   269
symbols with no close:       51

symbols total before drop:                                        1,529
unique symbols dropped from df OHLCVA (df_a) and df Close (df_c):   271

                                          symbols     rows
df adjOHLCV (df_a) after dropped symbols:   1,258    1,500
df Close (df_c) after dropped symbols:      1,258    1,500


In [48]:
pickle_dump(df_a, path_data_dump, filename_pickled_df_a)
print(f'pickled df adjOHLCV after dropping symbols with no volume, same volume, and no close:\n{path_data_dump}{filename_pickled_df_a}')
pickle_dump(df_c, path_data_dump, filename_pickled_df_c)
print(f'pickled df Close after dropping symbols with no volume, same volume, and no close:\n{path_data_dump}{filename_pickled_df_c}')

pickled df adjOHLCV after dropping symbols with no volume, same volume, and no close:
G:/My Drive/stocks/yfinance/VSCode_dump/df_OHLCV_clean
pickled df Close after dropping symbols with no volume, same volume, and no close:
G:/My Drive/stocks/yfinance/VSCode_dump/df_close_clean


In [49]:
from myUtils import list_dump

f_symbols_df_close_clean = 'symbols_df_close_clean.csv'  # symbols text file
symbols_df_c = list(df_c)  # column names in df_c
list_dump(symbols_df_c, path_data_dump, f_symbols_df_close_clean)# df_c.columns.to_csv(f_symbols_df_close_clean)

In [50]:
df_a.tail()

Unnamed: 0_level_0,A,A,A,A,A,AA,AA,AA,AA,AA,...,ZUMZ,ZUMZ,ZUMZ,ZUMZ,ZUMZ,ZWS,ZWS,ZWS,ZWS,ZWS
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2024-08-26,141.110001,141.179993,139.960007,140.490005,1013200.0,35.0,35.18,34.470001,34.560001,5397100.0,...,29.1,29.4,28.389999,28.49,304700.0,32.540001,32.619999,32.060001,32.09,552500.0
2024-08-27,140.479996,141.210007,139.710007,141.100006,949700.0,34.349998,35.105,34.09,34.299999,3538700.0,...,28.15,28.52,27.530001,28.299999,743700.0,32.040001,32.060001,31.719999,32.029999,664100.0
2024-08-28,141.529999,142.309998,140.289993,141.059998,1089900.0,33.189999,33.305,32.0,32.09,7350400.0,...,28.200001,28.200001,27.58,28.040001,214700.0,32.029999,32.25,31.860001,31.889999,386400.0
2024-08-29,142.070007,143.440002,141.110001,142.529999,1577400.0,32.209999,32.775002,31.700001,32.470001,4672900.0,...,28.08,28.719999,27.83,27.879999,227300.0,32.139999,32.529999,31.85,32.23,523500.0
2024-08-30,142.160004,143.270004,141.020004,142.919998,1570700.0,32.5,32.595001,31.815001,32.099998,3337600.0,...,28.120001,28.120001,27.209999,27.74,173000.0,32.369999,32.529999,31.940001,32.43,531400.0


In [51]:
df_c.tail()

Unnamed: 0_level_0,A,AA,AAL,AAP,AAPL,AB,ABBV,ABR,ABT,ACGL,...,ZBH,ZBRA,ZD,ZG,ZION,ZS,ZTO,ZTS,ZUMZ,ZWS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-08-26,140.490005,34.560001,10.21,49.110001,227.179993,34.93,197.440002,13.51,112.699997,109.489998,...,114.629997,347.690002,47.099998,55.779999,49.07,197.820007,20.440001,181.559998,28.49,32.09
2024-08-27,141.100006,34.299999,10.17,47.849998,228.029999,34.77,195.919998,13.61,113.099998,110.68,...,114.669998,346.040009,47.07,55.459999,48.75,197.220001,20.9,182.100006,28.299999,32.029999
2024-08-28,141.059998,32.09,10.15,46.639999,226.490005,34.34,195.399994,13.55,112.900002,111.010002,...,114.190002,343.829987,46.849998,54.52,49.279999,193.970001,20.790001,182.910004,28.040001,31.889999
2024-08-29,142.529999,32.470001,10.39,46.150002,229.789993,34.240002,195.179993,13.68,112.760002,112.809998,...,114.669998,342.269989,48.25,53.98,49.23,197.25,21.299999,182.889999,27.879999,32.23
2024-08-30,142.919998,32.099998,10.62,45.310001,229.0,34.439999,196.309998,13.6,113.269997,113.089996,...,115.459999,345.380005,48.869999,53.43,49.560001,199.979996,21.440001,183.490005,27.74,32.43
