### Create a list of trading dates in df_close_clean that are missing in df_picks<br>The missing dates are pickled in dates_missing_in_df_picks

In [31]:
import time
start_time = time.time()

In [32]:
def compare_lists(list_a, list_b):
  """Compares two lists and returns a list of values that are in list A but not in list B.

  Args:
    list_a: A list of objects.
    list_b: A list of objects.

  Returns:
    A list of values that are in list_a but not in list_b.
  """

  list_difference = []

  for item in list_a:
    if item not in list_b:
      list_difference.append(item)

  return list_difference

In [33]:
import pandas as pd
from myUtils import pickle_load, pickle_dump
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', 30)
pd.set_option('display.width', 900)

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

# pickle file of past picks
fp_df_picks  = f'df_picks'
# pickle file of dates missing from df_picks
fp_dates_missing_in_df_picks = f'dates_missing_in_df_picks'
# pickle file of symbols' close after cleaning the data
fp_df_close_clean = "df_close_clean"

In [34]:
df_close_clean = pickle_load(path_data_dump, fp_df_close_clean)

In [35]:
df_picks = pickle_load(path_data_dump, fp_df_picks)
# drop duplicates
df_picks = df_picks.drop_duplicates(subset=['date_end_df_train', 'max_days_lookbacks', 'days_lookbacks'], keep='last')
# sort, most recent date is first
df_picks = df_picks.sort_values(by=['date_end_df_train', 'max_days_lookbacks', 'days_lookbacks'], ascending=False)
# re-index
df_picks = df_picks.reset_index(drop=True)
# save results
pickle_dump(df_picks, path_data_dump, fp_df_picks)
print(f'df_picks, len({len(df_picks)}):\n{df_picks}')

df_picks, len(282):
    date_end_df_train  max_days_lookbacks     days_lookbacks sym_freq_15 sym_freq_14 sym_freq_13 sym_freq_12      sym_freq_11      sym_freq_10              sym_freq_9        sym_freq_8                     sym_freq_7                     sym_freq_6                     sym_freq_5                     sym_freq_4                     sym_freq_3 sym_freq_2
0          2023-11-09                 120      [30, 60, 120]          []          []          []          []               []               []         ['FTSM', 'SHV']          ['CBOE']                ['GBTC', 'NRG']          ['ANF', 'CAH', 'LRN']  ['BTC-USD', 'EDU', 'GPS', ...           ['PGR', 'TGH', 'YY']  ['AGYS', 'ALL', 'CLS', 'HO...         []
1          2023-11-09                 120  [15, 30, 60, 120]          []          []          []          []               []  ['FTSM', 'SHV']                 ['LRN']  ['CBOE', 'GBTC']      ['BTC-USD', 'NRG', 'TGH']                 ['ANF', 'CAH']  ['AGYS', 'DECK', 'EDU', 'G... 

In [36]:
start_date = df_picks.date_end_df_train.min()
end_date = df_picks.date_end_df_train.max()
print(f'df_picks start date: {start_date}')
print(f'df_picks end date: {end_date}')

df_picks start date: 2023-03-15
df_picks end date: 2023-11-09


In [37]:
l_dates_df_picks = df_picks.date_end_df_train.unique().tolist()  # unique dates in df_picks
print(f'l_dates_df_picks, len({len(l_dates_df_picks)}):\n{l_dates_df_picks}')

l_dates_df_picks, len(141):
['2023-11-09', '2023-11-08', '2023-11-07', '2023-11-06', '2023-11-03', '2023-11-02', '2023-11-01', '2023-10-31', '2023-10-30', '2023-10-27', '2023-10-26', '2023-10-25', '2023-10-24', '2023-10-23', '2023-10-20', '2023-10-19', '2023-10-17', '2023-10-12', '2023-10-06', '2023-10-05', '2023-10-04', '2023-10-02', '2023-09-29', '2023-09-28', '2023-09-25', '2023-09-22', '2023-09-21', '2023-09-20', '2023-09-18', '2023-09-15', '2023-09-14', '2023-09-13', '2023-09-12', '2023-09-11', '2023-09-08', '2023-09-05', '2023-09-01', '2023-08-25', '2023-08-18', '2023-08-17', '2023-08-15', '2023-08-14', '2023-08-11', '2023-08-10', '2023-08-09', '2023-08-08', '2023-08-07', '2023-08-04', '2023-08-03', '2023-08-02', '2023-08-01', '2023-07-27', '2023-07-26', '2023-07-25', '2023-07-21', '2023-07-20', '2023-07-18', '2023-07-14', '2023-07-13', '2023-07-12', '2023-07-11', '2023-07-10', '2023-07-07', '2023-07-06', '2023-07-05', '2023-07-03', '2023-06-30', '2023-06-29', '2023-06-28', '2023

In [38]:
# Select the rows where the index is between the start_date and end_date
mask = (df_close_clean.index >= start_date) & (df_close_clean.index <= end_date)
l_dates_df_close = df_close_clean[mask].index
# list of date index in 'yyyy-mm-dd' format
l_dates_df_close = l_dates_df_close.strftime('%Y-%m-%d')
l_dates_df_close

Index(['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-20', '2023-03-21', '2023-03-22', '2023-03-23', '2023-03-24', '2023-03-27', '2023-03-28',
       ...
       '2023-10-27', '2023-10-30', '2023-10-31', '2023-11-01', '2023-11-02', '2023-11-03', '2023-11-06', '2023-11-07', '2023-11-08', '2023-11-09'], dtype='object', name='Date', length=167)

In [39]:
dates_missing_in_df_picks  = compare_lists(l_dates_df_close, l_dates_df_picks)
pickle_dump(dates_missing_in_df_picks, path_data_dump, fp_dates_missing_in_df_picks)
print(f'dates_missing_in_df_picks is saved in {path_data_dump}{fp_dates_missing_in_df_picks}')   
print(f'dates_missing_in_df_picks, (len={len(dates_missing_in_df_picks )}):\n{dates_missing_in_df_picks }')

dates_missing_in_df_picks is saved in C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/dates_missing_in_df_picks
dates_missing_in_df_picks, (len=26):
['2023-07-17', '2023-07-19', '2023-07-24', '2023-07-28', '2023-07-31', '2023-08-16', '2023-08-21', '2023-08-22', '2023-08-23', '2023-08-24', '2023-08-28', '2023-08-29', '2023-08-30', '2023-08-31', '2023-09-06', '2023-09-07', '2023-09-19', '2023-09-26', '2023-09-27', '2023-10-03', '2023-10-09', '2023-10-10', '2023-10-11', '2023-10-13', '2023-10-16', '2023-10-18']


In [40]:
end_time = time.time()
exec_time = end_time - start_time
print(f'exec. time for all cells: {exec_time} sec')

exec. time for all cells: 0.18800044059753418 sec
