### Get the best performing symbols based on parameter settings for dates that are missing in df_picks<br> - Load past picks<br> - Find the largest number in l_days_lookbacks<br> - Create df_close_clean_dropped by dropping max_l_days_lookbacks leading rows in df_close_clean<br>- Get missing dates in df_picks by comparing dates in df_close_dropped to dates in df_picks<br>- Get the number of days_to_drop from df_close_clean s.t. the last date in df_close_clean is a missing date in df_picks<br>- Get picks for the missing dates

### Set parameters below

In [39]:
### Set parameters ######################################################################
# fp_df_eval_results = f'df_eval_results_{run_type}'
fp_df_picks = f"df_picks"

verbose = False  # True prints more output
# verbose = True  # True prints more output

store_results = True

# n_samples is the number of random samples of df chunks s.t.
#   len(df) = max_days_lookbacks. For getting the current picks
#   n_samples is always 1 since more samples will always yield
#   the same df chunk
n_samples = 1  # only need 1 sample to get the current picks

# for training, the number of days to lookback from iloc max-lookback iloc_end_train
l_days_lookbacks = [[30, 60, 120], [15, 30, 60, 120]]

# e.g sort from [[60, 30, 120], [15, 60, 30, 120]] to [[30, 60, 120], [15, 30, 60, 120]] 
l_sorted_days_lookbacks = []
for days_lookbacks in l_days_lookbacks:
    l_sorted_days_lookbacks.append(sorted(days_lookbacks))

# number of days from iloc_end_train are used to evaluate effectiveness of the training
days_eval = 0

# number of the most-common symbols from days_lookbacks' performance rankings to keep
n_top_syms = 20

# slice starts and ends for selecting the best performing symbols
syms_start = 0
syms_end = 10
#########################################################################

print(f"verbose : {verbose }")
print(f"store_results: {store_results}")
print(f"n_samples: {n_samples}")
print(f"l_sorted_days_lookbacks: {l_sorted_days_lookbacks}")
print(f"days_eval: {days_eval}")
print(f"n_top_syms: {n_top_syms}")
print(f"syms_start: {syms_start}")
print(f"syms_end: {syms_end}")
print(f"fp_df_picks: {fp_df_picks}\n\n")

verbose : False
store_results: True
n_samples: 1
l_sorted_days_lookbacks: [[30, 60, 120], [15, 30, 60, 120]]
days_eval: 0
n_top_syms: 20
syms_start: 0
syms_end: 10
fp_df_picks: df_picks




In [40]:
def compare_lists(list_a, list_b):
  """Compares two lists and returns a list of values that are in list A but not in list B.

  Args:
    list_a: A list of objects.
    list_b: A list of objects.

  Returns:
    A list of values that are in list_a but not in list_b.
  """

  list_difference = []

  for item in list_a:
    if item not in list_b:
      list_difference.append(item)

  return list_difference

In [41]:
import sys
import pandas as pd
import itertools
from yf_utils import random_slices, lookback_slices, top_set_sym_freq_cnt
# from yf_utils import rank_perf, grp_tuples_sort_sum, top_set_sym_freq_cnt
from yf_utils import best_perf_syms_sets_lookback_slices
from myUtils import pickle_load, pickle_dump

pd.set_option("display.max_rows", 20)
# pd.set_option("display.max_columns", 11)
pd.set_option("display.max_columns", 30)
pd.set_option("display.max_colwidth", 26)
# pd.set_option("display.width", 280)
pd.set_option("display.width", 500)

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

fp_df_close_clean = "df_close_clean"

### Load past picks

In [42]:
df_picks = pickle_load(path_data_dump, fp_df_picks)
# drop duplicates
df_picks = df_picks.drop_duplicates(subset=['date_end_df_train', 'max_days_lookbacks', 'days_lookbacks'], keep='last')
# sort, most recent date is first
df_picks = df_picks.sort_values(by=['date_end_df_train', 'max_days_lookbacks', 'days_lookbacks'], ascending=False)
# re-index
df_picks = df_picks.reset_index(drop=True)
# save results
pickle_dump(df_picks, path_data_dump, fp_df_picks)
print(f'df_picks, len({len(df_picks)}):\n{df_picks}')

df_picks, len(2768):
     date_end_df_train  max_days_lookbacks     days_lookbacks sym_freq_15 sym_freq_14 sym_freq_13 sym_freq_12 sym_freq_11 sym_freq_10        sym_freq_9                 sym_freq_8                sym_freq_7                 sym_freq_6                 sym_freq_5                 sym_freq_4                 sym_freq_3 sym_freq_2
0           2023-12-29                 120      [30, 60, 120]          []          []          []          []          []          []                []            ['CYTK', 'SHV']                        []  ['FTSM', 'GPS', 'HA', ...  ['ANF', 'BPMC', 'MBI',...  ['BURL', 'LRN', 'MARA'...  ['BIG', 'BKE', 'FATE',...         []
1           2023-12-29                 120  [15, 30, 60, 120]          []          []          []          []    ['CYTK']     ['SHV']                []                   ['FTSM']                     ['X']       ['GPS', 'HA', 'WOR']  ['ANF', 'BPMC', 'FATE'...  ['BKE', 'BURL', 'LRN',...                    ['BIG']         []
2      

In [43]:
l_dates_df_picks = df_picks.date_end_df_train.unique().tolist()  # unique dates in df_picks
print(f'l_dates_df_picks, len({len(l_dates_df_picks)}):\n{l_dates_df_picks}')

l_dates_df_picks, len(1384):
['2023-12-29', '2023-12-28', '2023-12-27', '2023-12-26', '2023-12-22', '2023-12-21', '2023-12-20', '2023-12-19', '2023-12-18', '2023-12-15', '2023-12-14', '2023-12-13', '2023-12-12', '2023-12-11', '2023-12-08', '2023-12-07', '2023-12-06', '2023-12-05', '2023-12-04', '2023-12-01', '2023-11-30', '2023-11-29', '2023-11-28', '2023-11-27', '2023-11-24', '2023-11-22', '2023-11-21', '2023-11-20', '2023-11-17', '2023-11-16', '2023-11-15', '2023-11-14', '2023-11-13', '2023-11-10', '2023-11-09', '2023-11-08', '2023-11-07', '2023-11-06', '2023-11-03', '2023-11-02', '2023-11-01', '2023-10-31', '2023-10-30', '2023-10-27', '2023-10-26', '2023-10-25', '2023-10-24', '2023-10-23', '2023-10-20', '2023-10-19', '2023-10-18', '2023-10-17', '2023-10-16', '2023-10-13', '2023-10-12', '2023-10-11', '2023-10-10', '2023-10-09', '2023-10-06', '2023-10-05', '2023-10-04', '2023-10-03', '2023-10-02', '2023-09-29', '2023-09-28', '2023-09-27', '2023-09-26', '2023-09-25', '2023-09-22', '202

### Find the largest number in l_days_lookbacks

In [44]:
# Find the largest number in l_days_lookbacks
max_l_days_lookbacks = max(number for inner_list in l_days_lookbacks for number in inner_list)


#### Create df_close_clean_dropped by dropping max_l_days_lookbacks leading rows in df_close_clean

In [45]:
# load df with symbols' close
df_close_clean = pickle_load(path_data_dump, fp_df_close_clean)
# Sort by ascending date inplace
df_close_clean.sort_index(ascending=True, inplace=True)
# Drop the first max_l_days_lookbacks rows
df_close_clean_dropped = df_close_clean.iloc[max_l_days_lookbacks:]
print(f'max_l_days_lookbacks: {max_l_days_lookbacks}')
print(f'len(df_close_clean) before row drop: {len(df_close_clean)}')
print(f'len(df_close_clean_dropped) after row drop:  {len(df_close_clean_dropped)}')
print(f'df_close_clean_dropped.index:\n{df_close_clean_dropped.index}')

max_l_days_lookbacks: 120
len(df_close_clean) before row drop: 1500
len(df_close_clean_dropped) after row drop:  1380
df_close_clean_dropped.index:
DatetimeIndex(['2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12', '2018-07-13', '2018-07-16', '2018-07-17', '2018-07-18', '2018-07-19', '2018-07-20',
               ...
               '2023-12-15', '2023-12-18', '2023-12-19', '2023-12-20', '2023-12-21', '2023-12-22', '2023-12-26', '2023-12-27', '2023-12-28', '2023-12-29'], dtype='datetime64[ns]', name='Date', length=1380, freq=None)


In [46]:
l_dates_df_close_dropped = df_close_clean_dropped.index
l_dates_df_close_dropped = l_dates_df_close_dropped.strftime('%Y-%m-%d')
print(f'l_dates_df_close_dropped, (len={len(l_dates_df_close_dropped)}):\n{l_dates_df_close_dropped }')

l_dates_df_close_dropped, (len=1380):
Index(['2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12', '2018-07-13', '2018-07-16', '2018-07-17', '2018-07-18', '2018-07-19', '2018-07-20',
       ...
       '2023-12-15', '2023-12-18', '2023-12-19', '2023-12-20', '2023-12-21', '2023-12-22', '2023-12-26', '2023-12-27', '2023-12-28', '2023-12-29'], dtype='object', name='Date', length=1380)


In [47]:
dates_missing_in_df_picks  = compare_lists(l_dates_df_close_dropped, l_dates_df_picks)
print(f'dates_missing_in_df_picks, (len={len(dates_missing_in_df_picks )}):\n{dates_missing_in_df_picks }')

dates_missing_in_df_picks, (len=0):
[]


### Given a missing date in df_picks, calculate the number of days_to_drop from df_close_clean s.t. the last date in df_close_clean is the missing date in df_picks

In [48]:
# list to store days_to_drop from df_close_clean
l_days_to_drop = []

# Calculate the number of days to drop from the date index.
for i in range(len(dates_missing_in_df_picks)):
  date = dates_missing_in_df_picks[i]
  last_date_index = df_close_clean.index.get_loc(date)
  # number of days to drop from df such that the last date is a missing date    
  days_to_drop = len(df_close_clean.index) - last_date_index - 1
  l_days_to_drop.append(days_to_drop)  

print(f'dates_missing_in_df_picks, len={len(dates_missing_in_df_picks)}: {dates_missing_in_df_picks}')
print(f'l_days_to_drop, len={len(l_days_to_drop)}: {l_days_to_drop}')

dates_missing_in_df_picks, len=0: []
l_days_to_drop, len=0: []


### Get picks for the missing dates: Bard comments and some modification of Original Code 

In [49]:
import itertools  # Import for using itertools.product

# Create a copy of the original DataFrame
df_current = df_close_clean.copy()

# Initialize lists for storing dates
l_date_last_index = []  # To store last df date in each iteration
l_date_missing_in_df_picks = []  # To store dates from dates_missing_in_df_picks

# Calculate total iterations based on input lists
i_total = len(l_days_to_drop) * len(l_sorted_days_lookbacks)

# Iterate through combinations of (days_to_drop, dates_missing_in_df_picks) and lookbacks
for i, values in enumerate(itertools.product(zip(l_days_to_drop, dates_missing_in_df_picks), l_sorted_days_lookbacks)):

    # Unpack values for clarity
    drop_last_n_rows = values[0][0]
    date_missing_in_df_picks = values[0][1]
    days_lookbacks = values[1]

    # Append missing date to the list
    l_date_missing_in_df_picks.append(date_missing_in_df_picks)

    # Print informative messages for debugging
    print(f'Start of For_Loop {i+1} of {i_total} loops{"="*40:>42}')
    print(f"\n{'For_Loop parameters':~^60}\n")    
    print(f'i: {i+1}')
    print(f'drop_last_n_rows: {drop_last_n_rows}')
    print(f'date_missing_in_df_picks: {date_missing_in_df_picks}')
    print(f'days_lookbacks: {days_lookbacks}')
    print(f"{'-'*60}\n\n")  # Visual separator for better 

    print(f'{i+1} of {len(l_sorted_days_lookbacks)} days_lookbacks: {days_lookbacks} in l_sorted_days_lookbacks: {l_sorted_days_lookbacks}')

    # Find the maximum value among the given days_lookbacks
    max_days_lookbacks = max(days_lookbacks)
    print(f"Maximum days lookbacks: {max_days_lookbacks}\n")

    # Calculate slice indices for creating a new DataFrame with appropriate rows
    slice_start = -(max_days_lookbacks + drop_last_n_rows)  # Start index for slicing
    slice_end = -drop_last_n_rows  # End index for slicing (exclusive)

    # Create a new DataFrame based on whether rows need to be dropped
    # Limit df length to max_days_lookbacks 
    # e.g. days_lookbacks: [15, 30, 60, 120] => max_days_lookbacks: 120    
    if drop_last_n_rows == 0:
        df = df_current[slice_start:].copy()  # Include all rows if no dropping required
    else:
        df = df_current[slice_start:slice_end].copy()  # Drop specified rows

    # Print information about the resulting DataFrame
    print(f"Dropped last {drop_last_n_rows} row(s) from df since drop_last_n_rows = {drop_last_n_rows}")
    print(f"First row of df:\n{df.head(1)}\n")
    print(f"Last row of df:\n{df.tail(1)}\n")

    # Extract the date of the last index
    date_last_index = df.index[-1].strftime('%Y-%m-%d')
    print(f'Date of last index: {date_last_index}')

    # Error check to ensure consistency between dates
    if date_missing_in_df_picks == date_last_index:
        print(f"Passed Error Check: date_missing_in_df_picks ({date_missing_in_df_picks}) matches date_last_index ({date_last_index})")  # Indicate success
    else:
        # Report error and provide details for debugging
        print("ERROR: date_missing_in_df_picks does not match date_last_index")
        print(f"date_missing_in_df_picks: {date_missing_in_df_picks}")
        print(f"date_last_index: {date_last_index}")
        sys.exit(1)  # Exit with an error code to signal a problem

    # Store the last index date for later reference
    l_date_last_index.append(date_last_index)

    # Calculate and print information about the DataFrame's length
    len_df = len(df)
    len_slice = slice_end - slice_start
    print(f"Length of DataFrame: {len_df}, Length of slice: {len_slice}\n")

    # Error check to ensure the DataFrame is large enough for intended operations
    days_total = max_days_lookbacks + days_eval  # Calculate total days needed
    if days_total > len_df:
        # Raise an informative error if the DataFrame is too short
        msg_err = f"Insufficient DataFrame length: max_days_lookbacks ({max_days_lookbacks}) + days_eval ({days_eval}) = {days_total} must be less or equal to len_df: {len_df}"
        raise SystemExit(msg_err)

    # Generate random slices for training and evaluation, considering the limited DataFrame length

    # Explanation:
    # - Since the DataFrame has been sliced to the maximum lookback length, any additional samples
    #   will be duplicates of the existing slice.
    # - The `random_slices` function creates multiple (possibly overlapping) slices within the DataFrame.
    # - It returns a list of tuples, each containing (start_train, end_train, end_eval) indices for slicing.
    max_lookback_slices = random_slices(
        len_df=len_df,       # Length of the DataFrame
        n_samples=n_samples,  # Number of desired samples (will be duplicates in this case)
        days_lookback=max(days_lookbacks),  # Maximum lookback length for training data
        days_eval=days_eval,  # Length of evaluation data
        verbose=False,       # Suppress logging output
    )

    print(f"\n{'Output from function random_slices':~^60}\n")
    print(f"Max. lookback slices: {max_lookback_slices}")
    print(f"\n{'Inputs from function random_slices':~^60}\n")
    print(f"len_df: {len_df}")
    print(f"n_samples: {n_samples}")
    print(f"days_lookback: {max(days_lookbacks)}")
    print(f"days_eval: {days_eval}")           
    print(f"{'-'*60}\n\n")  # Visual separator for better 

    # Generate sets of sub-slices for varying lookback lengths within larger slices

    # Explanation:
    # - The `lookback_slices` function creates multiple sub-slices within each of the previously generated slices.
    #   e.g. Given 2 max_slices of: 
    #          [(104, 224, 234), (626, 746, 756)],
    #        It returns 2 sets:
    #          [[(194, 224, 234), (164, 224, 234), (104, 224, 234)],
    #           [(716, 746, 756), (686, 746, 756), (626, 746, 756)]].
    # - It maintains a constant `end_train` for each set, ensuring consistent evaluation periods.
    # - Each sub-slice has a different `start_train` value, resulting in varying lookback lengths.
    sets_lookback_slices = lookback_slices(
        max_slices=max_lookback_slices,  # List of maximum slices (tuples of start_train, end_train, end_eval)
        days_lookbacks=days_lookbacks,  # List of different lookback lengths to create sub-slices for
        verbose=False  # Suppress logging output
    )

    # Print informative messages
    print(f"\n{'Output from function lookback_slices':~^60}\n")
    print(f"Sets lookback slices: {sets_lookback_slices}")
    # print(f"{'-'*60}")  
    print(f"\n{'Inputs from function lookback_slices':~^60}\n")
    print(f"max_slices: {max_lookback_slices}")
    print(f"days_lookbacks: {days_lookbacks}")    
    print(f"{'-'*60}\n\n")  # Visual separator for better 

    print(f"Number of random samples of max_lookback_slices taken: {n_samples}")  # Indicate the number of samples created
    print(f"Max lookback slices (start_train:end_train:end_eval): {max_lookback_slices}")  # Show the maximum slices
    print(f"Days lookbacks: {days_lookbacks}")  # Display the different lookback lengths used
    print(f"Sets of lookback slices (start_train:end_train:end_eval) for days_lookbacks [{days_lookbacks}]:\n{sets_lookback_slices}\n")  # Print the generated sets of sub-slices

    # Highlight structural aspects of the generated sets
    print(f"Number of sets in sets_lookback_slices: {n_samples}")  # Confirm the number of sets matches the number of samples
    print(f"Number of tuples in each 'set of lookback slices': {len(days_lookbacks)}")  # Indicate the number of sub-slices per set

    # **Function call to identify top-performing symbols based on criteria: CAGR/UI, CAGR/retnStd, retnStd/UIgiven**

    # Explanation:
    # - The `best_perf_syms_sets_lookback_slices` function analyzes performance across different symbol combinations and lookback periods.
    # - It takes several inputs:
    #     - `df_close`: DataFrame containing closing prices for symbols
    #     - `sets_lookback_slices`: Sets of slices defining training and evaluation periods
    #     - `n_top_syms`: Number of top-performing symbols to return
    #     - `syms_start` and `syms_end`: Indices for selecting a subset of symbols
    #     - `verbose`: Flag for additional output

    # Function output:
    # - `grp_top_set_syms_n_freq`: List of lists, where each inner list contains tuples of (symbol, frequency) for top-performing symbols within a given lookback period.
    #      e.g. [[('GPS', 8), ('SHV', 8), ('FTSM', 7), ('GBTC', 7), ('BTC-USD', 6), ('CBOE', 6), ('ANF', 5), ('NRG', 5), ('WING', 5), ('DELL', 4), ('EDU', 4), ('HIBB', 4), ('LRN', 4), ('ALL', 3), ('CAH', 3), ('CMG', 3), ('GDDY', 3), ('HRB', 3), ('MDLZ', 3), ('PGR', 3)]]
    # - `grp_top_set_syms`: List of lists containing only the top-performing symbols, without their frequencies.
    #      e.g. [['GPS', 'SHV', 'FTSM', 'GBTC', 'BTC-USD', 'CBOE', 'ANF', 'NRG', 'WING', 'DELL']]
    # - `date_end_df_train`: Date marking the end of the training period,  e.g. '2023-11-22'
    (
        grp_top_set_syms_n_freq,
        grp_top_set_syms,
        date_end_df_train,
    ) = best_perf_syms_sets_lookback_slices(
        df_close=df,
        sets_lookback_slices=sets_lookback_slices,
        n_top_syms=20,
        syms_start=0,
        syms_end=10,
        verbose=verbose
    )

    # Print results with clear headings and formatting
    print(f"\n{'Output from function best_perf_syms_sets_lookback_slices':~^60}\n")
    print(f"**Sets of lookback slices (start_train:end_train:end_eval) for days_lookbacks {days_lookbacks}:**\n{sets_lookback_slices}\n")  # Print the generated sets of sub-slices
    print(f"**Top-performing symbols with frequencies for days_lookbacks {days_lookbacks}:**\n{grp_top_set_syms_n_freq}\n")
    print(f"**Top-performing symbols for days_lookbacks {days_lookbacks}:**\n{grp_top_set_syms}\n")  
    print(f"**Date marking the end of the training period:**\n{date_end_df_train}")
    print(f"\n{'Input from function best_perf_syms_sets_lookback_slices':~^60}\n")
    print(f"   df_close: df")
    print(f"{'-'*20}")    
    print(f"   df_close.head(1): {df.head(1)}\n")
    print(f"   df_close.tail(1): {df.tail(1)}")
    print(f"{'-'*20}")           
    print(f"\nsets_lookback_slices: {sets_lookback_slices}")
    print(f"n_top_syms: 20")
    print(f"syms_start: 0")
    print(f"syms_end: 10")
    print(f"{'-'*60}\n\n")  # Visual separator for better 

    # Iterate through sets of top-performing symbols and their frequencies
    for j, top_set_syms_n_freq in enumerate(grp_top_set_syms_n_freq):

        # Explanation:
        # - `top_set_syms_n_freq` contains list of tuples of (symbol, frequency) for each lookback period.
        # - The `top_set_sym_freq_cnt` function analyzes the frequencies and categorizes symbols.

        # Categorize symbols based on frequency counts
        l_sym_freq_cnt = top_set_sym_freq_cnt(top_set_syms_n_freq)

        # Print results with descriptive labels
        print(f"\n**Iteration j={j}:**\n")
        print(f"**Groups of Top-performing symbols and frequencies for lookback period j={j}:**\n{grp_top_set_syms_n_freq}\n")        
        print(f"**Top-performing symbols and frequencies for lookback period j={j}:**\n{top_set_syms_n_freq}\n")
        print(f"**Categorized symbols based on frequency counts (from 15 to 2):**\n{l_sym_freq_cnt}\n")

        # Print informative messages for debugging and understanding if verbose mode is enabled
        # if verbose:
        print(f"\n**Contextual Information for Lookback Slice j={j}:**\n")

        print(f"- Specific lookback slice: {sets_lookback_slices[j]}")
        print(f"- Maximum possible lookback slices: {max_lookback_slices}\n")

        print(f"**Data to be Added to {fp_df_picks}:**")
        print(f"- Date marking the end of the training period: {date_end_df_train}")
        print(f"- Maximum possible lookback lengths (days): {max_days_lookbacks}")
        print(f"- Used lookback lengths (days): {days_lookbacks}\n")

        print(f"**Symbol Frequency Distribution (from 15 to 2):**")
        print(f"- Frequency count 15: {l_sym_freq_cnt[0]}")
        print(f"- Frequency count 14: {l_sym_freq_cnt[1]}")
        print(f"- Frequency count 13: {l_sym_freq_cnt[2]}")
        print(f"- Frequency count 12: {l_sym_freq_cnt[3]}")
        print(f"- Frequency count 11: {l_sym_freq_cnt[4]}")
        print(f"- Frequency count 10: {l_sym_freq_cnt[5]}")
        print(f"- Frequency count 9: {l_sym_freq_cnt[6]}")
        print(f"- Frequency count 8: {l_sym_freq_cnt[7]}")
        print(f"- Frequency count 7: {l_sym_freq_cnt[8]}")
        print(f"- Frequency count 6: {l_sym_freq_cnt[9]}")
        print(f"- Frequency count 5: {l_sym_freq_cnt[10]}")
        print(f"- Frequency count 4: {l_sym_freq_cnt[11]}")
        print(f"- Frequency count 3: {l_sym_freq_cnt[12]}")
        print(f"- Frequency count 2: {l_sym_freq_cnt[13]}")

    # Store results if store_results flag is True
    if store_results:

        # Construct a row of data to be stored
        row_picks = [
            date_end_df_train,  # Date marking the end of the training period
            max_days_lookbacks,  # Maximum possible lookback lengths (days)
            str(days_lookbacks),  # Used lookback lengths (days) (converted to string)
            *[str(l) for l in l_sym_freq_cnt],  # Symbol frequency counts (as strings)
        ]

        # Print the constructed row for debugging
        print(f"row_picks: {row_picks}")

        # Append the row to the DataFrame and store it using pickle
        df_picks.loc[len(df_picks)] = row_picks  # Append row to DataFrame
        pickle_dump(df_picks, path_data_dump, fp_df_picks)  # Store DataFrame using pickle

        print(f"Appended row_picks to df_picks:\n{row_picks}\n")

    print(f'End of For_Loop  {i+1} of {i_total} loops{"="*40:>44}\n\n')
    

### Get picks for the missing dates: Original Code 

In [50]:
# df_current = df_close_clean.copy()

# # list to store the last df date
# l_date_last_index = []

# # list to sore date in zipped dates_missing_in_df_picks
# l_date_missing_in_df_picks = []

# # total number of iteration
# i_total = len(l_days_to_drop) * len(l_sorted_days_lookbacks)

# for i, values in enumerate(itertools.product(zip(l_days_to_drop, dates_missing_in_df_picks), l_sorted_days_lookbacks)):
#     print(f'Start of For_Loop  {i+1} of {i_total} loops{"="*40:>42}')
#     drop_last_n_rows = values[0][0]
#     date_missing_in_df_picks = values[0][1] 
#     days_lookbacks = values[1]
#     l_date_missing_in_df_picks.append(date_missing_in_df_picks)

#     print(f'i: {i+1}')    
#     print(f'drop_last_n_rows: {drop_last_n_rows}') 
#     print(f'date_missing_in_df_picks: {date_missing_in_df_picks}')
#     print(f'days_lookbacks: {days_lookbacks}\n')
#     print(f'{i+1} of {len(l_sorted_days_lookbacks)} days_lookbacks: {days_lookbacks} in l_sorted_days_lookbacks: {l_sorted_days_lookbacks}')    

#     # drops df rows by drop_last_n_rows, limits df length to max_days_lookbacks 
#     # e.g. days_lookbacks: [15, 30, 60, 120] => max_days_lookbacks: 120
#     max_days_lookbacks = max(days_lookbacks)
#     print(f"max_days_lookbacks: {max_days_lookbacks}\n")
#     slice_start = -(max_days_lookbacks + drop_last_n_rows)
#     slice_end = -drop_last_n_rows
#     if drop_last_n_rows == 0:  # return df with all rows
#         df = df_current[slice_start:].copy()
#     else:  # return df with dropped drop_last_n_rows rows
#         df = df_current[slice_start:slice_end].copy()
#     print(f"dropped last {drop_last_n_rows} row(s) from df since drop_last_n_rows = {drop_last_n_rows}")
#     print(f"df.head(1):\n{df.head(1)}\n")
#     print(f"df.tail(1):\n{df.tail(1)}\n")

#     date_last_index = df.index[-1].strftime('%Y-%m-%d')
#     print(f'date_last_index: {date_last_index}')

#     # Error check
#     if date_missing_in_df_picks == date_last_index:
#         print(f"Passed Error Check: date_missing_in_df_picks {date_missing_in_df_picks} == date_last_index {date_last_index}")  
#     else:
#         print("ERROR: date_missing_in_df_picks != date_last_index")
#         print(f'date_missing_in_df_picks: {date_missing_in_df_picks}')
#         print(f'date_last_index: {date_last_index}')
#         sys.exit(1)  

#     l_date_last_index.append(date_last_index)
#     len_df = len(df)
#     len_slice = slice_end - slice_start
#     # print(f"len(df): {len(df)}\n")
#     print(f"len_df: {len_df}, len_slice: {len_slice}\n")    

#     # Error check, len(df) >= max(days_lookbacks) + days_eval 
#     days_total = max_days_lookbacks + days_eval
#     if days_total > len_df:    
#         msg_err = f"max_days_lookbacks: {max_days_lookbacks} + days_eval: {days_eval} = {days_total} must be less or equal to len_df: {len_df}"        
#         raise SystemExit(msg_err)


#     # Since df rows has been sliced to max_days_lookbacks,
#     #   n_samples > 1, will always return more copies of the same slice. 
#     # Returns a list of random tuples of start_train, end_train, end_eval,
#     # where iloc[start_train:end_train] is used for training,
#     # and iloc[end_train:end_eval] is used for evaluation.  The length of the
#     # list is equal to n_samples.
#     max_lookback_slices = random_slices(
#         len_df = len_df,
#         n_samples=n_samples,
#         days_lookback=max(days_lookbacks),
#         days_eval=days_eval,
#         verbose=False,
#     )    

#     # Create sets of sub-slices from max_slices and days_lookbacks. A slice is
#     # a tuple of iloc values for start_train:end_train=start_eval:end_eval.
#     # Given 2 max_slices of [(104, 224, 234), (626, 746, 756)], it returns 2 sets
#     # [[(194, 224, 234), (164, 224, 234), (104, 224, 234)],
#     # [(716, 746, 756), (686, 746, 756), (626, 746, 756)]]. End_train is constant
#     # for each set. End_train - start_train is the value of the maximum slice.     
#     sets_lookback_slices = lookback_slices(
#         max_slices=max_lookback_slices, days_lookbacks=days_lookbacks, verbose=False
#     )

#     if verbose:
#         print(f"number of random samples of max_lookback_slices taken is n_samples = {n_samples}")
#         print(f"max_lookback_slices: {max_lookback_slices}\n")
#         print(f"days_lookbacks: {days_lookbacks}")
#         print(f"sets_lookback_slices, e.g. (start_train:end_train:end_eval): {sets_lookback_slices}\n")

#         print(f"number of sets in sets_lookback_slices is equal to n_samples = {n_samples}")
#         print(
#             f'number of tuples in each "set of lookback slices" is equal to len(days_lookbacks): {len(days_lookbacks)}'
#         )

#     # If given:
#     #  performance metric: r_CAGR/UI, r_CAGR/retnStd, r_retnStd/UI
#     #  l_sorted_days_lookbacks: [[30, 60, 120], [15, 30, 60, 120]]
#     #   => days_lookbacks: [30, 60, 120]
#     #    => sets_lookback_slices: [[(90, 120, 120), (60, 120, 120), (0, 120, 120)]]
#     # Then, grp_top_set_syms_n_freq is a list of lists of the top n_top_syms of the
#     # best performing symbols and their number of occurrence for sets_lookback_slices.
#     # The list of lists corresponds to days_lookbacks in l_sorted_days_lookbacks.  
#     #  e.g. grp_top_set_syms_n_freq:
#     #   [[('GPS', 8), ('SHV', 8), ('FTSM', 7), ('GBTC', 7), ('BTC-USD', 6), ('CBOE', 6), ('ANF', 5), ('NRG', 5), ('WING', 5), ('DELL', 4), ('EDU', 4), ('HIBB', 4), ('LRN', 4), ('ALL', 3), ('CAH', 3), ('CMG', 3), ('GDDY', 3), ('HRB', 3), ('MDLZ', 3), ('PGR', 3)]]
#     # grp_top_set_syms is grp_top_set_syms_n_freq with number of occurrence dropped
#     #   e.g. [['GPS', 'SHV', 'FTSM', 'GBTC', 'BTC-USD', 'CBOE', 'ANF', 'NRG', 'WING', 'DELL']]
#     # date_end_df_train, e.g. 2023-11-22
#     (
#         grp_top_set_syms_n_freq,
#         grp_top_set_syms,
#         date_end_df_train,
#     ) = best_perf_syms_sets_lookback_slices(
#         df_close=df,
#         sets_lookback_slices=sets_lookback_slices,
#         n_top_syms=20,
#         syms_start=0,
#         syms_end=10,
#         verbose=verbose
#         )

#     print(f'\nOutput from function best_perf_syms_sets_lookback_slices')
#     print(f'{"`"*60}')
#     print(f'sets_lookback_slices: {sets_lookback_slices}\n')
#     print(f'grp_top_set_syms_n_freq:\n{grp_top_set_syms_n_freq}\n')
#     print(f'grp_top_set_syms:\n{grp_top_set_syms}\n')
#     print(f'date_end_df_train:\n{date_end_df_train}')    
#     print(f'{"`"*60}\n\n')

#     for j, top_set_syms_n_freq in enumerate(grp_top_set_syms_n_freq):
#         # If given top_set_syms_n_freq:
#         #  [('GPS', 10), ('SHV', 9), ('FTSM', 7), ('GBTC', 7), ('WING', 7), ('CBOE', 6),
#         #  ('ANF', 5), ('BTC-USD', 5), ('NRG', 5), ('TSEM', 5), ('BURL', 4), ('CRSP', 4),
#         #  ('EDU', 4), ('GDDY', 4), ('LRN', 4), ('NFLX', 4), ('PI', 4), ('WRB', 4),
#         #  ('AXGN', 3), ('CAH', 3)]
#         # Then, l_sym_freq_cnt, where symbol frequency count is from 15, 14, ..., 2:
#         #  [[], [], [], [], [], ['GPS'], ['SHV'], [], ['FTSM', 'GBTC', 'WING'], ['CBOE'],
#         #  ['ANF', 'BTC-USD', 'NRG', 'TSEM'],
#         #  ['BURL', 'CRSP', 'EDU', 'GDDY', 'LRN', 'NFLX', 'PI', 'WRB'],
#         #  ['AXGN', 'CAH'], []]
#         l_sym_freq_cnt = top_set_sym_freq_cnt(top_set_syms_n_freq)
#         print(f'{j}, grp_top_set_syms_n_freq:\n{grp_top_set_syms_n_freq}')
#         print(f'{j}, top_set_syms_n_freq:\n{top_set_syms_n_freq}')
#         print(f'{j}, l_sym_freq_cnt:\n{l_sym_freq_cnt}\n')                
        
#         if verbose:
#             print(f"set_lookback_slices: {sets_lookback_slices[j]}")
#             print(f"max_lookback_slices: {max_lookback_slices}\n")
#             print(f"data below will be added to {fp_df_picks}")
#             print(f"date_end_df_train:   {date_end_df_train}")
#             print(f"max_days_lookbacks:  {max_days_lookbacks}")
#             print(f"days_lookbacks:      {days_lookbacks}")
#             print(f"sym_freq_15:         {l_sym_freq_cnt[0]}")
#             print(f"sym_freq_14:         {l_sym_freq_cnt[1]}")
#             print(f"sym_freq_13:         {l_sym_freq_cnt[2]}")
#             print(f"sym_freq_12:         {l_sym_freq_cnt[3]}")
#             print(f"sym_freq_11:         {l_sym_freq_cnt[4]}")
#             print(f"sym_freq_10:         {l_sym_freq_cnt[5]}")
#             print(f"sym_freq_9:          {l_sym_freq_cnt[6]}")
#             print(f"sym_freq_8:          {l_sym_freq_cnt[7]}")
#             print(f"sym_freq_7:          {l_sym_freq_cnt[8]}")
#             print(f"sym_freq_6:          {l_sym_freq_cnt[9]}")
#             print(f"sym_freq_5:          {l_sym_freq_cnt[10]}")
#             print(f"sym_freq_4:          {l_sym_freq_cnt[11]}")
#             print(f"sym_freq_3:          {l_sym_freq_cnt[12]}")
#             print(f"sym_freq_2:          {l_sym_freq_cnt[13]}\n")

#     if store_results:
#         row_picks0 = [date_end_df_train, max_days_lookbacks, str(days_lookbacks)]
#         row_picks1 = [
#             str(l_sym_freq_cnt[0]),
#             str(l_sym_freq_cnt[1]),
#             str(l_sym_freq_cnt[2]),
#             str(l_sym_freq_cnt[3]),
#         ]
#         row_picks2 = [
#             str(l_sym_freq_cnt[4]),
#             str(l_sym_freq_cnt[5]),
#             str(l_sym_freq_cnt[6]),
#             str(l_sym_freq_cnt[7]),
#         ]
#         row_picks3 = [
#             str(l_sym_freq_cnt[8]),
#             str(l_sym_freq_cnt[9]),
#             str(l_sym_freq_cnt[10]),
#             str(l_sym_freq_cnt[11]),
#         ]
#         row_picks4 = [str(l_sym_freq_cnt[12]), str(l_sym_freq_cnt[13])]
#         row_picks_total = row_picks0 + row_picks1 + row_picks2 + row_picks3 + row_picks4
#         print(f"row_picks_total: {row_picks_total}")

#         df_picks.loc[len(df_picks)] = row_picks_total
#         pickle_dump(df_picks, path_data_dump, fp_df_picks)
#         print(f"appended row_picks_total to df_picks:\n{row_picks_total}\n")

#     print(f'End of For_Loop  {i+1} of {i_total} loops{"="*40:>44}\n\n')
    