In [1]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML
from yf_utils import _2_split_train_val_test, _3_random_slices, _4_lookback_slices
from yf_utils import _5_perf_ranks, _6_grp_tuples_sort_sum
from myUtils import pickle_load, pickle_dump

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', 16)
pd.set_option('display.width', 790)

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

fp_df_close_clean = 'df_close_clean'

###########################################################################################################
fp_df_eval_sym_freq_results_train    = '_df_eval_sym_freq_results_train'
fp_df_eval_sym_freq_results_validate = '_df_eval_sym_freq_results_validate'
fp_df_eval_sym_freq_results_test     = '_df_eval_sym_freq_results_test'
###########################################################################################################

df_close_clean = pickle_load(path_data_dump, fp_df_close_clean)

#### Check run results

In [2]:
# df = pickle_load(path_data_dump, fp_df_eval_sym_freq_results_train)
# df
# df = pickle_load(path_data_dump, fp_df_eval_sym_freq_results_validate)
# df
# df = pickle_load(path_data_dump, fp_df_eval_sym_freq_results_test)
# df

#### Create column names

In [3]:
col_add0 = ['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end', 'sym_freq_cnt']
col_add1 = ['grp(CAGR)_mean',         'grp(CAGR)_std',         'grp(CAGR)_mean/std']
col_add2 = ['grp(CAGR/UI)_mean',      'grp(CAGR/UI)_std',      'grp(CAGR/UI)_mean/std']
col_add3 = ['grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std']
col_add4 = ['grp(retnStd/UI)_mean',   'grp(retnStd/UI)_std',   'grp(retnStd/UI)_mean/std']
col_add5 = ['SPY_CAGR', 'SPY_CAGR/UI', 'SPY_CAGR/retnStd', 'SPY_retnStd/UI']

col_add_total = col_add0 + col_add1 + col_add2 + col_add3 + col_add4 + col_add5
print(f'col_add_total:\n{col_add_total}, total columns: {len(col_add_total)}')

col_add_total:
['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end', 'sym_freq_cnt', 'grp(CAGR)_mean', 'grp(CAGR)_std', 'grp(CAGR)_mean/std', 'grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'grp(CAGR/UI)_mean/std', 'grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std', 'grp(retnStd/UI)_mean', 'grp(retnStd/UI)_std', 'grp(retnStd/UI)_mean/std', 'SPY_CAGR', 'SPY_CAGR/UI', 'SPY_CAGR/retnStd', 'SPY_retnStd/UI'], total columns: 23


#### DANGER: Create a new Dataframe to store run results. Previous data will be destroyed. To create a new dataframe, uncomment one choice from below.

In [4]:
# # # Creating Empty DataFrame and save it to file
# df = pd.DataFrame(columns=col_add_total)
# pickle_dump(df, path_data_dump, fp_df_eval_sym_freq_results_train)
# df

# # # Creating Empty DataFrame and save it to file
# df = pd.DataFrame(columns=col_add_total)
# pickle_dump(df, path_data_dump, fp_df_eval_sym_freq_results_validate)
# df

# # # Creating Empty DataFrame and save it to file
# df = pd.DataFrame(columns=col_add_total)
# pickle_dump(df, path_data_dump, fp_df_eval_sym_freq_results_test)
# df

#### Select run parameters. Parameters can also be passed using papermill by running yf_7_freq_cnt_pm_.ipynb

In [5]:
# SELECT RUN PARAMETERS.async Parameters can also be passed using papermill by running yf_7_freq_cnt_pm_.ipynb
verbose = True  # True prints more output
# verbose = False  # True prints more output

# write run results to df_eval_results
# store_results = False
store_results = True

# select run type
run_type = 'train'
# run_type = 'validate'
# run_type = 'test'

# number of max lookback tuples to create for iloc iloc_start_train:iloc_end_train:iloc_end_eval
# i.e. number of grp_top_set_syms_n_freq and grp_top_set_syms 
# n_samples = 400  
n_samples = 2

# for training, the number of days to lookback from iloc max-lookback iloc_end_train
days_lookbacks = [10, 20, 30]
days_lookbacks.sort()

# number of days from iloc_end_train are used to evaluate effectiveness of the training
days_eval = 5

# number of the most-common symbols from days_lookbacks' performance rankings to keep
n_top_syms = 20  

syms_start = 0  #  start index of n_top_syms for evaluation
syms_end = 10  #  end index of n_top_syms for evaluation

In [6]:
# Parameters
verbose = True
store_results = True
run_type = "test"
n_samples = 3
n_top_syms = 20
syms_start = 0
syms_end = 10
days_eval = 6
days_lookbacks = [15, 30]


#### Split dataframe into Train, Validate and Test

In [7]:
# Split df_close_clean into training (df_train), validation (df_val) and test (df_test) set.
# The default split is 0.7, 0.2, 0.1 respectively.
df_train, df_val, df_test = _2_split_train_val_test(df_close_clean)

#### Load previous data according to run_type

In [8]:
if run_type == 'train':
  df_eval_sym_freq_results_train = pickle_load(path_data_dump, fp_df_eval_sym_freq_results_train)
  df = df_train.copy()    
elif run_type == 'validate':
  df_eval_sym_freq_results_validate = pickle_load(path_data_dump, fp_df_eval_sym_freq_results_validate)
  df = df_val.copy()
elif run_type == 'test':
  df_eval_sym_freq_results_test = pickle_load(path_data_dump, fp_df_eval_sym_freq_results_test)
  df = df_test.copy()
else:
  msg_stop = f"ERROR run_type must be 'train', 'validate', or 'test', run_type is: {run_type}"
  raise SystemExit(msg_stop )  

#### Print lengths of dataframes

In [9]:
len_df = len(df)
len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)
print(f'run_type: {run_type}, len(df): {len(df)}')
print(f'len_df_train: {len_df_train}, len_df_val: {len_df_val}, len_df_test: {len_df_test} ')

run_type: test, len(df): 150
len_df_train: 1050, len_df_val: 300, len_df_test: 150 


#### Create a sets of iloc lookback slices (iloc_start_train:iloc_end_train:iloc_end_eval), where:  
* iloc_end_train - iloc_start_train = days_lookback
* iloc_end_eval - iloc_end_train = days_eval  
#### for example, if given:  
* n_samples = 2  
* days_lookbacks = [10, 20, 30]  
* days_eval = 5  
#### a possible result is:  
  - max_lookback_slices: [(417, 447, 452), (265, 295, 300)], where:
    - len(max_lookback_slices) = n_samples = 2
    - middle number in the tuples, 447 and 295, is the iloc of the "pivot day" for the days in "days_lookbacks" to lookback
    - 447 - 417 = middle number - first number = max(days_lookbacks) = 30
    - 295 - 265 = middle number - first number = max(days_lookbacks) = 30      
    - 452 - 447 = last number - middle number = days_eval = 5
    - 300 - 295 = last number - middle number = days_eval = 5    
  - sets_lookback_slices: [[(437, 447, 452), (427, 447, 452), (417, 447, 452)], [(285, 295, 300), (275, 295, 300), (265, 295, 300)]], where:
    - len(sets_lookback_slices) = n_samples = 2
    - last tuple in each list, i.e. (417, 447, 452) and (265, 295, 300), is a tuple from max_lookback_slices
    - where a set, e.g. [(437, 447, 452), (427, 447, 452), (417, 447, 452)]:
      - middle number, 447, iloc of the "pivot day" is constant for that set
      - middle number - first number, is the training period specified in days_lookbacks
        - 447 - 437 = middle number - first number = days_lookbacks[0] = 10
        - 447 - 427 = middle number - first number = days_lookbacks[1] = 20
        - 447 - 417 = middle number - first number = days_lookbacks[2] = 30
      - last number, 452, iloc of the end of the evaluation period is constant
        - 452 - 447 = last number - middle number = days_eval = 5

In [10]:
# return n_samples slices
max_lookback_slices = _3_random_slices(len_df, n_samples=n_samples, days_lookback=max(days_lookbacks), days_eval=days_eval)
# return n_samples * len(days_lookbacks) slices
sets_lookback_slices = _4_lookback_slices(max_slices=max_lookback_slices, days_lookbacks=days_lookbacks, verbose=False)

if verbose:
  print(f'number of max_lookback_slices is equal to n_samples = {n_samples}')
  print(f'max_lookback_slices:\n{max_lookback_slices}\n')
  print(f'number of sets in sets_lookback_slices is equal to n_samples = {n_samples}')  
  print(f'sets_lookback_slices:\n{sets_lookback_slices}\n')
  print(f'number of tuples in each "set of lookback slices" is equal to len(days_lookbacks): len({days_lookbacks})')    

number of max_lookback_slices is equal to n_samples = 3
max_lookback_slices:
[(39, 69, 75), (4, 34, 40), (36, 66, 72)]

number of sets in sets_lookback_slices is equal to n_samples = 3
sets_lookback_slices:
[[(54, 69, 75), (39, 69, 75)], [(19, 34, 40), (4, 34, 40)], [(51, 66, 72), (36, 66, 72)]]

number of tuples in each "set of lookback slices" is equal to len(days_lookbacks): len([15, 30])


#### grp_top_set_syms_n_freq is a list of sub-lists, e.g.:
 - [[('AGY', 7), ('PCG', 7), ('KDN', 6), ..., ('CYT', 3)], ..., [('FCN', 9), ('HIG', 9), ('SJR', 8), ..., ('BFH', 2)]]
#### There are n_samples sub-lists. Each sub-list has n_top_syms tuples of (symbol, frequency) pairs, and is sorted in descending order of frequency. The frequency is the number of times the symbol appears in the top n_top_syms performance rankings of CAGR/UI, CAGR/retnStd and retnStd/UI. Therefore, the symbols in the sub-list are the best performing symbols for the periods in days_lookbacks.

In [11]:
# grp_top_set_syms_n_freq is a list of lists of top_set_syms_n_freq, e.g.
#   [[('AGY', 7), ('PCG', 7), ('KDN', 6), ..., ('CYT', 3)],
#    [('FCN', 9), ('HIG', 9), ('SJR', 8), ..., ('BFH', 2)]]
#   where each list is the best performing symbols from a lb_slices, e.g.
#     [(483, 513, 523), (453, 513, 523), (393, 513, 523)]  
grp_top_set_syms_n_freq = []  # list of lists of top_set_symbols_n_freq, there are n_samples lists in list
grp_top_set_syms = []  # grp_top_set_syms_n_freq without the frequency count

# lb_slices, e.g  [(483, 513, 523), (453, 513, 523), (393, 513, 523)],
#  is one max_lookback_slice, e.g. (393, 513, 523), along with
#  the remaining slices of the days_lookbacks, e.g. (483, 513, 523), (453, 513, 523)  
for i, lb_slices in enumerate(sets_lookback_slices):
  print(f'\n########## {i + 1} of {len(sets_lookback_slices)} lb_slices in sets_lookcak_slices ##########')
  # unsorted list of the most frequent symbols in performance metrics of the lb_slices  
  grp_most_freq_syms = []
  for j, lb_slice in enumerate(lb_slices):  # lb_slice, e.g. (246, 276, 286)
    iloc_start_train = lb_slice[0]     # iloc of start of training period
    iloc_end_train   = lb_slice[1]     # iloc of end of training period
    iloc_start_eval  = iloc_end_train  # iloc of start of evaluation period
    iloc_end_eval    = lb_slice[2]     # iloc of end of evaluation period
    lookback         = iloc_end_train - iloc_start_train
    d_eval           = iloc_end_eval - iloc_start_eval

    _df = df.iloc[iloc_start_train:iloc_end_train]
    date_iloc_start_train = _df.index[0].strftime('%Y-%m-%d')
    date_iloc_end_train = _df.index[-1].strftime('%Y-%m-%d')

    if verbose:
      print(f'days lookback:         {lookback},  {j + 1} of {len(days_lookbacks)} days_lookbacks: {days_lookbacks}')
      print(f'lb_slices:             {lb_slices}')
      print(f'lb_slice:              {lb_slice}')
      print(f'days eval:             {d_eval}')    
      print(f'iloc_iloc_start_train: {iloc_start_train}')
      print(f'iloc_iloc_end_train:   {iloc_end_train}')
      print(f'date_iloc_start_train: {date_iloc_start_train}')
      print(f'date_iloc_end_train:   {date_iloc_end_train}')


    perf_ranks, most_freq_syms = _5_perf_ranks(_df, n_top_syms=n_top_syms)
    # unsorted list of the most frequent symbols in performance metrics of the lb_slices  
    grp_most_freq_syms.append(most_freq_syms)  
    if verbose:    
      # 1 lookback of r_CAGR/UI, r_CAGR/retnStd, r_retnStd/UI
      print(f'perf_ranks: {perf_ranks}')  
      # most common symbols of perf_ranks 
      print(f'most_freq_syms: {most_freq_syms}')     
      # grp_perf_ranks[lookback] = perf_ranks
      print(f'+++ finish lookback slice {lookback} +++\n')

  if verbose:
    print(f'grp_most_freq_syms: {grp_most_freq_syms}')
    # grp_most_freq_syms a is list of lists of tuples of 
    #  the most-common-symbols symbol:frequency cumulated from
    #  each days_lookback  
    print(f'**** finish lookback slices {lb_slices} ****\n')

  # flatten list of lists of (symbol:frequency)
  flat_grp_most_freq_syms = [val for sublist in grp_most_freq_syms for val in sublist]
  # return "symbol, frequency" pairs of the most frequent symbols, i.e. best performing symbols,
  #  in flat_grp_most_freq_syms. The paris are sorted in descending frequency.   
  set_most_freq_syms = _6_grp_tuples_sort_sum(flat_grp_most_freq_syms, reverse=True)
  # get the top n_top_syms of the most frequent "symbol, frequency" pairs
  top_set_syms_n_freq = set_most_freq_syms[0:n_top_syms]
  # get symbols from top_set_syms_n_freq, i[0] = symbol, i[1]=symbol's frequency count
  top_set_syms = [i[0] for i in top_set_syms_n_freq[syms_start:syms_end]]  

  # grp_top_set_syms_n_freq is a list of lists of top_set_syms_n_freq, e.g.
  #   [[('AGY', 7), ('PCG', 7), ('KDN', 6), ..., ('CYT', 3)],
  #    [('FCN', 9), ('HIG', 9), ('SJR', 8), ..., ('BFH', 2)]]
  #   where each list is the best performing symbols from a lb_slices, e.g.
  #     [(483, 513, 523), (453, 513, 523), (393, 513, 523)]    
  grp_top_set_syms_n_freq.append(top_set_syms_n_freq)
  grp_top_set_syms.append(top_set_syms)

  if verbose:  
    print(f'top {n_top_syms} ranked symbols and frequency from set {lb_slices}:\n{top_set_syms_n_freq}')
    print(f'top {n_top_syms} ranked symbols from set {lb_slices}:\n{top_set_syms}')  
    print(f'===== finish top {n_top_syms} ranked symbols from days_lookback set {lb_slices} =====\n\n')


########## 1 of 3 lb_slices in sets_lookcak_slices ##########
days lookback:         15,  1 of 2 days_lookbacks: [15, 30]
lb_slices:             [(54, 69, 75), (39, 69, 75)]
lb_slice:              (54, 69, 75)
days eval:             6
iloc_iloc_start_train: 54
iloc_iloc_end_train:   69
date_iloc_start_train: 2022-10-18
date_iloc_end_train:   2022-11-07


perf_ranks: {'period-15': {'r_CAGR/UI': array(['RMBS', 'GBX', 'SRI', 'STRL', 'PTEN', 'ACLS', 'HAL', 'LRCX', 'CVE',
       'AMX', 'NOV', 'E', 'APO', 'PODD', 'CAT', 'HEES', 'KLAC', 'PI',
       'CNK', 'AMAT'], dtype=object), 'r_CAGR/retnStd': array(['GBX', 'RMBS', 'SRI', 'LRCX', 'ACLS', 'PTEN', 'STRL', 'NOV', 'HAL',
       'E', 'HEES', 'CVE', 'APO', 'KLAC', 'AMX', 'PODD', 'XPO', 'AMAT',
       'FTI', 'UCTT'], dtype=object), 'r_retnStd/UI': array(['RMBS', 'MAIN', 'STRL', 'GILD', 'CNQ', 'AMX', 'CVE', 'SRI', 'GS',
       'GE', 'HAL', 'COP', 'CVLT', 'EIG', 'GBX', 'AJRD', 'GVA', 'TCPC',
       'T', 'PTEN'], dtype=object)}}
most_freq_syms: [('RMBS', 3), ('GBX', 3), ('SRI', 3), ('STRL', 3), ('PTEN', 3), ('HAL', 3), ('CVE', 3), ('AMX', 3), ('ACLS', 2), ('LRCX', 2), ('NOV', 2), ('E', 2), ('APO', 2), ('PODD', 2), ('HEES', 2), ('KLAC', 2), ('AMAT', 2), ('CAT', 1), ('PI', 1), ('CNK', 1), ('XPO', 1), ('FTI', 1), ('UCTT', 1), ('MAIN', 1), ('GILD', 1), ('CNQ', 1), ('GS', 1), ('GE', 1), ('COP', 1), ('CV

perf_ranks: {'period-30': {'r_CAGR/UI': array(['HAL', 'XPRO', 'SLB', 'STRL', 'PTEN', 'MRO', 'NOV', 'PDCE', 'CAT',
       'CAR', 'FANG', 'CLMT', 'ATEN', 'APA', 'HES', 'ROCC', 'PBF', 'ENLC',
       'AER', 'VNOM'], dtype=object), 'r_CAGR/retnStd': array(['HAL', 'PTEN', 'SLB', 'NOV', 'XPRO', 'STRL', 'CAR', 'PDCE', 'MRO',
       'APA', 'ROCC', 'CLMT', 'HES', 'FANG', 'CPE', 'ENLC', 'RMBS', 'NEX',
       'MUR', 'ERF'], dtype=object), 'r_retnStd/UI': array(['GILD', 'CAT', 'STRL', 'AER', 'XPRO', 'VNOM', 'ATEN', 'QNST',
       'OSBC', 'SLB', 'LOCO', 'EME', 'BIIB', 'ATCO', 'HAL', 'HFWA',
       'IBOC', 'OLN', 'ELF', 'COP'], dtype=object)}}
most_freq_syms: [('HAL', 3), ('XPRO', 3), ('SLB', 3), ('STRL', 3), ('PTEN', 2), ('MRO', 2), ('NOV', 2), ('PDCE', 2), ('CAT', 2), ('CAR', 2), ('FANG', 2), ('CLMT', 2), ('ATEN', 2), ('APA', 2), ('HES', 2), ('ROCC', 2), ('ENLC', 2), ('AER', 2), ('VNOM', 2), ('PBF', 1), ('CPE', 1), ('RMBS', 1), ('NEX', 1), ('MUR', 1), ('ERF', 1), ('GILD', 1), ('QNST', 1), ('OSBC', 

perf_ranks: {'period-15': {'r_CAGR/UI': array(['ISEE', 'WIX', 'RVNC', 'NTNX', 'AXGN', 'RNR', 'RCL', 'CCRN',
       'REGN', 'TNK', 'GKOS', 'PEN', 'TNP', 'WING', 'BMA', 'LRN', 'FSLR',
       'TNDM', 'NCLH', 'SBUX'], dtype=object), 'r_CAGR/retnStd': array(['ISEE', 'RVNC', 'WIX', 'AXGN', 'NTNX', 'RCL', 'RNR', 'TNK', 'WING',
       'PEN', 'CCRN', 'REGN', 'TNP', 'GKOS', 'BMA', 'FSLR', 'NCLH',
       'TNDM', 'COMM', 'DXC'], dtype=object), 'r_retnStd/UI': array(['ISEE', 'NTNX', 'LRN', 'RNR', 'GKOS', 'REGN', 'WIX', 'CCRN',
       'AXGN', 'SBUX', 'ELF', 'FSLR', 'LAUR', 'PEN', 'BMA', 'BK', 'PCG',
       'ENPH', 'HOLI', 'RVNC'], dtype=object)}}
most_freq_syms: [('ISEE', 3), ('WIX', 3), ('RVNC', 3), ('NTNX', 3), ('AXGN', 3), ('RNR', 3), ('CCRN', 3), ('REGN', 3), ('GKOS', 3), ('PEN', 3), ('BMA', 3), ('FSLR', 3), ('RCL', 2), ('TNK', 2), ('TNP', 2), ('WING', 2), ('LRN', 2), ('TNDM', 2), ('NCLH', 2), ('SBUX', 2), ('COMM', 1), ('DXC', 1), ('ELF', 1), ('LAUR', 1), ('BK', 1), ('PCG', 1), ('ENPH', 1), ('HO

perf_ranks: {'period-15': {'r_CAGR/UI': array(['GBX', 'ASC', 'PI', 'NTNX', 'RMBS', 'T', 'NCLH', 'EXTR', 'VRE',
       'SHOP', 'STRL', 'EIG', 'SRI', 'APO', 'ATEN', 'HEES', 'CVLT', 'HUM',
       'ARES', 'ISRG'], dtype=object), 'r_CAGR/retnStd': array(['GBX', 'ASC', 'PI', 'EXTR', 'RMBS', 'NCLH', 'HEES', 'NTNX', 'APO',
       'SRI', 'T', 'SHOP', 'ARES', 'ISRG', 'STRL', 'ATEN', 'EIG', 'ENLC',
       'HY', 'SAIC'], dtype=object), 'r_retnStd/UI': array(['NTNX', 'VRE', 'GILD', 'MAIN', 'T', 'HUM', 'TMUS', 'PI', 'CVLT',
       'EIG', 'STRL', 'GE', 'SHOP', 'GBX', 'CLS', 'RMBS', 'BFAM', 'IBM',
       'AIG', 'MGM'], dtype=object)}}
most_freq_syms: [('GBX', 3), ('PI', 3), ('NTNX', 3), ('RMBS', 3), ('T', 3), ('SHOP', 3), ('STRL', 3), ('EIG', 3), ('ASC', 2), ('NCLH', 2), ('EXTR', 2), ('VRE', 2), ('SRI', 2), ('APO', 2), ('ATEN', 2), ('HEES', 2), ('CVLT', 2), ('HUM', 2), ('ARES', 2), ('ISRG', 2), ('ENLC', 1), ('HY', 1), ('SAIC', 1), ('GILD', 1), ('MAIN', 1), ('TMUS', 1), ('GE', 1), ('CLS', 1), ('BFAM', 

perf_ranks: {'period-30': {'r_CAGR/UI': array(['ATEN', 'CAR', 'ISEE', 'DXCM', 'PBF', 'XPRO', 'RCKT', 'AER',
       'AXON', 'EXTR', 'GDEN', 'BIIB', 'GILD', 'SKT', 'BBW', 'LOCO',
       'CVLT', 'CAT', 'AZO', 'RMBS'], dtype=object), 'r_CAGR/retnStd': array(['CAR', 'ATEN', 'ISEE', 'XPRO', 'PBF', 'RMBS', 'DXCM', 'EXTR',
       'RCKT', 'AXON', 'CVLT', 'STRL', 'SLB', 'HAL', 'SKT', 'MUR', 'ASC',
       'CAT', 'NOV', 'BBW'], dtype=object), 'r_retnStd/UI': array(['ATEN', 'GILD', 'AER', 'BIIB', 'LOCO', 'ORLY', 'WWE', 'AZO',
       'SATS', 'CHH', 'ATCO', 'PETS', 'GDEN', 'EME', 'DXCM', 'SIGI',
       'HCKT', 'PBF', 'ISEE', 'STLD'], dtype=object)}}
most_freq_syms: [('ATEN', 3), ('ISEE', 3), ('DXCM', 3), ('PBF', 3), ('CAR', 2), ('XPRO', 2), ('RCKT', 2), ('AER', 2), ('AXON', 2), ('EXTR', 2), ('GDEN', 2), ('BIIB', 2), ('GILD', 2), ('SKT', 2), ('BBW', 2), ('LOCO', 2), ('CVLT', 2), ('CAT', 2), ('AZO', 2), ('RMBS', 2), ('STRL', 1), ('SLB', 1), ('HAL', 1), ('MUR', 1), ('ASC', 1), ('NOV', 1), ('ORLY', 1), (

In [12]:
def top_set_sym_freq_cnt(top_set_syms_n_freq):
    # accommodate upto 5 periods of days_lookbacks(i.e. days_lookbacks = [5, 10, 15, 20, 25])
    sym_freq_cnt_15 = []
    sym_freq_cnt_14 = []
    sym_freq_cnt_13 = []
    sym_freq_cnt_12 = []
    sym_freq_cnt_11 = []
    sym_freq_cnt_10 = []
    sym_freq_cnt_9 = []
    sym_freq_cnt_8 = []
    sym_freq_cnt_7 = []
    sym_freq_cnt_6 = []
    sym_freq_cnt_5 = []
    sym_freq_cnt_4 = []
    sym_freq_cnt_3 = []
    sym_freq_cnt_2 = []

    for sym_n_freq in top_set_syms_n_freq:
        _sym = sym_n_freq[0]
        _freq = sym_n_freq[1]
        # print(_sym, _freq)
        if _freq == 15:
            sym_freq_cnt_15.append(_sym)
        elif _freq == 14:
            sym_freq_cnt_14.append(_sym)
        elif _freq == 13:
            sym_freq_cnt_13.append(_sym)
        elif _freq == 12:
            sym_freq_cnt_12.append(_sym)                        
        elif _freq == 11:
            sym_freq_cnt_11.append(_sym)
        elif _freq == 10:
            sym_freq_cnt_10.append(_sym)            
        elif _freq == 9:
            sym_freq_cnt_9.append(_sym)
        elif _freq == 8:
            sym_freq_cnt_8.append(_sym)
        elif _freq == 7:
            sym_freq_cnt_7.append(_sym)  
        elif _freq == 6:
            sym_freq_cnt_6.append(_sym)
        elif _freq == 5:
            sym_freq_cnt_5.append(_sym)
        elif _freq == 4:
            sym_freq_cnt_4.append(_sym)
        elif _freq == 3:
            sym_freq_cnt_3.append(_sym)          
        else:
            sym_freq_cnt_2.append(_sym)

    l_sym_freq_cnt = []

    l_sym_freq_cnt.append(sym_freq_cnt_15)
    l_sym_freq_cnt.append(sym_freq_cnt_14)
    l_sym_freq_cnt.append(sym_freq_cnt_13)
    l_sym_freq_cnt.append(sym_freq_cnt_12)    
    l_sym_freq_cnt.append(sym_freq_cnt_11)   
    l_sym_freq_cnt.append(sym_freq_cnt_10)
    l_sym_freq_cnt.append(sym_freq_cnt_9)
    l_sym_freq_cnt.append(sym_freq_cnt_8)
    l_sym_freq_cnt.append(sym_freq_cnt_7)    
    l_sym_freq_cnt.append(sym_freq_cnt_6)
    l_sym_freq_cnt.append(sym_freq_cnt_5)
    l_sym_freq_cnt.append(sym_freq_cnt_4)
    l_sym_freq_cnt.append(sym_freq_cnt_3)    
    l_sym_freq_cnt.append(sym_freq_cnt_2)    

    return l_sym_freq_cnt    

In [13]:
for top_set_syms_n_freq in grp_top_set_syms_n_freq:
  l_sym_freq_cnt = top_set_sym_freq_cnt(top_set_syms_n_freq)
  if verbose:
    print(f'sym_freq_15: {l_sym_freq_cnt[0]}')
    print(f'sym_freq_14: {l_sym_freq_cnt[1]}')
    print(f'sym_freq_13: {l_sym_freq_cnt[2]}')
    print(f'sym_freq_12: {l_sym_freq_cnt[3]}')
    print(f'sym_freq_11: {l_sym_freq_cnt[4]}')
    print(f'sym_freq_10: {l_sym_freq_cnt[5]}')
    print(f'sym_freq_9: {l_sym_freq_cnt[6]}')
    print(f'sym_freq_8: {l_sym_freq_cnt[7]}')
    print(f'sym_freq_7: {l_sym_freq_cnt[8]}')
    print(f'sym_freq_6: {l_sym_freq_cnt[9]}')
    print(f'sym_freq_5: {l_sym_freq_cnt[10]}')
    print(f'sym_freq_4: {l_sym_freq_cnt[11]}')
    print(f'sym_freq_3: {l_sym_freq_cnt[12]}')
    print(f'sym_freq_2: {l_sym_freq_cnt[13]}\n')  

sym_freq_15: []
sym_freq_14: []
sym_freq_13: []
sym_freq_12: []
sym_freq_11: []
sym_freq_10: []
sym_freq_9: []
sym_freq_8: []
sym_freq_7: []
sym_freq_6: ['HAL', 'STRL']
sym_freq_5: ['PTEN']
sym_freq_4: ['NOV', 'RMBS']
sym_freq_3: ['AMX', 'CAT', 'CVE', 'GBX', 'SLB', 'SRI', 'XPRO']
sym_freq_2: ['ACLS', 'AER', 'AMAT', 'APA', 'APO', 'ATEN', 'CAR', 'CLMT']

sym_freq_15: []
sym_freq_14: []
sym_freq_13: []
sym_freq_12: []
sym_freq_11: []
sym_freq_10: []
sym_freq_9: []
sym_freq_8: []
sym_freq_7: []
sym_freq_6: ['BMA', 'FSLR', 'NTNX', 'RVNC']
sym_freq_5: ['CCRN', 'RNR', 'TNK', 'TNP']
sym_freq_4: ['HOLI', 'PCG']
sym_freq_3: ['AMBC', 'AXGN', 'EURN', 'FL', 'GKOS', 'ISEE', 'PEN', 'RCL', 'REGN', 'TWNK']
sym_freq_2: []

sym_freq_15: []
sym_freq_14: []
sym_freq_13: []
sym_freq_12: []
sym_freq_11: []
sym_freq_10: []
sym_freq_9: []
sym_freq_8: []
sym_freq_7: []
sym_freq_6: []
sym_freq_5: ['ATEN', 'RMBS']
sym_freq_4: ['CVLT', 'EXTR', 'STRL']
sym_freq_3: ['ASC', 'DXCM', 'EIG', 'GBX', 'GILD', 'ISEE', 'NTNX

In [14]:
from myUtils import symb_perf_stats_vectorized_v8

print('z_grp_top_set_syms_n_freq:')
z_grp_top_set_syms_n_freq = zip(max_lookback_slices, grp_top_set_syms_n_freq)

for i, (_lookback_slice, _top_set_syms_n_freq) in enumerate(z_grp_top_set_syms_n_freq):
  iloc_start_train = _lookback_slice[0]
  iloc_end_train = _lookback_slice[1]
  iloc_start_eval = iloc_end_train
  iloc_end_eval = _lookback_slice[2]

  print(f'{i + 1 } of {n_samples} max_lookback_slice')
  print(f'max_lookback_slice: {_lookback_slice}')
  # dates correspond to max_lookback_slice
  d_iloc_start_train = df.index[iloc_start_train].strftime('%Y-%m-%d')
  d_iloc_end_train = df.index[iloc_end_train].strftime('%Y-%m-%d')
  d_iloc_start_eval = d_iloc_end_train
  d_iloc_end_eval = df.index[iloc_end_eval].strftime('%Y-%m-%d')
  d_df_eval_start = d_iloc_end_train
  d_df_eval_end = df.index[iloc_end_eval - 1].strftime('%Y-%m-%d')  
  print(f'max lookback dates: {d_iloc_start_train}, {d_iloc_end_train}, {d_iloc_end_eval}')
  print(f'df_eval dates (inclusive): {d_df_eval_start} - {d_df_eval_end}')    
  print(f'top_set_syms_n_freq: {_top_set_syms_n_freq}\n')

  l_sym_freq_cnt = top_set_sym_freq_cnt(_top_set_syms_n_freq)
  if verbose:
    print(f'sym_freq_15: {l_sym_freq_cnt[0]}')
    print(f'sym_freq_14: {l_sym_freq_cnt[1]}')
    print(f'sym_freq_13: {l_sym_freq_cnt[2]}')
    print(f'sym_freq_12: {l_sym_freq_cnt[3]}')
    print(f'sym_freq_11: {l_sym_freq_cnt[4]}')
    print(f'sym_freq_10: {l_sym_freq_cnt[5]}')
    print(f'sym_freq_9: {l_sym_freq_cnt[6]}')
    print(f'sym_freq_8: {l_sym_freq_cnt[7]}')
    print(f'sym_freq_7: {l_sym_freq_cnt[8]}')
    print(f'sym_freq_6: {l_sym_freq_cnt[9]}')
    print(f'sym_freq_5: {l_sym_freq_cnt[10]}')
    print(f'sym_freq_4: {l_sym_freq_cnt[11]}')
    print(f'sym_freq_3: {l_sym_freq_cnt[12]}')
    print(f'sym_freq_2: {l_sym_freq_cnt[13]}\n')  


  _sym_idx = ['SPY']
  df_SPY = df[iloc_start_eval:iloc_end_eval][_sym_idx]

  (
    _symbols,
    _period_yr,
    _retn,
    _DD,
    _UI,
    _MDD,
    _retnMean,
    _retnStd,
    _retnStd_div_UI,
    _CAGR,
    _CAGR_div_retnStd,
    _CAGR_div_UI,
    SPY_retnStd_d_UI,     
    SPY_CAGR,
    SPY_CAGR_d_retnStd,
    SPY_CAGR_d_UI,
  ) = symb_perf_stats_vectorized_v8(df_SPY)  

  print(f'\nSPY: retnStd/UI, CAGR/retnStd, CAGR/UI, CAGR: {SPY_retnStd_d_UI[0]:>17,.1f}, {SPY_CAGR_d_retnStd[0]:>17,.1f}, {SPY_CAGR_d_UI[0]:>17.1f}, {SPY_CAGR[0]:>17.1f}')

  # drop last list, with frequency count 2 or less, in l_sym_freq_cnt from zip
  # zip_cnt_n_syms = zip([6,5,4,3], l_sym_freq_cnt[:-1])  
  zip_cnt_n_syms = zip([15,14,13,12,11,10,9,8,7,6,5,4,3], l_sym_freq_cnt[:-1])
  for item in zip_cnt_n_syms:
    sym_freq_cnt = item[0]
    syms = item[1]
    if syms:  # iterate ONLY if there are symbols in syms
      df_eval = df[iloc_start_eval:iloc_end_eval][syms]      

      if verbose:
        print(f'iloc_start_eval: {iloc_start_eval},  date: {d_iloc_end_train}')
        print(f'iloc_end_eval:   {iloc_end_eval},  date: {d_iloc_end_eval},  df_eval last date: {d_df_eval_end}')
        print(f'frequency count of symbol(s): {sym_freq_cnt}')      
        # print(f'\ndf_eval:\n{df_eval}\n')

        syms_n_SPY = syms + ['SPY']
        df_eval_n_SPY = df[iloc_start_eval:iloc_end_eval][syms_n_SPY]   
        print(f'\ndf_eval_n_SPY:\n{df_eval_n_SPY}\n')

      (
        _symbols,
        _period_yr,
        _retn,
        _DD,
        _UI,
        _MDD,
        _retnMean,
        _retnStd,
        _retnStd_div_UI,
        _CAGR,
        _CAGR_div_retnStd,
        _CAGR_div_UI,
        grp_retnStd_d_UI,     
        grp_CAGR,
        grp_CAGR_d_retnStd,
        grp_CAGR_d_UI,
      ) = symb_perf_stats_vectorized_v8(df_eval)  
      print(f'grp(retnStd/UI):   mean, std, mean/std: {grp_retnStd_d_UI[0]  :>17,.1f}, {grp_retnStd_d_UI[1]  :>17,.1f}, {grp_retnStd_d_UI[2]  :>17,.1f}')
      print(f'grp(CAGR/retnStd): mean, std, mean/std: {grp_CAGR_d_retnStd[0]:>17,.1f}, {grp_CAGR_d_retnStd[1]:>17,.1f}, {grp_CAGR_d_retnStd[2]:>17,.1f}')
      print(f'grp(CAGR/UI):      mean, std, mean/std: {grp_CAGR_d_UI[0]     :>17,.1f}, {grp_CAGR_d_UI[1]     :>17,.1f}, {grp_CAGR_d_UI[2]     :>17,.1f}')
      print(f'grp(CAGR):         mean, std, mean/std: {grp_CAGR[0]          :>17,.1f}, {grp_CAGR[1]          :>17,.1f}, {grp_CAGR[2]          :>17,.1f}')

      if store_results:  # record results to df
        row_add0      = [n_samples, str(days_lookbacks), days_eval, n_top_syms, syms_start, syms_end, sym_freq_cnt]
        row_add1      = [grp_CAGR[0],           grp_CAGR[1],           grp_CAGR[2]]
        row_add2      = [grp_CAGR_d_UI[0],      grp_CAGR_d_UI[1],      grp_CAGR_d_UI[2]]
        row_add3      = [grp_CAGR_d_retnStd[0], grp_CAGR_d_retnStd[1], grp_CAGR_d_retnStd[2]]
        row_add4      = [grp_retnStd_d_UI[0],   grp_retnStd_d_UI[1],   grp_retnStd_d_UI[2]]
        row_add5      = [SPY_CAGR[0], SPY_CAGR_d_UI[0], SPY_CAGR_d_retnStd[0], SPY_retnStd_d_UI[0]]
        row_add_total = row_add0 + row_add1 + row_add2 + row_add3 + row_add4 + row_add5
        print(f'row_add_total: {row_add_total}')

        if run_type == 'train':
          df_eval_sym_freq_results_train.loc[len(df_eval_sym_freq_results_train)] = row_add_total
          print(f'appended row_add to df_eval_sym_freq_results_train:\n{row_add_total}\n')
        elif run_type == 'validate':
          df_eval_sym_freq_results_validate.loc[len(df_eval_sym_freq_results_validate)] = row_add_total       
          print(f'appended row_add to df_eval_sym_freq_results_validate:\n{row_add_total}\n')
        elif run_type == 'test':
          df_eval_sym_freq_results_test.loc[len(df_eval_sym_freq_results_test)] = row_add_total       
          print(f'appended row_add to df_eval_sym_freq_results_test:\n{row_add_total}\n')
        else:
          msg_stop = f"ERROR run_type must be 'train', 'validate', or 'test', run_type is: {run_type}"
          raise SystemExit(msg_stop )                             
  
  print('='*50, '\n')  

z_grp_top_set_syms_n_freq:
1 of 3 max_lookback_slice
max_lookback_slice: (39, 69, 75)
max lookback dates: 2022-09-27, 2022-11-08, 2022-11-16
df_eval dates (inclusive): 2022-11-08 - 2022-11-15
top_set_syms_n_freq: [('HAL', 6), ('STRL', 6), ('PTEN', 5), ('NOV', 4), ('RMBS', 4), ('AMX', 3), ('CAT', 3), ('CVE', 3), ('GBX', 3), ('SLB', 3), ('SRI', 3), ('XPRO', 3), ('ACLS', 2), ('AER', 2), ('AMAT', 2), ('APA', 2), ('APO', 2), ('ATEN', 2), ('CAR', 2), ('CLMT', 2)]

sym_freq_15: []
sym_freq_14: []
sym_freq_13: []
sym_freq_12: []
sym_freq_11: []
sym_freq_10: []
sym_freq_9: []
sym_freq_8: []
sym_freq_7: []
sym_freq_6: ['HAL', 'STRL']
sym_freq_5: ['PTEN']
sym_freq_4: ['NOV', 'RMBS']
sym_freq_3: ['AMX', 'CAT', 'CVE', 'GBX', 'SLB', 'SRI', 'XPRO']
sym_freq_2: ['ACLS', 'AER', 'AMAT', 'APA', 'APO', 'ATEN', 'CAR', 'CLMT']


SPY: retnStd/UI, CAGR/retnStd, CAGR/UI, CAGR:               2.8,             288.8,             814.9,               7.4
iloc_start_eval: 69,  date: 2022-11-08
iloc_end_eval:   75, 

appended row_add to df_eval_sym_freq_results_test:
[3, '[15, 30]', 6, 20, 0, 10, 3, -0.837241652937214, 0.23292029226314714, -3.5945414837077427, -17.093695266813356, 9.019600412092244, -1.8951721235783874, -41.69607877771942, 20.4016032563507, -2.0437648087652174, 0.4511028556798117, 0.20113915633938073, 2.242740120271107, -0.9387969964064807, -25.25656438747317, -169.16791794881468, 0.14929878368021932]


3 of 3 max_lookback_slice
max_lookback_slice: (36, 66, 72)
max lookback dates: 2022-09-22, 2022-11-03, 2022-11-11
df_eval dates (inclusive): 2022-11-03 - 2022-11-10
top_set_syms_n_freq: [('ATEN', 5), ('RMBS', 5), ('CVLT', 4), ('EXTR', 4), ('STRL', 4), ('ASC', 3), ('DXCM', 3), ('EIG', 3), ('GBX', 3), ('GILD', 3), ('ISEE', 3), ('NTNX', 3), ('PBF', 3), ('PI', 3), ('SHOP', 3), ('T', 3), ('AER', 2), ('APO', 2), ('ARES', 2), ('AXON', 2)]

sym_freq_15: []
sym_freq_14: []
sym_freq_13: []
sym_freq_12: []
sym_freq_11: []
sym_freq_10: []
sym_freq_9: []
sym_freq_8: []
sym_freq_7: []
sym_freq_6:

In [15]:
if store_results:  # record results to df
  if run_type == 'train':
    pickle_dump(df_eval_sym_freq_results_train, path_data_dump, fp_df_eval_sym_freq_results_train)
    print(f'Save results to: {fp_df_eval_sym_freq_results_train}')
  elif run_type == 'validate':
    pickle_dump(df_eval_sym_freq_results_validate, path_data_dump, fp_df_eval_sym_freq_results_validate)   
    print(f'Save results to: {fp_df_eval_sym_freq_results_validate}')
  elif run_type == 'test':
    pickle_dump(df_eval_sym_freq_results_test, path_data_dump, fp_df_eval_sym_freq_results_test)   
    print(f'Save results to: {fp_df_eval_sym_freq_results_test}')
  else:
    msg_stop = f"ERROR: Result was not saved. run_type must be 'train', 'validate', or 'test', run_type is: {run_type}"
    raise SystemExit(msg_stop )  

Save results to: _df_eval_sym_freq_results_test
