In [1]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML
from yf_utils import _2_split_train_val_test, _3_random_slices, _4_lookback_slices
from yf_utils import _5_perf_ranks, _6_grp_tuples_sort_sum
from myUtils import pickle_load, pickle_dump

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', 16)
pd.set_option('display.width', 790)

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"
fp_df_close_clean = 'df_close_clean'

# fp_df_eval_results = 'df_eval_results'
# fp_df_eval_sym_freq_results = 'df_eval_sym_freq_results'
fp_df_eval_sym_freq_results_validate = 'df_eval_sym_freq_results_validate'

df_close_clean = pickle_load(path_data_dump, fp_df_close_clean)

In [2]:
# Split df_close_clean into training (df_train), validation (df_val) and test (df_test) set.
# The default split is 0.7, 0.2, 0.1 respectively.

###################################  USE df_val as df_train
df_train, df_val, df_test = _2_split_train_val_test(df_close_clean)
df_train = df_val.copy()
###################################  USE df_val as df_train

len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)
print(f'len_df_train: {len_df_train}, len_df_val: {len_df_val}, len_df_test: {len_df_test} ')

len_df_train: 300, len_df_val: 300, len_df_test: 150 


In [3]:
print(f'df_train.index: {df_train.index[0]}, {df_train.index[-1]}')
print(f'df_val.index:   {df_val.index[0]}, {df_val.index[-1]}')

df_train.index: 2021-05-13 00:00:00, 2022-07-21 00:00:00
df_val.index:   2021-05-13 00:00:00, 2022-07-21 00:00:00


In [4]:
col_add0 = ['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end', 'sym_freq_cnt']
col_add1 = ['grp(CAGR)_mean',         'grp(CAGR)_std',         'grp(CAGR)_mean/std']
col_add2 = ['grp(CAGR/UI)_mean',      'grp(CAGR/UI)_std',      'grp(CAGR/UI)_mean/std']
col_add3 = ['grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std']
col_add4 = ['grp(retnStd/UI)_mean',   'grp(retnStd/UI)_std',   'grp(retnStd/UI)_mean/std']
col_add5 = ['SPY_CAGR', 'SPY_CAGR/UI', 'SPY_CAGR/retnStd', 'SPY_retnStd/UI']

col_add_total = col_add0 + col_add1 + col_add2 + col_add3 + col_add4 + col_add5
print(f'col_add_total:\n{col_add_total}, total columns: {len(col_add_total)}')

col_add_total:
['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end', 'sym_freq_cnt', 'grp(CAGR)_mean', 'grp(CAGR)_std', 'grp(CAGR)_mean/std', 'grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'grp(CAGR/UI)_mean/std', 'grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std', 'grp(retnStd/UI)_mean', 'grp(retnStd/UI)_std', 'grp(retnStd/UI)_mean/std', 'SPY_CAGR', 'SPY_CAGR/UI', 'SPY_CAGR/retnStd', 'SPY_retnStd/UI'], total columns: 23


In [5]:
# # Creating Empty DataFrame and save it to file
# df = pd.DataFrame(columns=col_add_total)
# pickle_dump(df, path_data_dump, fp_df_eval_sym_freq_results_validate)
# df

In [None]:
# if store_results:  # record results to df
# my_cols = ['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end', 'grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'grp(CAGR/UI)_mean/std', 'SPY_CAGR/UI']
# df_eval_results = pickle_load(path_data_dump, fp_df_eval_results)
df_eval_sym_freq_results_validate = pickle_load(path_data_dump, fp_df_eval_sym_freq_results_validate)
df_eval_sym_freq_results_validate

In [294]:
# verbose = True  # True prints more output
verbose = False  # True prints more output

# write run results to df_eval_results
# store_results = False
store_results = True

# number of max lookback tuples to create for iloc start_train:end_train:end_eval
# i.e. number of grp_top_set_syms_n_freq and grp_top_set_syms 
n_samples = 114  
# n_samples = 1

# for training, the number of days to lookback from iloc max-lookback end_train
days_lookbacks = [15, 30, 60, 120]
days_lookbacks = [30, 60, 120]
days_lookbacks = [60, 120]
days_lookbacks = [120]
days_lookbacks = [60]
days_lookbacks = [30]
days_lookbacks = [15]
days_lookbacks = [15, 30]
days_lookbacks = [15, 30, 60]


days_lookbacks.sort()

# number of days from end_train are used to evaluate effectiveness of the training
# days_eval = 10
days_eval = 6
days_eval = 5
days_eval = 4
# days_eval = 3
# days_eval = 2  


# number of the most-common symbols from days_lookbacks' performance rankings to keep
n_top_syms = 20  

syms_start = 0  #  start index of n_top_syms for evaluation
# syms_start = 1  #  start index of n_top_syms for evaluation

# syms_end = n_top_syms  #  end index of n_top_syms for evaluation
# syms_end = 1  #  end index of n_top_syms for evaluation
# syms_end = 2  #  end index of n_top_syms for evaluation
# syms_end = 3  #  end index of n_top_syms for evaluation
# syms_end = 4  #  end index of n_top_syms for evaluation
# syms_end = 5  #  end index of n_top_syms for evaluation
# syms_end = 6  #  end index of n_top_syms for evaluation
syms_end = 10  #  end index of n_top_syms for evaluation

create a sets of iloc lookback slices (start_train:end_train:end_eval), where  
end_train - start_train = days_lookback  
end_eval - end_train = days_eval  
for example,  
if given:  
 n_samples = 2  
 days_lookbacks = [30, 60, 120]  
 days_eval = 10  
a possible result is:  
 max_lookback_slices:  
 [(150, 270, 280), (5, 125, 135)]  
 where 270-150=125-5=max(days_lookbacks), 280-270=135-125=days_eval  
 sets_lookback_slices:  
 [[(240, 270, 280), (210, 270, 280), (150, 270, 280)], [(95, 125, 135), (65, 125, 135), (5, 125, 135)]]  
  where in a set, 270-240=days_lookbacks[0], 270-210=days_lookbacks[1], 270-150=days_lookbacks[2]  
  and 270, i.e. end_train, is constant for the set  

In [295]:
# return n_samples slices
max_lookback_slices = _3_random_slices(len_df_train, n_samples=n_samples, days_lookback=max(days_lookbacks), days_eval=days_eval)
# return n_samples * len(days_lookbacks) slices
sets_lookback_slices = _4_lookback_slices(max_slices=max_lookback_slices, days_lookbacks=days_lookbacks, verbose=False)

if verbose:
  print(f'max_lookback_slices:\n{max_lookback_slices}')
  print(f'sets_lookback_slices:\n{sets_lookback_slices}')

Generate lists, n_samples long, of the highest performance ranked symbols. The performance metrics are: CAGR/UI, CAGR/retnStd, retnStd/UI. n_top_syms of the best performing symbols from each metric are combined. The symbols are sorted by their number of appearances in the combined pool, and are placed in a list. A slice of the best performing symbols is selected by syms_start:syms_end, i.e. top_set_syms_n_freq[syms_start:syms_end].     

The performance metrics are calculated based on slices in sets_lookback_slices.  The first two numbers are ilocs for training. The last two numbers are ilocs for evaluation.

In [296]:
grp_top_set_syms_n_freq = []  # list of lists of top_set_symbols_n_freq, there are n_samples lists in list
grp_top_set_syms = []  # list of lists of top_set_symbols, there are n_samples lists in list
# loop thru lists of tuples of start_train:end_train:end_eval, i.e.
#  [[(887, 917, 927), (857, 917, 927), (797, 917, 927)],
#  [(483, 513, 523), (453, 513, 523), (393, 513, 523)]]
for lb_slices in sets_lookback_slices:
  grp_most_common_syms = []  
  for lb_slice in lb_slices:  # lb_slice, e.g. (246, 276, 286)
    start_train = lb_slice[0]
    end_train = lb_slice[1]
    start_eval = end_train
    end_eval = lb_slice[2]
    lookback = end_train - start_train
    d_eval = end_eval - start_eval

    _df = df_train.iloc[start_train:end_train]
    date_start_train = _df.index[0].strftime('%Y-%m-%d')
    date_end_train = _df.index[-1].strftime('%Y-%m-%d')

    if verbose:
      print(f'days lookback:    {lookback}')
      print(f'lb_slices:        {lb_slices}')
      print(f'lb_slice:         {lb_slice}')
      print(f'days eval:        {d_eval}')    
      print(f'iloc_start_train: {start_train}')
      print(f'iloc_end_train:   {end_train}')
      print(f'date_start_train: {date_start_train}')
      print(f'date_end_train:   {date_end_train}')


    perf_ranks, most_common_syms = _5_perf_ranks(_df, n_top_syms=n_top_syms)
    grp_most_common_syms.append(most_common_syms)
    
    if verbose:    
      # 1 lookback of r_CAGR/UI, r_CAGR/retnStd, r_retnStd/UI
      print(f'perf_ranks: {perf_ranks}')  
      # most common symbols of perf_ranks 
      print(f'most_common_syms: {most_common_syms}')     
      # grp_perf_ranks[lookback] = perf_ranks
      print(f'+++ finish lookback slice {lookback} +++\n')

  if verbose:
    print(f'grp_most_common_syms: {grp_most_common_syms}')
    # grp_most_common_syms a is list of lists of tuples of 
    #  the most-common-symbols symbol:frequency cumulated from
    #  each days_lookback  
    print(f'**** finish lookback slices {lb_slices} ****\n')

  # flatten list of lists of (symbol:frequency)
  flat_grp_most_common_syms = [val for sublist in grp_most_common_syms for val in sublist]
  # group symbols from set of days_lookbacks (i.e. lb_slices) and sum frequency of the symbols
  set_most_common_syms = _6_grp_tuples_sort_sum(flat_grp_most_common_syms, reverse=True)
  # get the top few most-frequent symbol:frequency pairs
  top_set_syms_n_freq = set_most_common_syms[0:n_top_syms]
  # get symbols from top_set_syms_n_freq

###################################  
  # top_set_syms = [i[0] for i in top_set_syms_n_freq]
  top_set_syms = [i[0] for i in top_set_syms_n_freq[syms_start:syms_end]]  
###################################  
  
  grp_top_set_syms_n_freq.append(top_set_syms_n_freq)
  grp_top_set_syms.append(top_set_syms)

  if verbose:  
    print(f'top {n_top_syms} ranked symbols and frequency from set {lb_slices}:\n{top_set_syms_n_freq}')
    print(f'top {n_top_syms} ranked symbols from set {lb_slices}:\n{top_set_syms}')  
    print(f'===== finish top {n_top_syms} ranked symbols from days_lookback set {lb_slices} =====\n\n')

In [297]:
def top_set_sym_freq_cnt(top_set_syms_n_freq):
    sym_freq_cnt_6 = []
    sym_freq_cnt_5 = []
    sym_freq_cnt_4 = []
    sym_freq_cnt_3 = []
    sym_freq_cnt_2 = []

    for sym_n_freq in top_set_syms_n_freq:
        _sym = sym_n_freq[0]
        _freq = sym_n_freq[1]
        # print(_sym, _freq)
        if _freq == 6:
            sym_freq_cnt_6.append(_sym)
        elif _freq == 5:
            sym_freq_cnt_5.append(_sym)
        elif _freq == 4:
            sym_freq_cnt_4.append(_sym)
        elif _freq == 3:
            sym_freq_cnt_3.append(_sym)          
        else:
            sym_freq_cnt_2.append(_sym)

    l_sym_freq_cnt = []
    l_sym_freq_cnt.append(sym_freq_cnt_6)
    l_sym_freq_cnt.append(sym_freq_cnt_5)
    l_sym_freq_cnt.append(sym_freq_cnt_4)
    l_sym_freq_cnt.append(sym_freq_cnt_3)    
    l_sym_freq_cnt.append(sym_freq_cnt_2)    

    # if verbose:
    #     print(f'sym_freq_cnt_6: {sym_freq_cnt_6}')
    #     print(f'sym_freq_cnt_5: {sym_freq_cnt_5}')
    #     print(f'sym_freq_cnt_4: {sym_freq_cnt_4}')
    #     print(f'sym_freq_cnt_3: {sym_freq_cnt_3}')
    #     print(f'sym_freq_cnt_2: {sym_freq_cnt_2}')

    # return sym_freq_cnt_6, sym_freq_cnt_5, sym_freq_cnt_4, sym_freq_cnt_3, sym_freq_cnt_2
    return l_sym_freq_cnt    

In [298]:
for top_set_syms_n_freq in grp_top_set_syms_n_freq:
  l_sym_freq_cnt = top_set_sym_freq_cnt(top_set_syms_n_freq)
  print(f'sym_freq_6: {l_sym_freq_cnt[0]}')
  print(f'sym_freq_5: {l_sym_freq_cnt[1]}')
  print(f'sym_freq_4: {l_sym_freq_cnt[2]}')
  print(f'sym_freq_3: {l_sym_freq_cnt[3]}')
  print(f'sym_freq_2: {l_sym_freq_cnt[4]}\n')  

sym_freq_6: ['AMPH', 'EQT', 'GLNG', 'LNTH']
sym_freq_5: ['BPT', 'SBS']
sym_freq_4: ['CALM', 'IPI', 'OXY', 'RGLD']
sym_freq_3: ['AR', 'CIB', 'GOGO', 'IRTC', 'MOS', 'PTEN', 'RGNX', 'RRC', 'SPLK']
sym_freq_2: ['CUTR']

sym_freq_6: ['DDS', 'ENPH', 'PDFS', 'WIRE', 'WOLF']
sym_freq_5: ['ANET', 'WD']
sym_freq_4: ['CLMT', 'F', 'LOB', 'TER']
sym_freq_3: ['AOS', 'BVH', 'CHRD', 'DNOW', 'ERF', 'FLR', 'IGT']
sym_freq_2: ['CAR', 'IRTC']

sym_freq_6: ['ASC', 'CVE', 'MRK', 'VLO']
sym_freq_5: ['ARLP', 'AXGN', 'CC', 'HRB', 'OEC', 'SQM']
sym_freq_4: ['AZPN', 'DK', 'NRG', 'STKL', 'SU', 'THS']
sym_freq_3: ['AMBC']
sym_freq_2: ['DINO', 'STNG', 'SWX']

sym_freq_6: ['DQ', 'FOLD', 'QURE']
sym_freq_5: ['AXSM', 'CELH', 'CYRX', 'EDU', 'FDP', 'RARE']
sym_freq_4: ['RCKT', 'RGNX', 'STKL']
sym_freq_3: ['AGYS', 'AMN', 'CLW', 'CORT', 'CVBF', 'DG']
sym_freq_2: ['ELF', 'RFP']

sym_freq_6: ['CLMT', 'WIRE']
sym_freq_5: ['ANET', 'ERF', 'LC', 'MARA', 'SAIA', 'WOLF']
sym_freq_4: ['AZTA', 'ENPH', 'IGT', 'SM', 'XENE']
sym_freq_

In [299]:
from myUtils import symb_perf_stats_vectorized_v8

print('z_grp_top_set_syms_n_freq:')
z_grp_top_set_syms_n_freq = zip(max_lookback_slices, grp_top_set_syms_n_freq)

for i, (_lookback_slice, _top_set_syms_n_freq) in enumerate(z_grp_top_set_syms_n_freq):
  start_train = _lookback_slice[0]
  end_train = _lookback_slice[1]
  start_eval = end_train
  end_eval = _lookback_slice[2]

  print(f'{i + 1 } of {n_samples} max_lookback_slice')
  print(f'max_lookback_slice: {_lookback_slice}')
  # dates correspond to max_lookback_slice
  d_start_train = df_train.index[start_train].strftime('%Y-%m-%d')
  d_end_train = df_train.index[end_train].strftime('%Y-%m-%d')
  d_start_eval = d_end_train
  d_end_eval = df_train.index[end_eval].strftime('%Y-%m-%d')
  d_df_eval_start = d_end_train
  d_df_eval_end = df_train.index[end_eval - 1].strftime('%Y-%m-%d')  
  print(f'max lookback dates: {d_start_train}, {d_end_train}, {d_end_eval}')
  print(f'df_eval dates (inclusive): {d_df_eval_start} - {d_df_eval_end}')    
  print(f'top_set_syms_n_freq: {_top_set_syms_n_freq}\n')

  l_sym_freq_cnt = top_set_sym_freq_cnt(_top_set_syms_n_freq)
  print(f'sym_freq_6: {l_sym_freq_cnt[0]}')
  print(f'sym_freq_5: {l_sym_freq_cnt[1]}')
  print(f'sym_freq_4: {l_sym_freq_cnt[2]}')
  print(f'sym_freq_3: {l_sym_freq_cnt[3]}\n')

  _sym_idx = ['SPY']
  df_SPY = df_train[start_eval:end_eval][_sym_idx]

  (
    _symbols,
    _period_yr,
    _retn,
    _DD,
    _UI,
    _MDD,
    _retnMean,
    _retnStd,
    _retnStd_div_UI,
    _CAGR,
    _CAGR_div_retnStd,
    _CAGR_div_UI,
    SPY_retnStd_d_UI,     
    SPY_CAGR,
    SPY_CAGR_d_retnStd,
    SPY_CAGR_d_UI,
  ) = symb_perf_stats_vectorized_v8(df_SPY)  

  print(f'\nSPY: retnStd/UI, CAGR/retnStd, CAGR/UI, CAGR: {SPY_retnStd_d_UI[0]:>17,.1f}, {SPY_CAGR_d_retnStd[0]:>17,.1f}, {SPY_CAGR_d_UI[0]:>17.1f}, {SPY_CAGR[0]:>17.1f}')

  # drop last list, with frequency count 2 or less, in l_sym_freq_cnt from zip
  zip_cnt_n_syms = zip([6,5,4,3], l_sym_freq_cnt[:-1])  
  for item in zip_cnt_n_syms:
    sym_freq_cnt = item[0]
    syms = item[1]
    if syms:  # iterate ONLY if there are symbols in syms
      df_eval = df_train[start_eval:end_eval][syms]      

      if verbose:
        print(f'start_eval: {start_eval},  date: {d_end_train}')
        print(f'end_eval:   {end_eval},  date: {d_end_eval},  df_eval last date: {d_df_eval_end}')
        print(f'frequency count of symbol(s): {sym_freq_cnt}')      
        # print(f'\ndf_eval:\n{df_eval}\n')

        syms_n_SPY = syms + ['SPY']
        df_eval_n_SPY = df_train[start_eval:end_eval][syms_n_SPY]   
        print(f'\ndf_eval_n_SPY:\n{df_eval_n_SPY}\n')

      (
        _symbols,
        _period_yr,
        _retn,
        _DD,
        _UI,
        _MDD,
        _retnMean,
        _retnStd,
        _retnStd_div_UI,
        _CAGR,
        _CAGR_div_retnStd,
        _CAGR_div_UI,
        grp_retnStd_d_UI,     
        grp_CAGR,
        grp_CAGR_d_retnStd,
        grp_CAGR_d_UI,
      ) = symb_perf_stats_vectorized_v8(df_eval)  
      print(f'grp(retnStd/UI):   mean, std, mean/std: {grp_retnStd_d_UI[0]  :>17,.1f}, {grp_retnStd_d_UI[1]  :>17,.1f}, {grp_retnStd_d_UI[2]  :>17,.1f}')
      print(f'grp(CAGR/retnStd): mean, std, mean/std: {grp_CAGR_d_retnStd[0]:>17,.1f}, {grp_CAGR_d_retnStd[1]:>17,.1f}, {grp_CAGR_d_retnStd[2]:>17,.1f}')
      print(f'grp(CAGR/UI):      mean, std, mean/std: {grp_CAGR_d_UI[0]     :>17,.1f}, {grp_CAGR_d_UI[1]     :>17,.1f}, {grp_CAGR_d_UI[2]     :>17,.1f}')
      print(f'grp(CAGR):         mean, std, mean/std: {grp_CAGR[0]          :>17,.1f}, {grp_CAGR[1]          :>17,.1f}, {grp_CAGR[2]          :>17,.1f}')

      if store_results:  # record results to df
        row_add0      = [n_samples, str(days_lookbacks), days_eval, n_top_syms, syms_start, syms_end, sym_freq_cnt]
        row_add1      = [grp_CAGR[0],           grp_CAGR[1],           grp_CAGR[2]]
        row_add2      = [grp_CAGR_d_UI[0],      grp_CAGR_d_UI[1],      grp_CAGR_d_UI[2]]
        row_add3      = [grp_CAGR_d_retnStd[0], grp_CAGR_d_retnStd[1], grp_CAGR_d_retnStd[2]]
        row_add4      = [grp_retnStd_d_UI[0],   grp_retnStd_d_UI[1],   grp_retnStd_d_UI[2]]
        row_add5      = [SPY_CAGR[0], SPY_CAGR_d_UI[0], SPY_CAGR_d_retnStd[0], SPY_retnStd_d_UI[0]]
        row_add_total = row_add0 + row_add1 + row_add2 + row_add3 + row_add4 + row_add5
        print(f'row_add_total: {row_add_total}')


        # df_eval_sym_freq_results.loc[len(df_eval_sym_freq_results)] = row_add_total
        # print(f'appended row_add to df_eval_sym_freq_results:\n{row_add_total}\n')
        df_eval_sym_freq_results_validate.loc[len(df_eval_sym_freq_results_validate)] = row_add_total       
        print(f'appended row_add to df_eval_sym_freq_results_validate:\n{row_add_total}\n')        
  
  
  print('='*50, '\n')  

z_grp_top_set_syms_n_freq:
1 of 114 max_lookback_slice
max_lookback_slice: (165, 225, 229)
max lookback dates: 2022-01-06, 2022-04-04, 2022-04-08
df_eval dates (inclusive): 2022-04-04 - 2022-04-07
top_set_syms_n_freq: [('CUTR', 9), ('AMPH', 6), ('EQT', 6), ('GLNG', 6), ('LNTH', 6), ('BPT', 5), ('SBS', 5), ('CALM', 4), ('IPI', 4), ('OXY', 4), ('RGLD', 4), ('AR', 3), ('CIB', 3), ('GOGO', 3), ('IRTC', 3), ('MOS', 3), ('PTEN', 3), ('RGNX', 3), ('RRC', 3), ('SPLK', 3)]

sym_freq_6: ['AMPH', 'EQT', 'GLNG', 'LNTH']
sym_freq_5: ['BPT', 'SBS']
sym_freq_4: ['CALM', 'IPI', 'OXY', 'RGLD']
sym_freq_3: ['AR', 'CIB', 'GOGO', 'IRTC', 'MOS', 'PTEN', 'RGNX', 'RRC', 'SPLK']


SPY: retnStd/UI, CAGR/retnStd, CAGR/UI, CAGR:               0.5,             -99.5,             -49.6,              -0.8
grp(retnStd/UI):   mean, std, mean/std:           2,109.4,           3,650.4,               0.6
grp(CAGR/retnStd): mean, std, mean/std:           4,687.9,           6,240.7,               0.8
grp(CAGR/UI):      me

In [300]:
# pickle_dump(df_eval_sym_freq_results, path_data_dump, fp_df_eval_sym_freq_results)
pickle_dump(df_eval_sym_freq_results_validate, path_data_dump, fp_df_eval_sym_freq_results_validate)

In [301]:
# https://towardsdatascience.com/pandas-groupby-a-simple-but-detailed-tutorial-314b8f37005d
# https://towardsdatascience.com/accessing-data-in-a-multiindex-dataframe-in-pandas-569e8767201d
# https://towardsdatascience.com/summarizing-data-with-pandas-crosstab-efc8b9abecf
# https://towardsdatascience.com/how-to-flatten-multiindex-columns-and-rows-in-pandas-f5406c50e569
# https://datascientyst.com/list-aggregation-functions-aggfunc-groupby-pandas/

In [305]:
df = pickle_load(path_data_dump, fp_df_eval_sym_freq_results_validate)
df

Unnamed: 0,n_samples,days_lookbacks,days_eval,n_top_syms,syms_start,syms_end,sym_freq_cnt,grp(CAGR)_mean,grp(CAGR)_std,grp(CAGR)_mean/std,grp(CAGR/UI)_mean,grp(CAGR/UI)_std,grp(CAGR/UI)_mean/std,grp(CAGR/retnStd)_mean,grp(CAGR/retnStd)_std,grp(CAGR/retnStd)_mean/std,grp(retnStd/UI)_mean,grp(retnStd/UI)_std,grp(retnStd/UI)_mean/std,SPY_CAGR,SPY_CAGR/UI,SPY_CAGR/retnStd,SPY_retnStd/UI
0,114,"[15, 30, 60,...",6,20,0,10,6,-9.060648e-01,2.047110e-01,-4.426068,-1.417674e+01,4.385355e+00,-3.232745,-2.938283e+01,1.395072e+01,-2.106187,0.585949,0.367768,1.593256,-0.989629,-16.940888,-49.329849,0.343421
1,114,"[15, 30, 60,...",6,20,0,10,5,-9.986618e-01,1.071932e-03,-931.646279,-1.360339e+01,2.038776e+00,-6.672329,-3.645348e+01,3.577501e+00,-10.189647,0.378527,0.075756,4.996672,-0.989629,-16.940888,-49.329849,0.343421
2,114,"[15, 30, 60,...",6,20,0,10,4,2.179304e+08,3.082001e+08,0.707107,8.990426e+09,1.271438e+10,0.707107,2.237311e+09,3.164036e+09,0.707107,1.566096,1.734474,0.902923,-0.989629,-16.940888,-49.329849,0.343421
3,114,"[15, 30, 60,...",6,20,0,10,6,8.287973e-01,1.739211e+00,0.476536,7.084092e+01,1.093169e+02,0.648032,3.137418e+01,5.404370e+01,0.580533,1.653390,0.369722,4.471976,-0.253862,-20.007051,-17.895907,1.117968
4,114,"[15, 30, 60,...",6,20,0,10,5,-6.475457e-01,5.205846e-01,-1.243882,2.019110e+01,1.111662e+02,0.181630,-4.432001e+00,6.368368e+01,-0.069594,1.026004,0.471329,2.176834,-0.253862,-20.007051,-17.895907,1.117968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7736,114,"[15, 30, 60]",4,20,0,10,3,2.515795e-01,9.088074e-01,0.276824,7.959812e+01,1.354325e+02,0.587733,2.593509e+01,6.982297e+01,0.371441,1.456737,0.636621,2.288232,0.582292,215.803013,103.313562,2.088816
7737,114,"[15, 30, 60]",4,20,0,10,6,1.253527e+00,2.253527e+00,0.556251,1.403186e+02,1.474331e+02,0.951744,7.617936e+01,8.735223e+01,0.872094,1.198187,0.561423,2.134196,-0.790839,-43.873867,-58.900365,0.744883
7738,114,"[15, 30, 60]",4,20,0,10,5,-9.621618e-01,5.531982e-02,-17.392715,-2.968757e+01,2.375205e+01,-1.249895,-4.040100e+01,3.192099e+01,-1.265656,0.752041,0.107377,7.003733,-0.790839,-43.873867,-58.900365,0.744883
7739,114,"[15, 30, 60]",4,20,0,10,4,1.861955e+00,2.841425e+00,0.655289,1.584498e+02,2.303186e+02,0.687959,7.335951e+01,1.359073e+02,0.539776,1.349574,0.444532,3.035941,-0.790839,-43.873867,-58.900365,0.744883


In [306]:
# pd.crosstab(df['days_lookbacks'], df['grp(CAGR/UI)_mean'])
tbl = df.groupby(['days_lookbacks', 'days_eval', 'sym_freq_cnt'])\
        .agg({'grp(CAGR)_mean':          ['count', 'mean', 'std',],
              'grp(CAGR/UI)_mean':       ['mean', 'std',],
              'grp(CAGR/retnStd)_mean':  ['mean', 'std',],
              'SPY_CAGR':                ['mean', 'std',],
              'SPY_CAGR/UI':             ['mean', 'std',],
              'SPY_CAGR/retnStd':        ['mean', 'std',],                           
              })

In [307]:
tbl

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,grp(CAGR)_mean,grp(CAGR)_mean,grp(CAGR)_mean,grp(CAGR/UI)_mean,grp(CAGR/UI)_mean,grp(CAGR/retnStd)_mean,grp(CAGR/retnStd)_mean,SPY_CAGR,SPY_CAGR,SPY_CAGR/UI,SPY_CAGR/UI,SPY_CAGR/retnStd,SPY_CAGR/retnStd
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
days_lookbacks,days_eval,sym_freq_cnt,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
[120],4,3,114,2510.347,12811.71,1155115000.0,6465253000.0,45636.43,175402.7,3.407992,11.314772,3930179.0,20258260.0,408.207572,1523.880722
[120],5,3,114,885.7833,4714.046,763466900.0,6460086000.0,20439.8,105086.0,0.980229,4.751738,1062275.0,8452350.0,131.35622,611.275272
[120],6,3,114,116.9878,473.903,10240520.0,39365720.0,3021.095,11534.12,0.991003,4.517483,2407.404,18709.29,97.512973,420.642091
"[15, 30, 60, 120]",4,3,75,9093.982,78039.03,39473040.0,237782300.0,171391.9,1464595.0,1.201747,4.947571,1482453.0,7511899.0,146.223841,612.961166
"[15, 30, 60, 120]",4,4,114,78378290000.0,836834300000.0,1.567566e+17,1.673669e+18,974724400000.0,10406840000000.0,3.215575,10.068122,3096685.0,17248880.0,360.154669,1362.842834
"[15, 30, 60, 120]",4,5,114,3346490.0,35142000.0,6687828000000.0,70284480000000.0,191693300.0,2029915000.0,3.215575,10.068122,3096685.0,17248880.0,360.154669,1362.842834
"[15, 30, 60, 120]",4,6,110,14352240.0,106435800.0,28702800000000.0,212871900000000.0,361094700.0,2904372000.0,3.35754,10.222752,3209294.0,17552140.0,378.956349,1383.872086
"[15, 30, 60, 120]",5,3,65,261.8358,1199.472,50780960.0,220959400.0,5881.227,21470.5,0.422828,3.098555,372779.3,3003652.0,52.006021,400.725874
"[15, 30, 60, 120]",5,4,114,11271.42,101805.6,23501590000.0,227374100000.0,266184.2,2024071.0,2.6968,9.535759,3454008.0,19885720.0,359.680068,1476.427335
"[15, 30, 60, 120]",5,5,114,41046950.0,308521800.0,91783250000000.0,689875200000000.0,765726000.0,5755433000.0,2.6968,9.535759,3454008.0,19885720.0,359.680068,1476.427335


In [308]:
tbl['dif_CAGR_mean'] = tbl['grp(CAGR)_mean', 'mean'] - tbl['SPY_CAGR', 'mean']
tbl['dif_CAGR/UI_mean'] = tbl['grp(CAGR/UI)_mean', 'mean'] - tbl['SPY_CAGR/UI', 'mean'] 
tbl['dif_CAGR/retnStd_mean'] = tbl['grp(CAGR/retnStd)_mean', 'mean'] - tbl['SPY_CAGR/retnStd', 'mean'] 
tbl['grp_CAGR/UI_mean/std'] = tbl['grp(CAGR/UI)_mean', 'mean'] / tbl['grp(CAGR/UI)_mean', 'std']  
tbl['SPY_CAGR/UI_mean/std'] = tbl['SPY_CAGR/UI', 'mean'] / tbl['SPY_CAGR/UI', 'std'] 
tbl['dif_CAGR/UI_mean/std'] = tbl['grp_CAGR/UI_mean/std']  - tbl['SPY_CAGR/UI_mean/std'] 
# tbl.sort_values(by='dif_CAGR/UI_mean', ascending=False, inplace=True)
# tbl.sort_values(by='dif_CAGR/UI_mean/std', ascending=False, inplace=True)
# tbl

In [309]:
tbl.sort_values(by='dif_CAGR/UI_mean', ascending=False, inplace=True)
tbl.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,grp(CAGR)_mean,grp(CAGR)_mean,grp(CAGR)_mean,grp(CAGR/UI)_mean,grp(CAGR/UI)_mean,grp(CAGR/retnStd)_mean,grp(CAGR/retnStd)_mean,SPY_CAGR,SPY_CAGR,SPY_CAGR/UI,SPY_CAGR/UI,SPY_CAGR/retnStd,SPY_CAGR/retnStd,dif_CAGR_mean,dif_CAGR/UI_mean,dif_CAGR/retnStd_mean,grp_CAGR/UI_mean/std,SPY_CAGR/UI_mean/std,dif_CAGR/UI_mean/std
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
days_lookbacks,days_eval,sym_freq_cnt,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
"[15, 30]",4,5,104,2.454632e+24,2.503243e+25,4.599074e+26,4.690154e+27,4.803255e+24,4.898378e+25,2.574747,6.302925,2418441.0,8783808.0,448.241677,1019.834724,2.454632e+24,4.599074e+26,4.803255e+24,0.098058,0.275329,-0.177271
[60],4,3,114,6.398038e+23,6.831235e+24,1.198756e+26,1.279921e+27,1.251976e+24,1.336745e+25,7.536041,22.871523,13228090.0,45296670.0,1307.275598,4387.109244,6.398038e+23,1.198756e+26,1.251976e+24,0.093659,0.292032,-0.198374
[30],4,3,114,4.976252e+23,5.313183e+24,9.323659e+25,9.954943e+26,9.737593e+23,1.03969e+25,4.760345,15.112558,7773749.0,29858330.0,727.4112,2181.618106,4.976252e+23,9.323659e+25,9.737593e+23,0.093659,0.260354,-0.166696
"[15, 30]",5,5,105,2.935294e+18,3.007782e+19,6.148805e+20,6.30065e+21,6.319247e+18,6.475301e+19,1.813687,6.763613,1672576.0,12280350.0,243.601503,968.427072,2.935294e+18,6.148805e+20,6.319247e+18,0.09759,0.136199,-0.038609
"[15, 30, 60]",4,6,106,1.912365e+17,1.968901e+18,2.307473e+18,2.375668e+19,3.441431e+17,3.54317e+18,4.216902,16.063055,7041599.0,31575380.0,798.395861,2965.385879,1.912365e+17,2.307473e+18,3.441431e+17,0.097129,0.223009,-0.12588
"[15, 30]",4,3,114,8.890821e+16,6.682656e+17,1.072793e+18,8.063263e+18,1.599964e+17,1.202589e+18,2.68435,6.544412,2778538.0,9916124.0,450.63101,999.468745,8.890821e+16,1.072793e+18,1.599964e+17,0.133047,0.280204,-0.147157
"[30, 60, 120]",4,5,111,121790600000.0,903185700000.0,2.435811e+17,1.806371e+18,1708456000000.0,12669720000000.0,4.522059,13.442306,4012496.0,24125770.0,609.44819,2510.590092,121790600000.0,2.435811e+17,1708456000000.0,0.134846,0.166316,-0.03147
"[15, 30, 60, 120]",4,4,114,78378290000.0,836834300000.0,1.567566e+17,1.673669e+18,974724400000.0,10406840000000.0,3.215575,10.068122,3096685.0,17248880.0,360.154669,1362.842834,78378290000.0,1.567566e+17,974724400000.0,0.09366,0.17953,-0.085869
"[15, 30]",4,4,107,50126220000.0,518262100000.0,1.002524e+17,1.036524e+18,624441600000.0,6444988000000.0,2.846915,6.717126,2960309.0,10211750.0,480.848517,1024.204382,50126220000.0,1.002524e+17,624441600000.0,0.09672,0.289892,-0.193173
"[15, 30]",5,4,108,18312040000.0,133933900000.0,4.094658e+16,2.994853e+17,257679000000.0,1884660000000.0,2.210721,7.094151,2385210.0,13353000.0,322.290375,1112.208677,18312040000.0,4.094658e+16,257679000000.0,0.136723,0.178627,-0.041904


In [311]:
tbl.sort_values(by='dif_CAGR/UI_mean/std', ascending=False, inplace=True)
tbl.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,grp(CAGR)_mean,grp(CAGR)_mean,grp(CAGR)_mean,grp(CAGR/UI)_mean,grp(CAGR/UI)_mean,grp(CAGR/retnStd)_mean,grp(CAGR/retnStd)_mean,SPY_CAGR,SPY_CAGR,SPY_CAGR/UI,SPY_CAGR/UI,SPY_CAGR/retnStd,SPY_CAGR/retnStd,dif_CAGR_mean,dif_CAGR/UI_mean,dif_CAGR/retnStd_mean,grp_CAGR/UI_mean/std,SPY_CAGR/UI_mean/std,dif_CAGR/UI_mean/std
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
days_lookbacks,days_eval,sym_freq_cnt,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
[120],6,3,114,116.9878,473.903,10240520.0,39365720.0,3021.095,11534.12,0.991003,4.517483,2407.404,18709.29,97.512973,420.642091,115.9968,10238110.0,2923.582,0.260138,0.128674,0.131464
"[15, 30, 60, 120]",5,3,65,261.8358,1199.472,50780960.0,220959400.0,5881.227,21470.5,0.422828,3.098555,372779.3,3003652.0,52.006021,400.725874,261.413,50408180.0,5829.221,0.22982,0.124109,0.105712
[60],6,3,114,17608.99,184828.2,39247700.0,192226900.0,269725.8,2805724.0,0.451441,2.169756,96946.07,730542.6,64.831364,236.251096,17608.54,39150750.0,269661.0,0.204174,0.132704,0.07147
"[60, 120]",6,5,105,52.8426,331.5004,13754120.0,71498300.0,1537.399,7549.144,1.264082,3.611477,2810.695,19699.65,145.130803,406.3612,51.57851,13751310.0,1392.268,0.19237,0.142677,0.049693
"[15, 30, 60]",4,3,113,8370.809,63776.31,2465608000.0,9418088000.0,180710.8,1201675.0,4.013736,15.577586,6656980.0,30611880.0,773.190655,2876.787777,8366.795,2458951000.0,179937.6,0.261795,0.217464,0.044331
"[30, 60, 120]",5,6,97,7475.722,38576.68,1101221000.0,7109085000.0,126913.0,596341.6,1.951284,6.565162,1584444.0,12712770.0,263.664071,994.442857,7473.771,1099637000.0,126649.3,0.154903,0.124634,0.030269
"[15, 30, 60, 120]",6,6,111,11584.32,112222.5,160241200.0,955277300.0,198248.6,1933373.0,1.448681,4.248258,3354.951,21757.13,138.979687,408.595075,11582.87,160237900.0,198109.6,0.167743,0.1542,0.013543
"[15, 30, 60, 120]",5,6,109,30556.71,288778.4,690915300.0,3630255000.0,596877.0,5484165.0,2.843123,9.728369,3612450.0,20326620.0,378.759427,1507.423253,30553.87,687302900.0,596498.2,0.190321,0.17772,0.012601
"[60, 120]",6,6,90,25.93389,86.94045,32454980.0,195809500.0,1620.109,6077.012,1.345787,3.778391,3323.745,21277.81,156.464182,431.444075,24.5881,32451660.0,1463.644,0.165748,0.156207,0.009541
"[15, 30, 60]",6,6,106,409.5303,2199.62,299534500.0,2805462000.0,11789.84,68286.81,0.484897,2.597304,141738.7,1445963.0,61.1671,297.514299,409.0454,299392700.0,11728.67,0.106768,0.098024,0.008745


In [None]:
if verbose:
  # from IPython.display import display, HTML
  # from myUtils import symb_perf_stats_vectorized_v8
  df = df_eval.copy()
  df_SPY = df_SPY.copy()  
  _my_df = [df_eval, df_SPY]
  for _df in _my_df:
    # print(f'_df:\n{_df}')
    (
      symbols,
      period_yr,
      retn,
      DD,
      UI,
      MDD,
      retnMean,
      retnStd,
      retnStd_div_UI,
      CAGR,
      CAGR_div_retnStd,
      CAGR_div_UI,
      grp_retnStd_div_UI,     
      grp_CAGR,
      grp_CAGR_div_retnStd,
      grp_CAGR_div_UI,
    ) = symb_perf_stats_vectorized_v8(_df)

    # display(HTML(_df.to_html()))
    print(f'_df:\n{_df}\n')
    print(f'symbols:\n{symbols}\n')
    print(f'period_yr:\n{period_yr}\n')
    print(f'retn:\n{retn}\n')
    print(f'DD:\n{DD}\n')
    print(f'UI:\n{UI}\n')
    print(f'MDD:\n{MDD}\n')
    print(f'retnMean:\n{retnMean}\n')
    print(f'retnStd:\n{retnStd}\n')
    print(f'retnStd_div_UI:\n{retnStd_div_UI}\n')
    print(f'CAGR:\n{CAGR}\n')
    print(f'CAGR_div_retnStd:\n{CAGR_div_retnStd}\n')
    print(f'CAGR_div_UI:\n{CAGR_div_UI}\n')
    print(f'grp_retnStd_div_UI:\n{grp_retnStd_div_UI}\n')
    print(f'grp_CAGR:\n{grp_CAGR}\n')
    print(f'grp_CAGR_div_retnStd:\n{grp_CAGR_div_retnStd}\n')
    print(f'grp_CAGR_div_UI:\n{grp_CAGR_div_UI}\n')            

In [None]:
# # https://stackoverflow.com/questions/31674195/plot-normal-distribution-given-mean-and-sigma-python
# # loc is mean, scale is standard deviation
# import pylab
# import numpy as np
# from scipy.stats import norm
# # x = np.linspace(-10000,100000,1000)
# x = np.linspace(-40e+10,50e+10,1000)
# y = norm.pdf(x, loc=2.562777e+10, scale=1.036925e+11)    # loc = mean, scale = standard deviation
# # z = norm.pdf(x, loc=3.540615e+10, scale=1.194430e+11)    # for example
# # z1 = norm.pdf(x, loc=298.805901, scale=826.875749)    # for example
# # z1 = norm.pdf(x, loc=1.021825, scale=1.505096)    # for example
# pylab.plot(x,y, 'b')
# # pylab.plot(x,z, 'g')
# # pylab.plot(x,z1, 'r')
# pylab.show()

In [None]:
# # get values of _cols, where grp(CAGR/retnStd)_mean is max after filtering out inf
# _cols = ['grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std']
# # _df_no_inf = df.loc[df['grp(CAGR/retnStd)_mean'] != np.inf]  # df with filter out inf in column grp(CAGR/UI)_mean 
# # _idx = _df_no_inf['grp(CAGR/retnStd)_mean'].idxmax()  # index value of max in grp(CAGR/UI)_mean 
# _idx = df['grp(CAGR/retnStd)_mean'].idxmax()  # index value of max in grp(CAGR/UI)_mean 
# grp_inf_replacement = df.loc[[_idx], _cols].squeeze()  # convert df (only has 1 row) to series
# print(f'_idx: {_idx}')
# grp_inf_replacement


In [None]:
# # get values of _cols, where SPY_CAGR/retnStd is max after filtering out inf
# _cols = ['SPY_CAGR/retnStd']
# # _df_no_inf = df.loc[df['SPY_CAGR/retnStd'] != np.inf]  # df with filter out inf in column grp(CAGR/UI)_mean 
# _idx = df['SPY_CAGR/retnStd'].idxmax()  # index value of max in grp(CAGR/UI)_mean 
# SPY_inf_replacement = df.loc[[_idx], _cols].squeeze()  # convert df (only has 1 row) to series
# print(f'_idx: {_idx}')
# SPY_inf_replacement