In [122]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML
from yf_utils import _2_split_train_val_test, _3_random_slices, _4_lookback_slices
from yf_utils import _5_perf_ranks, _6_grp_tuples_sort_sum
from myUtils import pickle_load, pickle_dump

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 14)
pd.set_option('display.max_colwidth', 16)
pd.set_option('display.width', 800)

verbose = False  # True prints more output
# verbose = True  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"
fp_df_close_clean = 'df_close_clean'
fp_df_eval_results = 'df_eval_results'

df_close_clean = pickle_load(path_data_dump, fp_df_close_clean)

In [123]:
# Split df_close_clean into training (df_train), validation (df_val) and test (df_test) set.
# The default split is 0.7, 0.2, 0.1 respectively.

###################################
# df_train, df_val, df_test = _2_split_train_val_test(df_close_clean, s_train=1, s_val=0, s_test=0)
df_train, df_val, df_test = _2_split_train_val_test(df_close_clean)
###################################

len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)
print(f'len_df_train: {len_df_train}, len_df_val: {len_df_val}, len_df_test: {len_df_test} ')

len_df_train: 1050, len_df_val: 300, len_df_test: 150 


In [124]:
# if store_results:  # record results to df
# my_cols = ['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end', 'grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'grp(CAGR/UI)_mean/std', 'SPY_CAGR/UI']
df_eval_results = pickle_load(path_data_dump, fp_df_eval_results)

In [125]:
# write run results to df_eval_results
# store_results = False
store_results = True

# number of max lookback tuples to create for iloc start_train:end_train:end_eval
# i.e. number of grp_top_set_syms_n_freq and grp_top_set_syms 
n_samples = 400  
# n_samples = 2

# for training, the number of days to lookback from iloc max-lookback end_train
days_lookbacks = [15, 30, 60, 120]
days_lookbacks = [30, 60, 120]
days_lookbacks = [60, 120]
days_lookbacks = [120]
days_lookbacks = [60]
days_lookbacks = [30]
days_lookbacks = [15]
days_lookbacks = [15, 30]
# days_lookbacks = [15, 29]
# days_lookbacks = [15, 31]
# days_lookbacks = [14, 30]
# days_lookbacks = [16, 32]
# days_lookbacks = [14, 28]

days_lookbacks.sort()

# number of days from end_train are used to evaluate effectiveness of the training
# days_eval = 10
days_eval = 5
# days_eval = 4
# days_eval = 3
# days_eval = 2  


# number of the most-common symbols from days_lookbacks' performance rankings to keep
n_top_syms = 20  

syms_start = 0  #  start index of n_top_syms for evaluation
# syms_start = 1  #  start index of n_top_syms for evaluation

# syms_end = n_top_syms  #  end index of n_top_syms for evaluation
# syms_end = 1  #  end index of n_top_syms for evaluation
# syms_end = 2  #  end index of n_top_syms for evaluation
# syms_end = 3  #  end index of n_top_syms for evaluation
# syms_end = 4  #  end index of n_top_syms for evaluation
# syms_end = 5  #  end index of n_top_syms for evaluation
# syms_end = 6  #  end index of n_top_syms for evaluation
syms_end = 10  #  end index of n_top_syms for evaluation

create a sets of iloc lookback slices (start_train:end_train:end_eval), where  
end_train - start_train = days_lookback  
end_eval - end_train = days_eval  
for example,  
if given:  
 n_samples = 2  
 days_lookbacks = [30, 60, 120]  
 days_eval = 10  
a possible result is:  
 max_lookback_slices:  
 [(150, 270, 280), (5, 125, 135)]  
 where 270-150=125-5=max(days_lookbacks), 280-270=135-125=days_eval  
 sets_lookback_slices:  
 [[(240, 270, 280), (210, 270, 280), (150, 270, 280)], [(95, 125, 135), (65, 125, 135), (5, 125, 135)]]  
  where in a set, 270-240=days_lookbacks[0], 270-210=days_lookbacks[1], 270-150=days_lookbacks[2]  
  and 270, i.e. end_train, is constant for the set  

In [126]:
# return n_samples slices
max_lookback_slices = _3_random_slices(len_df_train, n_samples=n_samples, days_lookback=max(days_lookbacks), days_eval=days_eval)
# return n_samples * len(days_lookbacks) slices
sets_lookback_slices = _4_lookback_slices(max_slices=max_lookback_slices, days_lookbacks=days_lookbacks, verbose=False)

if verbose:
  print(f'max_lookback_slices:\n{max_lookback_slices}')
  print(f'sets_lookback_slices:\n{sets_lookback_slices}')

Generate lists, n_samples long, of the highest performance ranked symbols. The performance metrics are: CAGR/UI, CAGR/retnStd, retnStd/UI. n_top_syms of the best performing symbols from each metric are combined. The symbols are sorted by their number of appearances in the combined pool, and are placed in a list. A slice of the best performing symbols is selected by syms_start:syms_end, i.e. top_set_syms_n_freq[syms_start:syms_end].     

The performance metrics are calculated based on slices in sets_lookback_slices.  The first two numbers are ilocs for training. The last two numbers are ilocs for evaluation.

In [127]:
grp_top_set_syms_n_freq = []  # list of lists of top_set_symbols_n_freq, there are n_samples lists in list
grp_top_set_syms = []  # list of lists of top_set_symbols, there are n_samples lists in list
# loop thru lists of tuples of start_train:end_train:end_eval, i.e.
#  [[(887, 917, 927), (857, 917, 927), (797, 917, 927)],
#  [(483, 513, 523), (453, 513, 523), (393, 513, 523)]]
for lb_slices in sets_lookback_slices:
  grp_most_common_syms = []  
  for lb_slice in lb_slices:  # lb_slice, e.g. (246, 276, 286)
    start_train = lb_slice[0]
    end_train = lb_slice[1]
    start_eval = end_train
    end_eval = lb_slice[2]
    lookback = end_train - start_train
    d_eval = end_eval - start_eval

    if verbose:
      print(f'days lookback: {lookback}')
      print(f'lb_slices:     {lb_slices}')
      print(f'lb_slice:      {lb_slice}')
      print(f'days eval:     {d_eval}')    
      print(f'start_train:   {start_train}')
      print(f'end_train:     {end_train}')
      # print(f'start_eval:    {start_eval}')
      # print(f'end_eval:      {end_eval}')`

    _df = df_train.iloc[start_train:end_train]
    perf_ranks, most_common_syms = _5_perf_ranks(_df, n_top_syms=n_top_syms)
    grp_most_common_syms.append(most_common_syms)
    
    if verbose:    
      # 1 lookback of r_CAGR/UI, r_CAGR/retnStd, r_retnStd/UI
      print(f'perf_ranks: {perf_ranks}')  
      # most common symbols of perf_ranks 
      print(f'most_common_syms: {most_common_syms}')     
      # grp_perf_ranks[lookback] = perf_ranks
      print(f'+++ finish lookback slice {lookback} +++\n')

  if verbose:
    print(f'grp_most_common_syms: {grp_most_common_syms}')
    # grp_most_common_syms a is list of lists of tuples of 
    #  the most-common-symbols symbol:frequency cumulated from
    #  each days_lookback  
    print(f'**** finish lookback slices {lb_slices} ****\n')

  # flatten list of lists of (symbol:frequency)
  flat_grp_most_common_syms = [val for sublist in grp_most_common_syms for val in sublist]
  # group symbols from set of days_lookbacks (i.e. lb_slices) and sum frequency of the symbols
  set_most_common_syms = _6_grp_tuples_sort_sum(flat_grp_most_common_syms, reverse=True)
  # get the top few most-frequent symbol:frequency pairs
  top_set_syms_n_freq = set_most_common_syms[0:n_top_syms]
  # get symbols from top_set_syms_n_freq

###################################  
  # top_set_syms = [i[0] for i in top_set_syms_n_freq]
  top_set_syms = [i[0] for i in top_set_syms_n_freq[syms_start:syms_end]]  
###################################  
  
  grp_top_set_syms_n_freq.append(top_set_syms_n_freq)
  grp_top_set_syms.append(top_set_syms)

  if verbose:  
    print(f'top {n_top_syms} ranked symbols and frequency from set {lb_slices}:\n{top_set_syms_n_freq}')
    print(f'top {n_top_syms} ranked symbols from set {lb_slices}:\n{top_set_syms}')  
    print(f'===== finish top {n_top_syms} ranked symbols from days_lookback set {lb_slices} =====\n\n')

In [128]:
print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
for item in z_grp_top_set_syms:
  print(item)

z_grp_top_set_syms:
((292, 322, 327), ['SIG', 'CATO', 'TNDM', 'AVXL', 'CYRX', 'W', 'ALGN', 'DDS', 'MED', 'PLAY'])
((474, 504, 509), ['JYNT', 'LSCC', 'NXST', 'OLED', 'W', 'CAR', 'GRMN', 'QURE', 'AGYS', 'LNTH'])
((136, 166, 171), ['MERC', 'SHYF', 'OSTK', 'QURE', 'ACLS', 'AMP', 'DHI', 'ETR', 'MKSI', 'NNI'])
((294, 324, 329), ['AVXL', 'SIG', 'TNDM', 'CYRX', 'RARE', 'ALGN', 'ETSY', 'MED', 'PI', 'PLAY'])
((365, 395, 400), ['DCI', 'MCFT', 'SBH', 'CLF', 'EW', 'TFX', 'NVAX', 'AMD', 'AU', 'CARA'])
((314, 344, 349), ['CARA', 'RAMP', 'KDP', 'NTRA', 'RDWR', 'RPM', 'ARWR', 'BLFS', 'MKC', 'XENE'])
((793, 823, 828), ['CHRD', 'CPE', 'ENLC', 'MGI', 'ROCC', 'CELH', 'CWH', 'STKL', 'TA', 'TRGP'])
((248, 278, 283), ['CATO', 'TNDM', 'GES', 'MOV', 'ANF', 'NSTG', 'RCII', 'AMBC', 'ANGO', 'AXON'])
((923, 953, 958), ['FATE', 'BCRX', 'NTLA', 'PERI', 'BPT', 'CPE', 'JWN', 'NOG', 'NSTG', 'OXY'])
((576, 606, 611), ['AMKR', 'PERI', 'CSTM', 'FCN', 'NVCR', 'OSTK', 'SSRM', 'ENTG', 'MAT', 'MU'])
((194, 224, 229), ['STX', '

In [129]:
col_add0 = ['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end']
col_add1 = ['grp(retnStd/UI)_mean',   'grp(retnStd/UI)_std',   'grp(retnStd/UI)_mean/std']
col_add2 = ['grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std']
col_add3 = ['grp(CAGR/UI)_mean',      'grp(CAGR/UI)_std',      'grp(CAGR/UI)_mean/std']
col_add4 = ['SPY_retnStd/UI', 'SPY_CAGR/retnStd', 'SPY_CAGR/UI']
col_add_total = col_add0 + col_add1 + col_add2 + col_add3 + col_add4
print(f'col_add_total: {col_add_total}')

col_add_total: ['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end', 'grp(retnStd/UI)_mean', 'grp(retnStd/UI)_std', 'grp(retnStd/UI)_mean/std', 'grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std', 'grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'grp(CAGR/UI)_mean/std', 'SPY_retnStd/UI', 'SPY_CAGR/retnStd', 'SPY_CAGR/UI']


In [130]:
from yf_utils import _7_perf_eval

print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
# z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms[26:29])

for i, (_lookback_slice, _top_set_syms) in enumerate(z_grp_top_set_syms):
# for i, (_lookback_slice, _top_set_syms) in enumerate(z_grp_top_set_syms[26:29]):

  start_train = _lookback_slice[0]
  end_train = _lookback_slice[1]
  start_eval = end_train
  end_eval = _lookback_slice[2]

  print(f'{i + 1 } of {n_samples} max_lookback_slice')
  print(f'max_lookback_slice: {_lookback_slice}')
  # dates correspond to max_lookback_slice
  d_start_train = df_train.index[start_train].strftime('%Y-%m-%d')
  d_end_train = df_train.index[end_train].strftime('%Y-%m-%d')
  d_start_eval = d_end_train
  d_end_eval = df_train.index[end_eval].strftime('%Y-%m-%d')
  d_df_eval_start = d_end_train
  d_df_eval_end = df_train.index[end_eval - 1].strftime('%Y-%m-%d')  
  print(f'max lookback dates: {d_start_train}, {d_end_train}, {d_end_eval}')
  print(f'df_eval dates (inclusive): {d_df_eval_start} - {d_df_eval_end}')    
  print(f'top_set_syms: {_top_set_syms}\n')

  # df_eval to show _top_set_syms along with SPY 
  symbols_eval = _top_set_syms.copy()
  symbols_eval.append('SPY')
  print(f'symbols_eval: {symbols_eval}, _top_set_syms: {_top_set_syms}')
  df_eval = df_train[start_eval:end_eval][symbols_eval]

  if verbose:
    # print(f'start_eval: {start_eval}')
    # print(f'end_eval:   {end_eval}')  
    print(f'start_eval: {start_eval},  date: {d_end_train}')
    print(f'end_eval:   {end_eval},  date: {d_end_eval},  df_eval last date: {d_df_eval_end}')      
    print(f'\ndf_eval:\n{df_eval}\n')


  _, grp_retnStd_d_UI, grp_CAGR_d_retnStd, grp_CAGR_d_UI = _7_perf_eval(df_eval)
  print(f'grp(retnStd/UI):   mean, std, mean/std: {grp_retnStd_d_UI[0]  :>13,.3f}, {grp_retnStd_d_UI[1]  :>13,.3f}, {grp_retnStd_d_UI[2]  :>13,.3f}')
  print(f'grp(CAGR/retnStd): mean, std, mean/std: {grp_CAGR_d_retnStd[0]:>13,.3f}, {grp_CAGR_d_retnStd[1]:>13,.3f}, {grp_CAGR_d_retnStd[2]:>13,.3f}')
  print(f'grp(CAGR/UI):      mean, std, mean/std: {grp_CAGR_d_UI[0]     :>13,.3f}, {grp_CAGR_d_UI[1]     :>13,.3f}, {grp_CAGR_d_UI[2]     :>13,.3f}')

  _sym_idx = ['SPY']
  df_SPY = df_train[start_eval:end_eval][_sym_idx]
  _, SPY_retnStd_d_UI, SPY_CAGR_d_retnStd, SPY_CAGR_d_UI = _7_perf_eval(df_SPY)
  print(f'\nSPY: retnStd/UI, CAGR/retnStd, CAGR/UI: {SPY_retnStd_d_UI[0]:>13,.3f}, {SPY_CAGR_d_retnStd[0]:>13,.3f}, {SPY_CAGR_d_UI[0]:>13,.3f}')

  if store_results:  # record results to df
    # row_add = [n_samples, str(days_lookbacks), days_eval, n_top_syms, syms_start, syms_end, grp_CAGR_d_UI[0], grp_CAGR_d_UI[1], grp_CAGR_d_UI[2], SPY_CAGR_d_UI[0]]
    # df_eval_results.loc[len(df_eval_results)] = row_add
    row_add0      = [n_samples, str(days_lookbacks), days_eval, n_top_syms, syms_start, syms_end]
    row_add1      = [grp_retnStd_d_UI[0],   grp_retnStd_d_UI[1],   grp_retnStd_d_UI[2]]
    row_add2      = [grp_CAGR_d_retnStd[0], grp_CAGR_d_retnStd[1], grp_CAGR_d_retnStd[2]]
    row_add3      = [grp_CAGR_d_UI[0],      grp_CAGR_d_UI[1],      grp_CAGR_d_UI[2]]
    row_add4      = [SPY_retnStd_d_UI[0],   SPY_CAGR_d_retnStd[0], SPY_CAGR_d_UI[0]]
    row_add_total = row_add0 + row_add1 + row_add2 + row_add3 + row_add4
    print(f'row_add_total: {row_add_total}')
    df_eval_results.loc[len(df_eval_results)] = row_add_total
    # print(f'appended row_add to df_eval_results:\n{row_add}\n')
    print(f'appended row_add to df_eval_results:\n{row_add_total}\n')
  print('='*50, '\n')  

z_grp_top_set_syms:
1 of 400 max_lookback_slice
max_lookback_slice: (292, 322, 327)
max lookback dates: 2018-05-01, 2018-06-13, 2018-06-20
df_eval dates (inclusive): 2018-06-13 - 2018-06-19
top_set_syms: ['SIG', 'CATO', 'TNDM', 'AVXL', 'CYRX', 'W', 'ALGN', 'DDS', 'MED', 'PLAY']

symbols_eval: ['SIG', 'CATO', 'TNDM', 'AVXL', 'CYRX', 'W', 'ALGN', 'DDS', 'MED', 'PLAY', 'SPY'], _top_set_syms: ['SIG', 'CATO', 'TNDM', 'AVXL', 'CYRX', 'W', 'ALGN', 'DDS', 'MED', 'PLAY']
grp(retnStd/UI):   mean, std, mean/std:           inf,           nan,           nan
grp(CAGR/retnStd): mean, std, mean/std:     1,018.394,     1,934.363,         0.526
grp(CAGR/UI):      mean, std, mean/std:           inf,           nan,           nan

SPY: retnStd/UI, CAGR/retnStd, CAGR/UI:         0.749,       -78.225,       -58.593
row_add_total: [400, '[15, 30]', 5, 20, 0, 10, inf, nan, nan, 1018.394078045316, 1934.3626014629738, 0.5264752726686798, inf, nan, nan, 0.749028221164424, -78.2250634591159, -58.592780133255765]
a

In [131]:
# # Creating Empty DataFrame and Storing it in variable df
# df_eval_results = pd.DataFrame(columns=col_add_total)
pickle_dump(df_eval_results, path_data_dump, fp_df_eval_results)
df = pickle_load(path_data_dump, fp_df_eval_results)
df

Unnamed: 0,n_samples,days_lookbacks,days_eval,n_top_syms,syms_start,syms_end,grp(retnStd/UI)_mean,...,grp(CAGR/retnStd)_mean/std,grp(CAGR/UI)_mean,grp(CAGR/UI)_std,grp(CAGR/UI)_mean/std,SPY_retnStd/UI,SPY_CAGR/retnStd,SPY_CAGR/UI
0,400,"[15, 30, 60,...",10,20,0,20,1.108222,...,0.278516,1.233041e+03,4819.519128,0.255843,1.237703,52.961179,65.550204
1,400,"[15, 30, 60,...",10,20,0,20,2.548395,...,0.256072,5.087090e+04,155754.011507,0.326611,1.008170,19.357624,19.515773
2,400,"[15, 30, 60,...",10,20,0,20,0.635840,...,-0.570477,-3.047098e+00,52.327578,-0.058231,0.766355,-57.849946,-44.333608
3,400,"[15, 30, 60,...",10,20,0,20,1.278825,...,0.364307,1.155709e+03,3027.321602,0.381760,2.549837,158.783768,404.872663
4,400,"[15, 30, 60,...",10,20,0,20,0.614480,...,-0.259152,6.407780e+00,71.436772,0.089699,0.515219,-39.893368,-20.553813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20395,400,"[15, 30]",5,20,0,10,1.839872,...,0.301995,9.694942e+03,31868.875119,0.304213,0.668525,-73.401901,-49.071040
20396,400,"[15, 30]",5,20,0,10,inf,...,0.433374,inf,,,5.055090,125.413425,633.976131
20397,400,"[15, 30]",5,20,0,10,1.663279,...,0.634661,3.949548e+02,591.880072,0.667289,2.344939,63.165100,148.118328
20398,400,"[15, 30]",5,20,0,10,inf,...,0.293500,inf,,,2.556020,33.249721,84.986956


In [132]:
# https://towardsdatascience.com/pandas-groupby-a-simple-but-detailed-tutorial-314b8f37005d
# https://towardsdatascience.com/accessing-data-in-a-multiindex-dataframe-in-pandas-569e8767201d
# https://towardsdatascience.com/summarizing-data-with-pandas-crosstab-efc8b9abecf
# https://towardsdatascience.com/how-to-flatten-multiindex-columns-and-rows-in-pandas-f5406c50e569
# https://datascientyst.com/list-aggregation-functions-aggfunc-groupby-pandas/

In [133]:
print(f'len(df.columns): {len(df.columns)}')
print(f'df.columns: {df.columns}')

len(df.columns): 18
df.columns: Index(['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end', 'grp(retnStd/UI)_mean', 'grp(retnStd/UI)_std', 'grp(retnStd/UI)_mean/std', 'grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std', 'grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'grp(CAGR/UI)_mean/std', 'SPY_retnStd/UI', 'SPY_CAGR/retnStd', 'SPY_CAGR/UI'], dtype='object')


In [134]:
# pd.crosstab(df['days_lookbacks'], df['grp(CAGR/UI)_mean'])
tbl = df.groupby(['days_lookbacks', 'days_eval', 'syms_start', 'syms_end'])\
        .agg({'grp(CAGR/retnStd)_mean':     ['mean', 'std'],
              'grp(CAGR/retnStd)_mean/std': ['mean', 'std'],
              'SPY_CAGR/retnStd':           ['mean', 'std']})

In [135]:
tbl

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,grp(CAGR/retnStd)_mean,grp(CAGR/retnStd)_mean,grp(CAGR/retnStd)_mean/std,grp(CAGR/retnStd)_mean/std,SPY_CAGR/retnStd,SPY_CAGR/retnStd
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std
days_lookbacks,days_eval,syms_start,syms_end,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
[120],5,0,5,500186.9,4229700.0,0.329713,0.785886,123.411555,281.441718
[120],5,0,10,2147785000000.0,41042170000000.0,0.289564,0.604811,178.953439,758.264793
[120],5,0,20,2.97484e+16,5.710088e+17,0.264614,0.299683,187.351137,838.811051
[120],10,0,20,5.094196e+22,1.011342e+24,0.206123,0.46592,70.452072,126.538973
"[14, 28]",5,0,5,2.06746e+28,4.134919999999999e+29,0.293027,0.646296,122.474102,415.265147
"[14, 30]",5,0,5,4.13492e+28,5.8403279999999994e+29,0.298647,0.726377,158.319994,548.438491
"[15, 29]",5,0,5,1.928587e+20,3.85458e+21,0.293753,0.625667,140.524341,418.463158
"[15, 30, 60, 120]",5,0,5,683277400000000.0,1.335791e+16,0.3611,0.488997,152.503422,457.155518
"[15, 30, 60, 120]",5,0,10,2.3690700000000002e+42,4.738141e+43,0.332299,0.319749,159.414856,452.971314
"[15, 30, 60, 120]",5,0,20,1.240942e+42,2.4818829999999997e+43,0.253898,0.256363,271.346679,1335.441749


In [142]:
tbl['SPY_m/s'] = tbl['SPY_CAGR/retnStd', 'mean'] / tbl['SPY_CAGR/retnStd', 'std'] 
tbl['grp-SPY_m/s'] = tbl['grp(CAGR/retnStd)_mean/std', 'mean'] - tbl['SPY_m/s'] 
tbl.sort_values(by='grp-SPY_m/s', ascending=False, inplace=True)
# tbl.sort_values(by='days_lookbacks', ascending=False, inplace=True)
tbl

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,grp(CAGR/retnStd)_mean,grp(CAGR/retnStd)_mean,grp(CAGR/retnStd)_mean/std,grp(CAGR/retnStd)_mean/std,SPY_CAGR/retnStd,SPY_CAGR/retnStd,SPY_m/s,grp-SPY_m/s
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std,Unnamed: 10_level_1,Unnamed: 11_level_1
days_lookbacks,days_eval,syms_start,syms_end,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
"[15, 30]",4,0,5,8.893989e+21,1.778798e+23,0.300491,0.558977,249.857544,1265.922671,0.197372,0.10312
"[15, 30]",3,0,5,3.32982e+52,4.703172e+53,0.246861,0.510462,495.058526,3441.429066,0.143853,0.103008
"[15, 30]",5,0,5,2.171648e+42,6.142348e+43,0.302272,0.62442,200.934012,961.705891,0.208935,0.093337
"[15, 30, 60]",5,0,10,95961450000.0,1888727000000.0,0.258161,0.435701,182.68506,1106.975253,0.165031,0.09313
"[30, 60, 120]",5,0,5,3.104123e+18,6.20621e+19,0.322799,0.574396,150.334122,652.335303,0.230455,0.092344
[60],5,0,20,3.702018e+16,7.393521e+17,0.280251,0.231756,170.03274,886.58758,0.191783,0.088468
"[15, 30]",5,0,10,2.3690700000000002e+42,4.735175e+43,0.274047,0.462036,151.102979,809.621633,0.186634,0.087412
"[60, 120]",5,0,10,2.199847e+20,4.399692e+21,0.309345,0.406279,146.126775,656.037742,0.222741,0.086604
[30],5,0,10,1.127706e+28,2.255411e+29,0.268133,0.476813,182.727889,976.932044,0.187043,0.08109
"[15, 30, 60]",5,0,5,1.210186e+21,1.393582e+22,0.331214,0.603319,153.87963,606.821152,0.253583,0.07763


In [137]:
# # https://stackoverflow.com/questions/31674195/plot-normal-distribution-given-mean-and-sigma-python
# # loc is mean, scale is standard deviation
# import pylab
# import numpy as np
# from scipy.stats import norm
# # x = np.linspace(-10000,100000,1000)
# x = np.linspace(-40e+10,50e+10,1000)
# y = norm.pdf(x, loc=2.562777e+10, scale=1.036925e+11)    # loc = mean, scale = standard deviation
# # z = norm.pdf(x, loc=3.540615e+10, scale=1.194430e+11)    # for example
# # z1 = norm.pdf(x, loc=298.805901, scale=826.875749)    # for example
# # z1 = norm.pdf(x, loc=1.021825, scale=1.505096)    # for example
# pylab.plot(x,y, 'b')
# # pylab.plot(x,z, 'g')
# # pylab.plot(x,z1, 'r')
# pylab.show()

In [138]:
# # get values of _cols, where grp(CAGR/retnStd)_mean is max after filtering out inf
# _cols = ['grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std']
# # _df_no_inf = df.loc[df['grp(CAGR/retnStd)_mean'] != np.inf]  # df with filter out inf in column grp(CAGR/UI)_mean 
# # _idx = _df_no_inf['grp(CAGR/retnStd)_mean'].idxmax()  # index value of max in grp(CAGR/UI)_mean 
# _idx = df['grp(CAGR/retnStd)_mean'].idxmax()  # index value of max in grp(CAGR/UI)_mean 
# grp_inf_replacement = df.loc[[_idx], _cols].squeeze()  # convert df (only has 1 row) to series
# print(f'_idx: {_idx}')
# grp_inf_replacement


In [139]:
# # get values of _cols, where SPY_CAGR/retnStd is max after filtering out inf
# _cols = ['SPY_CAGR/retnStd']
# # _df_no_inf = df.loc[df['SPY_CAGR/retnStd'] != np.inf]  # df with filter out inf in column grp(CAGR/UI)_mean 
# _idx = df['SPY_CAGR/retnStd'].idxmax()  # index value of max in grp(CAGR/UI)_mean 
# SPY_inf_replacement = df.loc[[_idx], _cols].squeeze()  # convert df (only has 1 row) to series
# print(f'_idx: {_idx}')
# SPY_inf_replacement

In [140]:
# # # https://stackoverflow.com/questions/50773107/how-to-replace-infinite-value-with-maximum-value-of-a-pandas-column
# # replace inf in column grp(CAGR/UI)_mean
# df['grp(CAGR/UI)_mean'].replace(np.inf, grp_inf_replacement['grp(CAGR/UI)_mean'], inplace=True)
# # replace NaN in column grp(CAGR/UI)_std
# df['grp(CAGR/UI)_std'].replace(np.nan, grp_inf_replacement['grp(CAGR/UI)_std'], inplace=True)
# # replace NaN in column grp(CAGR/UI)_mean/std
# df['grp(CAGR/UI)_mean/std'].replace(np.nan, grp_inf_replacement['grp(CAGR/UI)_mean/std'], inplace=True)
# # replace inf in column SPY_CAGR/UI
# df['SPY_CAGR/UI'].replace(np.inf, SPY_inf_replacement, inplace=True)
# df
# df

In [141]:
# print(f'n_samples:             {n_samples:>13}')
# print(f'days_lookbacks:        {days_lookbacks}')
# print(f'days_eval:             {days_eval:>13}')
# print(f'n_top_syms:            {n_top_syms:>13}')
# print(f'syms_start:            {syms_start:>13}')
# print(f'syms_end:              {syms_end:>13}')
# print(f'grp(CAGR/UI)_mean:     {grp_CAGR_d_UI[0]:>13,.3f}')
# print(f'grp(CAGR/UI)_std:      {grp_CAGR_d_UI[1]:>13,.3f}')
# print(f'grp(CAGR/UI)_mean/std: {grp_CAGR_d_UI[2]:>13,.3f}')
# print(f'SPY_CAGR/UI:           {SPY_CAGR_d_UI[0]:>13,.3f}')