In [41]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML
from yf_utils import _2_split_train_val_test, _3_random_slices, _4_lookback_slices
from yf_utils import _5_perf_ranks, _6_grp_tuples_sort_sum
from myUtils import pickle_load, pickle_dump

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 14)
pd.set_option('display.max_colwidth', 16)
pd.set_option('display.width', 800)

verbose = False  # True prints more output
# verbose = True  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"
fp_df_close_clean = 'df_close_clean'
fp_df_eval_results = 'df_eval_results'

df_close_clean = pickle_load(path_data_dump, fp_df_close_clean)

In [42]:
# Split df_close_clean into training (df_train), validation (df_val) and test (df_test) set.
# The default split is 0.7, 0.2, 0.1 respectively.

###################################
# df_train, df_val, df_test = _2_split_train_val_test(df_close_clean, s_train=1, s_val=0, s_test=0)
df_train, df_val, df_test = _2_split_train_val_test(df_close_clean)
###################################

len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)
print(f'len_df_train: {len_df_train}, len_df_val: {len_df_val}, len_df_test: {len_df_test} ')

len_df_train: 1050, len_df_val: 300, len_df_test: 150 


In [43]:
# if store_results:  # record results to df
# my_cols = ['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end', 'grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'grp(CAGR/UI)_mean/std', 'SPY_CAGR/UI']
df_eval_results = pickle_load(path_data_dump, fp_df_eval_results)

In [287]:
# write run results to df_eval_results
# store_results = False
store_results = True

# number of max lookback tuples to create for iloc start_train:end_train:end_eval
# i.e. number of grp_top_set_syms_n_freq and grp_top_set_syms 
n_samples = 400  
# n_samples = 2

# for training, the number of days to lookback from iloc max-lookback end_train
days_lookbacks = [15, 30, 60, 120]
days_lookbacks = [30, 60, 120]
days_lookbacks = [60, 120]
days_lookbacks = [120]
days_lookbacks = [60]
days_lookbacks = [30]
days_lookbacks = [15]
days_lookbacks = [15, 30]
days_lookbacks = [15, 30, 60]

days_lookbacks.sort()

# number of days from end_train are used to evaluate effectiveness of the training
# days_eval = 10
days_eval = 5   

# number of the most-common symbols from days_lookbacks' performance rankings to keep
n_top_syms = 20  

syms_start = 0  #  start index of n_top_syms for evaluation

syms_end = n_top_syms  #  end index of n_top_syms for evaluation
syms_end = 5  #  end index of n_top_syms for evaluation

create a sets of iloc lookback slices (start_train:end_train:end_eval), where  
end_train - start_train = days_lookback  
end_eval - end_train = days_eval  
for example,  
if given:  
 n_samples = 2  
 days_lookbacks = [30, 60, 120]  
 days_eval = 10  
a possible result is:  
 max_lookback_slices:  
 [(150, 270, 280), (5, 125, 135)]  
 where 270-150=125-5=max(days_lookbacks), 280-270=135-125=days_eval  
 sets_lookback_slices:  
 [[(240, 270, 280), (210, 270, 280), (150, 270, 280)], [(95, 125, 135), (65, 125, 135), (5, 125, 135)]]  
  where in a set, 270-240=days_lookbacks[0], 270-210=days_lookbacks[1], 270-150=days_lookbacks[2]  
  and 270, i.e. end_train, is constant for the set  

In [288]:
# return n_samples slices
max_lookback_slices = _3_random_slices(len_df_train, n_samples=n_samples, days_lookback=max(days_lookbacks), days_eval=days_eval)
# return n_samples * len(days_lookbacks) slices
sets_lookback_slices = _4_lookback_slices(max_slices=max_lookback_slices, days_lookbacks=days_lookbacks, verbose=False)

if verbose:
  print(f'max_lookback_slices:\n{max_lookback_slices}')
  print(f'sets_lookback_slices:\n{sets_lookback_slices}')

Generate lists, n_samples long, of the highest performance ranked symbols. The performance metrics are: CAGR/UI, CAGR/retnStd, retnStd/UI. n_top_syms of the best performing symbols from each metric are combined. The symbols are sorted by their number of appearances in the combined pool, and are placed in a list. A slice of the best performing symbols is selected by syms_start:syms_end, i.e. top_set_syms_n_freq[syms_start:syms_end].     

The performance metrics are calculated based on slices in sets_lookback_slices.  The first two numbers are ilocs for training. The last two numbers are ilocs for evaluation.

In [289]:
grp_top_set_syms_n_freq = []  # list of lists of top_set_symbols_n_freq, there are n_samples lists in list
grp_top_set_syms = []  # list of lists of top_set_symbols, there are n_samples lists in list
# loop thru lists of tuples of start_train:end_train:end_eval, i.e.
#  [[(887, 917, 927), (857, 917, 927), (797, 917, 927)],
#  [(483, 513, 523), (453, 513, 523), (393, 513, 523)]]
for lb_slices in sets_lookback_slices:
  grp_most_common_syms = []  
  for lb_slice in lb_slices:  # lb_slice, e.g. (246, 276, 286)
    start_train = lb_slice[0]
    end_train = lb_slice[1]
    start_eval = end_train
    end_eval = lb_slice[2]
    lookback = end_train - start_train
    d_eval = end_eval - start_eval

    if verbose:
      print(f'days lookback: {lookback}')
      print(f'lb_slices:     {lb_slices}')
      print(f'lb_slice:      {lb_slice}')
      print(f'days eval:     {d_eval}')    
      print(f'start_train:   {start_train}')
      print(f'end_train:     {end_train}')
      # print(f'start_eval:    {start_eval}')
      # print(f'end_eval:      {end_eval}')`

    _df = df_train.iloc[start_train:end_train]
    perf_ranks, most_common_syms = _5_perf_ranks(_df, n_top_syms=n_top_syms)
    grp_most_common_syms.append(most_common_syms)
    
    if verbose:    
      # 1 lookback of r_CAGR/UI, r_CAGR/retnStd, r_retnStd/UI
      print(f'perf_ranks: {perf_ranks}')  
      # most common symbols of perf_ranks 
      print(f'most_common_syms: {most_common_syms}')     
      # grp_perf_ranks[lookback] = perf_ranks
      print(f'+++ finish lookback slice {lookback} +++\n')

  if verbose:
    print(f'grp_most_common_syms: {grp_most_common_syms}')
    # grp_most_common_syms a is list of lists of tuples of 
    #  the most-common-symbols symbol:frequency cumulated from
    #  each days_lookback  
    print(f'**** finish lookback slices {lb_slices} ****\n')

  # flatten list of lists of (symbol:frequency)
  flat_grp_most_common_syms = [val for sublist in grp_most_common_syms for val in sublist]
  # group symbols from set of days_lookbacks (i.e. lb_slices) and sum frequency of the symbols
  set_most_common_syms = _6_grp_tuples_sort_sum(flat_grp_most_common_syms, reverse=True)
  # get the top few most-frequent symbol:frequency pairs
  top_set_syms_n_freq = set_most_common_syms[0:n_top_syms]
  # get symbols from top_set_syms_n_freq

###################################  
  # top_set_syms = [i[0] for i in top_set_syms_n_freq]
  top_set_syms = [i[0] for i in top_set_syms_n_freq[syms_start:syms_end]]  
###################################  
  
  grp_top_set_syms_n_freq.append(top_set_syms_n_freq)
  grp_top_set_syms.append(top_set_syms)

  if verbose:  
    print(f'top {n_top_syms} ranked symbols and frequency from set {lb_slices}:\n{top_set_syms_n_freq}')
    print(f'top {n_top_syms} ranked symbols from set {lb_slices}:\n{top_set_syms}')  
    print(f'===== finish top {n_top_syms} ranked symbols from days_lookback set {lb_slices} =====\n\n')

In [290]:
print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
for item in z_grp_top_set_syms:
  print(item)

z_grp_top_set_syms:
((345, 405, 410), ['CR', 'DCI', 'FTSM', 'PBR', 'AMD', 'PKI', 'CI', 'FATE', 'NOG', 'NVAX', 'PLAY', 'AVGO', 'CCJ', 'CLF', 'ENS', 'FLS', 'FTNT', 'HCA', 'HP', 'IMO'])
((695, 755, 760), ['ENPH', 'FATE', 'KDNY', 'AY', 'NEA', 'SBAC', 'DPZ', 'EVBG', 'FTSM', 'BFZ', 'FNV', 'NEM', 'NVAX', 'CELH', 'DQ', 'NMZ', 'PCG', 'RNG', 'RUN', 'TSLA'])
((34, 94, 99), ['BGNE', 'NVCR', 'FOLD', 'CPRX', 'CYRX', 'PTCT', 'BLFS', 'BPMC', 'KDNY', 'ARCH', 'ATHM', 'AXSM', 'EXEL', 'IART', 'LGIH', 'MT', 'NNI', 'NVGS', 'PETS', 'RIO'])
((730, 790, 795), ['CNX', 'EQT', 'NVAX', 'NG', 'RRC', 'TDOC', 'UNFI', 'WING', 'AMZN', 'CTRA', 'FIVN', 'INCY', 'OSTK', 'PETS', 'REGN', 'ABM', 'BIG', 'CCJ', 'CLX', 'DG'])
((20, 80, 85), ['PTCT', 'NVCR', 'ALNY', 'AON', 'CYRX', 'MAA', 'PI', 'COLL', 'ENSG', 'GDEN', 'HALO', 'IRBT', 'IVR', 'JNJ', 'LAUR', 'LGIH', 'LII', 'MO', 'MOH', 'MYGN'])
((966, 1026, 1031), ['SJR', 'CLMT', 'FCN', 'HE', 'HIG', 'CAR', 'EIG', 'GME', 'LC', 'MBI', 'WBD', 'CPE', 'LOB', 'WSM', 'ACRS', 'AEE', 'AZO', '

In [291]:
col_add0 = ['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end']
col_add1 = ['grp(retnStd/UI)_mean',   'grp(retnStd/UI)_std',   'grp(retnStd/UI)_mean/std']
col_add2 = ['grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std']
col_add3 = ['grp(CAGR/UI)_mean',      'grp(CAGR/UI)_std',      'grp(CAGR/UI)_mean/std']
col_add4 = ['SPY_retnStd/UI', 'SPY_CAGR/retnStd', 'SPY_CAGR/UI']
col_add_total = col_add0 + col_add1 + col_add2 + col_add3 + col_add4
print(f'col_add_total: {col_add_total}')

col_add_total: ['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end', 'grp(retnStd/UI)_mean', 'grp(retnStd/UI)_std', 'grp(retnStd/UI)_mean/std', 'grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std', 'grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'grp(CAGR/UI)_mean/std', 'SPY_retnStd/UI', 'SPY_CAGR/retnStd', 'SPY_CAGR/UI']


In [292]:
from yf_utils import _7_perf_eval

print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
# z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms[26:29])

for i, (_lookback_slice, _top_set_syms) in enumerate(z_grp_top_set_syms):
# for i, (_lookback_slice, _top_set_syms) in enumerate(z_grp_top_set_syms[26:29]):

  start_train = _lookback_slice[0]
  end_train = _lookback_slice[1]
  start_eval = end_train
  end_eval = _lookback_slice[2]

  print(f'{i + 1 } of {n_samples} max_lookback_slice')
  print(f'max_lookback_slice: {_lookback_slice}')
  # dates correspond to max_lookback_slice
  d_start_train = df_train.index[start_train].strftime('%Y-%m-%d')
  d_end_train = df_train.index[end_train].strftime('%Y-%m-%d')
  d_start_eval = d_end_train
  d_end_eval = df_train.index[end_eval].strftime('%Y-%m-%d')
  d_df_eval_start = d_end_train
  d_df_eval_end = df_train.index[end_eval - 1].strftime('%Y-%m-%d')  
  print(f'max lookback dates: {d_start_train}, {d_end_train}, {d_end_eval}')
  print(f'df_eval dates (inclusive): {d_df_eval_start} - {d_df_eval_end}')    
  print(f'top_set_syms: {_top_set_syms}\n')

  # df_eval to show _top_set_syms along with SPY 
  symbols_eval = _top_set_syms.copy()
  symbols_eval.append('SPY')
  print(f'symbols_eval: {symbols_eval}, _top_set_syms: {_top_set_syms}')
  df_eval = df_train[start_eval:end_eval][symbols_eval]

  if verbose:
    # print(f'start_eval: {start_eval}')
    # print(f'end_eval:   {end_eval}')  
    print(f'start_eval: {start_eval},  date: {d_end_train}')
    print(f'end_eval:   {end_eval},  date: {d_end_eval},  df_eval last date: {d_df_eval_end}')      
    print(f'\ndf_eval:\n{df_eval}\n')


  _, grp_retnStd_d_UI, grp_CAGR_d_retnStd, grp_CAGR_d_UI = _7_perf_eval(df_eval)
  print(f'grp(retnStd/UI):   mean, std, mean/std: {grp_retnStd_d_UI[0]  :>13,.3f}, {grp_retnStd_d_UI[1]  :>13,.3f}, {grp_retnStd_d_UI[2]  :>13,.3f}')
  print(f'grp(CAGR/retnStd): mean, std, mean/std: {grp_CAGR_d_retnStd[0]:>13,.3f}, {grp_CAGR_d_retnStd[1]:>13,.3f}, {grp_CAGR_d_retnStd[2]:>13,.3f}')
  print(f'grp(CAGR/UI):      mean, std, mean/std: {grp_CAGR_d_UI[0]     :>13,.3f}, {grp_CAGR_d_UI[1]     :>13,.3f}, {grp_CAGR_d_UI[2]     :>13,.3f}')

  _sym_idx = ['SPY']
  df_SPY = df_train[start_eval:end_eval][_sym_idx]
  _, SPY_retnStd_d_UI, SPY_CAGR_d_retnStd, SPY_CAGR_d_UI = _7_perf_eval(df_SPY)
  print(f'\nSPY: retnStd/UI, CAGR/retnStd, CAGR/UI: {SPY_retnStd_d_UI[0]:>13,.3f}, {SPY_CAGR_d_retnStd[0]:>13,.3f}, {SPY_CAGR_d_UI[0]:>13,.3f}')

  if store_results:  # record results to df
    # row_add = [n_samples, str(days_lookbacks), days_eval, n_top_syms, syms_start, syms_end, grp_CAGR_d_UI[0], grp_CAGR_d_UI[1], grp_CAGR_d_UI[2], SPY_CAGR_d_UI[0]]
    # df_eval_results.loc[len(df_eval_results)] = row_add
    row_add0      = [n_samples, str(days_lookbacks), days_eval, n_top_syms, syms_start, syms_end]
    row_add1      = [grp_retnStd_d_UI[0],   grp_retnStd_d_UI[1],   grp_retnStd_d_UI[2]]
    row_add2      = [grp_CAGR_d_retnStd[0], grp_CAGR_d_retnStd[1], grp_CAGR_d_retnStd[2]]
    row_add3      = [grp_CAGR_d_UI[0],      grp_CAGR_d_UI[1],      grp_CAGR_d_UI[2]]
    row_add4      = [SPY_retnStd_d_UI[0],   SPY_CAGR_d_retnStd[0], SPY_CAGR_d_UI[0]]
    row_add_total = row_add0 + row_add1 + row_add2 + row_add3 + row_add4
    print(f'row_add_total: {row_add_total}')
    df_eval_results.loc[len(df_eval_results)] = row_add_total
    # print(f'appended row_add to df_eval_results:\n{row_add}\n')
    print(f'appended row_add to df_eval_results:\n{row_add_total}\n')
  print('='*50, '\n')  

z_grp_top_set_syms:
1 of 400 max_lookback_slice
max_lookback_slice: (345, 405, 410)
max lookback dates: 2018-07-11, 2018-10-04, 2018-10-11
df_eval dates (inclusive): 2018-10-04 - 2018-10-10
top_set_syms: ['CR', 'DCI', 'FTSM', 'PBR', 'AMD', 'PKI', 'CI', 'FATE', 'NOG', 'NVAX', 'PLAY', 'AVGO', 'CCJ', 'CLF', 'ENS', 'FLS', 'FTNT', 'HCA', 'HP', 'IMO']

symbols_eval: ['CR', 'DCI', 'FTSM', 'PBR', 'AMD', 'PKI', 'CI', 'FATE', 'NOG', 'NVAX', 'PLAY', 'AVGO', 'CCJ', 'CLF', 'ENS', 'FLS', 'FTNT', 'HCA', 'HP', 'IMO', 'SPY'], _top_set_syms: ['CR', 'DCI', 'FTSM', 'PBR', 'AMD', 'PKI', 'CI', 'FATE', 'NOG', 'NVAX', 'PLAY', 'AVGO', 'CCJ', 'CLF', 'ENS', 'FLS', 'FTNT', 'HCA', 'HP', 'IMO']
grp(retnStd/UI):   mean, std, mean/std:         1.025,         0.806,         1.271
grp(CAGR/retnStd): mean, std, mean/std:        59.839,       447.000,         0.134
grp(CAGR/UI):      mean, std, mean/std:       230.403,     1,117.619,         0.206

SPY: retnStd/UI, CAGR/retnStd, CAGR/UI:         0.832,       -58.041,    

  arr_returns_std_div_arr_UI = arr_returns_std / arr_UI
  arr_CAGR_div_arr_Std = arr_CAGR / arr_returns_std
  arr_CAGR_div_arr_UI = arr_CAGR / arr_UI



SPY: retnStd/UI, CAGR/retnStd, CAGR/UI:         1.540,        35.521,        54.718
row_add_total: [400, '[15, 30, 60]', 5, 20, 0, 20, 1.2331328081058306, 0.8901393712428322, 1.3853255433293592, 19.876897363283664, 187.93832253336564, 0.10576287526326526, 153.00913280076603, 439.8768684028536, 0.3478453717203317, 1.5404179451116806, 35.52149909640318, 54.717954645367804]
appended row_add to df_eval_results:
[400, '[15, 30, 60]', 5, 20, 0, 20, 1.2331328081058306, 0.8901393712428322, 1.3853255433293592, 19.876897363283664, 187.93832253336564, 0.10576287526326526, 153.00913280076603, 439.8768684028536, 0.3478453717203317, 1.5404179451116806, 35.52149909640318, 54.717954645367804]


273 of 400 max_lookback_slice
max_lookback_slice: (161, 221, 226)
max lookback dates: 2017-10-16, 2018-01-11, 2018-01-19
df_eval dates (inclusive): 2018-01-11 - 2018-01-18
top_set_syms: ['CMC', 'NOG', 'SAGE', 'CEM', 'DVA', 'FATE', 'HAL', 'ATHM', 'TECK', 'VCEL', 'WLK', 'BOOT', 'CRI', 'DINO', 'FDX', 'NVAX', 'PTC

In [293]:
# # Creating Empty DataFrame and Storing it in variable df
# df_eval_results = pd.DataFrame(columns=col_add_total)
pickle_dump(df_eval_results, path_data_dump, fp_df_eval_results)
df = pickle_load(path_data_dump, fp_df_eval_results)
df

Unnamed: 0,n_samples,days_lookbacks,days_eval,n_top_syms,syms_start,syms_end,grp(retnStd/UI)_mean,...,grp(CAGR/retnStd)_mean/std,grp(CAGR/UI)_mean,grp(CAGR/UI)_std,grp(CAGR/UI)_mean/std,SPY_retnStd/UI,SPY_CAGR/retnStd,SPY_CAGR/UI
0,400,"[15, 30, 60,...",10,20,0,20,1.108222,...,0.278516,1.233041e+03,4819.519128,0.255843,1.237703,52.961179,65.550204
1,400,"[15, 30, 60,...",10,20,0,20,2.548395,...,0.256072,5.087090e+04,155754.011507,0.326611,1.008170,19.357624,19.515773
2,400,"[15, 30, 60,...",10,20,0,20,0.635840,...,-0.570477,-3.047098e+00,52.327578,-0.058231,0.766355,-57.849946,-44.333608
3,400,"[15, 30, 60,...",10,20,0,20,1.278825,...,0.364307,1.155709e+03,3027.321602,0.381760,2.549837,158.783768,404.872663
4,400,"[15, 30, 60,...",10,20,0,20,0.614480,...,-0.259152,6.407780e+00,71.436772,0.089699,0.515219,-39.893368,-20.553813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6395,400,"[15, 30, 60]",5,20,0,20,inf,...,0.426706,inf,,,inf,524.269716,inf
6396,400,"[15, 30, 60]",5,20,0,20,inf,...,0.219874,inf,,,2.246044,54.602126,122.638766
6397,400,"[15, 30, 60]",5,20,0,20,6.651440,...,0.303381,1.285453e+04,57734.208766,0.222650,2.345959,80.895109,189.776636
6398,400,"[15, 30, 60]",5,20,0,20,inf,...,0.219264,inf,,,2.214027,65.503261,145.025986


In [294]:
# https://towardsdatascience.com/pandas-groupby-a-simple-but-detailed-tutorial-314b8f37005d
# https://towardsdatascience.com/accessing-data-in-a-multiindex-dataframe-in-pandas-569e8767201d
# https://towardsdatascience.com/summarizing-data-with-pandas-crosstab-efc8b9abecf
# https://towardsdatascience.com/how-to-flatten-multiindex-columns-and-rows-in-pandas-f5406c50e569
# https://datascientyst.com/list-aggregation-functions-aggfunc-groupby-pandas/

In [295]:
print(f'len(df.columns): {len(df.columns)}')
print(f'df.columns: {df.columns}')

len(df.columns): 18
df.columns: Index(['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end', 'grp(retnStd/UI)_mean', 'grp(retnStd/UI)_std', 'grp(retnStd/UI)_mean/std', 'grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std', 'grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'grp(CAGR/UI)_mean/std', 'SPY_retnStd/UI', 'SPY_CAGR/retnStd', 'SPY_CAGR/UI'], dtype='object')


In [296]:
# pd.crosstab(df['days_lookbacks'], df['grp(CAGR/UI)_mean'])
tbl = df.groupby(['days_lookbacks', 'days_eval', 'syms_start', 'syms_end'])\
        .agg({'grp(CAGR/retnStd)_mean':     ['mean', 'std'],
              'grp(CAGR/retnStd)_mean/std': ['mean', 'std'],
              'SPY_CAGR/retnStd':           ['mean', 'std']})

In [297]:
tbl

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,grp(CAGR/retnStd)_mean,grp(CAGR/retnStd)_mean,grp(CAGR/retnStd)_mean/std,grp(CAGR/retnStd)_mean/std,SPY_CAGR/retnStd,SPY_CAGR/retnStd
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std
days_lookbacks,days_eval,syms_start,syms_end,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
[120],5,0,20,2.97484e+16,5.710088e+17,0.264614,0.299683,187.351137,838.811051
[120],10,0,20,5.094196e+22,1.011342e+24,0.206123,0.46592,70.452072,126.538973
"[15, 30, 60, 120]",5,0,20,1.240942e+42,2.4818829999999997e+43,0.253898,0.256363,271.346679,1335.441749
"[15, 30, 60, 120]",10,0,20,2508091000.0,38445580000.0,0.269642,0.3051,87.429111,145.005302
"[15, 30, 60]",5,0,20,9.380732e+19,1.857795e+21,0.271626,0.229689,203.076578,784.449669
"[15, 30]",5,0,20,5.94888e+27,1.181415e+29,0.249452,0.283829,211.491198,1074.058772
[15],5,0,20,2.4818830000000002e+42,3.505512e+43,0.252584,0.319888,138.83881,653.274146
[15],10,0,20,2526167000000000.0,4.887995e+16,0.251577,0.392747,59.841324,97.860429
"[30, 60, 120]",5,0,20,1.153026e+20,2.304597e+21,0.271093,0.246074,170.224752,454.826023
"[30, 60, 120]",10,0,20,3.753346e+20,7.506643e+21,0.274576,0.344481,79.760402,138.78803


In [298]:
# # https://stackoverflow.com/questions/31674195/plot-normal-distribution-given-mean-and-sigma-python
# # loc is mean, scale is standard deviation
# import pylab
# import numpy as np
# from scipy.stats import norm
# # x = np.linspace(-10000,100000,1000)
# x = np.linspace(-40e+10,50e+10,1000)
# y = norm.pdf(x, loc=2.562777e+10, scale=1.036925e+11)    # loc = mean, scale = standard deviation
# # z = norm.pdf(x, loc=3.540615e+10, scale=1.194430e+11)    # for example
# # z1 = norm.pdf(x, loc=298.805901, scale=826.875749)    # for example
# # z1 = norm.pdf(x, loc=1.021825, scale=1.505096)    # for example
# pylab.plot(x,y, 'b')
# # pylab.plot(x,z, 'g')
# # pylab.plot(x,z1, 'r')
# pylab.show()

In [299]:
# # get values of _cols, where grp(CAGR/retnStd)_mean is max after filtering out inf
# _cols = ['grp(CAGR/retnStd)_mean', 'grp(CAGR/retnStd)_std', 'grp(CAGR/retnStd)_mean/std']
# # _df_no_inf = df.loc[df['grp(CAGR/retnStd)_mean'] != np.inf]  # df with filter out inf in column grp(CAGR/UI)_mean 
# # _idx = _df_no_inf['grp(CAGR/retnStd)_mean'].idxmax()  # index value of max in grp(CAGR/UI)_mean 
# _idx = df['grp(CAGR/retnStd)_mean'].idxmax()  # index value of max in grp(CAGR/UI)_mean 
# grp_inf_replacement = df.loc[[_idx], _cols].squeeze()  # convert df (only has 1 row) to series
# print(f'_idx: {_idx}')
# grp_inf_replacement


In [300]:
# # get values of _cols, where SPY_CAGR/retnStd is max after filtering out inf
# _cols = ['SPY_CAGR/retnStd']
# # _df_no_inf = df.loc[df['SPY_CAGR/retnStd'] != np.inf]  # df with filter out inf in column grp(CAGR/UI)_mean 
# _idx = df['SPY_CAGR/retnStd'].idxmax()  # index value of max in grp(CAGR/UI)_mean 
# SPY_inf_replacement = df.loc[[_idx], _cols].squeeze()  # convert df (only has 1 row) to series
# print(f'_idx: {_idx}')
# SPY_inf_replacement

In [301]:
# # # https://stackoverflow.com/questions/50773107/how-to-replace-infinite-value-with-maximum-value-of-a-pandas-column
# # replace inf in column grp(CAGR/UI)_mean
# df['grp(CAGR/UI)_mean'].replace(np.inf, grp_inf_replacement['grp(CAGR/UI)_mean'], inplace=True)
# # replace NaN in column grp(CAGR/UI)_std
# df['grp(CAGR/UI)_std'].replace(np.nan, grp_inf_replacement['grp(CAGR/UI)_std'], inplace=True)
# # replace NaN in column grp(CAGR/UI)_mean/std
# df['grp(CAGR/UI)_mean/std'].replace(np.nan, grp_inf_replacement['grp(CAGR/UI)_mean/std'], inplace=True)
# # replace inf in column SPY_CAGR/UI
# df['SPY_CAGR/UI'].replace(np.inf, SPY_inf_replacement, inplace=True)
# df
# df

In [302]:
# print(f'n_samples:             {n_samples:>13}')
# print(f'days_lookbacks:        {days_lookbacks}')
# print(f'days_eval:             {days_eval:>13}')
# print(f'n_top_syms:            {n_top_syms:>13}')
# print(f'syms_start:            {syms_start:>13}')
# print(f'syms_end:              {syms_end:>13}')
# print(f'grp(CAGR/UI)_mean:     {grp_CAGR_d_UI[0]:>13,.3f}')
# print(f'grp(CAGR/UI)_std:      {grp_CAGR_d_UI[1]:>13,.3f}')
# print(f'grp(CAGR/UI)_mean/std: {grp_CAGR_d_UI[2]:>13,.3f}')
# print(f'SPY_CAGR/UI:           {SPY_CAGR_d_UI[0]:>13,.3f}')