In [2]:
# https://towardsdatascience.com/pandas-groupby-a-simple-but-detailed-tutorial-314b8f37005d
# https://towardsdatascience.com/accessing-data-in-a-multiindex-dataframe-in-pandas-569e8767201d
# https://towardsdatascience.com/summarizing-data-with-pandas-crosstab-efc8b9abecf
# https://towardsdatascience.com/how-to-flatten-multiindex-columns-and-rows-in-pandas-f5406c50e569
# https://datascientyst.com/list-aggregation-functions-aggfunc-groupby-pandas/

In [3]:
import pandas as pd
import datetime
from myUtils import pickle_load, pickle_dump

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', 18)
pd.set_option('display.width', 790)

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

fp_df_eval_sym_freq_results_train = 'df_eval_sym_freq_results_train_2023-03-14'
fp_df_eval_sym_freq_results_validate = 'df_eval_sym_freq_results_validate_2023-03-14'
fp_df_eval_sym_freq_results_test = 'df_eval_sym_freq_results_test_2023-03-14'

str_today = datetime.date.today().strftime('%Y-%m-%d')
fp_results = f'yf_8_results_{str_today}'

In [4]:
run_types = ['train', 'validate', 'test']

In [5]:
for run_type in run_types:
  if run_type == 'train':
    df = pickle_load(path_data_dump, fp_df_eval_sym_freq_results_train)
    name_table = 'tbl_train'
  elif run_type == 'validate':
    df = pickle_load(path_data_dump, fp_df_eval_sym_freq_results_validate)
    name_table = 'tbl_val'
  elif run_type == 'test':
    df = pickle_load(path_data_dump, fp_df_eval_sym_freq_results_test)
    name_table = 'tbl_test'
  else:
    msg_stop = f"ERROR run_type must be 'train', 'validate', or 'test', run_type is: {run_type}"
    raise SystemExit(msg_stop )

  tbl = df.groupby(['days_lookbacks', 'days_eval', 'sym_freq_cnt'])\
          .agg({'grp(CAGR)_mean':          ['count', 'mean', 'std',],
                'grp(CAGR/UI)_mean':       ['mean', 'std',],
                'grp(CAGR/retnStd)_mean':  ['mean', 'std',],
                'SPY_CAGR':                ['mean', 'std',],
                'SPY_CAGR/UI':             ['mean', 'std',],
                'SPY_CAGR/retnStd':        ['mean', 'std',],                           
                })
  
  tbl['dif_CAGR_mean'] = tbl['grp(CAGR)_mean', 'mean'] - tbl['SPY_CAGR', 'mean']
  tbl['dif_CAGR/UI_mean'] = tbl['grp(CAGR/UI)_mean', 'mean'] - tbl['SPY_CAGR/UI', 'mean'] 
  tbl['dif_CAGR/retnStd_mean'] = tbl['grp(CAGR/retnStd)_mean', 'mean'] - tbl['SPY_CAGR/retnStd', 'mean'] 
  tbl['grp_CAGR/UI_mean/std'] = tbl['grp(CAGR/UI)_mean', 'mean'] / tbl['grp(CAGR/UI)_mean', 'std']  
  tbl['SPY_CAGR/UI_mean/std'] = tbl['SPY_CAGR/UI', 'mean'] / tbl['SPY_CAGR/UI', 'std'] 
  tbl['dif_CAGR/UI_mean/std'] = tbl['grp_CAGR/UI_mean/std']  - tbl['SPY_CAGR/UI_mean/std']
  tbl['pct_rank_d_CAGR/UI_m/s'] = tbl['dif_CAGR/UI_mean/std'].rank(pct=True)
  tbl['run_type'] = run_type
  
  pickle_dump(tbl, path_data_dump, name_table)
  print(f'saved table for run_type: {run_type} to {path_data_dump}{name_table}') 


saved table for run_type: train to C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/tbl_train
saved table for run_type: validate to C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/tbl_val
saved table for run_type: test to C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/tbl_test


In [6]:
tbl_train = pickle_load(path_data_dump, 'tbl.train')
tbl_val = pickle_load(path_data_dump, 'tbl.validate')
tbl_test = pickle_load(path_data_dump, 'tbl.test')
print(f'len(tbl_train): {len(tbl_train)}')
print(f'len(tbl_val): {len(tbl_val)}')
print(f'len(tbl_test): {len(tbl_test)}')

len(tbl_train): 120
len(tbl_val): 117
len(tbl_test): 120


In [15]:
tbl_concat = pd.concat([tbl_train, tbl_val, tbl_test])
tbl_concat.sort_values(by=["days_lookbacks", "days_eval", "sym_freq_cnt"])
tbl_concat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,grp(CAGR)_mean,grp(CAGR)_mean,grp(CAGR)_mean,grp(CAGR/UI)_mean,grp(CAGR/UI)_mean,grp(CAGR/retnStd)_mean,grp(CAGR/retnStd)_mean,SPY_CAGR,SPY_CAGR,SPY_CAGR/UI,SPY_CAGR/UI,SPY_CAGR/retnStd,SPY_CAGR/retnStd,dif_CAGR_mean,dif_CAGR/UI_mean,dif_CAGR/retnStd_mean,grp_CAGR/UI_mean/std,SPY_CAGR/UI_mean/std,dif_CAGR/UI_mean/std,pct_rank_d_CAGR/UI_m/s,run_type
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
days_lookbacks,days_eval,sym_freq_cnt,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
"[15, 30, 60, 120]",4,9,231,30190270.0,218506200.0,39210390000000.0,298893200000000.0,418121500.0,2607273000.0,2607.00571,39310.500243,5210077000.0,78621260000.0,84429.356152,1268486.0,30187660.0,39205180000000.0,418037100.0,0.131185,0.066268,0.064917,1.0,train
"[15, 30, 60]",4,9,139,39000.34,206457.6,74772540000.0,412059000000.0,6465760.0,49888260.0,8598.48994,71407.917197,17194770000.0,142816100000.0,277647.576742,2304214.0,30401.85,57577770000.0,6188112.0,0.181461,0.120398,0.061063,0.991667,train
"[30, 60, 120]",4,9,128,140571.4,979721.0,267421700000.0,1958404000000.0,8855914.0,66613970.0,4701.93781,52807.975285,9401742000.0,105616100000.0,152256.166211,1704007.0,135869.4,258019900000.0,8703658.0,0.136551,0.089018,0.047533,0.983333,train
"[30, 60, 120]",4,6,365,2752167000.0,42298830000.0,228043600000000.0,2417940000000000.0,26521220000.0,402327300000.0,1650.419329,31273.269372,3299372000.0,62546620000.0,53943.101151,1009131.0,2752165000.0,228040300000000.0,26521170000.0,0.094313,0.052751,0.041563,0.975,train
"[15, 30, 60, 120]",4,5,397,6.854585e+26,9.64522e+27,6.192848e+28,8.615225e+29,1.46417e+27,2.060262e+28,1517.577527,29986.454471,3032301000.0,59973050000.0,49326.115525,967611.7,6.854585e+26,6.192848e+28,1.46417e+27,0.071883,0.050561,0.021322,0.966667,train


#### These are the best performing run parameters for days_lookbacks, days_eval, sym_freq_cnt. The performance metric is "pct_rank_d_CAGR/UI_m/s". The metric first calculate the difference between the group's "CAGR/UI_mean/CAGR/UI_std" and the SPY's "CAGR/UI_mean/CAGR/UI_std" for run_type "train", "validate" and "test". It does a percentile rank of the difference for each run_type. The hightest ranking is 1. For a set of run parameters, i.e. days_lookbacks, days_eval, sym_freq_cnt, the sum of "pct_rank_d_CAGR/UI_m/s" for the three run_types is 3 maximum.              

In [16]:
yf_8_results = tbl_concat.groupby(level=[0,1,2]).sum(numeric_only=True).sort_values('pct_rank_d_CAGR/UI_m/s', ascending=False)
yf_8_results.head(8)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,grp(CAGR)_mean,grp(CAGR)_mean,grp(CAGR)_mean,grp(CAGR/UI)_mean,grp(CAGR/UI)_mean,grp(CAGR/retnStd)_mean,grp(CAGR/retnStd)_mean,SPY_CAGR,SPY_CAGR,SPY_CAGR/UI,SPY_CAGR/UI,SPY_CAGR/retnStd,SPY_CAGR/retnStd,dif_CAGR_mean,dif_CAGR/UI_mean,dif_CAGR/retnStd_mean,grp_CAGR/UI_mean/std,SPY_CAGR/UI_mean/std,dif_CAGR/UI_mean/std,pct_rank_d_CAGR/UI_m/s
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
days_lookbacks,days_eval,sym_freq_cnt,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
"[30, 60, 120]",4,8,287,2.2513410000000004e+31,3.495021e+32,1.016651e+32,1.578266e+33,2.9812000000000003e+31,4.5991710000000005e+32,2506.100201,38499.17889,5003596000.0,76993650000.0,82666.934367,1244125.0,2.2513410000000004e+31,1.016651e+32,2.9812000000000003e+31,1.46021,0.712894,0.747316,2.674145
"[15, 30, 60, 120]",5,7,454,2.210351e+54,3.966332e+55,4.9424940000000004e+60,8.868989e+61,5.0382950000000005e+54,9.040896999999999e+55,19.609688,205.238892,13129770.0,187218700.0,1086.509738,7030.503,2.210351e+54,4.9424940000000004e+60,5.0382950000000005e+54,1.109533,0.546658,0.562875,2.598932
"[30, 60, 120]",4,9,159,140614.0,979834.1,267472300000.0,1958628000000.0,8857921.0,66619590.0,4711.536608,52823.851676,9405926000.0,105635300000.0,153574.866921,1706507.0,135902.5,258066300000.0,8704346.0,1.524426,1.352416,0.172009,2.545726
"[30, 60, 120]",4,4,561,3.010245e+28,4.251791e+29,2.688766e+30,3.7977210000000003e+31,6.552447e+28,9.254939999999999e+29,1511.058721,29887.662537,3016252000.0,59771740000.0,50506.064853,968365.8,3.010245e+28,2.688766e+30,6.552447e+28,0.799411,0.49501,0.304401,2.447436
"[30, 60, 120]",6,9,168,15736.23,166571.6,552485200.0,6108887000.0,204187.7,1964075.0,7.046354,16.209956,247727.2,2547085.0,893.586793,1053.718,15729.18,552237500.0,203294.1,2.647089,2.278452,0.368637,2.41453
"[15, 30, 60, 120]",4,5,568,6.854585e+26,9.64522e+27,6.192848e+28,8.615225e+29,1.46417e+27,2.060262e+28,1525.013913,30009.444253,3042862000.0,60017760000.0,50928.917784,973981.8,6.854585e+26,6.192848e+28,1.46417e+27,0.560918,0.503351,0.057567,2.368803
"[30, 60, 120]",5,7,345,55276980000.0,608812600000.0,7209697000000.0,71382980000000.0,249402800000.0,2741740000000.0,5.955643,16.701344,1092793.0,11758400.0,669.995134,2126.812,55276980000.0,7209696000000.0,249402800000.0,1.372698,0.696621,0.676077,2.299359
"[30, 60, 120]",4,7,333,5028244000.0,77492310000.0,1.005648e+16,1.549846e+17,39492520000.0,605274300000.0,2534.822299,38744.46798,5064882000.0,77486480000.0,83737.799857,1255225.0,5028241000.0,1.005648e+16,39492440000.0,1.295681,0.587684,0.707997,2.266239


In [17]:
pickle_dump(yf_8_results, path_data_dump, fp_results)
print(f'yf_8_results save to: {path_data_dump}{fp_results}')

yf_8_results save to: C:/Users/ping/MyDrive/stocks/yfinance/VSCode_dump/yf_8_results_2023-03-13


In [18]:
# # select by days_lookbacks, days_eval, sym_freq_cnt
# tbl_concat.loc['[15, 30, 60, 120]', 4, 9]