In [84]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML
from yf_utils import _2_split_train_val_test, _3_random_slices, _4_lookback_slices
from yf_utils import _5_perf_ranks, _6_grp_tuples_sort_sum
from myUtils import pickle_load, pickle_dump

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 14)
pd.set_option('display.max_colwidth', 12)
pd.set_option('display.width', 800)

# verbose = False  # True prints more output
verbose = True  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"
fp_df_close_clean = 'df_close_clean'
fp_df_eval_results = 'df_eval_results'

df_close_clean = pickle_load(path_data_dump, fp_df_close_clean)

In [85]:
# Split df_close_clean into training (df_train), validation (df_val) and test (df_test) set.
# The default split is 0.7, 0.2, 0.1 respectively.

###################################
# df_train, df_val, df_test = _2_split_train_val_test(df_close_clean, s_train=1, s_val=0, s_test=0)
df_train, df_val, df_test = _2_split_train_val_test(df_close_clean)
###################################

len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)
print(f'len_df_train: {len_df_train}, len_df_val: {len_df_val}, len_df_test: {len_df_test} ')

len_df_train: 1050, len_df_val: 300, len_df_test: 150 


In [86]:
# if store_results:  # record results to df
# my_cols = ['n_samples', 'days_lookbacks', 'days_eval', 'n_top_syms', 'syms_start', 'syms_end', 'grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'grp(CAGR/UI)_mean/std', 'SPY_CAGR/UI']
df_eval_results = pickle_load(path_data_dump, fp_df_eval_results)

In [87]:
# write run results to df_eval_results
store_results = False
# store_results = True

# number of max lookback tuples to create for iloc start_train:end_train:end_eval
# i.e. number of grp_top_set_syms_n_freq and grp_top_set_syms 
# n_samples = 400  
n_samples = 2

# for training, the number of days to lookback from iloc max-lookback end_train
# days_lookbacks = [15, 30, 60, 120]
# days_lookbacks = [30, 60, 120]
days_lookbacks = [60, 120]
# days_lookbacks = [120]
days_lookbacks.sort()

# number of days from end_train are used to evaluate effectiveness of the training
days_eval = 10  

# number of the most-common symbols from days_lookbacks' performance rankings to keep
# n_top_syms = 5
n_top_syms = 10  

syms_start = 0  #  start index of n_top_syms for evaluation
syms_end = 5  #  end index of n_top_syms for evaluation

create a sets of iloc lookback slices (start_train:end_train:end_eval), where  
end_train - start_train = days_lookback  
end_eval - end_train = days_eval  
for example,  
if given:  
 n_samples = 2  
 days_lookbacks = [30, 60, 120]  
 days_eval = 10  
a possible result is:  
 max_lookback_slices:  
 [(150, 270, 280), (5, 125, 135)]  
 where 270-150=125-5=max(days_lookbacks), 280-270=135-125=days_eval  
 sets_lookback_slices:  
 [[(240, 270, 280), (210, 270, 280), (150, 270, 280)], [(95, 125, 135), (65, 125, 135), (5, 125, 135)]]  
  where in a set, 270-240=days_lookbacks[0], 270-210=days_lookbacks[1], 270-150=days_lookbacks[2]  
  and 270, i.e. end_train, is constant for the set  

In [88]:
# return n_samples slices
max_lookback_slices = _3_random_slices(len_df_train, n_samples=n_samples, days_lookback=max(days_lookbacks), days_eval=days_eval)
# return n_samples * len(days_lookbacks) slices
sets_lookback_slices = _4_lookback_slices(max_slices=max_lookback_slices, days_lookbacks=days_lookbacks, verbose=False)

if verbose:
  print(f'max_lookback_slices:\n{max_lookback_slices}')
  print(f'sets_lookback_slices:\n{sets_lookback_slices}')

max_lookback_slices:
[(330, 450, 460), (67, 187, 197)]
sets_lookback_slices:
[[(390, 450, 460), (330, 450, 460)], [(127, 187, 197), (67, 187, 197)]]


Generate lists, n_samples long, of the highest performance ranked symbols. The performance metrics are: CAGR/UI, CAGR/retnStd, retnStd/UI. n_top_syms of the best performing symbols from each metric are combined. The symbols are sorted by their number of appearances in the combined pool, and are placed in a list. A slice of the best performing symbols is selected by syms_start:syms_end, i.e. top_set_syms_n_freq[syms_start:syms_end].     

The performance metrics are calculated based on slices in sets_lookback_slices.  The first two numbers are ilocs for training. The last two numbers are ilocs for evaluation.

In [89]:
grp_top_set_syms_n_freq = []  # list of lists of top_set_symbols_n_freq, there are n_samples lists in list
grp_top_set_syms = []  # list of lists of top_set_symbols, there are n_samples lists in list
# loop thru lists of tuples of start_train:end_train:end_eval, i.e.
#  [[(887, 917, 927), (857, 917, 927), (797, 917, 927)],
#  [(483, 513, 523), (453, 513, 523), (393, 513, 523)]]
for lb_slices in sets_lookback_slices:
  grp_most_common_syms = []  
  for lb_slice in lb_slices:  # lb_slice, e.g. (246, 276, 286)
    start_train = lb_slice[0]
    end_train = lb_slice[1]
    start_eval = end_train
    end_eval = lb_slice[2]
    lookback = end_train - start_train
    d_eval = end_eval - start_eval

    if verbose:
      print(f'days lookback: {lookback}')
      print(f'lb_slices:     {lb_slices}')
      print(f'lb_slice:      {lb_slice}')
      print(f'days eval:     {d_eval}')    
      print(f'start_train:   {start_train}')
      print(f'end_train:     {end_train}')
      # print(f'start_eval:    {start_eval}')
      # print(f'end_eval:      {end_eval}')`

    _df = df_train.iloc[start_train:end_train]
    perf_ranks, most_common_syms = _5_perf_ranks(_df, n_top_syms=n_top_syms)
    grp_most_common_syms.append(most_common_syms)
    
    if verbose:    
      # 1 lookback of r_CAGR/UI, r_CAGR/retnStd, r_retnStd/UI
      print(f'perf_ranks: {perf_ranks}')  
      # most common symbols of perf_ranks 
      print(f'most_common_syms: {most_common_syms}')     
      # grp_perf_ranks[lookback] = perf_ranks
      print(f'+++ finish lookback slice {lookback} +++\n')

  if verbose:
    print(f'grp_most_common_syms: {grp_most_common_syms}')
    # grp_most_common_syms a is list of lists of tuples of 
    #  the most-common-symbols symbol:frequency cumulated from
    #  each days_lookback  
    print(f'**** finish lookback slices {lb_slices} ****\n')

  # flatten list of lists of (symbol:frequency)
  flat_grp_most_common_syms = [val for sublist in grp_most_common_syms for val in sublist]
  # group symbols from set of days_lookbacks (i.e. lb_slices) and sum frequency of the symbols
  set_most_common_syms = _6_grp_tuples_sort_sum(flat_grp_most_common_syms, reverse=True)
  # get the top few most-frequent symbol:frequency pairs
  top_set_syms_n_freq = set_most_common_syms[0:n_top_syms]
  # get symbols from top_set_syms_n_freq

###################################  
  # top_set_syms = [i[0] for i in top_set_syms_n_freq]
  top_set_syms = [i[0] for i in top_set_syms_n_freq[syms_start:syms_end]]  
###################################  
  
  grp_top_set_syms_n_freq.append(top_set_syms_n_freq)
  grp_top_set_syms.append(top_set_syms)

  if verbose:  
    print(f'top {n_top_syms} ranked symbols and frequency from set {lb_slices}:\n{top_set_syms_n_freq}')
    print(f'top {n_top_syms} ranked symbols from set {lb_slices}:\n{top_set_syms}')  
    print(f'===== finish top {n_top_syms} ranked symbols from days_lookback set {lb_slices} =====\n\n')

days lookback: 60
lb_slices:     [(390, 450, 460), (330, 450, 460)]
lb_slice:      (390, 450, 460)
days eval:     10
start_train:   390
end_train:     450
perf_ranks: {'period-60': {'r_CAGR/UI': array(['FTSM', 'SBUX', 'LRN', 'LW', 'WBA', 'MKC', 'GOLD', 'PZZA', 'ULTA',
       'AU'], dtype=object), 'r_CAGR/retnStd': array(['LRN', 'SBUX', 'FTSM', 'HRL', 'MKC', 'CCEP', 'ULTA', 'AU', 'LW',
       'GOLD'], dtype=object), 'r_retnStd/UI': array(['FTSM', 'SBUX', 'WBA', 'LW', 'LRN', 'PZZA', 'MCD', 'MKC', 'TDS',
       'AAP'], dtype=object)}}
most_common_syms: [('FTSM', 3), ('SBUX', 3), ('LRN', 3), ('LW', 3), ('MKC', 3), ('WBA', 2), ('GOLD', 2), ('PZZA', 2), ('ULTA', 2), ('AU', 2), ('HRL', 1), ('CCEP', 1), ('MCD', 1), ('TDS', 1), ('AAP', 1)]
+++ finish lookback slice 60 +++

days lookback: 120
lb_slices:     [(390, 450, 460), (330, 450, 460)]
lb_slice:      (330, 450, 460)
days eval:     10
start_train:   330
end_train:     450
perf_ranks: {'period-120': {'r_CAGR/UI': array(['FTSM', 'MKC', 'CLX',

In [90]:
print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
for item in z_grp_top_set_syms:
  print(item)
print('')

z_grp_top_set_syms:
((330, 450, 460), ['FTSM', 'MKC', 'AAP', 'CCEP', 'CLX'])
((67, 187, 197), ['OSTK', 'PRGS', 'CAT', 'GDS', 'QNST'])



In [91]:
from yf_utils import _7_perf_eval

print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
# z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms[26:29])

for i, (_lookback_slice, _top_set_syms) in enumerate(z_grp_top_set_syms):
# for i, (_lookback_slice, _top_set_syms) in enumerate(z_grp_top_set_syms[26:29]):

  start_train = _lookback_slice[0]
  end_train = _lookback_slice[1]
  start_eval = end_train
  end_eval = _lookback_slice[2]

  print(f'{i + 1 } of {n_samples} max_lookback_slice')
  print(f'max_lookback_slice: {_lookback_slice}')
  # dates correspond to max_lookback_slice
  d_start_train = df_train.index[start_train].strftime('%Y-%m-%d')
  d_end_train = df_train.index[end_train].strftime('%Y-%m-%d')
  d_start_eval = d_end_train
  d_end_eval = df_train.index[end_eval].strftime('%Y-%m-%d')
  d_df_eval_start = d_end_train
  d_df_eval_end = df_train.index[end_eval - 1].strftime('%Y-%m-%d')  
  print(f'max lookback dates: {d_start_train}, {d_end_train}, {d_end_eval}')
  print(f'df_eval dates (inclusive): {d_df_eval_start} - {d_df_eval_end}')    
  print(f'top_set_syms: {_top_set_syms}\n')

  df_eval = df_train[start_eval:end_eval][_top_set_syms]

  if verbose:
    # print(f'start_eval: {start_eval}')
    # print(f'end_eval:   {end_eval}')  
    print(f'start_eval: {start_eval},  date: {d_end_train}')
    print(f'end_eval:   {end_eval},  date: {d_end_eval},  df_eval last date: {d_df_eval_end}')      
    print(f'\ndf_eval:\n{df_eval}\n')


  _, grp_retnStd_d_UI, grp_CAGR_d_retnStd, grp_CAGR_d_UI = _7_perf_eval(df_eval)
  print(f'grp(retnStd/UI):   mean, std, mean/std: {grp_retnStd_d_UI[0]:>13,.3f}, {grp_retnStd_d_UI[1]:>13,.3f}, {grp_retnStd_d_UI[2]:>13,.3f}')
  print(f'grp(CAGR/retnStd): mean, std, mean/std: {grp_CAGR_d_retnStd[0]:>13,.3f}, {grp_CAGR_d_retnStd[1]:>13,.3f}, {grp_CAGR_d_retnStd[2]:>13,.3f}')
  print(f'grp(CAGR/UI):      mean, std, mean/std: {grp_CAGR_d_UI[0]:>13,.3f}, {grp_CAGR_d_UI[1]:>13,.3f}, {grp_CAGR_d_UI[2]:>13,.3f}')

  _sym_idx = ['SPY']
  df_SPY = df_train[start_eval:end_eval][_sym_idx]
  _, SPY_retnStd_d_UI, SPY_CAGR_d_retnStd, SPY_CAGR_d_UI = _7_perf_eval(df_SPY)
  print(f'\nSPY: retnStd/UI, CAGR/retnStd, CAGR/UI: {SPY_retnStd_d_UI[0]:>13,.3f}, {SPY_CAGR_d_retnStd[0]:>13,.3f}, {SPY_CAGR_d_UI[0]:>13,.3f}')

  if store_results:  # record results to df
    row_add = [n_samples, str(days_lookbacks), days_eval, n_top_syms, syms_start, syms_end, grp_CAGR_d_UI[0], grp_CAGR_d_UI[1], grp_CAGR_d_UI[2], SPY_CAGR_d_UI[0]]
    df_eval_results.loc[len(df_eval_results)] = row_add
    print(f'appended row_add to df_eval_results:\n{row_add}\n')
  print('='*50, '\n')  

z_grp_top_set_syms:
1 of 2 max_lookback_slice
max_lookback_slice: (330, 450, 460)
max lookback dates: 2018-06-05, 2018-11-23, 2018-12-10
df_eval dates (inclusive): 2018-11-23 - 2018-12-07
top_set_syms: ['FTSM', 'MKC', 'AAP', 'CCEP', 'CLX']

start_eval: 450,  date: 2018-11-23
end_eval:   460,  date: 2018-12-10,  df_eval last date: 2018-12-07

df_eval:
                 FTSM        MKC         AAP       CCEP         CLX
Date                                                               
2018-11-23  57.552986  69.143875  166.839661  45.267357  145.605545
2018-11-26  57.533775  68.934311  167.291489  45.063618  146.052719
2018-11-27  57.543388  70.067825  168.750519  44.859875  148.762131
2018-11-28  57.533775  70.467888  168.449310  45.461838  147.921585
2018-11-29  57.533775  70.987015  167.244431  45.128445  148.484940
2018-11-30  57.533775  71.439476  167.282089  44.952488  148.100433
2018-12-03  57.533775  72.153854  167.451508  44.100479  147.805328
2018-12-04  57.543388  72.182419  1

In [None]:
# https://towardsdatascience.com/pandas-groupby-a-simple-but-detailed-tutorial-314b8f37005d
# https://towardsdatascience.com/accessing-data-in-a-multiindex-dataframe-in-pandas-569e8767201d
# https://towardsdatascience.com/summarizing-data-with-pandas-crosstab-efc8b9abecf
# https://towardsdatascience.com/how-to-flatten-multiindex-columns-and-rows-in-pandas-f5406c50e569
# https://datascientyst.com/list-aggregation-functions-aggfunc-groupby-pandas/

In [92]:
# df = pickle_load(path_data_dump, 'df_eval_results')
df = df_eval_results.copy()
# display(HTML(df.to_html()))

In [110]:
my_cols = ['grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'SPY_CAGR/UI']
# my_cols = ['grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std']
# my_cols = ['SPY_CAGR/UI']
_df = df[my_cols]
row_inf = _df.index[np.isinf(_df).any(axis=1)]
print(f'row iloc with inf:\n{row_inf}')
print(f'len(row iloc with inf):\n{len(row_inf)}')
print(f'len(_df): {len(_df)}')
# df_inf = df.iloc[row_inf].copy()  # df with inf in my_cols
# # display(HTML(df_inf.to_html()))
# df.drop(axis=0, index=row_inf, inplace=True)
# # display(HTML(df.to_html()))
# df.dropna(how='any', inplace=True)
# df

row iloc with inf:
Int64Index([25, 26, 38, 52, 65, 73, 76, 84, 100, 110, 131, 139, 149, 223, 225, 244, 266, 346, 349, 358, 384, 393, 405, 411, 412, 418, 419, 440, 473, 478, 496, 511, 525, 533, 535, 540, 554, 581, 586, 587, 590, 619, 623, 627, 629, 633, 681, 683, 684, 706, 727, 750, 763, 784], dtype='int64')
len(row iloc with inf):
54
len(_df): 800


In [111]:
# # https://stackoverflow.com/questions/50773107/how-to-replace-infinite-value-with-maximum-value-of-a-pandas-column
# m = df.loc[df['Crime_Rate'] != np.inf, 'Crime_Rate'].max()
# df['Crime_Rate'].replace(np.inf,m,inplace=True)

# filter out inf value from column grp(CAGR/UI)_mean,
#  than find max values in columns grp(CAGR/UI)_mean and grp(CAGR/UI)_std
m = _df.loc[_df['grp(CAGR/UI)_mean'] != np.inf, ['grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std']].max()
print(f'max grp(CAGR/UI)_mean:  {m["grp(CAGR/UI)_mean"]}')
print(f'max grp(CAGR/UI)_std]:  {m["grp(CAGR/UI)_std"]}')

# replace inf in column grp(CAGR/UI)_mean with max value excluding inf
_df['grp(CAGR/UI)_mean'].replace(np.inf, m['grp(CAGR/UI)_mean']).copy()
# replace inf in column grp(CAGR/UI)_stdn with max value excluding NaN
_df['grp(CAGR/UI)_std'].replace(np.nan, m['grp(CAGR/UI)_std']).copy()
_df

max grp(CAGR/UI)_mean:  444907257358.61584
max grp(CAGR/UI)_std]:  994841359375.5496


Unnamed: 0,grp(CAGR/UI)_mean,grp(CAGR/UI)_std,SPY_CAGR/UI
0,248.489978,584.843540,-30.375669
1,31.927844,60.229337,-10.581483
2,4008.399882,5423.419446,190.881313
3,3735.447231,8000.658763,289.043187
4,-6.004549,16.475508,-12.457626
...,...,...,...
795,33229.34...,71736.37...,53.836188
796,22.390279,82.528456,-18.271479
797,569.693718,1270.879342,657.333386
798,2093.482297,4283.497196,-7.457301


In [None]:
my_cols = ['grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'SPY_CAGR/UI']
_df = df[my_cols]
row_inf = _df.index[np.isinf(_df).any(axis=1)]
print(f'row iloc with inf:\n{row_inf}\n')
df_inf = df.iloc[row_inf].copy()  # df with inf in my_cols
# display(HTML(df_inf.to_html()))
df.drop(axis=0, index=row_inf, inplace=True)
# display(HTML(df.to_html()))
df.dropna(how='any', inplace=True)
df

In [112]:
df = _df.copy()

In [113]:
# pd.crosstab(df['days_lookbacks'], df['grp(CAGR/UI)_mean'])
tbl = df.groupby(['days_lookbacks'])\
        .agg({'grp(CAGR/UI)_mean': ['mean', 'std'],
              'grp(CAGR/UI)_mean/std': ['mean', 'std'],
              'SPY_CAGR/UI': ['mean', 'std']})

KeyError: 'days_lookbacks'

In [None]:
tbl

In [None]:
import pylab
import numpy as np
from scipy.stats import norm
x = np.linspace(-10,20,1000)
y = norm.pdf(x, loc=3.579270, scale=3.953039)    # for example
z = norm.pdf(x, loc=2.215328, scale=2.629466)    # for example
z1 = norm.pdf(x, loc=1.021825, scale=1.505096)    # for example
pylab.plot(x,y, 'b')
pylab.plot(x,z, 'r')
pylab.plot(x,z1, 'g')
pylab.show()

In [None]:
print(f'n_samples:             {n_samples:>13}')
print(f'days_lookbacks:        {days_lookbacks}')
print(f'days_eval:             {days_eval:>13}')
print(f'n_top_syms:            {n_top_syms:>13}')
print(f'syms_start:            {syms_start:>13}')
print(f'syms_end:              {syms_end:>13}')
print(f'grp(CAGR/UI)_mean:     {grp_CAGR_d_UI[0]:>13,.3f}')
print(f'grp(CAGR/UI)_std:      {grp_CAGR_d_UI[1]:>13,.3f}')
print(f'grp(CAGR/UI)_mean/std: {grp_CAGR_d_UI[2]:>13,.3f}')
print(f'SPY_CAGR/UI:           {SPY_CAGR_d_UI[0]:>13,.3f}')

In [None]:
pd.set_option('display.max_colwidth', 20)
_df
# _df1 = _df.sort_values(by=['grp(CAGR/UI)_mean']).head(100).copy()
_df1 = _df.sort_values(by=['grp(CAGR/UI)_mean']).tail(100).copy()
# display(HTML(_df1.to_html()))
_df1