In [12]:
import yfinance as yf
import yfinance.shared as shared
import time
import pandas as pd
# from datetime import date, timedelta, datetime
from myUtils import pickle_dump, pickle_load, read_symbols_file # NOQA
from myUtils import drop_symbols_all_NaN, chunked_list # NOQA
from myUtils import yf_download_AdjOHLCV_noAutoAdj
from yf_utils import _2_split_train_val_test, _3_random_slices, _4_lookback_slices, _5_perf_ranks
from yf_utils import _6_grp_tuples_sort_sum

verbose = False  # True prints more output
# verbose = True  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

filename_symbols = path_data_dump + 'vg_symbols_4chars_max.csv'  # symbols text file
filename_pickled_df_OHLCVA_downloaded = 'df_OHLCVA_downloaded '  # OHLCVA downloaded from Yahoo
filename_pickled_df_adjOHLCV = 'df_adjOHLCV'  # adjusted OHLCV
filename_pickled_df_symbols_close = "df_symbols_close"  # symbols' adjusted close
filename_pickled_symbols_df_adjOHLCV =  'symbols_df_adjOHLCV'  # symbols in df_adjOHLCV
filename_pickled_df_c = 'df_close_clean' 

In [13]:
df_c = pickle_load(path_data_dump, filename_pickled_df_c)
df_train, df_val, df_test = _2_split_train_val_test(df_c)
len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)

In [14]:
n_samples = 4  # number of tuples to create for iloc start_train:end_train:end_eval
# n_samples = 200  # number of tuples to create for iloc start_train:end_train:end_eval
days_lookbacks = [30, 120, 60]  # number of lookback days for training
days_eval = 15  # number of days to evaluate effectiveness of the training
set_top_syms = 5  # number of the most-common symbols from days_lookbacks' performance rankings

In [15]:
# create n_samples of iloc start_train:end_train:end_eval using max value in days_loobacks
max_lookback_slices = _3_random_slices(len_df_train, n_samples=n_samples, days_lookback=max(days_lookbacks), days_eval=days_eval)
sets_lookback_slices = _4_lookback_slices(max_slices=max_lookback_slices, days_lookbacks=days_lookbacks, verbose=False)

if verbose:
  print(f'max_lookback_slices:\n{max_lookback_slices}')
  print(f'sets_lookback_slices:\n{sets_lookback_slices}')

In [16]:
grp_top_set_syms_n_freq = []  # list of lists of top_set_symbols_n_freq, there are n_samples lists in list
grp_top_set_syms = []  # list of lists of top_set_symbols, there are n_samples lists in list
# loop thru lists of tuples of start_train:end_train:end_eval, i.e.
#  [[(887, 917, 927), (857, 917, 927), (797, 917, 927)],
#  [(483, 513, 523), (453, 513, 523), (393, 513, 523)]]
for lb_slices in sets_lookback_slices:
  grp_most_common_syms = []  
  for lb_slice in lb_slices:  # lb_slice, e.g. (246, 276, 286)
    start_train = lb_slice[0]
    end_train = lb_slice[1]
    start_eval = end_train
    end_eval = lb_slice[2]
    lookback = end_train - start_train
    eval = end_eval - start_eval

    if verbose:
      print(f'lb_slices:     {lb_slices}')
      print(f'lb_slice:      {lb_slice}')
      print(f'days lookback: {lookback}')
      print(f'days eval:     {eval}')    
      print(f'start_train:   {start_train}')
      print(f'end_train:     {end_train}')
      # print(f'start_eval:    {start_eval}')
      # print(f'end_eval:      {end_eval}')`

    _df = df_train.iloc[start_train:end_train]
    perf_ranks, most_common_syms = _5_perf_ranks(_df, n_top_syms=10)
    grp_most_common_syms.append(most_common_syms)
    
    if verbose:    
      # 1 lookback of r_CAGR/UI, r_CAGR/retnStd, r_retnStd/UI
      print(f'perf_ranks: {perf_ranks}')  
      # most common symbols of perf_ranks 
      print(f'most_common_syms: {most_common_syms}')     
      # grp_perf_ranks[lookback] = perf_ranks
      print(f'+++ finish lookback slice {lookback} +++\n')

  if verbose:
    print(f'grp_most_common_syms: {grp_most_common_syms}')
    # grp_most_common_syms a is list of lists of tuples of 
    #  the most-common-symbols symbol:frequency cumulated from
    #  each days_lookback  
    print(f'**** finish lookback slices {lb_slices} ****\n')

  # flatten list of lists of (symbol:frequency)
  flat_grp_most_common_syms = [val for sublist in grp_most_common_syms for val in sublist]
  # group symbols from set of days_lookbacks (i.e. lb_slices) and sum frequency of the symbols
  set_most_common_syms = _6_grp_tuples_sort_sum(flat_grp_most_common_syms, reverse=True)
  # get the top few most-frequent symbol:frequency pairs
  top_set_syms_n_freq = set_most_common_syms[0:set_top_syms]
  # get symbols from top_set_syms_n_freq
  top_set_syms = [i[0] for i in top_set_syms_n_freq]
  grp_top_set_syms_n_freq.append(top_set_syms_n_freq)
  grp_top_set_syms.append(top_set_syms)

  if verbose:  
    print(f'top {set_top_syms} ranked symbols and frequency from set {lb_slices}:\n{top_set_syms_n_freq}')
    print(f'top {set_top_syms} ranked symbols from set {lb_slices}:\n{top_set_syms}')  
    print(f'===== finish top {set_top_syms} ranked symbols from days_lookback set {lb_slices} =====\n\n')

In [17]:
print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
for item in z_grp_top_set_syms:
  print(item)
print('')

z_grp_top_set_syms:
((473, 593, 608), ['AXSM', 'ENPH', 'FTSM', 'LAD', 'DMRC'])
((690, 810, 825), ['QDEL', 'NVAX', 'TDOC', 'NEM', 'REGN'])
((722, 842, 857), ['NVAX', 'QDEL', 'RENN', 'AR', 'DKL'])
((305, 425, 440), ['AMD', 'CDNA', 'IRMD', 'RETA', 'FTSM'])



In [18]:
print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
for i, (_lookback_slice, _top_set_syms) in enumerate(z_grp_top_set_syms):
  print(f'{i + 1 } of {n_samples} max_lookback_slice')
  print(f'max_lookback_slice: {_lookback_slice}')
  print(f'top_set_syms: {_top_set_syms}')
  print(f'start_eval: {_lookback_slice[1]}')
  print(f'end_eval:   {_lookback_slice[2]}')
  l_syms = []  # list to accumlate top set symbbols
  for sym in _top_set_syms:
    l_syms.append(sym)
    print(f'symbol: {sym:>4}')
  print(f'top symbols from max_lookback_slice: {l_syms}')  
  print('')  

z_grp_top_set_syms:
1 of 4 max_lookback_slice
max_lookback_slice: (473, 593, 608)
top_set_syms: ['AXSM', 'ENPH', 'FTSM', 'LAD', 'DMRC']
start_eval: 593
end_eval:   608
symbol: AXSM
symbol: ENPH
symbol: FTSM
symbol:  LAD
symbol: DMRC
top symbols from max_lookback_slice: ['AXSM', 'ENPH', 'FTSM', 'LAD', 'DMRC']

2 of 4 max_lookback_slice
max_lookback_slice: (690, 810, 825)
top_set_syms: ['QDEL', 'NVAX', 'TDOC', 'NEM', 'REGN']
start_eval: 810
end_eval:   825
symbol: QDEL
symbol: NVAX
symbol: TDOC
symbol:  NEM
symbol: REGN
top symbols from max_lookback_slice: ['QDEL', 'NVAX', 'TDOC', 'NEM', 'REGN']

3 of 4 max_lookback_slice
max_lookback_slice: (722, 842, 857)
top_set_syms: ['NVAX', 'QDEL', 'RENN', 'AR', 'DKL']
start_eval: 842
end_eval:   857
symbol: NVAX
symbol: QDEL
symbol: RENN
symbol:   AR
symbol:  DKL
top symbols from max_lookback_slice: ['NVAX', 'QDEL', 'RENN', 'AR', 'DKL']

4 of 4 max_lookback_slice
max_lookback_slice: (305, 425, 440)
top_set_syms: ['AMD', 'CDNA', 'IRMD', 'RETA', 'FT

In [35]:
from yf_utils import _7_perf_eval
print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
for i, (_lookback_slice, _top_set_syms) in enumerate(z_grp_top_set_syms):
  print(f'{i + 1 } of {n_samples} max_lookback_slice')
  print(f'max_lookback_slice: {_lookback_slice}')
  print(f'top_set_syms: {_top_set_syms}')


  start_eval = _lookback_slice[1]
  end_eval = _lookback_slice[2]
  df_eval = df_train[start_eval:end_eval][_top_set_syms]

  if verbose:
    print(f'start_eval: {start_eval}')
    print(f'end_eval:   {end_eval}')  
    print(f'\ndf_eval:\n{df_eval}\n')


  _, grp_retnStd_d_UI, grp_CAGR_d_retnStd, grp_CAGR_d_UI = _7_perf_eval(df_eval)
  print(f'grp(retnStd/UI):   mean, std, mean/std: {grp_retnStd_d_UI[0]:>12,.3f}, {grp_retnStd_d_UI[1]:>12,.3f}, {grp_retnStd_d_UI[2]:>12,.3f}')
  print(f'grp(CAGR/retnStd): mean, std, mean/std: {grp_CAGR_d_retnStd[0]:>12,.3f}, {grp_CAGR_d_retnStd[1]:>12,.3f}, {grp_CAGR_d_retnStd[2]:>12,.3f}')
  print(f'grp(CAGR/UI):      mean, std, mean/std: {grp_CAGR_d_UI[0]:>12,.3f}, {grp_CAGR_d_UI[1]:>12,.3f}, {grp_CAGR_d_UI[2]:>12,.3f}')

  _sym_idx = ['SPY']
  df_SPY = df_train[start_eval:end_eval][_sym_idx]
  _, grp_retnStd_d_UI, grp_CAGR_d_retnStd, grp_CAGR_d_UI = _7_perf_eval(df_SPY)
  print(f'\n_sym_idx: {_sym_idx}')
  print(f'grp(retnStd/UI):   mean, std, mean/std: {grp_retnStd_d_UI[0]:>12,.3f}, {grp_retnStd_d_UI[1]:>12,.3f}, {grp_retnStd_d_UI[2]:>12,.3f}')
  print(f'grp(CAGR/retnStd): mean, std, mean/std: {grp_CAGR_d_retnStd[0]:>12,.3f}, {grp_CAGR_d_retnStd[1]:>12,.3f}, {grp_CAGR_d_retnStd[2]:>12,.3f}')
  print(f'grp(CAGR/UI):      mean, std, mean/std: {grp_CAGR_d_UI[0]:>12,.3f}, {grp_CAGR_d_UI[1]:>12,.3f}, {grp_CAGR_d_UI[2]:>12,.3f}')

  print('')  

z_grp_top_set_syms:
1 of 4 max_lookback_slice
max_lookback_slice: (473, 593, 608)
top_set_syms: ['AXSM', 'ENPH', 'FTSM', 'LAD', 'DMRC']
grp(retnStd/UI):   mean, std, mean/std:        0.950,        0.388,        2.449
grp(CAGR/retnStd): mean, std, mean/std:       37.376,       67.126,        0.557
grp(CAGR/UI):      mean, std, mean/std:       53.425,       82.395,        0.648

_sym_idx: ['SPY']
grp(retnStd/UI):   mean, std, mean/std:        4.671,          nan,          nan
grp(CAGR/retnStd): mean, std, mean/std:      285.190,          nan,          nan
grp(CAGR/UI):      mean, std, mean/std:    1,332.083,          nan,          nan

2 of 4 max_lookback_slice
max_lookback_slice: (690, 810, 825)
top_set_syms: ['QDEL', 'NVAX', 'TDOC', 'NEM', 'REGN']
grp(retnStd/UI):   mean, std, mean/std:        1.460,        1.523,        0.959
grp(CAGR/retnStd): mean, std, mean/std:    1,702.563,    3,711.678,        0.459
grp(CAGR/UI):      mean, std, mean/std:    6,952.605,   15,466.499,        0.450

In [21]:
grp_top_set_syms

[['AXSM', 'ENPH', 'FTSM', 'LAD', 'DMRC'],
 ['QDEL', 'NVAX', 'TDOC', 'NEM', 'REGN'],
 ['NVAX', 'QDEL', 'RENN', 'AR', 'DKL'],
 ['AMD', 'CDNA', 'IRMD', 'RETA', 'FTSM']]

In [22]:
max_lookback_slices

[(473, 593, 608), (690, 810, 825), (722, 842, 857), (305, 425, 440)]