In [1]:
import yfinance as yf
import yfinance.shared as shared
import time
import pandas as pd
# from datetime import date, timedelta, datetime
from myUtils import pickle_dump, pickle_load, read_symbols_file # NOQA
from myUtils import drop_symbols_all_NaN, chunked_list # NOQA
from myUtils import yf_download_AdjOHLCV_noAutoAdj
from yf_utils import _2_split_train_val_test, _3_random_slices, _4_lookback_slices, _5_perf_ranks
from yf_utils import _6_grp_tuples_sort_sum

verbose = False  # True prints more output
# verbose = True  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

filename_symbols = path_data_dump + 'vg_symbols_4chars_max.csv'  # symbols text file
filename_pickled_df_OHLCVA_downloaded = 'df_OHLCVA_downloaded '  # OHLCVA downloaded from Yahoo
filename_pickled_df_adjOHLCV = 'df_adjOHLCV'  # adjusted OHLCV
filename_pickled_df_symbols_close = "df_symbols_close"  # symbols' adjusted close
filename_pickled_symbols_df_adjOHLCV =  'symbols_df_adjOHLCV'  # symbols in df_adjOHLCV
filename_pickled_df_c = 'df_close_clean' 

In [2]:
df_c = pickle_load(path_data_dump, filename_pickled_df_c)
df_train, df_val, df_test = _2_split_train_val_test(df_c)
len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)

In [3]:
n_samples = 4  # number of tuples to create for iloc start_train:end_train:end_eval
# n_samples = 200  # number of tuples to create for iloc start_train:end_train:end_eval
days_lookbacks = [30, 120, 60]  # number of lookback days for training
days_eval = 10  # number of days to evaluate effectiveness of the training
set_top_syms = 5  # number of the most-common symbols from days_lookbacks' performance rankings

In [4]:
# create n_samples of iloc start_train:end_train:end_eval using max value in days_loobacks
max_lookback_slices = _3_random_slices(len_df_train, n_samples=n_samples, days_lookback=max(days_lookbacks), days_eval=days_eval)
sets_lookback_slices = _4_lookback_slices(max_slices=max_lookback_slices, days_lookbacks=days_lookbacks, verbose=False)

if verbose:
  print(f'max_lookback_slices:\n{max_lookback_slices}')
  print(f'sets_lookback_slices:\n{sets_lookback_slices}')

In [5]:
grp_top_set_syms_n_freq = []  # list of lists of top_set_symbols_n_freq, there are n_samples lists in list
grp_top_set_syms = []  # list of lists of top_set_symbols, there are n_samples lists in list
# loop thru lists of tuples of start_train:end_train:end_eval, i.e.
#  [[(887, 917, 927), (857, 917, 927), (797, 917, 927)],
#  [(483, 513, 523), (453, 513, 523), (393, 513, 523)]]
for lb_slices in sets_lookback_slices:
  grp_most_common_syms = []  
  for lb_slice in lb_slices:  # lb_slice, e.g. (246, 276, 286)
    start_train = lb_slice[0]
    end_train = lb_slice[1]
    start_eval = end_train
    end_eval = lb_slice[2]
    lookback = end_train - start_train
    eval = end_eval - start_eval

    if verbose:
      print(f'lb_slices:     {lb_slices}')
      print(f'lb_slice:      {lb_slice}')
      print(f'days lookback: {lookback}')
      print(f'days eval:     {eval}')    
      print(f'start_train:   {start_train}')
      print(f'end_train:     {end_train}')
      # print(f'start_eval:    {start_eval}')
      # print(f'end_eval:      {end_eval}')`

    _df = df_train.iloc[start_train:end_train]
    perf_ranks, most_common_syms = _5_perf_ranks(_df, n_top_syms=10)
    grp_most_common_syms.append(most_common_syms)
    
    if verbose:    
      # 1 lookback of r_CAGR/UI, r_CAGR/retnStd, r_retnStd/UI
      print(f'perf_ranks: {perf_ranks}')  
      # most common symbols of perf_ranks 
      print(f'most_common_syms: {most_common_syms}')     
      # grp_perf_ranks[lookback] = perf_ranks
      print(f'+++ finish lookback slice {lookback} +++\n')

  if verbose:
    print(f'grp_most_common_syms: {grp_most_common_syms}')
    # grp_most_common_syms a is list of lists of tuples of 
    #  the most-common-symbols symbol:frequency cumulated from
    #  each days_lookback  
    print(f'**** finish lookback slices {lb_slices} ****\n')

  # flatten list of lists of (symbol:frequency)
  flat_grp_most_common_syms = [val for sublist in grp_most_common_syms for val in sublist]
  # group symbols from set of days_lookbacks (i.e. lb_slices) and sum frequency of the symbols
  set_most_common_syms = _6_grp_tuples_sort_sum(flat_grp_most_common_syms, reverse=True)
  # get the top few most-frequent symbol:frequency pairs
  top_set_syms_n_freq = set_most_common_syms[0:set_top_syms]
  # get symbols from top_set_syms_n_freq
  top_set_syms = [i[0] for i in top_set_syms_n_freq]
  grp_top_set_syms_n_freq.append(top_set_syms_n_freq)
  grp_top_set_syms.append(top_set_syms)

  if verbose:  
    print(f'top {set_top_syms} ranked symbols and frequency from set {lb_slices}:\n{top_set_syms_n_freq}')
    print(f'top {set_top_syms} ranked symbols from set {lb_slices}:\n{top_set_syms}')  
    print(f'===== finish top {set_top_syms} ranked symbols from days_lookback set {lb_slices} =====\n\n')

In [40]:
print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
for item in z_grp_top_set_syms:
  print(item)
print('')

z_grp_top_set_syms:
((920, 1040, 1050), ['ACRS', 'CPE', 'RICK', 'DAC', 'GME'])
((338, 458, 468), ['FTSM', 'IDT', 'MKC', 'SBUX', 'ATNI'])
((125, 245, 255), ['SKY', 'MDGL', 'SAGE', 'BOOT', 'GDS'])
((84, 204, 214), ['GRVY', 'PTSI', 'XOMA', 'GDS', 'OSTK'])

z_grp_top_set_syms_n_freq:


In [55]:
print('z_grp_top_set_syms_n_freq:')
z_grp_top_set_syms_n_freq = zip(max_lookback_slices, grp_top_set_syms_n_freq)
# for item in z_grp_top_set_syms_n_freq:
#   print(item)


for i, (_lookback_slice, _top_set_syms_n_freq) in enumerate(z_grp_top_set_syms_n_freq):
  print(f'{i + 1 } of {n_samples} max_lookback_slice')
  print(f'max_lookback_slice: {_lookback_slice}')
  print(f'top_set_syms_n_freq: {top_set_syms_n_freq}')
  print(f'start_eval: {_lookback_slice[1]}')
  print(f'end_eval:   {_lookback_slice[2]}')
  l_syms = []  # list to accumlate top set symbbols
  for sym_n_freq in _top_set_syms_n_freq:
    l_syms.append(sym_n_freq[0])
    print(f'symbol: {sym_n_freq[0]:>4},  freq: {sym_n_freq[1]:>2}')
  print(f'top symbols from max_lookback_slice: {l_syms}')  
  print('')  

z_grp_top_set_syms_n_freq:
1 of 4 max_lookback_slice
max_lookback_slice: (920, 1040, 1050)
top_set_syms_n_freq: [('GRVY', 6), ('PTSI', 6), ('XOMA', 6), ('GDS', 5), ('OSTK', 5)]
start_eval: 1040
end_eval:   1050
symbol: ACRS,  freq:  6
symbol:  CPE,  freq:  6
symbol: RICK,  freq:  5
symbol:  DAC,  freq:  4
symbol:  GME,  freq:  4
top symbols from max_lookback_slice: ['ACRS', 'CPE', 'RICK', 'DAC', 'GME']

2 of 4 max_lookback_slice
max_lookback_slice: (338, 458, 468)
top_set_syms_n_freq: [('GRVY', 6), ('PTSI', 6), ('XOMA', 6), ('GDS', 5), ('OSTK', 5)]
start_eval: 458
end_eval:   468
symbol: FTSM,  freq:  7
symbol:  IDT,  freq:  4
symbol:  MKC,  freq:  4
symbol: SBUX,  freq:  4
symbol: ATNI,  freq:  3
top symbols from max_lookback_slice: ['FTSM', 'IDT', 'MKC', 'SBUX', 'ATNI']

3 of 4 max_lookback_slice
max_lookback_slice: (125, 245, 255)
top_set_syms_n_freq: [('GRVY', 6), ('PTSI', 6), ('XOMA', 6), ('GDS', 5), ('OSTK', 5)]
start_eval: 245
end_eval:   255
symbol:  SKY,  freq:  8
symbol: MDGL