In [1]:
import yfinance as yf
import yfinance.shared as shared
import time
import pandas as pd
# from datetime import date, timedelta, datetime
from myUtils import pickle_dump, pickle_load, read_symbols_file # NOQA
from myUtils import drop_symbols_all_NaN, chunked_list # NOQA
from myUtils import yf_download_AdjOHLCV_noAutoAdj
from yf_utils import _2_split_train_val_test, _3_random_slices, _4_perf_ranks
from yf_utils import _5_lookback_slices

verbose = False  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

filename_symbols = path_data_dump + 'vg_symbols_4chars_max.csv'  # symbols text file
filename_pickled_df_OHLCVA_downloaded = 'df_OHLCVA_downloaded '  # OHLCVA downloaded from Yahoo
filename_pickled_df_adjOHLCV = 'df_adjOHLCV'  # adjusted OHLCV
filename_pickled_df_symbols_close = "df_symbols_close"  # symbols' adjusted close
filename_pickled_symbols_df_adjOHLCV =  'symbols_df_adjOHLCV'  # symbols in df_adjOHLCV
filename_pickled_df_c = 'df_close_clean' 

In [2]:
df_c = pickle_load(path_data_dump, filename_pickled_df_c)
df_train, df_val, df_test = _2_split_train_val_test(df_c)
len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)

In [3]:
n_samples = 3  # number of tuples to create for iloc start_train:end_train:end_eval
days_lookbacks = [30, 120, 60]  # number of lookback days for training
days_eval = 10  # number of days to evaluate effectiveness of the training
set_top_syms = 5  # number of the most-common symbols from days_lookbacks' performance rankings

In [4]:
# create n_samples of iloc start_train:end_train:end_eval using max value in days_loobacks
max_lookback_slices = _3_random_slices(len_df_train, n_samples=n_samples, days_lookback=max(days_lookbacks), days_eval=days_eval)
sets_lookback_slices = _5_lookback_slices(max_slices=max_lookback_slices, days_lookbacks=days_lookbacks, verbose=False)
sets_lookback_slices 

[[(236, 266, 276), (206, 266, 276), (146, 266, 276)],
 [(403, 433, 443), (373, 433, 443), (313, 433, 443)],
 [(528, 558, 568), (498, 558, 568), (438, 558, 568)]]

In [5]:
def grp_tuples_sort_sum(L, reverse=True):
  # https://stackoverflow.com/questions/2249036/grouping-python-tuple-list
  # https://stackoverflow.com/questions/10695139/sort-a-list-of-tuples-by-2nd-item-integer-value
  """
  Given a list of tuples of (key:value) such as:
  [('grape', 100), ('apple', 15), ('grape', 3), ('apple', 10),
   ('apple', 4), ('banana', 3)]
  Returns list of grouped-sorted-tuples based on summed-values such as:
  [('grape', 103), ('apple', 29), ('banana', 3)] 

  Args:
      L(list of tuples): list of tuples of key(str):value(int) pairs
      reverse(bool): sort order of summed-values of the grouped tuples,
       default is in descending order.  

  Return:
      grp_sorted_list(list of tuples): list of grouped-sorted-tuples
       based on summed-values such as:
       [('grape', 103), ('apple', 29), ('banana', 3)] 
  """ 

  import itertools
  from operator import itemgetter

  grp_list = []
  l = sorted(L)
  it = itertools.groupby(l, itemgetter(0))

  for key, subiter in it:
    # print(f'key: {key}')
    key_sum = sum(item[1] for item in subiter)
    # print(f'key_sum: {key_sum}')
    grp_list.append((key, key_sum))

  grp_sorted_list = sorted(grp_list, key=itemgetter(1), reverse=reverse)

  return grp_sorted_list  


In [12]:
# grp_perf_ranks = {}
# grp_most_common_syms = []
# loop thru lists of tuples of start_train:end_train:end_eval, i.e.
#  [[(887, 917, 927), (857, 917, 927), (797, 917, 927)],
#  [(483, 513, 523), (453, 513, 523), (393, 513, 523)]]
for lb_slices in sets_lookback_slices:
  grp_most_common_syms = []  
  for lb_slice in lb_slices:  # lb_slice, e.g. (246, 276, 286)
    start_train = lb_slice[0]
    end_train = lb_slice[1]
    start_eval = end_train
    end_eval = lb_slice[2]
    lookback = end_train - start_train
    eval = end_eval - start_eval
    print(f'lb_slices:     {lb_slices}')
    print(f'lb_slice:      {lb_slice}')
    print(f'days lookback: {lookback}')
    print(f'days eval:     {eval}')    
    print(f'start_train:   {start_train}')
    print(f'end_train:     {end_train}')
    # print(f'start_eval:    {start_eval}')
    # print(f'end_eval:      {end_eval}')

    _df = df_train.iloc[start_train:end_train]
    perf_ranks, most_common_syms = _4_perf_ranks(_df, n_top_syms=10)
    # 1 lookback of r_CAGR/UI, r_CAGR/retnStd, r_retnStd/UI
    print(f'perf_ranks: {perf_ranks}')  
    # most common symbols of perf_ranks 
    print(f'most_common_syms: {most_common_syms}')     
    # grp_perf_ranks[lookback] = perf_ranks
    print(f'+++ finish lookback slice {lookback} +++\n')
    grp_most_common_syms.append(most_common_syms)

  print(f'grp_most_common_syms: {grp_most_common_syms}')
  # grp_most_common_syms a is list of lists of tuples of 
  #  the most-common-symbols symbol:frequency cumulated from
  #  each days_lookback  
  print(f'**** finish lookback slices {lb_slices} ****\n')
  # flatten list of lists of (symbol:frequency)
  flat_grp_most_common_syms = [val for sublist in grp_most_common_syms for val in sublist]
  # group symbols from set of days_lookbacks (i.e. lb_slices) and sum frequency of the symbols
  set_most_common_syms = grp_tuples_sort_sum(flat_grp_most_common_syms, reverse=True)
  # get the top few most-frequent symbol:frequency pairs
  top_set_syms_n_freq = set_most_common_syms[0:set_top_syms]
  # get symbols from top_set_syms_n_freq
  top_set_syms = [i[0] for i in top_set_syms_n_freq]
  print(f'top {set_top_syms} ranked symbols and frequency from set {lb_slices}:\n{top_set_syms_n_freq}')
  print(f'top {set_top_syms} ranked symbols from set {lb_slices}:\n{top_set_syms}')  
  print(f'===== finish top {set_top_syms} ranked symbols from days_lookback set {lb_slices} =====\n\n')





lb_slices:     [(236, 266, 276), (206, 266, 276), (146, 266, 276)]
lb_slice:      (236, 266, 276)
days lookback: 30
days eval:     10
start_train:   236
end_train:     266
perf_ranks: {'period-30': {'r_CAGR/UI': array(['ENTA', 'BXC', 'MRTX', 'NVAX', 'CRSP', 'MDGL', 'SKY', 'RCKT',
       'ARWR', 'PTCT'], dtype=object), 'r_CAGR/retnStd': array(['ENTA', 'CRSP', 'BXC', 'MRTX', 'NVAX', 'MDGL', 'PTCT', 'SKY',
       'RCKT', 'CVEO'], dtype=object), 'r_retnStd/UI': array(['KDP', 'ENTA', 'SII', 'RELL', 'WING', 'CNMD', 'BXC', 'CHDN',
       'DECK', 'QNST'], dtype=object)}}
most_common_syms: [('ENTA', 3), ('BXC', 3), ('MRTX', 2), ('NVAX', 2), ('CRSP', 2), ('MDGL', 2), ('SKY', 2), ('RCKT', 2), ('PTCT', 2), ('ARWR', 1), ('CVEO', 1), ('KDP', 1), ('SII', 1), ('RELL', 1), ('WING', 1), ('CNMD', 1), ('CHDN', 1), ('DECK', 1), ('QNST', 1)]
+++ finish lookback slice 30 +++

lb_slices:     [(236, 266, 276), (206, 266, 276), (146, 266, 276)]
lb_slice:      (206, 266, 276)
days lookback: 60
days eval:     10
