In [1]:
import yfinance as yf
import yfinance.shared as shared
import time
import pandas as pd
# from datetime import date, timedelta, datetime
from myUtils import pickle_dump, pickle_load, read_symbols_file # NOQA
from myUtils import drop_symbols_all_NaN, chunked_list # NOQA
from myUtils import yf_download_AdjOHLCV_noAutoAdj
from yf_utils import _2_split_train_val_test, _3_random_slices, _4_perf_ranks

verbose = False  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

filename_symbols = path_data_dump + 'vg_symbols_4chars_max.csv'  # symbols text file
filename_pickled_df_OHLCVA_downloaded = 'df_OHLCVA_downloaded '  # OHLCVA downloaded from Yahoo
filename_pickled_df_adjOHLCV = 'df_adjOHLCV'  # adjusted OHLCV
filename_pickled_df_symbols_close = "df_symbols_close"  # symbols' adjusted close
filename_pickled_symbols_df_adjOHLCV =  'symbols_df_adjOHLCV'  # symbols in df_adjOHLCV
filename_pickled_df_c = 'df_close_clean' 

In [2]:
df_c = pickle_load(path_data_dump, filename_pickled_df_c)
df_train, df_val, df_test = _2_split_train_val_test(df_c)
len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)

In [3]:
# _n_samples = 2  # number of tuples to create for iloc start_train:end_train:end_eval
_n_samples = 2  # number of tuples to create for iloc start_train:end_train:end_eval
_days_lookbacks = [120, 60]
_days_eval = 10

In [4]:
# create n_samples of iloc start_train:end_train:end_eval using max value in days_loobacks
max_train_eval_days = _3_random_slices(len_df_train, n_samples=_n_samples, days_lookback=max(_days_lookbacks), days_eval=_days_eval)
max_train_eval_days

[(289, 409, 419), (466, 586, 596)]

In [5]:
def lookback_slices(max_slices, days_lookbacks, verbose=False):
  """Create 
  
  
  Returns perf_ranks_dict(dic. of dic. of symbols ranked in descending
    performance) and ranked_perf_ranks_dict(dic. of symbols ranked in
    descending frequency in a combined pool of symbols in perf_ranks_dict).

  Args:
      df_close(dataframe): dataframe of symbols' close with
        DatetimeIndex e.g. (['2016-12-19', ... '2016-12-22']), symbols as
        column names, and symbols' close as column values.
      days_lookbacks(list of positive integers): list of number of days to
      look-back, e.g. [15, 30], for performance calculation.
      n_top_syms(int): number of top symbols to keep in perf_ranks_dict 

  Return:
      perf_ranks_dict({dic): dic. of dic. of symbols ranked in descending
        performance.
        First dic keys are:
        'period' + str(days_lookbacks[0]), ... ,
        'period' + str(days_lookbacks[-1])
        Second dic keys are:
        'r_CAGR/UI', 'r_CAGR/retnStd' and 'r_retnStd/UI'
        e.g.:
        {
          period-15': {
                        'r_CAGR/UI':  ['HZNP', ... , 'CB'],
                        'r_CAGR/retnStd': ['BBW', ... , 'CPRX'],
                        'r_retnStd/UI':   ['ENR', ... , 'HSY']
                      },
          ... ,
          'period-60': {
                        'r_CAGR/UI':  ['WNC', ... , 'FSLR'],
                        'r_CAGR/retnStd': ['VCYT', ... , 'BERY'],
                        'r_retnStd/UI':   ['MYOV', ... , 'NSC']
                        }
        }
      ranked_perf_ranks_dict(dic): dic. of symbols ranked in descending
        frequency in a combined pool of symbols in perf_ranks_dict.  Key is
        'ranked_perf_ranks_period' + str(days_lookbacks), e.g.:
        {'ranked_perf_ranks_period[-15, -30]': ['HZNP', ... , 'NSC']}
  """  
  lb_slices = []
  days_lookbacks.sort()  # sort list of integers in ascending order
  for l_max_slice in max_slices:
    for days in days_lookbacks:
      new_slice = (l_max_slice[1]-days, l_max_slice[1], l_max_slice[2])
      lb_slices.append(new_slice)
      if verbose:
        print(f'days: {days}, {new_slice}')      
    if verbose:    
     print('')
  return lb_slices

In [6]:
def lookback_slices_2(max_slices, days_lookbacks, verbose=False):

  lb_slices = []
  days_lookbacks.sort()  # sort list of integers in ascending order
  for max_slice in max_slices:
    l_max_slice = []
    for days in days_lookbacks:
      new_slice = (max_slice[1]-days, max_slice[1], max_slice[2])
      l_max_slice.append(new_slice)
      if verbose:
        print(f'days: {days}, {new_slice}')    
    lb_slices.append(l_max_slice)  
    if verbose:    
     print('')
  return lb_slices

In [7]:
all_lb_slices = lookback_slices_2(max_slices=max_train_eval_days, days_lookbacks=_days_lookbacks, verbose=False)
all_lb_slices 

[[(349, 409, 419), (289, 409, 419)], [(526, 586, 596), (466, 586, 596)]]

In [8]:
L = [('grape', 100), ('apple', 15), ('grape', 3), ('apple', 10),
    ('apple', 4), ('banana', 3)]

In [14]:
def grp_tuples_sort_sum(L, reverse=True):
  # https://stackoverflow.com/questions/2249036/grouping-python-tuple-list
  # https://stackoverflow.com/questions/10695139/sort-a-list-of-tuples-by-2nd-item-integer-value
  """
  Given a list of tuples of (key:value) such as:
  [('grape', 100), ('apple', 15), ('grape', 3), ('apple', 10),
   ('apple', 4), ('banana', 3)]
  Returns list of grouped-sorted-tuples based on summed-values such as:
  [('grape', 103), ('apple', 29), ('banana', 3)] 

  Args:
      L(list of tuples): list of tuples of key(str):value(int) pairs
      reverse(bool): sort order of summed-values of the grouped tuples,
       default is in descending order.  

  Return:
      grp_sorted_list(list of tuples): list of grouped-sorted-tuples
       based on summed-values such as:
       [('grape', 103), ('apple', 29), ('banana', 3)] 
  """ 

  import itertools
  from operator import itemgetter

  grp_list = []
  l = sorted(L)
  it = itertools.groupby(l, itemgetter(0))

  for key, subiter in it:
    # print(f'key: {key}')
    key_sum = sum(item[1] for item in subiter)
    # print(f'key_sum: {key_sum}')
    grp_list.append((key, key_sum))

  grp_sorted_list = sorted(grp_list, key=itemgetter(1), reverse=reverse)

  return grp_sorted_list  


In [15]:
grp_tuples_sort_sum(L, reverse=True)

[('grape', 103), ('apple', 29), ('banana', 3)]

In [None]:
# grp_perf_ranks = {}
grp_most_common_syms = []
# loop thru lists of tuples of start_train:end_train:end_eval, i.e.
#  [[(887, 917, 927), (857, 917, 927), (797, 917, 927)],
#  [(483, 513, 523), (453, 513, 523), (393, 513, 523)]]
for lb_slices in all_lb_slices:  
  for lb_slice in lb_slices:  # lb_slice, e.g. (246, 276, 286)
    start_train = lb_slice[0]
    end_train = lb_slice[1]
    start_eval = end_train
    end_eval = lb_slice[2]
    lookback = end_train - start_train
    eval = end_eval - start_eval
    print(f'lb_slices:     {lb_slices}')
    print(f'lb_slice:      {lb_slice}')
    print(f'days lookback: {lookback}')
    print(f'days eval:     {eval}')    
    print(f'start_train:   {start_train}')
    print(f'end_train:     {end_train}')
    # print(f'start_eval:    {start_eval}')
    # print(f'end_eval:      {end_eval}')

    _df = df_train.iloc[start_train:end_train]
    perf_ranks, most_common_syms = _4_perf_ranks(_df, n_top_syms=10)
    # 1 lookback of r_CAGR/UI, r_CAGR/retnStd, r_retnStd/UI
    print(f'perf_ranks: {perf_ranks}')  
    # most common symbols of perf_ranks 
    print(f'most_common_syms: {most_common_syms}')     
    # grp_perf_ranks[lookback] = perf_ranks
    print(f'+++ finish lookback slice {lookback} +++\n')
    grp_most_common_syms.append(most_common_syms)
    
  print(f'grp_most_common_syms: {grp_most_common_syms}')
  print(f'===== finish lookback slices {lb_slices} =====\n')

In [None]:
# grp_perf_ranks
grp_most_common_syms
# flatten list of lists of (symbols:count)
flat_grp_most_common_syms = [val for sublist in grp_most_common_syms for val in sublist]

sorted_grp_most_common_syms = grp_tuples_sort_sum(flat_grp_most_common_syms, reverse=True)
print(f'len(sorted_grp_most_common_syms): {len(sorted_grp_most_common_syms)}')
sorted_grp_most_common_syms
# top_common_syms = sorted_grp_most_common_syms[0:5]
# top_common_syms = [i[0] for i in top_common_syms]  # get keys from key:value pairs
# top_common_syms 

In [19]:
top_common_syms = sorted_grp_most_common_syms[0:5]
top_common_syms = [i[0] for i in top_common_syms]  # get keys from key:value pairs
top_common_syms 

['FTSM', 'JYNT', 'AMED', 'AXSM', 'TNDM']

In [None]:
# grp_perf_ranks = {}
grp_most_common_syms = []
# loop thru lists of tuples of start_train:end_train:end_eval, i.e.
#  [[(887, 917, 927), (857, 917, 927), (797, 917, 927)],
#  [(483, 513, 523), (453, 513, 523), (393, 513, 523)]]
for l_max_slices in lb_slices:  
  for l_max_slice in l_max_slices:  # l_max_slice, e.g. (246, 276, 286)
    start_train = l_max_slice[0]
    end_train = l_max_slice[1]
    start_eval = end_train
    end_eval = l_max_slice[2]
    days_lookback = end_train - start_train
    days_eval = end_eval - start_eval
    print(f'l_max_slices:  {l_max_slices}')
    print(f'l_max_slice:   {l_max_slice}')
    print(f'days_lookback: {days_lookback}')
    print(f'days_eval:     {days_eval}')    
    print(f'start_train:   {start_train}')
    print(f'end_train:     {end_train}')
    # print(f'start_eval:    {start_eval}')
    # print(f'end_eval:      {end_eval}')

    _df = df_train.iloc[start_train:end_train]
    perf_ranks, most_common_syms = _4_perf_ranks(_df, n_top_syms=10)
    # 1 days_lookback of r_CAGR/UI, r_CAGR/retnStd, r_retnStd/UI
    print(f'perf_ranks: {perf_ranks}')  
    # most common symbols of perf_ranks 
    print(f'most_common_syms: {most_common_syms}')     
    # grp_perf_ranks[days_lookback] = perf_ranks
    print('+++ finish 1 days_lookback +++', '\n')
    grp_most_common_syms.append(most_common_syms)
    
  print(f'grp_most_common_syms: {grp_most_common_syms}')
  print('===== finish 1 set of days_lookbacks =====', '\n')