In [1]:
import yfinance as yf
import yfinance.shared as shared
import time
import pandas as pd
# from datetime import date, timedelta, datetime
from myUtils import pickle_dump, pickle_load, read_symbols_file # NOQA
from myUtils import drop_symbols_all_NaN, chunked_list # NOQA
from myUtils import yf_download_AdjOHLCV_noAutoAdj
from yf_utils import _2_split_train_val_test, _3_random_slices, _4_lookback_slices, _5_perf_ranks
from yf_utils import _6_grp_tuples_sort_sum

verbose = False  # True prints more output
# verbose = True  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

filename_symbols = path_data_dump + 'vg_symbols_4chars_max.csv'  # symbols text file
filename_pickled_df_OHLCVA_downloaded = 'df_OHLCVA_downloaded '  # OHLCVA downloaded from Yahoo
filename_pickled_df_adjOHLCV = 'df_adjOHLCV'  # adjusted OHLCV
filename_pickled_df_symbols_close = "df_symbols_close"  # symbols' adjusted close
filename_pickled_symbols_df_adjOHLCV =  'symbols_df_adjOHLCV'  # symbols in df_adjOHLCV
filename_pickled_df_c = 'df_close_clean' 

In [2]:
df_c = pickle_load(path_data_dump, filename_pickled_df_c)
df_train, df_val, df_test = _2_split_train_val_test(df_c)
len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)

In [3]:
n_samples = 4  # number of tuples to create for iloc start_train:end_train:end_eval
# n_samples = 200  # number of tuples to create for iloc start_train:end_train:end_eval
days_lookbacks = [30, 120, 60]  # number of lookback days for training
days_eval = 10  # number of days to evaluate effectiveness of the training
set_top_syms = 5  # number of the most-common symbols from days_lookbacks' performance rankings

In [4]:
# create n_samples of iloc start_train:end_train:end_eval using max value in days_loobacks
max_lookback_slices = _3_random_slices(len_df_train, n_samples=n_samples, days_lookback=max(days_lookbacks), days_eval=days_eval)
sets_lookback_slices = _4_lookback_slices(max_slices=max_lookback_slices, days_lookbacks=days_lookbacks, verbose=False)

if verbose:
  print(f'max_lookback_slices:\n{max_lookback_slices}')
  print(f'sets_lookback_slices:\n{sets_lookback_slices}')

In [5]:
grp_top_set_syms_n_freq = []  # list of lists of top_set_symbols_n_freq, there are n_samples lists in list
grp_top_set_syms = []  # list of lists of top_set_symbols, there are n_samples lists in list
# loop thru lists of tuples of start_train:end_train:end_eval, i.e.
#  [[(887, 917, 927), (857, 917, 927), (797, 917, 927)],
#  [(483, 513, 523), (453, 513, 523), (393, 513, 523)]]
for lb_slices in sets_lookback_slices:
  grp_most_common_syms = []  
  for lb_slice in lb_slices:  # lb_slice, e.g. (246, 276, 286)
    start_train = lb_slice[0]
    end_train = lb_slice[1]
    start_eval = end_train
    end_eval = lb_slice[2]
    lookback = end_train - start_train
    eval = end_eval - start_eval

    if verbose:
      print(f'lb_slices:     {lb_slices}')
      print(f'lb_slice:      {lb_slice}')
      print(f'days lookback: {lookback}')
      print(f'days eval:     {eval}')    
      print(f'start_train:   {start_train}')
      print(f'end_train:     {end_train}')
      # print(f'start_eval:    {start_eval}')
      # print(f'end_eval:      {end_eval}')`

    _df = df_train.iloc[start_train:end_train]
    perf_ranks, most_common_syms = _5_perf_ranks(_df, n_top_syms=10)
    grp_most_common_syms.append(most_common_syms)
    
    if verbose:    
      # 1 lookback of r_CAGR/UI, r_CAGR/retnStd, r_retnStd/UI
      print(f'perf_ranks: {perf_ranks}')  
      # most common symbols of perf_ranks 
      print(f'most_common_syms: {most_common_syms}')     
      # grp_perf_ranks[lookback] = perf_ranks
      print(f'+++ finish lookback slice {lookback} +++\n')

  if verbose:
    print(f'grp_most_common_syms: {grp_most_common_syms}')
    # grp_most_common_syms a is list of lists of tuples of 
    #  the most-common-symbols symbol:frequency cumulated from
    #  each days_lookback  
    print(f'**** finish lookback slices {lb_slices} ****\n')

  # flatten list of lists of (symbol:frequency)
  flat_grp_most_common_syms = [val for sublist in grp_most_common_syms for val in sublist]
  # group symbols from set of days_lookbacks (i.e. lb_slices) and sum frequency of the symbols
  set_most_common_syms = _6_grp_tuples_sort_sum(flat_grp_most_common_syms, reverse=True)
  # get the top few most-frequent symbol:frequency pairs
  top_set_syms_n_freq = set_most_common_syms[0:set_top_syms]
  # get symbols from top_set_syms_n_freq
  top_set_syms = [i[0] for i in top_set_syms_n_freq]
  grp_top_set_syms_n_freq.append(top_set_syms_n_freq)
  grp_top_set_syms.append(top_set_syms)

  if verbose:  
    print(f'top {set_top_syms} ranked symbols and frequency from set {lb_slices}:\n{top_set_syms_n_freq}')
    print(f'top {set_top_syms} ranked symbols from set {lb_slices}:\n{top_set_syms}')  
    print(f'===== finish top {set_top_syms} ranked symbols from days_lookback set {lb_slices} =====\n\n')

In [6]:
print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
for item in z_grp_top_set_syms:
  print(item)
print('')

z_grp_top_set_syms:
((42, 162, 172), ['XOMA', 'CUTR', 'DVAX', 'RDNT', 'AVEO'])
((95, 215, 225), ['ENPH', 'GDS', 'MRTX', 'OSTK', 'ALGN'])
((552, 672, 682), ['ETN', 'FTSM', 'SHAK', 'CAL', 'HOV'])
((877, 997, 1007), ['DAC', 'MARA', 'MSTR', 'BEEM', 'CHRD'])



In [7]:
print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
for i, (_lookback_slice, _top_set_syms) in enumerate(z_grp_top_set_syms):
  print(f'{i + 1 } of {n_samples} max_lookback_slice')
  print(f'max_lookback_slice: {_lookback_slice}')
  print(f'top_set_syms: {_top_set_syms}')
  print(f'start_eval: {_lookback_slice[1]}')
  print(f'end_eval:   {_lookback_slice[2]}')
  l_syms = []  # list to accumlate top set symbbols
  for sym in _top_set_syms:
    l_syms.append(sym)
    print(f'symbol: {sym:>4}')
  print(f'top symbols from max_lookback_slice: {l_syms}')  
  print('')  

z_grp_top_set_syms:
1 of 4 max_lookback_slice
max_lookback_slice: (42, 162, 172)
top_set_syms: ['XOMA', 'CUTR', 'DVAX', 'RDNT', 'AVEO']
start_eval: 162
end_eval:   172
symbol: XOMA
symbol: CUTR
symbol: DVAX
symbol: RDNT
symbol: AVEO
top symbols from max_lookback_slice: ['XOMA', 'CUTR', 'DVAX', 'RDNT', 'AVEO']

2 of 4 max_lookback_slice
max_lookback_slice: (95, 215, 225)
top_set_syms: ['ENPH', 'GDS', 'MRTX', 'OSTK', 'ALGN']
start_eval: 215
end_eval:   225
symbol: ENPH
symbol:  GDS
symbol: MRTX
symbol: OSTK
symbol: ALGN
top symbols from max_lookback_slice: ['ENPH', 'GDS', 'MRTX', 'OSTK', 'ALGN']

3 of 4 max_lookback_slice
max_lookback_slice: (552, 672, 682)
top_set_syms: ['ETN', 'FTSM', 'SHAK', 'CAL', 'HOV']
start_eval: 672
end_eval:   682
symbol:  ETN
symbol: FTSM
symbol: SHAK
symbol:  CAL
symbol:  HOV
top symbols from max_lookback_slice: ['ETN', 'FTSM', 'SHAK', 'CAL', 'HOV']

4 of 4 max_lookback_slice
max_lookback_slice: (877, 997, 1007)
top_set_syms: ['DAC', 'MARA', 'MSTR', 'BEEM', 'C

In [8]:
print('z_grp_top_set_syms_n_freq:')
z_grp_top_set_syms_n_freq = zip(max_lookback_slices, grp_top_set_syms_n_freq)
for i, (_lookback_slice, _top_set_syms_n_freq) in enumerate(z_grp_top_set_syms_n_freq):
  print(f'{i + 1 } of {n_samples} max_lookback_slice')
  print(f'max_lookback_slice: {_lookback_slice}')
  print(f'top_set_syms_n_freq: {_top_set_syms_n_freq}')
  print(f'start_eval: {_lookback_slice[1]}')
  print(f'end_eval:   {_lookback_slice[2]}')
  l_syms = []  # list to accumlate top set symbbols
  for sym_n_freq in _top_set_syms_n_freq:
    l_syms.append(sym_n_freq[0])
    print(f'symbol: {sym_n_freq[0]:>4},  freq: {sym_n_freq[1]:>2}')
  print(f'top symbols from max_lookback_slice: {l_syms}')  
  print('')  

z_grp_top_set_syms_n_freq:
1 of 4 max_lookback_slice
max_lookback_slice: (42, 162, 172)
top_set_syms_n_freq: [('XOMA', 6), ('CUTR', 5), ('DVAX', 5), ('RDNT', 5), ('AVEO', 4)]
start_eval: 162
end_eval:   172
symbol: XOMA,  freq:  6
symbol: CUTR,  freq:  5
symbol: DVAX,  freq:  5
symbol: RDNT,  freq:  5
symbol: AVEO,  freq:  4
top symbols from max_lookback_slice: ['XOMA', 'CUTR', 'DVAX', 'RDNT', 'AVEO']

2 of 4 max_lookback_slice
max_lookback_slice: (95, 215, 225)
top_set_syms_n_freq: [('ENPH', 6), ('GDS', 6), ('MRTX', 6), ('OSTK', 6), ('ALGN', 5)]
start_eval: 215
end_eval:   225
symbol: ENPH,  freq:  6
symbol:  GDS,  freq:  6
symbol: MRTX,  freq:  6
symbol: OSTK,  freq:  6
symbol: ALGN,  freq:  5
top symbols from max_lookback_slice: ['ENPH', 'GDS', 'MRTX', 'OSTK', 'ALGN']

3 of 4 max_lookback_slice
max_lookback_slice: (552, 672, 682)
top_set_syms_n_freq: [('ETN', 6), ('FTSM', 6), ('SHAK', 5), ('CAL', 3), ('HOV', 3)]
start_eval: 672
end_eval:   682
symbol:  ETN,  freq:  6
symbol: FTSM,  

In [25]:
print('z_grp_top_set_syms:')
z_grp_top_set_syms = zip(max_lookback_slices, grp_top_set_syms)
for i, (_lookback_slice, _top_set_syms) in enumerate(z_grp_top_set_syms):
  print(f'{i + 1 } of {n_samples} max_lookback_slice')
  print(f'max_lookback_slice: {_lookback_slice}')
  print(f'top_set_syms: {_top_set_syms}')


  start_eval = _lookback_slice[1]
  end_eval = _lookback_slice[2]
  df_eval = df_train[start_eval:end_eval][_top_set_syms]
  if verbose:
    print(f'start_eval: {start_eval}')
    print(f'end_eval:   {end_eval}')  
    print(f'\ndf_eval:\n{df_eval}\n')



  df_perf1, grp_retnStd_d_UI, grp_CAGR_d_retnStd, grp_CAGR_d_UI = perf_eval(df_temp)
  print(f'grp(retnStd/UI):   mean, std, mean/std: {grp_retnStd_d_UI[0]:>10.6f}, {grp_retnStd_d_UI[1]:>10.6f}, {grp_retnStd_d_UI[2]:>10.6f}')
  print(f'grp(CAGR/retnStd): mean, std, mean/std: {grp_CAGR_d_retnStd[0]:>10.6f}, {grp_CAGR_d_retnStd[1]:>10.6f}, {grp_CAGR_d_retnStd[2]:>10.6f}')
  print(f'grp(CAGR/UI):      mean, std, mean/std: {grp_CAGR_d_UI[0]:>10.6f}, {grp_CAGR_d_UI[1]:>10.6f}, {grp_CAGR_d_UI[2]:>10.6f}')
  df_perf1




  print('')  

z_grp_top_set_syms:
1 of 4 max_lookback_slice
max_lookback_slice: (42, 162, 172)
top_set_syms: ['XOMA', 'CUTR', 'DVAX', 'RDNT', 'AVEO']
start_eval: 162
end_eval:   172

df_eval:
                 XOMA       CUTR       DVAX   RDNT       AVEO
Date                                                         
2017-09-14  18.719999  37.450001  20.650000  11.20  35.000000
2017-09-15  19.400000  38.549999  20.799999  11.15  35.500000
2017-09-18  19.090000  40.250000  20.750000  11.10  37.700001
2017-09-19  18.990000  40.349998  20.850000  11.00  38.400002
2017-09-20  20.150000  40.500000  21.049999  10.85  37.700001
2017-09-21  20.799999  39.900002  20.799999  11.00  37.200001
2017-09-22  19.570000  39.849998  20.500000  11.10  36.000000
2017-09-25  19.400000  39.650002  21.049999  11.05  35.599998
2017-09-26  19.290001  39.650002  20.600000  11.60  35.099998
2017-09-27  19.299999  40.450001  20.650000  11.85  36.299999


2 of 4 max_lookback_slice
max_lookback_slice: (95, 215, 225)
top_set_syms: [

In [11]:
grp_top_set_syms

[['XOMA', 'CUTR', 'DVAX', 'RDNT', 'AVEO'],
 ['ENPH', 'GDS', 'MRTX', 'OSTK', 'ALGN'],
 ['ETN', 'FTSM', 'SHAK', 'CAL', 'HOV'],
 ['DAC', 'MARA', 'MSTR', 'BEEM', 'CHRD']]

In [12]:
max_lookback_slices

[(42, 162, 172), (95, 215, 225), (552, 672, 682), (877, 997, 1007)]