In [None]:
def _4_perf_ranks(df_close, days_lookbacks, n_top_syms=200, verbose=False):
    """Returns perf_ranks_dict(dic. of dic. of symbols ranked in descending
     performance) and ranked_perf_ranks_dict(dic. of symbols ranked in
     descending frequency in a combined pool of symbols in perf_ranks_dict).

    Args:
        df_close(dataframe): dataframe of symbols' close with
         DatetimeIndex e.g. (['2016-12-19', ... '2016-12-22']), symbols as
         column names, and symbols' close as column values.
        days_lookbacks(list of positive integers): list of number of days to
        look-back, e.g. [15, 30], for performance calculation.
        n_top_syms(int): number of top symbols to keep in perf_ranks_dict 

    Return:
        perf_ranks_dict({dic): dic. of dic. of symbols ranked in descending
         performance.
         First dic keys are:
          'period' + str(days_lookbacks[0]), ... ,
          'period' + str(days_lookbacks[-1])
         Second dic keys are:
          'r_CAGR/UI', 'r_CAGR/retnStd' and 'r_retnStd/UI'
         e.g.:
          {
            period-15': {
                         'r_CAGR/UI':  ['HZNP', ... , 'CB'],
                         'r_CAGR/retnStd': ['BBW', ... , 'CPRX'],
                         'r_retnStd/UI':   ['ENR', ... , 'HSY']
                        },
            ... ,
            'period-60': {
                          'r_CAGR/UI':  ['WNC', ... , 'FSLR'],
                          'r_CAGR/retnStd': ['VCYT', ... , 'BERY'],
                          'r_retnStd/UI':   ['MYOV', ... , 'NSC']
                         }
          }
        ranked_perf_ranks_dict(dic): dic. of symbols ranked in descending
         frequency in a combined pool of symbols in perf_ranks_dict.  Key is
         'ranked_perf_ranks_period' + str(days_lookbacks), e.g.:
         {'ranked_perf_ranks_period[-15, -30]': ['HZNP', ... , 'NSC']}
    """

    import pandas as pd
    from myUtils import symb_perf_stats_vectorized_v2

    perf_ranks_dict = {}  # dic of performance ranks
    syms_perf_rank = []  # list of lists to store top 100 ranked symbols

    for days_lookback in days_lookbacks:
        days_lookback = -1 * days_lookback
        f_name = "period" + str(days_lookback)
        _df_c = df_close[days_lookback::]
        (
            symbols,
            period_yr,
            drawdown,
            UI,
            max_drawdown,
            retnStd,
            retnStd_d_UI,
            CAGR,
            CAGR_d_retnStd,
            CAGR_d_UI,        
        ) = symb_perf_stats_vectorized_v2(_df_c)            


        caches_perf_stats_vect = []
        for symbol in symbols:
            date_first = drawdown.index[0].strftime("%Y-%m-%d")
            date_last = drawdown.index[-1].strftime("%Y-%m-%d")
            cache = (
                symbol,
                date_first,
                date_last,
                period_yr,
                CAGR[symbol],
                UI[symbol],
                retnStd_d_UI[symbol],
                CAGR_d_retnStd[symbol],
                CAGR_d_UI[symbol],
            )
            # append performance data (tuple) to caches_perf_stats (list)
            caches_perf_stats_vect.append(cache)
        column_names = [
            "symbol",
            "first date",
            "last date",
            "Year",
            "CAGR",
            "UI",
            "retnStd/UI",
            "CAGR/retnStd",
            "CAGR/UI",
        ]

        # write symbols' performance stats to dataframe
        df_ps = pd.DataFrame(caches_perf_stats_vect, columns=column_names)
        df_ps["r_CAGR/UI"] = df_ps["CAGR/UI"].rank(ascending=False)
        df_ps["r_CAGR/retnStd"] = df_ps["CAGR/retnStd"].rank(ascending=False)
        df_ps["r_retnStd/UI"] = df_ps["retnStd/UI"].rank(ascending=False)

        _dict = {}
        cols_sort = ["r_CAGR/UI", "r_CAGR/retnStd", "r_retnStd/UI"]

        # print(f'{f_name} top 100 symbols')
        for col in cols_sort:
            symbols_top_n = (

                df_ps.sort_values(by=[col]).head(n_top_syms).symbol.values
                # df_ps.sort_values(by=[col]).symbol.values

            )
            syms_perf_rank.append(list(symbols_top_n))
            # print(f'{col}: {symbols_top_n}')
            _dict[col] = symbols_top_n
            perf_ranks_dict[f"{f_name}"] = _dict

    syms_perf_rank  # list of lists of top n_top_syms symbols
    l_syms_perf_rank = [
        val for sublist in syms_perf_rank for val in sublist
    ]  # flatten list of lists

    from collections import Counter

    cnt_symbol_freq = Counter(l_syms_perf_rank)  # count symbols and frequency
    # print(cnt_symbol_freq)
    
    
    ####################### all symbols in l_tuples have the same count of 3 #############
    l_tuples = (
        cnt_symbol_freq.most_common()
    )  # convert to e.g [('AKRO', 6), ('IMVT', 4), ... ('ADEA', 3)]
    print(f'days_lookback: {days_lookback}')
    print(f'len(l_tuples): {len(l_tuples)}')
    print(l_tuples)
    ####################### all symbols in l_tuples have the same count of 3 #############



    symbols_ranked_perf_ranks = [



        symbol for symbol, count in l_tuples
        # symbol, count for symbol, count in l_tuples


    ]  # select just the symbols without the frequency counts

    ranked_perf_ranks_dict = {}
    f_name = f"ranked_perf_ranks_period" + str(
        days_lookbacks
    )  # key name, ranked_perf_ranks_dict
    ranked_perf_ranks_dict[
        f"{f_name}"
    # values: list of most common symbols in all performance ranks in
    #  descending order
    ] = symbols_ranked_perf_ranks

    # return perf_ranks_dict, ranked_perf_ranks_dict
    return perf_ranks_dict, ranked_perf_ranks_dict, l_tuples    


In [13]:
def _4_perf_ranks_1(df_close, n_top_syms=200, verbose=False):
    """Returns perf_ranks_dict(dic. of dic. of symbols ranked in descending
     performance) and ranked_perf_ranks_dict(dic. of symbols ranked in
     descending frequency in a combined pool of symbols in perf_ranks_dict).

    Args:
        df_close(dataframe): dataframe of symbols' close with
         DatetimeIndex e.g. (['2016-12-19', ... '2016-12-22']), symbols as
         column names, and symbols' close as column values.
        days_lookbacks(list of positive integers): list of number of days to
        look-back, e.g. [15, 30], for performance calculation.
        n_top_syms(int): number of top symbols to keep in perf_ranks_dict 

    Return:
        perf_ranks_dict({dic): dic. of dic. of symbols ranked in descending
         performance.
         First dic keys are:
          'period' + str(days_lookbacks[0]), ... ,
          'period' + str(days_lookbacks[-1])
         Second dic keys are:
          'r_CAGR/UI', 'r_CAGR/retnStd' and 'r_retnStd/UI'
         e.g.:
          {
            period-15': {
                         'r_CAGR/UI':  ['HZNP', ... , 'CB'],
                         'r_CAGR/retnStd': ['BBW', ... , 'CPRX'],
                         'r_retnStd/UI':   ['ENR', ... , 'HSY']
                        },
            ... ,
            'period-60': {
                          'r_CAGR/UI':  ['WNC', ... , 'FSLR'],
                          'r_CAGR/retnStd': ['VCYT', ... , 'BERY'],
                          'r_retnStd/UI':   ['MYOV', ... , 'NSC']
                         }
          }
        ranked_perf_ranks_dict(dic): dic. of symbols ranked in descending
         frequency in a combined pool of symbols in perf_ranks_dict.  Key is
         'ranked_perf_ranks_period' + str(days_lookbacks), e.g.:
         {'ranked_perf_ranks_period[-15, -30]': ['HZNP', ... , 'NSC']}
    """

    import pandas as pd
    from myUtils import symb_perf_stats_vectorized_v2

    perf_ranks_dict = {}  # dic of performance ranks
    syms_perf_rank = []  # list of lists to store top 100 ranked symbols

    # for days_lookback in days_lookbacks:
    days_lookback = -1 * len(df_close)
    f_name = "period" + str(days_lookback)
    _df_c = df_close[days_lookback::]
    (
        symbols,
        period_yr,
        drawdown,
        UI,
        max_drawdown,
        retnStd,
        retnStd_d_UI,
        CAGR,
        CAGR_d_retnStd,
        CAGR_d_UI,        
    ) = symb_perf_stats_vectorized_v2(_df_c)            


    caches_perf_stats_vect = []
    for symbol in symbols:
        date_first = drawdown.index[0].strftime("%Y-%m-%d")
        date_last = drawdown.index[-1].strftime("%Y-%m-%d")
        cache = (
            symbol,
            date_first,
            date_last,
            period_yr,
            CAGR[symbol],
            UI[symbol],
            retnStd_d_UI[symbol],
            CAGR_d_retnStd[symbol],
            CAGR_d_UI[symbol],
        )
        # append performance data (tuple) to caches_perf_stats (list)
        caches_perf_stats_vect.append(cache)
    column_names = [
        "symbol",
        "first date",
        "last date",
        "Year",
        "CAGR",
        "UI",
        "retnStd/UI",
        "CAGR/retnStd",
        "CAGR/UI",
    ]

    # write symbols' performance stats to dataframe
    df_ps = pd.DataFrame(caches_perf_stats_vect, columns=column_names)
    df_ps["r_CAGR/UI"] = df_ps["CAGR/UI"].rank(ascending=False)
    df_ps["r_CAGR/retnStd"] = df_ps["CAGR/retnStd"].rank(ascending=False)
    df_ps["r_retnStd/UI"] = df_ps["retnStd/UI"].rank(ascending=False)

    _dict = {}
    cols_sort = ["r_CAGR/UI", "r_CAGR/retnStd", "r_retnStd/UI"]

    # print(f'{f_name} top 100 symbols')
    for col in cols_sort:
        symbols_top_n = (

            df_ps.sort_values(by=[col]).head(n_top_syms).symbol.values
            # df_ps.sort_values(by=[col]).symbol.values

        )
        syms_perf_rank.append(list(symbols_top_n))
        # print(f'{col}: {symbols_top_n}')
        _dict[col] = symbols_top_n
        perf_ranks_dict[f"{f_name}"] = _dict

    syms_perf_rank  # list of lists of top n_top_syms symbols
    l_syms_perf_rank = [
        val for sublist in syms_perf_rank for val in sublist
    ]  # flatten list of lists

    from collections import Counter

    cnt_symbol_freq = Counter(l_syms_perf_rank)  # count symbols and frequency
    # print(cnt_symbol_freq)
    
    
    ####################### all symbols in l_tuples have the same count of 3 #############
    l_tuples = (
        cnt_symbol_freq.most_common()
    )  # convert to e.g [('AKRO', 6), ('IMVT', 4), ... ('ADEA', 3)]
    print(f'days_lookback: {days_lookback}')
    print(f'len(l_tuples): {len(l_tuples)}')
    print(l_tuples)
    ####################### all symbols in l_tuples have the same count of 3 #############



    symbols_ranked_perf_ranks = [



        symbol for symbol, count in l_tuples
        # symbol, count for symbol, count in l_tuples


    ]  # select just the symbols without the frequency counts

    ranked_perf_ranks_dict = {}
    f_name = f"ranked_perf_ranks_period" + str(
        days_lookback
    )  # key name, ranked_perf_ranks_dict
    ranked_perf_ranks_dict[
        f"{f_name}"
    # values: list of most common symbols in all performance ranks in
    #  descending order
    ] = symbols_ranked_perf_ranks

    # return perf_ranks_dict, ranked_perf_ranks_dict
    return perf_ranks_dict, ranked_perf_ranks_dict, l_tuples    


In [4]:
import yfinance as yf
import yfinance.shared as shared
import time
import pandas as pd
# from datetime import date, timedelta, datetime
from myUtils import pickle_dump, pickle_load, read_symbols_file # NOQA
from myUtils import drop_symbols_all_NaN, chunked_list # NOQA
from myUtils import yf_download_AdjOHLCV_noAutoAdj

# from yf_utils import _2_split_train_val_test, _3_random_slices, _4_perf_ranks
from yf_utils import _2_split_train_val_test, _3_random_slices
# from trash import _4_perf_ranks_1

verbose = False  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

filename_symbols = path_data_dump + 'vg_symbols_4chars_max.csv'  # symbols text file
filename_pickled_df_OHLCVA_downloaded = 'df_OHLCVA_downloaded '  # OHLCVA downloaded from Yahoo
filename_pickled_df_adjOHLCV = 'df_adjOHLCV'  # adjusted OHLCV
filename_pickled_df_symbols_close = "df_symbols_close"  # symbols' adjusted close
filename_pickled_symbols_df_adjOHLCV =  'symbols_df_adjOHLCV'  # symbols in df_adjOHLCV
filename_pickled_df_c = 'df_close_clean' 

In [5]:
df_c = pickle_load(path_data_dump, filename_pickled_df_c)
df_train, df_val, df_test = _2_split_train_val_test(df_c)
len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)

In [6]:
_n_samples = 2  # number of tuples to create for iloc start_train:end_train:end_eval
_days_lookbacks = [60, 120, 30]
_days_eval = 10

In [7]:
# create n_samples of iloc start_train:end_train:end_eval using max value in days_loobacks
max_train_eval_days = _3_random_slices(len_df_train, n_samples=_n_samples, days_lookback=max(_days_lookbacks), days_eval=_days_eval)
max_train_eval_days

[(224, 344, 354), (582, 702, 712)]

In [8]:
def lookback_slices(max_slices, days_lookbacks, verbose=False):
  """Create 
  
  
  Returns perf_ranks_dict(dic. of dic. of symbols ranked in descending
    performance) and ranked_perf_ranks_dict(dic. of symbols ranked in
    descending frequency in a combined pool of symbols in perf_ranks_dict).

  Args:
      df_close(dataframe): dataframe of symbols' close with
        DatetimeIndex e.g. (['2016-12-19', ... '2016-12-22']), symbols as
        column names, and symbols' close as column values.
      days_lookbacks(list of positive integers): list of number of days to
      look-back, e.g. [15, 30], for performance calculation.
      n_top_syms(int): number of top symbols to keep in perf_ranks_dict 

  Return:
      perf_ranks_dict({dic): dic. of dic. of symbols ranked in descending
        performance.
        First dic keys are:
        'period' + str(days_lookbacks[0]), ... ,
        'period' + str(days_lookbacks[-1])
        Second dic keys are:
        'r_CAGR/UI', 'r_CAGR/retnStd' and 'r_retnStd/UI'
        e.g.:
        {
          period-15': {
                        'r_CAGR/UI':  ['HZNP', ... , 'CB'],
                        'r_CAGR/retnStd': ['BBW', ... , 'CPRX'],
                        'r_retnStd/UI':   ['ENR', ... , 'HSY']
                      },
          ... ,
          'period-60': {
                        'r_CAGR/UI':  ['WNC', ... , 'FSLR'],
                        'r_CAGR/retnStd': ['VCYT', ... , 'BERY'],
                        'r_retnStd/UI':   ['MYOV', ... , 'NSC']
                        }
        }
      ranked_perf_ranks_dict(dic): dic. of symbols ranked in descending
        frequency in a combined pool of symbols in perf_ranks_dict.  Key is
        'ranked_perf_ranks_period' + str(days_lookbacks), e.g.:
        {'ranked_perf_ranks_period[-15, -30]': ['HZNP', ... , 'NSC']}
  """  
  lb_slices = []
  days_lookbacks.sort()  # sort list of integers in ascending order
  for my_slice in max_slices:
    # lb_slices.append(my_slice)
    # if verbose:
    #   print(f'days: {days_lookbacks[-1]}, {my_slice}')
    for days in days_lookbacks:
      new_slice = (my_slice[1]-days, my_slice[1], my_slice[2])
      lb_slices.append(new_slice)
      if verbose:
        print(f'days: {days}, {new_slice}')      
    if verbose:    
     print('')
  return lb_slices

In [9]:
lb_slices = lookback_slices(max_slices=max_train_eval_days, days_lookbacks=_days_lookbacks, verbose=False)
lb_slices 

[(314, 344, 354),
 (284, 344, 354),
 (224, 344, 354),
 (672, 702, 712),
 (642, 702, 712),
 (582, 702, 712)]

In [10]:
perf_ranks_dic = {}
ranked_perf_ranks_dic = {}

In [14]:
for lb_slice in lb_slices:
  start_train = lb_slice[0]
  end_train = lb_slice[1]
  start_eval = end_train
  end_eval = lb_slice[2]
  print(f'start_train: {start_train}')
  print(f'end_train:   {end_train}')
  print(f'start_eval:  {start_eval}')
  print(f'end_eval:    {end_eval}')
  print('')
  _df = df_train.iloc[start_train:end_train]
  perf_ranks, ranked_perf_ranks, l_tuples = _4_perf_ranks_1(_df)    
  perf_ranks[days_lookback] = perf_ranks
  ranked_perf_ranks[days_lookback] = ranked_perf_ranks   

start_train: 314
end_train:   344
start_eval:  344
end_eval:    354

days_lookback: -30
len(l_tuples): 277
[('STAA', 3), ('TNDM', 3), ('CRK', 3), ('WWE', 3), ('RETA', 3), ('CHRD', 3), ('OSPN', 3), ('CHDN', 3), ('BOOM', 3), ('KMPR', 3), ('GDS', 3), ('THC', 3), ('EXPO', 3), ('ENVA', 3), ('BCOR', 3), ('EVTC', 3), ('VTOL', 3), ('DINO', 3), ('TTD', 3), ('XHR', 3), ('TTGT', 3), ('LPLA', 3), ('HSKA', 3), ('SIVB', 3), ('SM', 3), ('OKE', 3), ('VLO', 3), ('EFC', 3), ('HSII', 3), ('PBF', 3), ('EFSC', 3), ('CMG', 3), ('AXON', 3), ('SHAK', 3), ('NVCR', 3), ('ENSG', 3), ('GDEN', 3), ('PERI', 3), ('CRMT', 3), ('PENN', 3), ('CDNA', 3), ('FFIV', 3), ('OXY', 3), ('HAE', 3), ('MED', 3), ('AXSM', 3), ('LNW', 3), ('NSTG', 3), ('DVN', 3), ('BOOT', 3), ('EVH', 3), ('SMCI', 3), ('TWIN', 3), ('MMSI', 3), ('ESI', 3), ('VCYT', 3), ('NBHC', 3), ('TWLO', 3), ('NSIT', 3), ('DMLP', 3), ('FCN', 3), ('FTSM', 3), ('HQY', 3), ('BPT', 3), ('MRO', 3), ('TPL', 3), ('EWBC', 3), ('CHE', 3), ('WMK', 3), ('MORN', 3), ('NSP', 3

NameError: name 'days_lookback' is not defined

In [None]:
for days_lookback in slices_dic:
  slices = slices_dic[days_lookback]
  print(days_lookback, slices)
  for my_slice in slices:
    start_train = my_slice[0]
    end_train = my_slice[1]
    start_eval = end_train
    end_eval = my_slice[2]
    print(f'start_train: {start_train}')
    print(f'end_train:   {end_train}')
    print(f'start_eval:   {start_eval}')
    print(f'end_eval:     {end_eval}')
    _df = df_train.iloc[start_train:end_train]


    # l_days_lookback = []  # create a list for days_lookbacks
    # l_days_lookback.append(days_lookback)
    # perf_ranks, ranked_perf_ranks, l_tuples = _4_perf_ranks(_df, days_lookbacks=l_days_lookbacks)
    
    
    perf_ranks, ranked_perf_ranks, l_tuples = _4_perf_ranks(_df, days_lookbacks=days_lookbacks)    
    perf_ranks[days_lookback] = perf_ranks
    ranked_perf_ranks[days_lookback] = ranked_perf_ranks 
  print('')   

In [None]:
perf_ranks, ranked_perf_ranks, l_tuples = _4_perf_ranks(df_train, days_lookbacks=days_lookbacks, n_top_syms=20)

In [None]:
ranked_perf_ranks

In [None]:
l_tuples
# len(l_tuples)

In [None]:
perf_ranks

In [None]:
for days_lookback in slices_dic:
  slices = slices_dic[days_lookback]
  print(days_lookback, slices)
  for my_slice in slices:
    start_train = my_slice[0]
    end_train = my_slice[1]
    start_val = end_train
    end_val = my_slice[2]
    print(f'start_train: {start_train}')
    print(f'end_train:   {end_train}')
    # print(f'start_val:   {start_val}')
    # print(f'end_val:     {end_val}')
    _df = df_train.iloc[start_train:end_train]
    perf_ranks, ranked_perf_ranks = _4_perf_ranks_1(_df, days_lookback=days_lookback)
    perf_ranks_dic[days_lookback] = perf_ranks
    ranked_perf_ranks_dic[days_lookback] = ranked_perf_ranks 
  print('')    

In [None]:
perf_ranks_dic

In [None]:
ranked_perf_ranks_dic

In [None]:
# for my_slice in my_slices[0:2]:
for my_slice in my_slices:
  start_train = my_slice[0]
  end_train = my_slice[1]
  start_val = end_train
  end_val = my_slice[2]
  print(f'start_train: {start_train}')
  print(f'end_train: {end_train}')
  print(f'start_val: {start_val}')
  print(f'end_val: {end_val}')

  _df = df_train.iloc[start_train:end_train]
  # print(_df) 
  perf_ranks_dict, ranked_perf_ranks_dict = _4_perf_ranks(_df, days_lookbacks=[60])
  # print(perf_ranks_dict)
  # print(ranked_perf_ranks_dict)  

In [None]:
# print(perf_ranks_dict)
trash = perf_ranks_dict
trash

In [None]:
ranked_perf_ranks_dict