In [38]:
import yfinance as yf
import yfinance.shared as shared
import time
import pandas as pd
# from datetime import date, timedelta, datetime
from myUtils import pickle_dump, pickle_load, read_symbols_file # NOQA
from myUtils import drop_symbols_all_NaN, chunked_list # NOQA
from myUtils import yf_download_AdjOHLCV_noAutoAdj

# from yf_utils import _2_split_train_val_test, _3_random_slices, _4_perf_ranks
from yf_utils import _2_split_train_val_test, _3_random_slices
# from trash import _4_perf_ranks_1

verbose = False  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

filename_symbols = path_data_dump + 'vg_symbols_4chars_max.csv'  # symbols text file
filename_pickled_df_OHLCVA_downloaded = 'df_OHLCVA_downloaded '  # OHLCVA downloaded from Yahoo
filename_pickled_df_adjOHLCV = 'df_adjOHLCV'  # adjusted OHLCV
filename_pickled_df_symbols_close = "df_symbols_close"  # symbols' adjusted close
filename_pickled_symbols_df_adjOHLCV =  'symbols_df_adjOHLCV'  # symbols in df_adjOHLCV
filename_pickled_df_c = 'df_close_clean' 

In [39]:
df_c = pickle_load(path_data_dump, filename_pickled_df_c)
df_train, df_val, df_test = _2_split_train_val_test(df_c)

In [40]:
len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)

In [41]:
n_samples = 2
# days_lookback = 60
days_lookbacks = [60, 120]
days_eval = 10

In [44]:
slices_dic = {}
for days_lookback in days_lookbacks:
  my_slices = _3_random_slices(len_df_train, n_samples=n_samples, days_lookback=days_lookback, days_eval=days_eval)
  slices_dic[days_lookback] = my_slices 

In [45]:
slices_dic

{60: [(332, 392, 402), (638, 698, 708)],
 120: [(524, 644, 654), (667, 787, 797)]}

In [46]:
perf_ranks_dic = {}
ranked_perf_ranks_dic = {}

In [None]:
# from trash import _4_perf_ranks_1

In [47]:
def _4_perf_ranks(df_close, days_lookbacks, n_top_syms=200, verbose=False):
    """Returns perf_ranks_dict(dic. of dic. of symbols ranked in descending
     performance) and ranked_perf_ranks_dict(dic. of symbols ranked in
     descending frequency in a combined pool of symbols in perf_ranks_dict).

    Args:
        df_close(dataframe): dataframe of symbols' close with
         DatetimeIndex e.g. (['2016-12-19', ... '2016-12-22']), symbols as
         column names, and symbols' close as column values.
        days_lookbacks(list of positive integers): list of number of days to
        look-back, e.g. [15, 30], for performance calculation.
        n_top_syms(int): number of top symbols to keep in perf_ranks_dict 

    Return:
        perf_ranks_dict({dic): dic. of dic. of symbols ranked in descending
         performance.
         First dic keys are:
          'period' + str(days_lookbacks[0]), ... ,
          'period' + str(days_lookbacks[-1])
         Second dic keys are:
          'r_CAGR/UI', 'r_CAGR/retnStd' and 'r_retnStd/UI'
         e.g.:
          {
            period-15': {
                         'r_CAGR/UI':  ['HZNP', ... , 'CB'],
                         'r_CAGR/retnStd': ['BBW', ... , 'CPRX'],
                         'r_retnStd/UI':   ['ENR', ... , 'HSY']
                        },
            ... ,
            'period-60': {
                          'r_CAGR/UI':  ['WNC', ... , 'FSLR'],
                          'r_CAGR/retnStd': ['VCYT', ... , 'BERY'],
                          'r_retnStd/UI':   ['MYOV', ... , 'NSC']
                         }
          }
        ranked_perf_ranks_dict(dic): dic. of symbols ranked in descending
         frequency in a combined pool of symbols in perf_ranks_dict.  Key is
         'ranked_perf_ranks_period' + str(days_lookbacks), e.g.:
         {'ranked_perf_ranks_period[-15, -30]': ['HZNP', ... , 'NSC']}
    """

    import pandas as pd
    from myUtils import symb_perf_stats_vectorized_v2

    perf_ranks_dict = {}  # dic of performance ranks
    syms_perf_rank = []  # list of lists to store top 100 ranked symbols

    for days_lookback in days_lookbacks:
        days_lookback = -1 * days_lookback
        f_name = "period" + str(days_lookback)
        _df_c = df_close[days_lookback::]
        (
            symbols,
            period_yr,
            drawdown,
            UI,
            max_drawdown,
            retnStd,
            retnStd_d_UI,
            CAGR,
            CAGR_d_retnStd,
            CAGR_d_UI,        
        ) = symb_perf_stats_vectorized_v2(_df_c)            


        caches_perf_stats_vect = []
        for symbol in symbols:
            date_first = drawdown.index[0].strftime("%Y-%m-%d")
            date_last = drawdown.index[-1].strftime("%Y-%m-%d")
            cache = (
                symbol,
                date_first,
                date_last,
                period_yr,
                CAGR[symbol],
                UI[symbol],
                retnStd_d_UI[symbol],
                CAGR_d_retnStd[symbol],
                CAGR_d_UI[symbol],
            )
            # append performance data (tuple) to caches_perf_stats (list)
            caches_perf_stats_vect.append(cache)
        column_names = [
            "symbol",
            "first date",
            "last date",
            "Year",
            "CAGR",
            "UI",
            "retnStd/UI",
            "CAGR/retnStd",
            "CAGR/UI",
        ]

        # write symbols' performance stats to dataframe
        df_ps = pd.DataFrame(caches_perf_stats_vect, columns=column_names)
        df_ps["r_CAGR/UI"] = df_ps["CAGR/UI"].rank(ascending=False)
        df_ps["r_CAGR/retnStd"] = df_ps["CAGR/retnStd"].rank(ascending=False)
        df_ps["r_retnStd/UI"] = df_ps["retnStd/UI"].rank(ascending=False)

        _dict = {}
        cols_sort = ["r_CAGR/UI", "r_CAGR/retnStd", "r_retnStd/UI"]

        # print(f'{f_name} top 100 symbols')
        for col in cols_sort:
            symbols_top_n = (

                df_ps.sort_values(by=[col]).head(n_top_syms).symbol.values
                # df_ps.sort_values(by=[col]).symbol.values

            )
            syms_perf_rank.append(list(symbols_top_n))
            # print(f'{col}: {symbols_top_n}')
            _dict[col] = symbols_top_n
            perf_ranks_dict[f"{f_name}"] = _dict

    syms_perf_rank  # list of lists of top n_top_syms symbols
    l_syms_perf_rank = [
        val for sublist in syms_perf_rank for val in sublist
    ]  # flatten list of lists

    from collections import Counter

    cnt_symbol_freq = Counter(l_syms_perf_rank)  # count symbols and frequency
    # print(cnt_symbol_freq)
    
    
    ####################### all symbols in l_tuples have the same count of 3 #############
    l_tuples = (
        cnt_symbol_freq.most_common()
    )  # convert to e.g [('AKRO', 6), ('IMVT', 4), ... ('ADEA', 3)]
    print(f'days_lookback: {days_lookback}')
    print(f'len(l_tuples): {len(l_tuples)}')
    print(l_tuples)
    ####################### all symbols in l_tuples have the same count of 3 #############



    symbols_ranked_perf_ranks = [



        symbol for symbol, count in l_tuples
        # symbol, count for symbol, count in l_tuples


    ]  # select just the symbols without the frequency counts

    ranked_perf_ranks_dict = {}
    f_name = f"ranked_perf_ranks_period" + str(
        days_lookbacks
    )  # key name, ranked_perf_ranks_dict
    ranked_perf_ranks_dict[
        f"{f_name}"
    # values: list of most common symbols in all performance ranks in
    #  descending order
    ] = symbols_ranked_perf_ranks

    # return perf_ranks_dict, ranked_perf_ranks_dict
    return perf_ranks_dict, ranked_perf_ranks_dict, l_tuples    


In [48]:
slices_dic

{60: [(332, 392, 402), (638, 698, 708)],
 120: [(524, 644, 654), (667, 787, 797)]}

In [49]:
perf_ranks, ranked_perf_ranks, l_tuples = _4_perf_ranks(df_train, days_lookbacks=days_lookbacks, n_top_syms=20)

days_lookback: -120
len(l_tuples): 60
[('ACRS', 6), ('RICK', 5), ('GME', 4), ('MARA', 4), ('PARA', 4), ('SM', 4), ('CPE', 4), ('SBNY', 4), ('NMM', 4), ('DAC', 4), ('TPL', 4), ('WBD', 3), ('HAFC', 3), ('MTZ', 3), ('CHRD', 3), ('WAL', 3), ('HOV', 2), ('TMST', 2), ('PBF', 2), ('UNFI', 2), ('PDCE', 2), ('MSTR', 2), ('FBK', 2), ('EWBC', 2), ('BANC', 2), ('PFC', 2), ('CTRN', 2), ('NBR', 2), ('PNFP', 2), ('TBK', 2), ('MTDR', 1), ('MCS', 1), ('SEED', 1), ('SNDA', 1), ('AMCX', 1), ('VRTV', 1), ('AJRD', 1), ('SFST', 1), ('AVEO', 1), ('DBI', 1), ('JSD', 1), ('MATW', 1), ('HMST', 1), ('KIO', 1), ('OPY', 1), ('LKFN', 1), ('HCI', 1), ('DBRG', 1), ('TCBI', 1), ('VERU', 1), ('SLCA', 1), ('ATKR', 1), ('CASH', 1), ('SLM', 1), ('INBK', 1), ('OZK', 1), ('KE', 1), ('CUBI', 1), ('TBBK', 1), ('BOH', 1)]


In [None]:
ranked_perf_ranks

In [54]:
l_tuples
# len(l_tuples)

[('ACRS', 6),
 ('RICK', 5),
 ('GME', 4),
 ('MARA', 4),
 ('PARA', 4),
 ('SM', 4),
 ('CPE', 4),
 ('SBNY', 4),
 ('NMM', 4),
 ('DAC', 4),
 ('TPL', 4),
 ('WBD', 3),
 ('HAFC', 3),
 ('MTZ', 3),
 ('CHRD', 3),
 ('WAL', 3),
 ('HOV', 2),
 ('TMST', 2),
 ('PBF', 2),
 ('UNFI', 2),
 ('PDCE', 2),
 ('MSTR', 2),
 ('FBK', 2),
 ('EWBC', 2),
 ('BANC', 2),
 ('PFC', 2),
 ('CTRN', 2),
 ('NBR', 2),
 ('PNFP', 2),
 ('TBK', 2),
 ('MTDR', 1),
 ('MCS', 1),
 ('SEED', 1),
 ('SNDA', 1),
 ('AMCX', 1),
 ('VRTV', 1),
 ('AJRD', 1),
 ('SFST', 1),
 ('AVEO', 1),
 ('DBI', 1),
 ('JSD', 1),
 ('MATW', 1),
 ('HMST', 1),
 ('KIO', 1),
 ('OPY', 1),
 ('LKFN', 1),
 ('HCI', 1),
 ('DBRG', 1),
 ('TCBI', 1),
 ('VERU', 1),
 ('SLCA', 1),
 ('ATKR', 1),
 ('CASH', 1),
 ('SLM', 1),
 ('INBK', 1),
 ('OZK', 1),
 ('KE', 1),
 ('CUBI', 1),
 ('TBBK', 1),
 ('BOH', 1)]

In [53]:
perf_ranks

{'period-60': {'r_CAGR/UI': array(['GME', 'ACRS', 'MARA', 'PARA', 'HOV', 'WBD', 'SM', 'CPE', 'SBNY',
         'HAFC', 'RICK', 'NMM', 'TMST', 'DAC', 'PBF', 'UNFI', 'MTDR', 'TPL',
         'PDCE', 'MCS'], dtype=object),
  'r_CAGR/retnStd': array(['GME', 'MARA', 'ACRS', 'SM', 'PARA', 'HOV', 'CPE', 'WBD', 'UNFI',
         'DAC', 'MSTR', 'SEED', 'SNDA', 'TPL', 'TMST', 'PBF', 'AMCX',
         'VRTV', 'NMM', 'PDCE'], dtype=object),
  'r_retnStd/UI': array(['ACRS', 'AJRD', 'SFST', 'SBNY', 'FBK', 'AVEO', 'MTZ', 'HAFC',
         'DBI', 'RICK', 'JSD', 'MATW', 'HMST', 'EWBC', 'BANC', 'KIO', 'OPY',
         'LKFN', 'PFC', 'HCI'], dtype=object)},
 'period-120': {'r_CAGR/UI': array(['CHRD', 'MARA', 'GME', 'DAC', 'SM', 'ACRS', 'CPE', 'CTRN', 'WAL',
         'NMM', 'RICK', 'TPL', 'PARA', 'NBR', 'PNFP', 'DBRG', 'SBNY', 'MTZ',
         'TBK', 'TCBI'], dtype=object),
  'r_CAGR/retnStd': array(['GME', 'MARA', 'SM', 'CHRD', 'DAC', 'CPE', 'ACRS', 'CTRN', 'NBR',
         'MSTR', 'NMM', 'PARA', 'WBD', 'TPL', '

In [None]:
for days_lookback in slices_dic:
  slices = slices_dic[days_lookback]
  print(days_lookback, slices)
  for my_slice in slices:
    start_train = my_slice[0]
    end_train = my_slice[1]
    start_val = end_train
    end_val = my_slice[2]
    print(f'start_train: {start_train}')
    print(f'end_train:   {end_train}')
    # print(f'start_val:   {start_val}')
    # print(f'end_val:     {end_val}')
    _df = df_train.iloc[start_train:end_train]
    l_days_lookback = []  # create a list for days_lookbacks
    l_days_lookback.append(days_lookback)
    perf_ranks, ranked_perf_ranks = _4_perf_ranks(_df, days_lookbacks=l_days_lookback)
    perf_ranks_dic[days_lookback] = perf_ranks
    ranked_perf_ranks_dic[days_lookback] = ranked_perf_ranks 
  print('')   

In [None]:
for days_lookback in slices_dic:
  slices = slices_dic[days_lookback]
  print(days_lookback, slices)
  for my_slice in slices:
    start_train = my_slice[0]
    end_train = my_slice[1]
    start_val = end_train
    end_val = my_slice[2]
    print(f'start_train: {start_train}')
    print(f'end_train:   {end_train}')
    # print(f'start_val:   {start_val}')
    # print(f'end_val:     {end_val}')
    _df = df_train.iloc[start_train:end_train]
    perf_ranks, ranked_perf_ranks = _4_perf_ranks_1(_df, days_lookback=days_lookback)
    perf_ranks_dic[days_lookback] = perf_ranks
    ranked_perf_ranks_dic[days_lookback] = ranked_perf_ranks 
  print('')    

In [None]:
perf_ranks_dic

In [None]:
ranked_perf_ranks_dic

In [None]:
# for my_slice in my_slices[0:2]:
for my_slice in my_slices:
  start_train = my_slice[0]
  end_train = my_slice[1]
  start_val = end_train
  end_val = my_slice[2]
  print(f'start_train: {start_train}')
  print(f'end_train: {end_train}')
  print(f'start_val: {start_val}')
  print(f'end_val: {end_val}')

  _df = df_train.iloc[start_train:end_train]
  # print(_df) 
  perf_ranks_dict, ranked_perf_ranks_dict = _4_perf_ranks(_df, days_lookbacks=[60])
  # print(perf_ranks_dict)
  # print(ranked_perf_ranks_dict)  

In [None]:
# print(perf_ranks_dict)
trash = perf_ranks_dict
trash

In [None]:
ranked_perf_ranks_dict