In [3]:

def _4_perf_ranks(df_close, days_lookbacks, n_symbols=500, verbose=False):
    """Returns perf_ranks_dict(dic. of dic. of symbols ranked in descending performance)
     and ranked_perf_ranks_dict(dic. of symbols ranked in descending frequency in a combined
     pool of symbols in perf_ranks_dict).

    Args:
        df_close(dataframe): dataframe of symbols' close with
         DatetimeIndex e.g. (['2016-12-19', ... '2016-12-22']), symbols as
         column names, and symbols' close as column values.
        days_lookbacks(list of negative integers): list of number of days to look-back,
         e.g. [-15, -30]
        n_sysmbols(int): number of symbols to be returned, default=500

    Return:
        perf_ranks_dict({dic): dic. of dic. of symbols ranked in descending performance.
         First dic keys are: 'period' + str(days_lookbacks[0]), ... ,  'period' + str(days_lookbacks[-1])
         Second dic keys are: 'r_CAGR/UI', 'r_CAGR/Std' and 'r_Std/UI'
         e.g.:
          {
            period-15': {
                         'r_CAGR/UI':  ['HZNP', ... , 'CB'],
                         'r_CAGR/Std': ['BBW', ... , 'CPRX'],
                         'r_Std/UI':   ['ENR', ... , 'HSY']
                        },
            ... ,
            'period-60': {
                          'r_CAGR/UI':  ['WNC', ... , 'FSLR'],
                          'r_CAGR/Std': ['VCYT', ... , 'BERY'],
                          'r_Std/UI':   ['MYOV', ... , 'NSC']
                         }
          }
        ranked_perf_ranks_dict(dic): dic. of symbols ranked in descending frequency in a combined
         pool of symbols in perf_ranks_dict.  Key is 'ranked_perf_ranks_period' + str(days_lookbacks)
         e.g.:
          {'ranked_perf_ranks_period[-15, -30]': ['HZNP', ... , 'NSC']}
    """

    # from myUtils import pickle_load, pickle_dump, symb_perf_stats_vectorized
    import pandas as pd
    from myUtils import symb_perf_stats_vectorized

    perf_ranks_dict = {}  # dic of performance ranks
    syms_perf_rank = []  # list of lists to store top 100 ranked symbols

    # days_lookbacks = [-15, -30, -60, -120, -240]
    # days_lookbacks = [-15, -30]

    for days_lookback in days_lookbacks:
        f_name = "period" + str(days_lookback)
        _df_c = df_close[days_lookback::]
        (
            symbols,
            period_yr,
            drawdown,
            UI,
            max_drawdown,
            returns_std,
            Std_UI,
            CAGR,
            CAGR_Std,
            CAGR_UI,
        ) = symb_perf_stats_vectorized(_df_c)
        caches_perf_stats_vect = []
        for symbol in symbols:
            date_first = drawdown.index[0].strftime("%Y-%m-%d")
            date_last = drawdown.index[-1].strftime("%Y-%m-%d")
            cache = (
                symbol,
                date_first,
                date_last,
                period_yr,
                CAGR[symbol],
                UI[symbol],
                Std_UI[symbol],
                CAGR_Std[symbol],
                CAGR_UI[symbol],
            )
            # append performance data (tuple) to caches_perf_stats (list)
            caches_perf_stats_vect.append(cache)
        column_names = [
            "symbol",
            "first date",
            "last date",
            "Year",
            "CAGR",
            "UI",
            "Std/UI",
            "CAGR/Std",
            "CAGR/UI",
        ]

        # write symbols' performance stats to dataframe
        df_ps = pd.DataFrame(caches_perf_stats_vect, columns=column_names)
        df_ps["r_CAGR/UI"] = df_ps["CAGR/UI"].rank(ascending=False)
        df_ps["r_CAGR/Std"] = df_ps["CAGR/Std"].rank(ascending=False)
        df_ps["r_Std/UI"] = df_ps["Std/UI"].rank(ascending=False)

        _dict = {}
        cols_sort = ["r_CAGR/UI", "r_CAGR/Std", "r_Std/UI"]

        # print(f'{f_name} top 100 symbols')
        for col in cols_sort:
            symbols_top_n = (
                df_ps.sort_values(by=[col]).head(n_symbols).symbol.values
            )
            syms_perf_rank.append(list(symbols_top_n))
            # print(f'{col}: {symbols_top_n}')
            _dict[col] = symbols_top_n
            perf_ranks_dict[f"{f_name}"] = _dict
        # print(' ')

    # pickle_dump(perf_ranks_dict, path_data_dump, f_pickled_perf_ranks_dict)
    # print(f'perf_ranks_dict:\n{perf_ranks_dict}\n')

    syms_perf_rank  # list of lists of top 100 rank
    l_syms_perf_rank = [
        val for sublist in syms_perf_rank for val in sublist
    ]  # flatten list of lists

    from collections import Counter

    cnt_symbol_freq = Counter(l_syms_perf_rank)  # count symbols and frequency
    # print(cnt_symbol_freq)
    l_tuples = (
        cnt_symbol_freq.most_common()
    )  # convert to e.g [('AKRO', 6), ('IMVT', 4), ... ('ADEA', 3)]
    symbols_ranked_perf_ranks = [
        symbol for symbol, count in l_tuples
    ]  # select just the symbols without the frequency counts
    symbols_ranked_perf_ranks = symbols_ranked_perf_ranks[:n_symbols]

    ranked_perf_ranks_dict = {}
    # f_name = f'ranked_perf_ranks_period' + str(_periods)  # key name, ranked_perf_ranks_dict
    f_name = f"ranked_perf_ranks_period" + str(
        days_lookbacks
    )  # key name, ranked_perf_ranks_dict
    ranked_perf_ranks_dict[
        f"{f_name}"
    ] = symbols_ranked_perf_ranks  # values: list of most common symbols in all performance ranks in descending order
    # pickle_dump(ranked_perf_ranks_dict, path_data_dump, f_pickled_ranked_perf_ranks_dict)
    # print(f'ranked_perf_ranks_dict:\n{ranked_perf_ranks_dict}\n')

    return perf_ranks_dict, ranked_perf_ranks_dict


In [20]:

def _4_perf_ranks1(df_close, days_lookbacks, verbose=False):
    """Returns perf_ranks_dict(dic. of dic. of symbols ranked in descending performance)
     and ranked_perf_ranks_dict(dic. of symbols ranked in descending frequency in a combined
     pool of symbols in perf_ranks_dict).

    Args:
        df_close(dataframe): dataframe of symbols' close with
         DatetimeIndex e.g. (['2016-12-19', ... '2016-12-22']), symbols as
         column names, and symbols' close as column values.
        days_lookbacks(list of negative integers): list of number of days to look-back,
         e.g. [-15, -30]


        # n_sysmbols(int): number of symbols to be returned, default=500



    Return:
        perf_ranks_dict({dic): dic. of dic. of symbols ranked in descending performance.
         First dic keys are: 'period' + str(days_lookbacks[0]), ... ,  'period' + str(days_lookbacks[-1])
         Second dic keys are: 'r_CAGR/UI', 'r_CAGR/Std' and 'r_Std/UI'
         e.g.:
          {
            period-15': {
                         'r_CAGR/UI':  ['HZNP', ... , 'CB'],
                         'r_CAGR/Std': ['BBW', ... , 'CPRX'],
                         'r_Std/UI':   ['ENR', ... , 'HSY']
                        },
            ... ,
            'period-60': {
                          'r_CAGR/UI':  ['WNC', ... , 'FSLR'],
                          'r_CAGR/Std': ['VCYT', ... , 'BERY'],
                          'r_Std/UI':   ['MYOV', ... , 'NSC']
                         }
          }
        ranked_perf_ranks_dict(dic): dic. of symbols ranked in descending frequency in a combined
         pool of symbols in perf_ranks_dict.  Key is 'ranked_perf_ranks_period' + str(days_lookbacks)
         e.g.:
          {'ranked_perf_ranks_period[-15, -30]': ['HZNP', ... , 'NSC']}
    """

    # from myUtils import pickle_load, pickle_dump, symb_perf_stats_vectorized
    import pandas as pd
    from myUtils import symb_perf_stats_vectorized

    perf_ranks_dict = {}  # dic of performance ranks
    syms_perf_rank = []  # list of lists to store top 100 ranked symbols

    # days_lookbacks = [-15, -30, -60, -120, -240]
    # days_lookbacks = [-15, -30]

    for days_lookback in days_lookbacks:
        f_name = "period" + str(days_lookback)
        _df_c = df_close[days_lookback::]
        (
            symbols,
            period_yr,
            drawdown,
            UI,
            max_drawdown,
            returns_std,
            Std_UI,
            CAGR,
            CAGR_Std,
            CAGR_UI,
        ) = symb_perf_stats_vectorized(_df_c)
        caches_perf_stats_vect = []
        for symbol in symbols:
            date_first = drawdown.index[0].strftime("%Y-%m-%d")
            date_last = drawdown.index[-1].strftime("%Y-%m-%d")
            cache = (
                symbol,
                date_first,
                date_last,
                period_yr,
                CAGR[symbol],
                UI[symbol],
                Std_UI[symbol],
                CAGR_Std[symbol],
                CAGR_UI[symbol],
            )
            # append performance data (tuple) to caches_perf_stats (list)
            caches_perf_stats_vect.append(cache)
        column_names = [
            "symbol",
            "first date",
            "last date",
            "Year",
            "CAGR",
            "UI",
            "Std/UI",
            "CAGR/Std",
            "CAGR/UI",
        ]

        # write symbols' performance stats to dataframe
        df_ps = pd.DataFrame(caches_perf_stats_vect, columns=column_names)
        df_ps["r_CAGR/UI"] = df_ps["CAGR/UI"].rank(ascending=False)
        df_ps["r_CAGR/Std"] = df_ps["CAGR/Std"].rank(ascending=False)
        df_ps["r_Std/UI"] = df_ps["Std/UI"].rank(ascending=False)

        _dict = {}
        cols_sort = ["r_CAGR/UI", "r_CAGR/Std", "r_Std/UI"]

        # print(f'{f_name} top 100 symbols')
        for col in cols_sort:
            symbols_top_n = (

                # df_ps.sort_values(by=[col]).head(n_symbols).symbol.values
                df_ps.sort_values(by=[col]).symbol.values

            )
            syms_perf_rank.append(list(symbols_top_n))
            # print(f'{col}: {symbols_top_n}')
            _dict[col] = symbols_top_n
            perf_ranks_dict[f"{f_name}"] = _dict
        # print(' ')

    # pickle_dump(perf_ranks_dict, path_data_dump, f_pickled_perf_ranks_dict)
    # print(f'perf_ranks_dict:\n{perf_ranks_dict}\n')

    syms_perf_rank  # list of lists of top 100 rank
    l_syms_perf_rank = [
        val for sublist in syms_perf_rank for val in sublist
    ]  # flatten list of lists

    from collections import Counter

    cnt_symbol_freq = Counter(l_syms_perf_rank)  # count symbols and frequency
    # print(cnt_symbol_freq)
    l_tuples = (
        cnt_symbol_freq.most_common()
    )  # convert to e.g [('AKRO', 6), ('IMVT', 4), ... ('ADEA', 3)]
    symbols_ranked_perf_ranks = [
        symbol for symbol, count in l_tuples
    ]  # select just the symbols without the frequency counts


    # symbols_ranked_perf_ranks = symbols_ranked_perf_ranks[:n_symbols]


    ranked_perf_ranks_dict = {}
    # f_name = f'ranked_perf_ranks_period' + str(_periods)  # key name, ranked_perf_ranks_dict
    f_name = f"ranked_perf_ranks_period" + str(
        days_lookbacks
    )  # key name, ranked_perf_ranks_dict
    ranked_perf_ranks_dict[
        f"{f_name}"
    ] = symbols_ranked_perf_ranks  # values: list of most common symbols in all performance ranks in descending order
    # pickle_dump(ranked_perf_ranks_dict, path_data_dump, f_pickled_ranked_perf_ranks_dict)
    # print(f'ranked_perf_ranks_dict:\n{ranked_perf_ranks_dict}\n')

    return perf_ranks_dict, ranked_perf_ranks_dict


In [28]:
import yfinance as yf
import yfinance.shared as shared
import time
import pandas as pd
# from datetime import date, timedelta, datetime
from myUtils import pickle_dump, pickle_load, read_symbols_file # NOQA
from myUtils import drop_symbols_all_NaN, chunked_list # NOQA
from myUtils import yf_download_AdjOHLCV_noAutoAdj
verbose = False  # True prints more output

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

filename_symbols = path_data_dump + 'vg_symbols_4chars_max.csv'  # symbols text file
filename_pickled_df_OHLCVA_downloaded = 'df_OHLCVA_downloaded '  # OHLCVA downloaded from Yahoo
filename_pickled_df_adjOHLCV = 'df_adjOHLCV'  # adjusted OHLCV
filename_pickled_df_symbols_close = "df_symbols_close"  # symbols' adjusted close
filename_pickled_symbols_df_adjOHLCV =  'symbols_df_adjOHLCV'  # symbols in df_adjOHLCV
filename_pickled_df_c = 'df_close_clean' 

In [None]:
df_c = pickle_load(path_data_dump, filename_pickled_df_c)
df_c

In [30]:
days_lookbacks = [-15, -30]

In [34]:
# perf_ranks_dict, ranked_perf_ranks_dict = _4_perf_ranks(df_c, days_lookbacks, n_symbols=500, verbose=False)
perf_ranks_dict, ranked_perf_ranks_dict = _4_perf_ranks1(df_c, days_lookbacks, verbose=False)

In [32]:
start_slice = 0
end_slice = 100

In [36]:
for key0 in perf_ranks_dict.keys():
  print(f'key0: {key0}')
  for key1 in perf_ranks_dict[key0].keys():
    print(f'key1: {key1}')

    print(f'perf_ranks_dict[{key0}][{key1}][{start_slice}:{end_slice}]:\n{perf_ranks_dict[key0][key1][start_slice:end_slice]}')    
    print(f'len(perf_ranks_dict[{key0}][{key1}][{start_slice}:{end_slice}]): {len(perf_ranks_dict[key0][key1][start_slice:end_slice])}')
    
    # print(f'perf_ranks_dict[{key0}][{key1}]:\n{perf_ranks_dict[key0][key1]}')    
    # print(f'len(perf_ranks_dict[{key0}][{key1}]): {len(perf_ranks_dict[key0][key1])}')
  
  print(' ')

key0: period-15
key1: r_CAGR/UI
perf_ranks_dict[period-15][r_CAGR/UI][0:100]:
['BBW' 'AXSM' 'BURL' 'CTRN' 'OFIX' 'POWL' 'AEO' 'TIPT' 'PEN' 'CVCO' 'AZN'
 'PVH' 'ANF' 'RDNT' 'COUP' 'FE' 'NVS' 'GES' 'ROST' 'DMB' 'ATKR' 'HZNP'
 'IIM' 'WLFC' 'BKE' 'BMRN' 'WEN' 'NVO' 'SJM' 'TBNK' 'MUJ' 'OFLX' 'NUVA'
 'ZTO' 'GIS' 'UHAL' 'ODC' 'ZBH' 'CPB' 'MRK' 'CALM' 'ABBV' 'FL' 'BERY'
 'AVA' 'FIVE' 'CAG' 'BVH' 'CIEN' 'BBY' 'UNVR' 'ULTA' 'BIDU' 'RIO' 'WEYS'
 'NUO' 'DKS' 'EXC' 'MOH' 'RARE' 'VBF' 'HROW' 'NTGR' 'PCN' 'ISEE' 'TFX'
 'PNW' 'KHC' 'CVS' 'GDS' 'ETR' 'ETSY' 'PG' 'RELL' 'ELV' 'NEWT' 'IPAR'
 'SYK' 'ROG' 'CELH' 'VRDN' 'GSK' 'BDX' 'AMBC' 'MTX' 'HSY' 'MATW' 'MMS'
 'HHC' 'MGI' 'STE' 'ENR' 'NI' 'PDI' 'KN' 'NWE' 'AEP' 'EDU' 'IFF' 'SPB']
len(perf_ranks_dict[period-15][r_CAGR/UI][0:100]): 100
key1: r_CAGR/Std
perf_ranks_dict[period-15][r_CAGR/Std][0:100]:
['AXSM' 'BBW' 'POWL' 'CTRN' 'OFIX' 'RDNT' 'BURL' 'PVH' 'AEO' 'ANF' 'COUP'
 'PEN' 'HZNP' 'UHAL' 'ATKR' 'CVCO' 'GDS' 'WLFC' 'BMRN' 'ROST' 'BKE' 'GES'
 'NVO' 'TIP

In [37]:
for key0 in perf_ranks_dict.keys():
  print(f'key0: {key0}')
  for key1 in perf_ranks_dict[key0].keys():
    print(f'key1: {key1}')
    # print(f'perf_ranks_dict[{key0}][{key1}][{start_slice}:{end_slice}]:\n{perf_ranks_dict[key0][key1][start_slice:end_slice]}')    
    # print(f'len(perf_ranks_dict[{key0}][{key1}][{start_slice}:{end_slice}]): {len(perf_ranks_dict[key0][key1][start_slice:end_slice])}')
    
    print(f'perf_ranks_dict[{key0}][{key1}]:\n{perf_ranks_dict[key0][key1]}')    
    print(f'len(perf_ranks_dict[{key0}][{key1}]): {len(perf_ranks_dict[key0][key1])}')
  
  print(' ')

key0: period-15
key1: r_CAGR/UI
perf_ranks_dict[period-15][r_CAGR/UI]:
['BBW' 'AXSM' 'BURL' ... 'AROW' 'VVV' 'UNTY']
len(perf_ranks_dict[period-15][r_CAGR/UI]): 2299
key1: r_CAGR/Std
perf_ranks_dict[period-15][r_CAGR/Std]:
['AXSM' 'BBW' 'POWL' ... 'FFIC' 'DKL' 'FNWB']
len(perf_ranks_dict[period-15][r_CAGR/Std]): 2299
key1: r_Std/UI
perf_ranks_dict[period-15][r_Std/UI]:
['AZN' 'TIPT' 'BURL' ... 'PBF' 'DK' 'DKL']
len(perf_ranks_dict[period-15][r_Std/UI]): 2299
 
key0: period-30
key1: r_CAGR/UI
perf_ranks_dict[period-30][r_CAGR/UI]:
['AXSM' 'WYNN' 'BBW' ... 'SAVE' 'MTB' 'EPD']
len(perf_ranks_dict[period-30][r_CAGR/UI]): 2299
key1: r_CAGR/Std
perf_ranks_dict[period-30][r_CAGR/Std]:
['AXSM' 'GDS' 'WYNN' ... 'FNWB' 'IEP' 'IRBT']
len(perf_ranks_dict[period-30][r_CAGR/Std]): 2299
key1: r_Std/UI
perf_ranks_dict[period-30][r_Std/UI]:
['ABMD' 'BVH' 'GILD' ... 'MARA' 'TTGT' 'MPAA']
len(perf_ranks_dict[period-30][r_Std/UI]): 2299
 


In [13]:
# perf_ranks_dict[key0][key1][start_slice:end_slice]    
len(perf_ranks_dict[key0][key1])


500

In [None]:
def random_slices (df, n_samples, days_lookback, days_eval):

  import random
  from random import randint

  n_sample = 0
  days_total = days_lookback + days_eval
  print(f'days_lookback: {days_lookback}, days_eval: {days_eval}, days_total: {days_total}, len(df): {len(df)}')

  if days_total > len(df):
    msg_err = f'days_total: {days_total} must be less or equal to len(df): {len(df)}'
    raise SystemExit(msg_err)

  # random slices of iloc for train and eval that fits the days_lookback, days_eval and total len(df) constraints
  l_slices = []  
  while n_sample < n_samples:
    random.seed(0)
    n_rand = randint(0, 250-1)
    start_train = n_rand - days_lookback
    end_train = n_rand
    start_eval = n_rand
    end_eval = n_rand + days_eval
    if 0 <= start_train and end_eval <= len(df):
      l_slices.append((start_train, end_train, end_eval))
      # print(f'n_rand: {n_rand:>3},    start_train: {start_train:>3},    end_train: {end_train:>3},    start_eval: {start_eval:>3},    end_eval: {end_eval:>3},    n_sample: {n_sample:>3}')
      # ======== valid n_rand, do test and validation here ========
      # ======== valid n_rand, do test and validation here ========
      n_sample += 1

  return l_slices  

In [None]:
import pandas as pd
import numpy as np
from myUtils import pickle_load, pickle_dump, symb_perf_stats_vectorized

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

In [None]:
df = pickle_load(path_data_dump, 'df_train')
print(f"Full path to pickled df_train: {path_data_dump}df_train")
df

In [None]:
l_slices = random_slices(df, n_samples=100, days_lookback=120, days_eval=20)

In [None]:
len(my_slices)

In [None]:
for l_slice in l_slices[0:1]:
  start_train = l_slice[0]
  end_train = l_slice[1]
  df_train = df.iloc[start_train:end_train]

  start_eval = end_train
  end_eval = l_slice[2]
  df_eval = df.iloc[start_eval:end_eval]

In [None]:
df_train

In [None]:
df_eval