In [13]:
import pandas as pd
from myUtils import pickle_load, pickle_dump
from yf_utils import split_train_val_test, random_slices, lookback_slices
from yf_utils import top_set_sym_freq_cnt, get_grp_top_syms_n_freq
from yf_utils import eval_grp_top_syms_n_freq


pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 12)
pd.set_option("display.max_colwidth", 12)
pd.set_option("display.width", 160)  # code-runner format

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

fp_df_close_clean = "df_close_clean"

In [14]:
#######################################################################
## SELECT RUN PARAMETERS.async Parameters can also be passed using papermill by running yf_7_freq_cnt_pm_.ipynb
verbose = True  # True prints more output
# verbose = False  # True prints more output

# write run results to df_eval_results
# store_results = False
store_results = True

# select run type
run_type = "train"
# run_type = 'validate'
# run_type = "test"

# number of max lookback tuples to create for iloc iloc_start_train:iloc_end_train:iloc_end_eval
# i.e. number of grp_top_set_syms_n_freq and grp_top_set_syms
# n_samples = 400
n_samples = 2

# for training, the number of days to lookback from iloc max-lookback iloc_end_train
# days_lookbacks = [15, 30, 60]
# days_lookbacks = [30, 60, 120]
days_lookbacks = [15, 30, 60, 120]
days_lookbacks.sort()

# number of days in dataframe to evaluate effectiveness of the training, days_eval = len(df_eval)
#  days_eval = 4 means buy at close on 1st day after the signal, hold for 2nd and 3rd day, sell at close on 4th day
days_eval = 4
# days_eval = 5

# number of the most-common symbols from days_lookbacks' performance rankings to keep
n_top_syms = 20
# n_top_syms = 5

# slice starts and ends for selecting the best performing symbols
syms_start = 0
syms_end = 10


##########################################################
# fp_df_eval_results = f"df_eval_results_{run_type}"
fp_df_eval_results = "test_df_eval"
##########################################################
#######################################################################

In [15]:

print(f"verbose : {verbose }")
print(f"store_results: {store_results}")
print(f"run_type: {run_type}")
print(f"n_samples: {n_samples}")
print(f"days_lookbacks: {days_lookbacks}")
print(f"days_eval: {days_eval}")
print(f"n_top_syms: {n_top_syms}")
print(f"syms_start: {syms_start}")
print(f"syms_end: {syms_end}")
print(f"fp_df_eval_results: {fp_df_eval_results}")

df_close_clean = pickle_load(path_data_dump, fp_df_close_clean)

# Split df_close_clean into training (df_train), validation (df_val) and test (df_test) set.
# The default split is 0.7, 0.2, 0.1 respectively.
df_train, df_val, df_test = split_train_val_test(df_close_clean)

max_days_lookbacks = max(days_lookbacks)
print(f"max_days_lookbacks: {max_days_lookbacks}")

# Load df according to run_type
if run_type == "train":
    df = df_train.copy()
elif run_type == "validate":
    df = df_val.copy()
elif run_type == "test":
    df = df_test.copy()
else:
    msg_stop = f"ERROR run_type must be 'train', 'validate', or 'test', run_type is: {run_type}"
    raise SystemExit(msg_stop)

# Print dataframe for the run, and lengths of other dataframes
print(f"run_type: {run_type}, df.tail(3):\n{df.tail(3)}\n")
len_df = len(df)
len_df_train = len(df_train)
len_df_val = len(df_val)
len_df_test = len(df_test)
print(f"run_type: {run_type}, len(df): {len(df)}")
print(
    f"len_df_train: {len_df_train}, len_df_val: {len_df_val}, len_df_test: {len_df_test}"
)

# return n_samples slices
max_lookback_slices = random_slices(
    len_df,
    n_samples=n_samples,
    days_lookback=max(days_lookbacks),
    days_eval=days_eval,
    verbose=False,
)
# return n_samples * len(days_lookbacks) slices
sets_lookback_slices = lookback_slices(
    max_slices=max_lookback_slices, days_lookbacks=days_lookbacks, verbose=False
)

if verbose:
    print(f"number of max_lookback_slices is equal to n_samples = {n_samples}")
    print(f"max_lookback_slices:\n{max_lookback_slices}\n")
    print(f"number of sets in sets_lookback_slices is equal to n_samples = {n_samples}")
    print(f"sets_lookback_slices:\n{sets_lookback_slices}\n")
    print(f"days_lookbacks: {days_lookbacks}")
    print(
        f'number of tuples in each "set of lookback slices" is equal to len(days_lookbacks): {len(days_lookbacks)}'
    )

# #### Generate grp_top_set_syms_n_freq. It is a list of sub-lists, e.g.:
#  - [[('AGY', 7), ('PCG', 7), ('KDN', 6), ..., ('CYT', 3)], ..., [('FCN', 9), ('HIG', 9), ('SJR', 8), ..., ('BFH', 2)]]
# #### grp_top_set_syms_n_freq has n_samples sub-lists. Each sub-list corresponds to a tuple in the max_lookback_slices. Each sub-list has n_top_syms tuples of (symbol, frequency) pairs, and is sorted in descending order of frequency. The frequency is the number of times the symbol appears in the top n_top_syms performance rankings of CAGR/UI, CAGR/retnStd and retnStd/UI.
# #### Therefore, symbols in the sub-list are the best performing symbols for the periods in days_lookbacks. Each sub-list corresponds to a tuple in max_lookback_slices. There are as many sub-lists as there are tuples in max_lookback_slices.
grp_top_set_syms_n_freq, grp_top_set_syms, dates_end_df_train = get_grp_top_syms_n_freq(
    df, sets_lookback_slices, days_lookbacks, n_top_syms, syms_start, syms_end, verbose
)

# #### print the best performing symbols for each set in sets_lookback_slices
for i, top_set_syms_n_freq in enumerate(grp_top_set_syms_n_freq):
    l_sym_freq_cnt = top_set_sym_freq_cnt(top_set_syms_n_freq)
    if verbose:
        print(f"max_lookback_slices:             {max_lookback_slices}")
        # print(f'set_lookback_slices: {sets_lookback_slices[i]}\n')
        print(
            f"set_lookback_slices {i + 1} of {len(sets_lookback_slices):>3}:    {sets_lookback_slices[i]}\n"
        )
        print(f"max_days_lookbacks:              {max_days_lookbacks}")
        print(f"df end date for days_lookbacks:  {dates_end_df_train[i]}")
        print(f"days_lookbacks:                  {days_lookbacks}")
        print(f"sym_freq_15:                     {l_sym_freq_cnt[0]}")
        print(f"sym_freq_14:                     {l_sym_freq_cnt[1]}")
        print(f"sym_freq_13:                     {l_sym_freq_cnt[2]}")
        print(f"sym_freq_12:                     {l_sym_freq_cnt[3]}")
        print(f"sym_freq_11:                     {l_sym_freq_cnt[4]}")
        print(f"sym_freq_10:                     {l_sym_freq_cnt[5]}")
        print(f"sym_freq_9:                      {l_sym_freq_cnt[6]}")
        print(f"sym_freq_8:                      {l_sym_freq_cnt[7]}")
        print(f"sym_freq_7:                      {l_sym_freq_cnt[8]}")
        print(f"sym_freq_6:                      {l_sym_freq_cnt[9]}")
        print(f"sym_freq_5:                      {l_sym_freq_cnt[10]}")
        print(f"sym_freq_4:                      {l_sym_freq_cnt[11]}")
        print(f"sym_freq_3:                      {l_sym_freq_cnt[12]}")
        print(f"sym_freq_2:                      {l_sym_freq_cnt[13]}\n")


# #### Evaluate performance of symbols in set_lookback_slices versus SPY
l_row_add_total = eval_grp_top_syms_n_freq(
    df, max_lookback_slices, sets_lookback_slices, grp_top_set_syms_n_freq, days_lookbacks, days_eval, n_samples, n_top_syms, syms_start, syms_end, verbose
)

for row_add_total in l_row_add_total:
    print(f"row_add_total: {row_add_total}")

if store_results:  # record results to df
    df_eval_results = pickle_load(path_data_dump, fp_df_eval_results)
    print(f"df_eval_results BEFORW store results:\n{df_eval_results}\n")

    for row_add_total in l_row_add_total:
        # print(f'row_add_total: {row_add_total}')
        df_eval_results.loc[len(df_eval_results)] = row_add_total
        print(f"appended row_add_total to df_eval_results:\n{row_add_total}\n")

    pickle_dump(df_eval_results, path_data_dump, fp_df_eval_results)
    print(f"Save results to: {fp_df_eval_results}")
    df_eval_results = pickle_load(path_data_dump, fp_df_eval_results)
    print(f"df_eval_results AFTER store results:\n{df_eval_results}\n")


verbose : True
store_results: True
run_type: train
n_samples: 2
days_lookbacks: [15, 30, 60, 120]
days_eval: 4
n_top_syms: 20
syms_start: 0
syms_end: 10
fp_df_eval_results: test_df_eval
max_days_lookbacks: 120
run_type: train, df.tail(3):
                     A         AA        AAL         AAP        AAPL         AB  ...          ZG       ZION        ZTO         ZTS       ZUMZ        ZWS
Date                                                                             ...                                                                    
2021-06-15  143.119141  35.880054  22.790001  190.966675  128.315628  39.616650  ...  111.209999  51.980713  30.783392  183.434158  45.570000  50.638294
2021-06-16  142.099823  34.960556  22.830000  187.407333  128.820419  39.607914  ...  108.839996  52.562309  30.199211  181.892944  45.959999  51.023483
2021-06-17  145.395294  33.200668  22.230000  185.943817  130.443665  39.005150  ...  113.660004  49.215782  29.733847  184.194885  44.130001  48.998

In [16]:
df

Unnamed: 0_level_0,A,AA,AAL,AAP,AAPL,AB,...,ZG,ZION,ZTO,ZTS,ZUMZ,ZWS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-04-18,50.734451,31.401224,42.429222,132.239700,33.128162,13.696466,...,35.330002,34.190353,11.961975,51.636120,16.500000,21.610952
2017-04-19,50.772724,30.946419,43.000633,133.656754,33.006157,13.666758,...,35.799999,34.319107,12.065994,51.925400,16.850000,21.659666
2017-04-20,50.830112,31.678062,43.794777,137.542023,33.419090,13.696466,...,35.950001,35.263355,12.122731,52.349674,17.799999,22.088379
2017-04-21,50.964058,31.341906,43.726997,137.804779,33.379200,13.607337,...,36.160000,35.065918,12.302398,52.253254,18.200001,22.068893
2017-04-24,52.083416,32.933723,44.976330,138.048828,33.700642,13.518205,...,36.480000,36.018761,12.368589,53.024654,18.400000,22.702221
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-11,142.040451,36.957741,23.530001,189.805603,126.049034,40.097115,...,113.470001,52.695789,31.288363,180.005920,46.939999,50.638294
2021-06-14,142.891541,36.532593,22.990000,188.996658,129.147064,39.756416,...,116.599998,51.504002,31.219053,182.357269,46.009998,50.440754
2021-06-15,143.119141,35.880054,22.790001,190.966675,128.315628,39.616650,...,111.209999,51.980713,30.783392,183.434158,45.570000,50.638294
2021-06-16,142.099823,34.960556,22.830000,187.407333,128.820419,39.607914,...,108.839996,52.562309,30.199211,181.892944,45.959999,51.023483


In [18]:
max_lookback_slices

[(520, 640, 644), (231, 351, 355)]

In [19]:
sets_lookback_slices

[[(625, 640, 644), (610, 640, 644), (580, 640, 644), (520, 640, 644)],
 [(336, 351, 355), (321, 351, 355), (291, 351, 355), (231, 351, 355)]]