#### The code finds the recent dates in df_picks that are missing in df_model_top_picks. For those missing dates, it finds stock picks in df_picks that meet the 'criteria_model_top_picks' and append them to df_model_top_picks.

In [1]:
import pandas as pd
from itertools import product
from ast import literal_eval
from datetime import datetime
from myUtils import pickle_load, pickle_dump

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', 30)
pd.set_option('display.width', 900)

path_dir = "C:/Users/ping/MyDrive/stocks/yfinance/"
path_data_dump = path_dir + "VSCode_dump/"

fp_df_picks  = f'df_picks'  # stock picks by criteria: CAGR/UI, CAGR/rtn_std, rtd/UI
fp_df_model_top_picks = f'df_model_top_picks'  # top stock picks from model developed by back test

verbose = True
# verbose = False

In [2]:
print(f'Model predicts the best performing symbols are from df_filtered_picks  columns:')
print(f'performance   {"days_lookbacks":<20}{"sym_freq":<14}{"days_eval":<20}')
print(f'1st           {"[30, 60, 120]":<20}{"sym_freq_9":<14}{"days_eval = 4":<20}')
print(f'2nd           {"[15, 30, 60, 120]":<20}{"sym_freq_12":<14}{"days_eval = 5":<20}')
print(f'3rd           {"[30, 60, 120]":<20}{"sym_freq_8":<14}{"days_eval = 4":<20}')

criteria_model_top_picks = [([30, 60, 120], 'sym_freq_9'), ([15, 30, 60, 120], 'sym_freq_12'), ([30, 60, 120], 'sym_freq_8')]
print(f'\nmodel criteria for top picks:\n{criteria_model_top_picks}') 

Model predicts the best performing symbols are from df_filtered_picks  columns:
performance   days_lookbacks      sym_freq      days_eval           
1st           [30, 60, 120]       sym_freq_9    days_eval = 4       
2nd           [15, 30, 60, 120]   sym_freq_12   days_eval = 5       
3rd           [30, 60, 120]       sym_freq_8    days_eval = 4       

model criteria for top picks:
[([30, 60, 120], 'sym_freq_9'), ([15, 30, 60, 120], 'sym_freq_12'), ([30, 60, 120], 'sym_freq_8')]


#### Criteria of the best stocks from the model

In [3]:
# Define lists for lookback periods and column names:

# 1. Lookback periods (in days) for two potential scenarios:
my_days_lookbacks = [
    [30, 60, 120],  # Scenario 1: Lookbacks of 30, 60, and 120 days
    [15, 30, 60, 120]  # Scenario 2: Lookbacks of 15, 30, 60, and 120 days
]

# 2. Columns to be used in the analysis:
my_cols = ['sym_freq_12', 'sym_freq_9', 'sym_freq_8']  # Columns representing frequencies


In [4]:
# Load the DataFrame from a pickle file
df_model_top_picks = pickle_load(path_data_dump, fp_df_model_top_picks)

# Print the length of the DataFrame
print(f'Length of df_model_top_picks: {len(df_model_top_picks)}')

# Print the first 5 rows of the DataFrame
print(df_model_top_picks.head(), '\n')

# Print the last 5 rows of the DataFrame
print(df_model_top_picks.tail())


Length of df_model_top_picks: 1509
         date  days_lookback   syms_freq         symbols
0  2024-01-26  [30, 60, 120]  sym_freq_9         ['SHV']
1  2024-01-26  [30, 60, 120]  sym_freq_8        ['FTSM']
2  2024-01-25  [30, 60, 120]  sym_freq_9         ['SHV']
3  2024-01-24  [30, 60, 120]  sym_freq_8         ['SHV']
4  2024-01-23  [30, 60, 120]  sym_freq_8  ['ANF', 'SHV'] 

            date  days_lookback   syms_freq   symbols
1504  2018-07-12  [30, 60, 120]  sym_freq_8  ['TNDM']
1505  2018-07-11  [30, 60, 120]  sym_freq_8  ['TNDM']
1506  2018-07-10  [30, 60, 120]  sym_freq_8  ['TNDM']
1507  2018-07-09  [30, 60, 120]  sym_freq_8  ['TNDM']
1508  2018-07-06  [30, 60, 120]  sym_freq_8  ['TNDM']


In [5]:
# 1. Extract dates from DataFrame:
dates_in_model = df_model_top_picks.date.tolist()  # Convert date column to a list

# 2. Find the latest date:
latest_date_in_model = max(dates_in_model, key=lambda x: datetime.strptime(x, "%Y-%m-%d"))

# 3. Print the latest date:
print(f"Latest date in model: {latest_date_in_model}")

Latest date in model: 2024-01-26


#### Clean df_picks_pick data:<br>- drop duplicates<br>- sort on date<br>- re-index<br>- save

In [6]:
# 1. Load data from pickle file fpdf_model_top_picks_picks
print(f"Load the DataFrame from a pickle file {fp_df_picks }")
df_picks  = pickle_load(path_data_dump, fp_df_picks)

# 2. Drop duplicates, keeping the last occurrence
print("Dropping duplicates, keeping last occurrence...")
df_picks = df_picks.drop_duplicates(subset=['date_end_df_train', 'max_days_lookbacks', 'days_lookbacks'], keep='last')

# 3. Sort by date, max lookback, then lookback (descending)
print("Sorting by date, max lookback, then lookback (descending)...")
df_picks = df_picks.sort_values(by=['date_end_df_train', 'max_days_lookbacks', 'days_lookbacks'], ascending=False)

# 4. Reset index (optional)
print("Resetting index...")
df_picks = df_picks.reset_index(drop=True)

# 5. Save data to pickle (replace with actual saving)
# Comment: Save the modified DataFrame back to a pickle file
print(f"Save the modified DataFrame back to a pickle file {fp_df_picks}")
pickle_dump(df_picks, path_data_dump, fp_df_picks)

# 6. Print information about the DataFrame
print(f"Number of rows: {len(df_picks)}")
print(df_picks)

Load the DataFrame from a pickle file df_picks
Dropping duplicates, keeping last occurrence...
Sorting by date, max lookback, then lookback (descending)...
Resetting index...
Save the modified DataFrame back to a pickle file df_picks
Number of rows: 2806
     date_end_df_train  max_days_lookbacks     days_lookbacks sym_freq_15 sym_freq_14 sym_freq_13 sym_freq_12 sym_freq_11 sym_freq_10        sym_freq_9                     sym_freq_8                sym_freq_7                     sym_freq_6                     sym_freq_5                     sym_freq_4                     sym_freq_3 sym_freq_2
0           2024-01-29                 120      [30, 60, 120]          []          []          []          []          []          []           ['SHV']                ['ANF', 'FTSM']                        []                ['CYTK', 'PGR']   ['AGO', 'FATE', 'NTNX', 'X']                             []  ['BK', 'GDDY', 'HA', 'HY',...         []
1           2024-01-29                 120  [15, 30, 60, 

In [7]:
# Convert the date column to datetime format for accurate filtering
df_picks['date_end_df_train'] = pd.to_datetime(df_picks['date_end_df_train'])

# Filter rows after '2013-12-31' (exclusive)
df_filtered_picks = df_picks[df_picks['date_end_df_train'] > latest_date_in_model]

# SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
pd.options.mode.chained_assignment = None  # default='warn'

# Convert the date column back to '%Y-%m-%d' format
df_filtered_picks['date_end_df_train'] = df_filtered_picks['date_end_df_train'].dt.strftime('%Y-%m-%d')

# # Print the filtered DataFrame
print(df_filtered_picks)

  date_end_df_train  max_days_lookbacks     days_lookbacks sym_freq_15 sym_freq_14 sym_freq_13 sym_freq_12 sym_freq_11 sym_freq_10 sym_freq_9       sym_freq_8               sym_freq_7        sym_freq_6                    sym_freq_5              sym_freq_4                     sym_freq_3 sym_freq_2
0        2024-01-29                 120      [30, 60, 120]          []          []          []          []          []          []    ['SHV']  ['ANF', 'FTSM']                       []   ['CYTK', 'PGR']  ['AGO', 'FATE', 'NTNX', 'X']                      []  ['BK', 'GDDY', 'HA', 'HY',...         []
1        2024-01-29                 120  [15, 30, 60, 120]          []          []          []          []     ['SHV']          []         []  ['ANF', 'FTSM']  ['FATE', 'NTNX', 'PGR']  ['CYTK', 'JNPR']  ['AGO', 'NVDA', 'SMCI', 'X']  ['AMD', 'NOW', 'SPLK']  ['ASML', 'BK', 'BSIG', 'CB...         []


#### Get dates in df_filtered_picks where values in column "days_lookbacks" matched items in my_days_lookbacks.

In [8]:
# Initialize an empty list to store the dates
dates_in_days_lookbacks = []

# Iterate through each days lookback configuration
for my_days_lookback in my_days_lookbacks:
    # Create a mask to filter rows based on the current days lookback
    mask_days_lookback = df_filtered_picks ['days_lookbacks'] == str(my_days_lookback)

    # Extract the ending dates for the training datasets that match the lookback
    l_dates = df_filtered_picks.loc[mask_days_lookback, 'date_end_df_train'].tolist()

    # Extend the dates list with the extracted dates (avoiding nested lists)
    dates_in_days_lookbacks.extend(l_dates)

# Remove duplicates and sort the dates from newest to oldest
dates_in_days_lookbacks = sorted(set(dates_in_days_lookbacks), reverse=True)

# Print the list of dates along with its length
print(f'dates_in_days_lookbacks, len({len(dates_in_days_lookbacks)}):\n{dates_in_days_lookbacks}')


dates_in_days_lookbacks, len(1):
['2024-01-29']


#### Create list of tuples of all combination of dates_in_days_lookbacks and criteria_model_top_picks

In [9]:
# Generate all possible combinations of dates, lookbacks, and top features
date_lookback_symfreq = list(product(
    dates_in_days_lookbacks,  # List of dates
    criteria_model_top_picks  # List of (lookback, top feature) tuples
))

# Print the resulting list of tuples, along with its length
print(f'date_lookback_symfreq, len({len(date_lookback_symfreq)}):\n{date_lookback_symfreq}')


date_lookback_symfreq, len(3):
[('2024-01-29', ([30, 60, 120], 'sym_freq_9')), ('2024-01-29', ([15, 30, 60, 120], 'sym_freq_12')), ('2024-01-29', ([30, 60, 120], 'sym_freq_8'))]


#### Get symbols in my_cols where values of "date_end_df_picks_train" equals to dates_in_days_lookbacks, and "days_lookbacks" equals to my_days_lookbacks.

In [10]:
# Initialize a list to store the picked symbols
picks = []

# Track the previous date for formatting output
prev_date = ''

# Iterate through each combination of date, lookback, and top feature,
# now handling the nested tuple structure
for i, (_date, (my_days_lookback, my_col)) in enumerate(date_lookback_symfreq):
    # Create a mask to filter rows in the DataFrame based on the date and lookback criteria
    mask = (df_filtered_picks['date_end_df_train'] == _date) & (df_filtered_picks['days_lookbacks'] == str(my_days_lookback))

    # Extract the list of symbols picked by the model for the current criteria
    list_in_df_filtered_picks = df_filtered_picks.loc[mask, my_col]
    my_str = list_in_df_filtered_picks.tolist()[0]  # Convert the Pandas Series to a string
    my_list = literal_eval(my_str)  # Convert the string representation of the list to an actual list

    # Print the model's picks in a formatted way, grouping by date
    if _date != prev_date:
        if prev_date == '':
            print("\nModel Results:")  # Print a header for the results
        else:
            print("\n")  # Separate results for different dates
        print(f"{i:<6}{_date:<14}{str(my_days_lookback):<21}{my_col:<11}: {my_list}")
    else:
        print(f"{i:<6}{' ':<14}{str(my_days_lookback):<21}{my_col:<11}: {my_list}")

    # Add the picked symbols to the list of picks
    picks.append((i, my_list))

    # Update the previous date for the next iteration
    prev_date = _date


Model Results:
0     2024-01-29    [30, 60, 120]        sym_freq_9 : ['SHV']
1                   [15, 30, 60, 120]    sym_freq_12: []
2                   [30, 60, 120]        sym_freq_8 : ['ANF', 'FTSM']


#### Create iterable z_items of date_lookback_symfreq and picks<br>Add picks to df_model_top_picks if dlb_sym_freq match criteria_model_top_picks and _syms has symbol(s)  


In [11]:
print(f'before add len(df_model_top_picks): {len(df_model_top_picks)}')

# Combine date_lookback_symfreq and picks into a single iterable
z_items = zip(date_lookback_symfreq, picks)

# Iterate through each combined item
for i, item in enumerate(z_items):
    # Unpack the date, lookback, feature, counter, and symbols
    _date, _days_lookback, _sym_freq, _counter, _syms = (
        item[0][0],
        item[0][1][0],  # Accessing nested tuple elements
        item[0][1][1],
        item[1][0],
        item[1][1],
    )

    len_df_model_top_picks = len(df_model_top_picks)

    # Print the current item's details for inspection
    print(f"{i}, {_date}, {_days_lookback}, {_sym_freq}, {_syms}")

    # Create a tuple for model criteria matching
    dlb_sym_freq = (_days_lookback, _sym_freq)

    # Add data to the DataFrame if the criteria are met and symbols exist
    if dlb_sym_freq in criteria_model_top_picks and _syms:
        row_add = [_date, _days_lookback, _sym_freq, _syms]
        df_model_top_picks.loc[len_df_model_top_picks] = row_add  # Append row using index
        print(f"\nAdded row {len_df_model_top_picks} to df_model_top_picks:\n{row_add}\n")

print(f'after add len(df_model_top_picks): {len(df_model_top_picks)}')


before add len(df_model_top_picks): 1509
0, 2024-01-29, [30, 60, 120], sym_freq_9, ['SHV']

Added row 1509 to df_model_top_picks:
['2024-01-29', [30, 60, 120], 'sym_freq_9', ['SHV']]

1, 2024-01-29, [15, 30, 60, 120], sym_freq_12, []
2, 2024-01-29, [30, 60, 120], sym_freq_8, ['ANF', 'FTSM']

Added row 1510 to df_model_top_picks:
['2024-01-29', [30, 60, 120], 'sym_freq_8', ['ANF', 'FTSM']]

after add len(df_model_top_picks): 1511


In [12]:
# Convert all data types in the DataFrame to strings
df_model_top_picks = df_model_top_picks.astype(str)

# Remove any duplicate rows, keeping the last occurrence of each
df_model_top_picks = df_model_top_picks.drop_duplicates(keep='last')

# Sort the DataFrame by 'date' (descending), 'days_lookback' (descending), and 'syms_freq' (descending)
df_model_top_picks = df_model_top_picks.sort_values(
    by=['date', 'days_lookback', 'syms_freq'], ascending=False
)

# Reset the index of the DataFrame, dropping the old index
df_model_top_picks = df_model_top_picks.reset_index(drop=True)

print(f'df_model_top_picks:\n{df_model_top_picks}')

df_model_top_picks:
            date  days_lookback   syms_freq          symbols
0     2024-01-29  [30, 60, 120]  sym_freq_9          ['SHV']
1     2024-01-29  [30, 60, 120]  sym_freq_8  ['ANF', 'FTSM']
2     2024-01-26  [30, 60, 120]  sym_freq_9          ['SHV']
3     2024-01-26  [30, 60, 120]  sym_freq_8         ['FTSM']
4     2024-01-25  [30, 60, 120]  sym_freq_9          ['SHV']
...          ...            ...         ...              ...
1506  2018-07-12  [30, 60, 120]  sym_freq_8         ['TNDM']
1507  2018-07-11  [30, 60, 120]  sym_freq_8         ['TNDM']
1508  2018-07-10  [30, 60, 120]  sym_freq_8         ['TNDM']
1509  2018-07-09  [30, 60, 120]  sym_freq_8         ['TNDM']
1510  2018-07-06  [30, 60, 120]  sym_freq_8         ['TNDM']

[1511 rows x 4 columns]


#### Cleanup df_model_top_picks<br>- convert data to str<br>- drop duplicates, keep last value<br>- sort date, newest first<br>- re-index

In [13]:
# Convert all data types in the DataFrame to strings
df_model_top_picks = df_model_top_picks.astype(str)

# Remove any duplicate rows, keeping the last occurrence of each
df_model_top_picks = df_model_top_picks.drop_duplicates(keep='last')

# Sort the DataFrame by 'date' (descending), 'days_lookback' (descending), and 'syms_freq' (descending)
df_model_top_picks = df_model_top_picks.sort_values(
    by=['date', 'days_lookback', 'syms_freq'], ascending=False
)

# Reset the index of the DataFrame, dropping the old index
df_model_top_picks = df_model_top_picks.reset_index(drop=True)

# Save the DataFrame to a pickle file
pickle_dump(df_model_top_picks, path_data_dump, fp_df_model_top_picks)

# Print the length of the DataFrame
print(f'Length of df_model_top_picks: {len(df_model_top_picks)}')

# Print the file path and contents of the DataFrame
print(f'File path: {fp_df_model_top_picks}\nDataFrame contents:\n{df_model_top_picks}')


Length of df_model_top_picks: 1511
File path: df_model_top_picks
DataFrame contents:
            date  days_lookback   syms_freq          symbols
0     2024-01-29  [30, 60, 120]  sym_freq_9          ['SHV']
1     2024-01-29  [30, 60, 120]  sym_freq_8  ['ANF', 'FTSM']
2     2024-01-26  [30, 60, 120]  sym_freq_9          ['SHV']
3     2024-01-26  [30, 60, 120]  sym_freq_8         ['FTSM']
4     2024-01-25  [30, 60, 120]  sym_freq_9          ['SHV']
...          ...            ...         ...              ...
1506  2018-07-12  [30, 60, 120]  sym_freq_8         ['TNDM']
1507  2018-07-11  [30, 60, 120]  sym_freq_8         ['TNDM']
1508  2018-07-10  [30, 60, 120]  sym_freq_8         ['TNDM']
1509  2018-07-09  [30, 60, 120]  sym_freq_8         ['TNDM']
1510  2018-07-06  [30, 60, 120]  sym_freq_8         ['TNDM']

[1511 rows x 4 columns]


In [14]:
print(f'{fp_df_model_top_picks}.head(20):\n{df_model_top_picks.head(20)}')

df_model_top_picks.head(20):
          date      days_lookback    syms_freq           symbols
0   2024-01-29      [30, 60, 120]   sym_freq_9           ['SHV']
1   2024-01-29      [30, 60, 120]   sym_freq_8   ['ANF', 'FTSM']
2   2024-01-26      [30, 60, 120]   sym_freq_9           ['SHV']
3   2024-01-26      [30, 60, 120]   sym_freq_8          ['FTSM']
4   2024-01-25      [30, 60, 120]   sym_freq_9           ['SHV']
5   2024-01-24      [30, 60, 120]   sym_freq_8           ['SHV']
6   2024-01-23      [30, 60, 120]   sym_freq_8    ['ANF', 'SHV']
7   2024-01-22      [30, 60, 120]   sym_freq_8           ['SHV']
8   2024-01-19      [30, 60, 120]   sym_freq_8   ['CYTK', 'SHV']
9   2024-01-18      [30, 60, 120]   sym_freq_9           ['SHV']
10  2024-01-18      [30, 60, 120]   sym_freq_8          ['CYTK']
11  2024-01-17      [30, 60, 120]   sym_freq_9           ['SHV']
12  2024-01-17      [30, 60, 120]   sym_freq_8  ['CYTK', 'FTSM']
13  2024-01-16      [30, 60, 120]   sym_freq_9   ['CYTK', 'SH