In [1]:
import sys
from pathlib import Path

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# --- Execute the processor ---
import utils

SOURCE_PATH_OHLCV = '..\data\df_OHLCV_2025-03-14_clean.pkl'
SOURCE_PATH = '..\data\df_finviz_n_ratios_merged.pkl'
PICKLE_PATH_COVARIANCE = '..\data\df_cov_emv_matrix.pkl'
PICKLE_PATH_CORRELATION = '..\data\df_corr_emv_matrix.pkl'

Python will look in these locations:
['C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\python310.zip', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\DLLs', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv', '', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\win32', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\win32\\lib', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\Pythonwin', 'c:\\Users\\ping\\Files_win10\\python\\py310\\stocks\\src']


In [2]:
import pandas as pd

df_ohlcv = pd.read_pickle(SOURCE_PATH_OHLCV)
df_finviz = pd.read_pickle(SOURCE_PATH)

print(f"Loaded df_ohlcv shape: {df_ohlcv.shape}")
print(f"Loaded df_finviz shape: {df_finviz.shape}")


Loaded df_ohlcv shape: (344750, 9)
Loaded df_finviz shape: (1379, 54)


In [3]:
# Combine indices from both dataframes and remove duplicates
tickers = list(set(df_finviz.index))
print(f"Total unique tickers: {len(tickers)}")

Total unique tickers: 1379


In [4]:
df_close = df_ohlcv['Adj Close'].unstack(level=0)

# Assume `tickers` is your predefined list of tickers
common_tickers = df_close.columns.intersection(tickers)
df_close = df_close[common_tickers]

# Assuming `tickers` is your original list of tickers
missing_tickers = list(set(tickers) - set(df_close.columns))

# Print the missing tickers (if any)
if missing_tickers:
    print(f"{len(missing_tickers)} tickers not found in df_close:")
    print(missing_tickers)
else:
    print("All tickers in your list are present in df_close.")

print(f"\nShape of filtered df_close: {df_close.shape}")
display(df_close.head())

All tickers in your list are present in df_close.

Shape of filtered df_close: (250, 1379)


Symbol,UBS,INTC,ADI,IBN,TD,SAN,MELI,SMFG,LRCX,KKR,...,WBA,DTM,EHC,BROS,ERJ,WIX,HESM,AOS,BEPC,PCVX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-03-14,32.73,24.05,208.75,29.29,58.93,6.77,2021.37,15.9,78.69,113.31,...,11.23,95.79,95.98,62.48,52.33,172.59,42.09,66.85,28.22,74.37
2025-03-13,31.71,23.7,203.8,28.95,57.67,6.45,1989.7,15.48,75.26,108.69,...,11.26,93.33,93.68,59.61,51.0,172.83,40.68,65.45,28.32,73.02
2025-03-12,31.94,20.68,206.62,28.79,58.33,6.5,2004.83,15.29,76.59,114.37,...,11.2,93.43,94.28,62.31,50.6,175.0,40.64,65.64,28.93,73.24
2025-03-11,31.38,19.78,209.08,28.52,57.67,6.39,1990.11,14.89,73.36,112.11,...,11.21,89.9,96.99,60.17,50.59,175.87,40.63,67.11,28.3,69.85
2025-03-10,31.88,19.93,216.26,28.09,58.75,6.39,1944.61,14.91,73.81,108.17,...,11.28,86.95,99.11,58.41,51.0,170.4,40.34,69.61,28.61,68.89


In [5]:
df_close_sorted = df_close.sort_index()
print(f'df_close_sorted shape: {df_close_sorted.shape}')
display(df_close_sorted.head())

# Calculate daily returns
df_close_returns = df_close_sorted.pct_change().dropna()
print(f'df_close_returns shape: {df_close_returns.shape}')
display(df_close_returns.head())


df_close_sorted shape: (250, 1379)


Symbol,UBS,INTC,ADI,IBN,TD,SAN,MELI,SMFG,LRCX,KKR,...,WBA,DTM,EHC,BROS,ERJ,WIX,HESM,AOS,BEPC,PCVX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-03-15,31.11,42.2,192.01,25.6,57.05,4.23,1499.51,11.52,88.5,96.14,...,19.46,56.01,74.21,33.94,23.33,135.57,32.66,85.61,22.72,69.7
2024-03-18,30.74,42.27,188.1,25.73,56.29,4.27,1509.05,11.63,89.98,94.6,...,19.34,55.98,75.44,34.15,23.48,138.52,32.78,84.4,22.27,68.04
2024-03-19,30.35,41.62,187.43,25.68,56.55,4.3,1519.44,11.54,90.54,95.21,...,19.36,57.07,76.7,33.71,25.07,136.52,32.84,85.3,22.26,67.6
2024-03-20,30.42,41.76,191.15,25.96,57.11,4.36,1528.84,11.67,92.29,98.03,...,19.72,56.85,76.11,34.24,25.48,141.79,32.8,86.15,22.86,69.13
2024-03-21,30.45,41.98,191.96,25.76,57.17,4.37,1555.69,11.71,95.55,101.05,...,19.45,57.85,76.74,35.62,24.73,139.89,33.25,88.35,22.87,67.64


df_close_returns shape: (249, 1379)


Symbol,UBS,INTC,ADI,IBN,TD,SAN,MELI,SMFG,LRCX,KKR,...,WBA,DTM,EHC,BROS,ERJ,WIX,HESM,AOS,BEPC,PCVX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-03-18,-0.011893,0.001659,-0.020364,0.005078,-0.013322,0.009456,0.006362,0.009549,0.016723,-0.016018,...,-0.006166,-0.000536,0.016575,0.006187,0.006429,0.02176,0.003674,-0.014134,-0.019806,-0.023816
2024-03-19,-0.012687,-0.015377,-0.003562,-0.001943,0.004619,0.007026,0.006885,-0.007739,0.006224,0.006448,...,0.001034,0.019471,0.016702,-0.012884,0.067717,-0.014438,0.00183,0.010664,-0.000449,-0.006467
2024-03-20,0.002306,0.003364,0.019847,0.010903,0.009903,0.013953,0.006186,0.011265,0.019328,0.029619,...,0.018595,-0.003855,-0.007692,0.015722,0.016354,0.038602,-0.001218,0.009965,0.026954,0.022633
2024-03-21,0.000986,0.005268,0.004238,-0.007704,0.001051,0.002294,0.017562,0.003428,0.035323,0.030807,...,-0.013692,0.01759,0.008277,0.040304,-0.029435,-0.0134,0.01372,0.025537,0.000437,-0.021554
2024-03-22,-0.000328,0.003573,-0.008387,0.000776,-0.007871,0.022883,0.010478,0.017079,-0.004814,-0.00861,...,-0.010797,-0.002074,-0.010295,-0.057552,0.080873,-0.003932,-0.003008,-0.006678,0.010057,-0.006062


In [6]:
cov_emv_matrix, corr_emv_matrix = utils.get_cov_corr_ewm_matrices(df_close_returns, span=21, return_corr=True, return_cov=True)

print(f'cov_emv_matrix shape: {cov_emv_matrix.shape}')
display(cov_emv_matrix.head())
print(f'corr_emv_matrix shape: {corr_emv_matrix.shape}')
display(corr_emv_matrix.head())  

cov_emv_matrix shape: (1379, 1379)


Symbol,UBS,INTC,ADI,IBN,TD,SAN,MELI,SMFG,LRCX,KKR,...,WBA,DTM,EHC,BROS,ERJ,WIX,HESM,AOS,BEPC,PCVX
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
UBS,0.000631,0.000281,0.00035,1.3e-05,0.000193,0.00049,0.000209,0.000284,0.000613,0.000632,...,0.000144,0.000221,9.6e-05,0.000696,0.000159,0.000441,0.000161,8.8e-05,1.7e-05,0.000204
INTC,0.000281,0.002817,6.6e-05,-1.1e-05,4.3e-05,9.4e-05,6.1e-05,0.00034,0.000349,5e-05,...,0.000163,0.000326,-5.2e-05,0.000611,-6.9e-05,0.000257,0.000178,-9.5e-05,-0.000135,0.000246
ADI,0.00035,6.6e-05,0.000589,2.4e-05,0.000144,0.000298,0.000123,0.000163,0.000509,0.000294,...,6.2e-05,0.000132,0.000127,0.000461,6.2e-05,0.000141,0.000125,0.000178,0.00012,0.000129
IBN,1.3e-05,-1.1e-05,2.4e-05,9.9e-05,1.5e-05,4.3e-05,4.9e-05,5.1e-05,5.7e-05,0.000127,...,-0.000124,0.000101,1.5e-05,0.000118,4.5e-05,6.2e-05,2.2e-05,-3e-06,1.6e-05,0.000148
TD,0.000193,4.3e-05,0.000144,1.5e-05,0.000134,0.00019,5.7e-05,0.000105,0.000235,0.000208,...,-1e-06,6.2e-05,6.8e-05,0.000224,6.6e-05,7.8e-05,7.4e-05,8.7e-05,4.6e-05,0.000104


corr_emv_matrix shape: (1379, 1379)


Symbol,UBS,INTC,ADI,IBN,TD,SAN,MELI,SMFG,LRCX,KKR,...,WBA,DTM,EHC,BROS,ERJ,WIX,HESM,AOS,BEPC,PCVX
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
UBS,1.0,0.21061,0.573803,0.052719,0.663496,0.778463,0.394446,0.705066,0.75135,0.705928,...,0.154528,0.383975,0.255013,0.532203,0.219512,0.567581,0.438889,0.217148,0.032174,0.284953
INTC,0.21061,1.0,0.051587,-0.019887,0.069431,0.070395,0.054298,0.398674,0.202098,0.02652,...,0.083103,0.268511,-0.065967,0.22129,-0.045056,0.156546,0.229177,-0.110979,-0.121568,0.162411
ADI,0.573803,0.051587,1.0,0.097973,0.511197,0.490459,0.240141,0.417838,0.645005,0.340436,...,0.068776,0.237253,0.349209,0.364788,0.089114,0.188023,0.353009,0.457317,0.236218,0.186363
IBN,0.052719,-0.019887,0.097973,1.0,0.128165,0.1727,0.234305,0.317786,0.17737,0.356167,...,-0.335963,0.440358,0.100849,0.227855,0.155555,0.201748,0.152629,-0.02022,0.074948,0.521594
TD,0.663496,0.069431,0.511197,0.128165,1.0,0.655009,0.231852,0.563382,0.624552,0.503258,...,-0.003187,0.234971,0.391389,0.372019,0.19621,0.216888,0.437436,0.469642,0.191367,0.316155


In [7]:
cov_emv_matrix.to_pickle(PICKLE_PATH_COVARIANCE)
print(f'cov_emv_matrix pickled to {PICKLE_PATH_COVARIANCE}')
display(cov_emv_matrix.info())

corr_emv_matrix.to_pickle(PICKLE_PATH_CORRELATION)
print(f'corr_emv_matrix pickled to {PICKLE_PATH_CORRELATION}')
display(corr_emv_matrix.info())

cov_emv_matrix pickled to ..\data\df_cov_emv_matrix.pkl
<class 'pandas.core.frame.DataFrame'>
Index: 1379 entries, UBS to PCVX
Columns: 1379 entries, UBS to PCVX
dtypes: float64(1379)
memory usage: 14.6+ MB


None

corr_emv_matrix pickled to ..\data\df_corr_emv_matrix.pkl
<class 'pandas.core.frame.DataFrame'>
Index: 1379 entries, UBS to PCVX
Columns: 1379 entries, UBS to PCVX
dtypes: float64(1379)
memory usage: 14.6+ MB


None