In [1]:
import sys
from pathlib import Path

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# --- Execute the processor ---
import utils

SOURCE_PATH_OHLCV = '..\data\df_OHLCV_2025-03-10_clean.pkl'
SOURCE_PATH_STOCK = '..\data\df_finviz_stocks_n_ratios.pkl'
SOURCE_PATH_ETF = '..\data\df_finviz_etfs_n_ratios.pkl'
PICKLE_PATH_COVARIANCE = '..\data\df_covariance_matrix.pkl'
PICKLE_PATH_CORRELATION = '..\data\df_correlation_matrix.pkl'

Python will look in these locations:
['C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\python310.zip', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\DLLs', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv', '', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\win32', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\win32\\lib', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\Pythonwin', 'c:\\Users\\ping\\Files_win10\\python\\py310\\stocks\\src']


In [2]:
import pandas as pd

df_ohlcv = pd.read_pickle(SOURCE_PATH_OHLCV)
df_stocks = pd.read_pickle(SOURCE_PATH_STOCK)
df_etfs = pd.read_pickle(SOURCE_PATH_ETF)


print(f"Load df_ohlcv shape: {df_ohlcv.shape}")
print(f"Loaded df_stocks shape: {df_stocks.shape}")
print(f"Loaded df_etfs shape: {df_etfs.shape}")

Load df_ohlcv shape: (620000, 9)
Loaded df_stocks shape: (1000, 43)
Loaded df_etfs shape: (420, 34)


In [3]:
# Combine indices from both dataframes and remove duplicates
symbols = list(set(df_stocks.index) | set(df_etfs.index))
print(f"Total unique symbols: {len(symbols)}")

Total unique symbols: 1420


In [4]:
df_close = df_ohlcv['Adj Close'].unstack(level=0)
print(f"Shape of df_close: {df_close.shape}")
display(df_close.head())

Shape of df_close: (250, 2480)


Symbol,AAPL,NVDA,MSFT,AMZN,GOOG,GOOGL,META,TSLA,BRK-B,BRK-A,...,BIZD,FPEI,FLQL,NVDY,FENY,SIVR,BSVO,NULG,AVSC,FCOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-03-10,227.48,106.98,380.16,194.54,167.81,165.87,597.99,222.15,497.1,744944.0,...,16.92,18.77,56.78,15.83,24.01,30.46,19.27,78.32,48.88,57.47
2025-03-07,239.07,112.69,393.31,199.25,175.75,173.86,625.66,262.67,495.62,742901.0,...,17.12,18.81,58.35,16.56,23.86,31.01,19.86,81.16,50.13,59.44
2025-03-06,235.33,110.57,396.89,200.7,174.21,172.35,627.93,263.45,497.84,747110.0,...,16.73,18.78,58.04,16.3,23.5,31.1,19.79,81.39,49.87,59.13
2025-03-05,235.74,117.3,401.02,208.36,174.99,173.02,656.47,279.1,498.5,746940.0,...,17.0,18.83,59.1,17.19,23.46,31.22,19.89,83.54,50.24,60.3
2025-03-04,235.93,115.99,388.61,203.8,172.61,170.92,640.0,272.04,495.86,742800.0,...,17.12,18.82,58.45,16.95,23.81,30.4,19.83,82.44,49.85,59.44


In [5]:
# Assume `symbols` is your predefined list of symbols
common_symbols = df_close.columns.intersection(symbols)
df_filtered = df_close[common_symbols]

# Assuming `symbols` is your original list of symbols
missing_symbols = list(set(symbols) - set(df_filtered.columns))

# Print the missing symbols (if any)
if missing_symbols:
    print(f"{len(missing_symbols)} symbols not found in df_filtered:")
    print(missing_symbols)
else:
    print("All symbols in your list are present in df_filtered.")

print(f"\nShape of filtered df_close: {df_filtered.shape}")
display(df_filtered)

29 symbols not found in df_filtered:
['FER', 'ZK', 'RDY', 'TTAN', 'SOLV', 'SFD', 'TEM', 'ADM', 'LTM', 'BTC', 'GEV', 'SW', 'RDDT', 'SPMO', 'EQT', 'RBRK', 'IUSG', 'JGLO', 'TLN', 'WRB', 'ULS', 'VG', 'FNGA', 'SAIL', 'VIK', 'FNGU', 'ALAB', 'LINE', 'STRK']

Shape of filtered df_close: (250, 1391)


Symbol,AAPL,NVDA,MSFT,AMZN,GOOG,GOOGL,META,TSLA,BRK-B,BRK-A,...,JAVA,EPI,BSCR,IBDS,JPIE,HYMB,CGMU,USRT,VFLO,IWX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-03-10,227.48,106.98,380.16,194.54,167.81,165.87,597.99,222.15,497.10,744944.0,...,63.12,40.99,19.61,24.13,45.97,25.74,27.20,58.14,34.51,81.40
2025-03-07,239.07,112.69,393.31,199.25,175.75,173.86,625.66,262.67,495.62,742901.0,...,64.13,41.67,19.57,24.08,45.95,25.68,27.16,58.80,34.81,82.42
2025-03-06,235.33,110.57,396.89,200.70,174.21,172.35,627.93,263.45,497.84,747110.0,...,63.88,41.39,19.58,24.11,45.94,25.72,27.13,58.51,34.50,81.99
2025-03-05,235.74,117.30,401.02,208.36,174.99,173.02,656.47,279.10,498.50,746940.0,...,64.44,41.30,19.59,24.09,45.98,25.70,27.18,60.15,34.58,82.68
2025-03-04,235.93,115.99,388.61,203.80,172.61,170.92,640.00,272.04,495.86,742800.0,...,63.81,40.21,19.61,24.11,46.01,25.75,27.20,59.49,34.27,82.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-15,171.81,87.82,413.26,174.42,141.66,140.67,482.77,163.57,408.13,618134.0,...,57.26,42.46,18.43,22.66,42.61,24.51,26.26,51.33,30.82,72.92
2024-03-14,172.19,87.92,422.00,178.75,143.82,142.58,490.48,162.50,406.73,612500.0,...,57.37,42.67,18.43,22.68,42.64,24.54,26.25,51.43,30.72,72.99
2024-03-13,170.33,90.87,411.95,176.56,140.27,139.29,494.21,169.48,408.13,613900.0,...,57.71,42.37,18.48,22.71,42.74,24.60,26.31,52.20,30.87,73.38
2024-03-12,172.42,91.89,412.13,175.39,139.12,138.00,498.38,177.54,404.98,609710.0,...,57.65,43.87,18.49,22.73,42.73,24.54,26.30,52.50,30.61,73.24


In [6]:
# Compute the covariance matrix for df_filtered
covariance_matrix = df_filtered.cov()

# Display the matrix
display(covariance_matrix)

Symbol,AAPL,NVDA,MSFT,AMZN,GOOG,GOOGL,META,TSLA,BRK-B,BRK-A,...,JAVA,EPI,BSCR,IBDS,JPIE,HYMB,CGMU,USRT,VFLO,IWX
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAPL,631.812346,415.372154,106.527309,307.590211,217.859957,216.446006,1241.996649,1689.796882,561.760914,805739.190529,...,67.723093,16.684166,8.522140,10.566939,23.335677,11.014896,7.550196,81.415110,33.567205,77.213681
NVDA,415.372154,368.092260,118.441204,224.963072,166.753004,165.955459,845.470133,1170.364451,345.957926,494868.812565,...,50.166416,12.822856,5.324985,6.617886,14.769030,7.058067,4.440458,54.108854,24.759583,54.659856
MSFT,106.527309,118.441204,245.836298,86.055265,98.742686,99.581638,5.803866,353.627960,-68.175486,-102566.670536,...,2.545830,16.511953,-0.165330,-0.211790,-0.214661,0.138635,-0.135273,0.618197,-0.713097,-2.283211
AMZN,307.590211,224.963072,86.055265,386.028735,222.944111,222.059393,1144.654590,1548.509821,312.282851,453130.287441,...,47.637788,-17.847818,4.357190,5.395184,13.551679,4.963949,3.178414,31.241771,29.874739,51.608556
GOOG,217.859957,166.753004,98.742686,222.944111,201.899019,200.828886,538.800135,919.389029,120.589601,172226.507680,...,22.615857,-3.128666,2.155218,2.686560,7.198898,2.236864,1.371080,10.530466,13.091160,22.517982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HYMB,11.014896,7.058067,0.138635,4.963949,2.236864,2.216476,26.133428,28.120975,12.832354,18469.268669,...,1.472901,0.257441,0.184238,0.228253,0.487298,0.255481,0.174787,1.898061,0.743280,1.750118
CGMU,7.550196,4.440458,-0.135273,3.178414,1.371080,1.355962,17.769889,18.620880,8.890733,12796.935201,...,0.966207,0.173535,0.128080,0.158633,0.337603,0.174787,0.123830,1.299832,0.491104,1.168800
USRT,81.415110,54.108854,0.618197,31.241771,10.530466,10.434340,180.694350,201.509417,100.541385,144848.440201,...,11.590729,3.040904,1.409965,1.746398,3.645410,1.898061,1.299832,15.831749,5.690293,13.748616
VFLO,33.567205,24.759583,-0.713097,29.874739,13.091160,12.979400,114.796173,139.967716,47.053932,68327.579421,...,6.146325,-1.160714,0.588485,0.728913,1.668799,0.743280,0.491104,5.690293,3.703458,6.997797


In [7]:
# Compute the correlation matrix
correlation_matrix = df_filtered.corr()

# Display the top of the matrix (optional)
display(correlation_matrix)

Symbol,AAPL,NVDA,MSFT,AMZN,GOOG,GOOGL,META,TSLA,BRK-B,BRK-A,...,JAVA,EPI,BSCR,IBDS,JPIE,HYMB,CGMU,USRT,VFLO,IWX
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAPL,1.000000,0.861321,0.270299,0.622829,0.609981,0.609152,0.705040,0.772883,0.791605,0.782004,...,0.806033,0.257095,0.890827,0.891352,0.901533,0.866975,0.853594,0.814041,0.693933,0.792634
NVDA,0.861321,1.000000,0.393733,0.596792,0.611686,0.611904,0.628792,0.701320,0.638698,0.629246,...,0.782248,0.258875,0.729254,0.731366,0.747530,0.727827,0.657713,0.708803,0.670596,0.735128
MSFT,0.270299,0.393733,1.000000,0.279347,0.443215,0.449290,0.005282,0.259297,-0.154013,-0.159585,...,0.048575,0.407905,-0.027706,-0.028640,-0.013295,0.017493,-0.024517,0.009909,-0.023633,-0.037575
AMZN,0.622829,0.596792,0.279347,1.000000,0.798581,0.799520,0.831289,0.906103,0.562975,0.562629,...,0.725357,-0.351851,0.582687,0.582225,0.669790,0.499848,0.459714,0.399633,0.790116,0.677773
GOOG,0.609981,0.611686,0.443215,0.798581,1.000000,0.999837,0.541063,0.743885,0.300603,0.295693,...,0.476163,-0.085286,0.398532,0.400889,0.491988,0.311453,0.274210,0.186258,0.478748,0.408917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HYMB,0.866975,0.727827,0.017493,0.499848,0.311453,0.310208,0.737741,0.639623,0.899244,0.891413,...,0.871774,0.197279,0.957721,0.957483,0.936203,1.000000,0.982692,0.943770,0.764133,0.893430
CGMU,0.853594,0.657713,-0.024517,0.459714,0.274210,0.272587,0.720542,0.608360,0.894902,0.887160,...,0.821424,0.191011,0.956328,0.955820,0.931642,0.982692,1.000000,0.928346,0.725199,0.857038
USRT,0.814041,0.708803,0.009909,0.399633,0.186258,0.185511,0.647989,0.582243,0.895017,0.888092,...,0.871477,0.296021,0.931071,0.930621,0.889687,0.943770,0.928346,1.000000,0.743133,0.891594
VFLO,0.693933,0.670596,-0.023633,0.790116,0.478748,0.477113,0.851160,0.836176,0.866050,0.866166,...,0.955480,-0.233617,0.803471,0.803093,0.842084,0.764133,0.725199,0.743133,1.000000,0.938275


In [8]:
covariance_matrix.to_pickle(PICKLE_PATH_COVARIANCE)

correlation_matrix.to_pickle(PICKLE_PATH_CORRELATION)

In [9]:
display(covariance_matrix.info())
display(correlation_matrix.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1391 entries, AAPL to IWX
Columns: 1391 entries, AAPL to IWX
dtypes: float64(1391)
memory usage: 14.8+ MB


None

<class 'pandas.core.frame.DataFrame'>
Index: 1391 entries, AAPL to IWX
Columns: 1391 entries, AAPL to IWX
dtypes: float64(1391)
memory usage: 14.8+ MB


None