In [2]:
import sys
from pathlib import Path

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# --- Execute the processor ---
import utils

SOURCE_PATH_OHLCV = '..\data\df_OHLCV_2025-03-07_clean.pkl'
SOURCE_PATH_STOCK = '..\data\df_finviz_stocks_n_ratios.pkl'
SOURCE_PATH_ETF = '..\data\df_finviz_etfs_n_ratios.pkl'
PICKLE_PATH = '..\data\df_correlation_matrix.pkl'

Python will look in these locations:
['C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\python310.zip', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\DLLs', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv', '', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\win32', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\win32\\lib', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\Pythonwin', 'c:\\Users\\ping\\Files_win10\\python\\py310\\stocks\\src']


In [3]:
import pandas as pd

df_ohlcv = pd.read_pickle(SOURCE_PATH_OHLCV)
df_stocks = pd.read_pickle(SOURCE_PATH_STOCK)
df_etfs = pd.read_pickle(SOURCE_PATH_ETF)


print(f"Load df_ohlcv shape: {df_ohlcv.shape}")
print(f"Loaded df_stocks shape: {df_stocks.shape}")
print(f"Loaded df_etfs shape: {df_etfs.shape}")

Load df_ohlcv shape: (620750, 9)
Loaded df_stocks shape: (1000, 40)
Loaded df_etfs shape: (420, 31)


In [4]:
# Combine indices from both dataframes and remove duplicates
symbols = list(set(df_stocks.index) | set(df_etfs.index))
print(f"Total unique symbols: {len(symbols)}")

Total unique symbols: 1420


In [5]:
df_close = df_ohlcv['Adj Close'].unstack(level=0)
print(f"Shape of df_close: {df_close.shape}")
display(df_close.head())

Shape of df_close: (250, 2483)


Symbol,AAPL,NVDA,MSFT,AMZN,GOOG,GOOGL,META,TSLA,BRK-B,BRK-A,...,BIZD,FPEI,FLQL,NVDY,FENY,SIVR,BSVO,NULG,AVSC,FCOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-03-07,239.07,112.69,393.31,199.25,175.75,173.86,625.66,262.67,495.52,743535.0,...,17.12,18.81,58.35,16.56,23.86,31.01,19.86,81.16,50.13,59.44
2025-03-06,235.33,110.57,396.89,200.7,174.21,172.35,627.93,263.45,497.84,747110.0,...,16.73,18.78,58.04,16.3,23.5,31.1,19.79,81.39,49.87,59.13
2025-03-05,235.74,117.3,401.02,208.36,174.99,173.02,656.47,279.1,498.5,746940.0,...,17.0,18.83,59.1,17.19,23.46,31.22,19.89,83.54,50.24,60.3
2025-03-04,235.93,115.99,388.61,203.8,172.61,170.92,640.0,272.04,495.86,742800.0,...,17.12,18.82,58.45,16.95,23.81,30.4,19.83,82.44,49.85,59.44
2025-03-03,238.03,114.06,388.49,205.02,168.66,167.01,655.05,284.65,510.08,765160.0,...,17.47,18.82,59.22,16.67,24.03,30.11,20.2,82.8,50.62,59.95


In [6]:
# Assume `symbols` is your predefined list of symbols
common_symbols = df_close.columns.intersection(symbols)
df_filtered = df_close[common_symbols]

# Assuming `symbols` is your original list of symbols
missing_symbols = list(set(symbols) - set(df_filtered.columns))

# Print the missing symbols (if any)
if missing_symbols:
    print(f"{len(missing_symbols)} symbols not found in df_filtered:")
    print(missing_symbols)
else:
    print("All symbols in your list are present in df_filtered.")

print(f"\nShape of filtered df_close: {df_filtered.shape}")
display(df_filtered)

26 symbols not found in df_filtered:
['ADM', 'LINE', 'SAIL', 'TLN', 'VG', 'ULS', 'TTAN', 'WRB', 'RBRK', 'STRK', 'VIK', 'FER', 'SW', 'JGLO', 'TEM', 'ALAB', 'GEV', 'FNGU', 'LTM', 'ZK', 'IUSG', 'SOLV', 'BTC', 'RDDT', 'FNGA', 'SFD']

Shape of filtered df_close: (250, 1394)


Symbol,AAPL,NVDA,MSFT,AMZN,GOOG,GOOGL,META,TSLA,BRK-B,BRK-A,...,JAVA,EPI,BSCR,IBDS,JPIE,HYMB,CGMU,USRT,VFLO,IWX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-03-07,239.07,112.69,393.31,199.25,175.75,173.86,625.66,262.67,495.52,743535.0,...,64.13,41.67,19.57,24.08,45.95,25.68,27.16,58.80,34.81,82.42
2025-03-06,235.33,110.57,396.89,200.70,174.21,172.35,627.93,263.45,497.84,747110.0,...,63.88,41.39,19.58,24.11,45.94,25.72,27.13,58.51,34.50,81.99
2025-03-05,235.74,117.30,401.02,208.36,174.99,173.02,656.47,279.10,498.50,746940.0,...,64.44,41.30,19.59,24.09,45.98,25.70,27.18,60.15,34.58,82.68
2025-03-04,235.93,115.99,388.61,203.80,172.61,170.92,640.00,272.04,495.86,742800.0,...,63.81,40.21,19.61,24.11,46.01,25.75,27.20,59.49,34.27,82.13
2025-03-03,238.03,114.06,388.49,205.02,168.66,167.01,655.05,284.65,510.08,765160.0,...,65.32,40.08,19.59,24.11,45.99,25.94,27.26,60.29,34.64,83.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-14,172.19,87.92,422.00,178.75,143.82,142.58,490.48,162.50,406.73,612500.0,...,57.37,42.67,18.43,22.68,42.64,24.54,26.33,51.43,30.72,72.99
2024-03-13,170.33,90.87,411.95,176.56,140.27,139.29,494.21,169.48,408.13,613900.0,...,57.71,42.37,18.48,22.71,42.74,24.60,26.39,52.20,30.87,73.38
2024-03-12,172.42,91.89,412.13,175.39,139.12,138.00,498.38,177.54,404.98,609710.0,...,57.65,43.87,18.49,22.73,42.73,24.54,26.38,52.50,30.61,73.24
2024-03-11,171.94,85.75,401.45,171.96,138.44,137.17,482.26,177.77,404.76,607756.0,...,57.54,44.13,18.51,22.76,42.78,24.56,26.38,52.56,30.42,73.01


In [7]:
# Compute the correlation matrix
correlation_matrix = df_filtered.corr()

# Display the top of the matrix (optional)
display(correlation_matrix)

Symbol,AAPL,NVDA,MSFT,AMZN,GOOG,GOOGL,META,TSLA,BRK-B,BRK-A,...,JAVA,EPI,BSCR,IBDS,JPIE,HYMB,CGMU,USRT,VFLO,IWX
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAPL,1.000000,0.865211,0.285015,0.625198,0.617776,0.617021,0.704202,0.774687,0.796218,0.786532,...,0.807567,0.265630,0.893160,0.893760,0.904572,0.868082,0.856385,0.813389,0.696280,0.794393
NVDA,0.865211,1.000000,0.397209,0.599437,0.617008,0.617164,0.632011,0.702289,0.652391,0.642806,...,0.785788,0.258296,0.739450,0.741677,0.758833,0.735108,0.673766,0.711723,0.677029,0.741286
MSFT,0.285015,0.397209,1.000000,0.286205,0.450420,0.456174,0.015666,0.261834,-0.129055,-0.134927,...,0.058873,0.396537,-0.005601,-0.006358,0.010834,0.035303,0.003453,0.019319,-0.008725,-0.022504
AMZN,0.625198,0.599437,0.286205,1.000000,0.796852,0.797941,0.832422,0.906717,0.569965,0.569578,...,0.726951,-0.351801,0.587875,0.587473,0.675452,0.503809,0.468954,0.401967,0.792396,0.680532
GOOG,0.617776,0.617008,0.450420,0.796852,1.000000,0.999841,0.541711,0.742231,0.315257,0.310032,...,0.482958,-0.081670,0.409732,0.412213,0.503639,0.321817,0.288031,0.193074,0.486420,0.418318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HYMB,0.868082,0.735108,0.035303,0.503809,0.321817,0.320690,0.737543,0.644934,0.900310,0.892436,...,0.873502,0.210867,0.958053,0.957832,0.936670,1.000000,0.984041,0.944220,0.764925,0.893886
CGMU,0.856385,0.673766,0.003453,0.468954,0.288031,0.286701,0.722494,0.621624,0.890277,0.882221,...,0.828467,0.215766,0.955374,0.954831,0.929897,0.984041,1.000000,0.931876,0.729880,0.859495
USRT,0.813389,0.711723,0.019319,0.401967,0.193074,0.192367,0.648267,0.584943,0.898196,0.891321,...,0.871524,0.304297,0.932673,0.932250,0.891458,0.944220,0.931876,1.000000,0.743637,0.891519
VFLO,0.696280,0.677029,-0.008725,0.792396,0.486420,0.484930,0.850610,0.839705,0.868264,0.868359,...,0.956252,-0.224811,0.804598,0.804253,0.843486,0.764925,0.729880,0.743637,1.000000,0.938551


In [9]:
correlation_matrix.to_pickle(PICKLE_PATH)