In [1]:
# process_files.py
from config import date_str, DOWNLOAD_DIR, DEST_DIR
from pathlib import Path  # Better path handling

print(f"date_str: {date_str}")

date_str: 2025-04-02


In [2]:
import sys
from pathlib import Path

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# --- Execute the processor ---
import utils

SOURCE_PATH_OHLCV = f'..\data\df_OHLCV_{date_str}_clean.parquet'
SOURCE_PATH = '..\data\df_finviz_n_ratios_merged.parquet'
PARQUET_PATH_COVARIANCE = '..\data\df_cov_emv_matrix.parquet'
PARQUET_PATH_CORRELATION = '..\data\df_corr_emv_matrix.parquet'

Python will look in these locations:
['C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\python311.zip', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\DLLs', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\Lib', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv', '', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\win32', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\Pythonwin', 'c:\\Users\\ping\\Files_win10\\python\\py311\\stocks\\src']


In [3]:
import pandas as pd

df_ohlcv = pd.read_parquet(SOURCE_PATH_OHLCV)
df_finviz = pd.read_parquet(SOURCE_PATH)

print(f"Loaded df_ohlcv shape: {df_ohlcv.shape}")
print(f"Loaded df_finviz shape: {df_finviz.shape}")


Loaded df_ohlcv shape: (385536, 9)
Loaded df_finviz shape: (1536, 54)


In [4]:
# Combine indices from both dataframes and remove duplicates
tickers = list(set(df_finviz.index))
print(f"Total unique tickers: {len(tickers)}")

Total unique tickers: 1536


In [5]:
df_close = df_ohlcv['Adj Close'].unstack(level=0)

# Assume `tickers` is your predefined list of tickers
common_tickers = df_close.columns.intersection(tickers)
df_close = df_close[common_tickers]

# Assuming `tickers` is your original list of tickers
missing_tickers = list(set(tickers) - set(df_close.columns))

# Print the missing tickers (if any)
if missing_tickers:
    print(f"{len(missing_tickers)} tickers not found in df_close:")
    print(missing_tickers)
else:
    print("All tickers in your list are present in df_close.")

print(f"\nShape of filtered df_close: {df_close.shape}")
display(df_close.head())

All tickers in your list are present in df_close.

Shape of filtered df_close: (251, 1536)


Symbol,A,AA,AAL,AAON,AAPL,ABBV,ABEV,ABNB,ABT,ACGL,...,YUMC,Z,ZBH,ZBRA,ZG,ZION,ZM,ZS,ZTO,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-04-03,142.67,36.76,14.16,85.97,168.85,171.1,2.427,159.34,109.06,87.7,...,39.08,46.74,128.87,292.75,45.85,41.13,63.22,186.31,20.59,161.25
2024-04-04,140.3,35.55,13.83,85.33,168.03,162.0,2.427,158.84,107.91,86.62,...,39.07,46.82,126.44,292.96,45.83,40.35,63.28,182.01,20.34,163.77
2024-04-05,143.06,36.08,13.76,89.58,168.78,164.02,2.3873,161.77,108.98,90.24,...,38.31,47.22,126.21,290.38,46.23,40.7,62.94,183.34,19.95,164.11
2024-04-08,143.39,35.93,13.9,92.53,167.66,163.83,2.3873,160.27,108.31,89.91,...,38.01,47.7,125.58,296.87,46.61,41.58,63.01,183.71,19.8,163.94
2024-04-09,146.31,36.41,13.95,89.24,168.87,164.16,2.427,162.66,110.19,86.74,...,38.78,47.99,128.26,306.17,47.19,42.09,63.85,188.5,20.15,165.38


In [6]:
df_close_sorted = df_close.sort_index()
print(f'df_close_sorted shape: {df_close_sorted.shape}')
display(df_close_sorted.head())

# Calculate daily returns
df_close_returns = df_close_sorted.pct_change().dropna()
print(f'df_close_returns shape: {df_close_returns.shape}')
display(df_close_returns.head())


df_close_sorted shape: (251, 1536)


Symbol,A,AA,AAL,AAON,AAPL,ABBV,ABEV,ABNB,ABT,ACGL,...,YUMC,Z,ZBH,ZBRA,ZG,ZION,ZM,ZS,ZTO,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-04-03,142.67,36.76,14.16,85.97,168.85,171.1,2.427,159.34,109.06,87.7,...,39.08,46.74,128.87,292.75,45.85,41.13,63.22,186.31,20.59,161.25
2024-04-04,140.3,35.55,13.83,85.33,168.03,162.0,2.427,158.84,107.91,86.62,...,39.07,46.82,126.44,292.96,45.83,40.35,63.28,182.01,20.34,163.77
2024-04-05,143.06,36.08,13.76,89.58,168.78,164.02,2.3873,161.77,108.98,90.24,...,38.31,47.22,126.21,290.38,46.23,40.7,62.94,183.34,19.95,164.11
2024-04-08,143.39,35.93,13.9,92.53,167.66,163.83,2.3873,160.27,108.31,89.91,...,38.01,47.7,125.58,296.87,46.61,41.58,63.01,183.71,19.8,163.94
2024-04-09,146.31,36.41,13.95,89.24,168.87,164.16,2.427,162.66,110.19,86.74,...,38.78,47.99,128.26,306.17,47.19,42.09,63.85,188.5,20.15,165.38


df_close_returns shape: (250, 1536)


Symbol,A,AA,AAL,AAON,AAPL,ABBV,ABEV,ABNB,ABT,ACGL,...,YUMC,Z,ZBH,ZBRA,ZG,ZION,ZM,ZS,ZTO,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-04-04,-0.016612,-0.032916,-0.023305,-0.007444,-0.004856,-0.053185,0.0,-0.003138,-0.010545,-0.012315,...,-0.000256,0.001712,-0.018856,0.000717,-0.000436,-0.018964,0.000949,-0.02308,-0.012142,0.015628
2024-04-05,0.019672,0.014909,-0.005061,0.049807,0.004463,0.012469,-0.016358,0.018446,0.009916,0.041792,...,-0.019452,0.008543,-0.001819,-0.008807,0.008728,0.008674,-0.005373,0.007307,-0.019174,0.002076
2024-04-08,0.002307,-0.004157,0.010174,0.032931,-0.006636,-0.001158,0.0,-0.009272,-0.006148,-0.003657,...,-0.007831,0.010165,-0.004992,0.02235,0.00822,0.021622,0.001112,0.002018,-0.007519,-0.001036
2024-04-09,0.020364,0.013359,0.003597,-0.035556,0.007217,0.002014,0.01663,0.014912,0.017358,-0.035257,...,0.020258,0.00608,0.021341,0.031327,0.012444,0.012266,0.013331,0.026074,0.017677,0.008784
2024-04-10,-0.02194,-0.017303,-0.03871,-0.019162,-0.011133,-0.006518,-0.028595,-0.014632,-0.011253,0.010606,...,-0.013667,-0.059596,-0.012942,-0.020446,-0.059123,-0.055595,-0.025529,-0.024138,-0.002978,-0.023582


In [7]:
cov_emv_matrix, corr_emv_matrix = utils.get_cov_corr_ewm_matrices_chunked(df_close_returns, span=21, return_corr=True, return_cov=True)

print(f'cov_emv_matrix shape: {cov_emv_matrix.shape}')
display(cov_emv_matrix.head())
print(f'corr_emv_matrix shape: {corr_emv_matrix.shape}')
display(corr_emv_matrix.head())  

cov_emv_matrix shape: (1536, 1536)


Symbol,A,AA,AAL,AAON,AAPL,ABBV,ABEV,ABNB,ABT,ACGL,...,YUMC,Z,ZBH,ZBRA,ZG,ZION,ZM,ZS,ZTO,ZTS
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,0.00022,0.000189,0.000277,0.00033,0.000105,3e-05,3.6e-05,0.000119,1.5e-05,6.5e-05,...,7.8e-05,0.000157,4e-05,0.000168,0.000167,0.000156,0.000127,0.00015,4.8e-05,7.6e-05
AA,0.000189,0.000668,0.000316,0.00045,0.000116,-7.4e-05,0.000121,0.000257,-0.000123,8.4e-05,...,0.000148,0.000294,-6.9e-05,0.000188,0.000312,0.000269,0.000211,0.000348,2.7e-05,-1.5e-05
AAL,0.000277,0.000316,0.000785,0.000516,0.000194,2.1e-05,5.7e-05,0.000406,8.1e-05,0.000151,...,0.000158,0.000365,7.6e-05,0.000352,0.000384,0.00028,0.000243,0.00025,0.000114,0.000144
AAON,0.00033,0.00045,0.000516,0.001244,0.000192,-0.000112,-2.5e-05,0.000369,-6.5e-05,5.2e-05,...,9.4e-05,0.000503,-2.7e-05,0.000348,0.000524,0.000292,0.000297,0.00049,1.7e-05,-3.9e-05
AAPL,0.000105,0.000116,0.000194,0.000192,0.000243,-1.9e-05,4.8e-05,0.000182,5.9e-05,7.4e-05,...,6.9e-05,0.000177,1.2e-05,0.00012,0.000181,0.000139,0.000101,0.00013,6e-05,6.3e-05


corr_emv_matrix shape: (1536, 1536)


Symbol,A,AA,AAL,AAON,AAPL,ABBV,ABEV,ABNB,ABT,ACGL,...,YUMC,Z,ZBH,ZBRA,ZG,ZION,ZM,ZS,ZTO,ZTS
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,1.0,0.493081,0.665815,0.63086,0.452365,0.154622,0.166585,0.344348,0.073027,0.353852,...,0.309734,0.494752,0.219803,0.673706,0.497851,0.65254,0.536152,0.419098,0.153362,0.383545
AA,0.493081,1.0,0.436009,0.494166,0.288046,-0.216593,0.317702,0.426003,-0.345658,0.265227,...,0.339217,0.530522,-0.216489,0.431863,0.534262,0.647237,0.512298,0.556286,0.049969,-0.042511
AAL,0.665815,0.436009,1.0,0.522345,0.443506,0.057435,0.137838,0.621501,0.210558,0.438751,...,0.333647,0.608519,0.220662,0.747802,0.605989,0.621101,0.543365,0.368863,0.192371,0.386469
AAON,0.63086,0.494166,0.522345,1.0,0.348452,-0.238707,-0.048658,0.448916,-0.134429,0.118814,...,0.157526,0.66635,-0.061595,0.586416,0.656429,0.514981,0.52923,0.573972,0.022187,-0.083386
AAPL,0.452365,0.288046,0.443506,0.348452,1.0,-0.093006,0.210075,0.501629,0.274899,0.386625,...,0.261833,0.529264,0.060826,0.4584,0.513509,0.553219,0.405768,0.343752,0.182615,0.302639


In [8]:
cov_emv_matrix.to_parquet(PARQUET_PATH_COVARIANCE)
print(f'cov_emv_matrix parqueted to {PARQUET_PATH_COVARIANCE}')
display(cov_emv_matrix.info())

corr_emv_matrix.to_parquet(PARQUET_PATH_CORRELATION)
print(f'corr_emv_matrix parqueted to {PARQUET_PATH_CORRELATION}')
display(corr_emv_matrix.info())

cov_emv_matrix parqueted to ..\data\df_cov_emv_matrix.parquet
<class 'pandas.core.frame.DataFrame'>
Index: 1536 entries, A to ZTS
Columns: 1536 entries, A to ZTS
dtypes: float64(1536)
memory usage: 18.0+ MB


None

corr_emv_matrix parqueted to ..\data\df_corr_emv_matrix.parquet
<class 'pandas.core.frame.DataFrame'>
Index: 1536 entries, A to ZTS
Columns: 1536 entries, A to ZTS
dtypes: float64(1536)
memory usage: 18.0+ MB


None