In [27]:
# import sys
# from pathlib import Path

# # Notebook cell
# %load_ext autoreload
# %autoreload 2

# # Get root directory (assuming notebook is in root/notebooks/)
# NOTEBOOK_DIR = Path.cwd()
# ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# # Add src directory to Python path
# sys.path.append(str(ROOT_DIR / 'src'))

# # Verify path
# print(f"Python will look in these locations:\n{sys.path}")


# # --- Execute the processor ---
# import utils


# SOURCE_PATH, DEST_PATH = utils.main_processor(
#     data_dir='..\data',  # search project ..\data    
#     downloads_dir=None,  # None searchs Downloads dir, '' omits search
#     downloads_limit=50,  # search the first 10 files
#     clean_name_override=None,  # override filename
#     start_file_pattern='df_OHLCV_', # search for files starting with 'df_'
# )



In [28]:
# process_files.py
from config import date_str, DOWNLOAD_DIR, DEST_DIR
from pathlib import Path  # Better path handling

print(f"date_str: {date_str}")
print(f"DOWNLOAD_DIR: {DOWNLOAD_DIR}")
print(f"DEST_DIR: {DEST_DIR}\n")

# Build paths
source_path = Path(DOWNLOAD_DIR) / f'df_OHLCV_{date_str}.pkl'
dest_path = Path(DEST_DIR) / f'df_OHLCV_{date_str}_clean.pkl'

print(f"source_path: {source_path}")
print(f"dest_path: {dest_path}")

date_str: 2025-03-14
DOWNLOAD_DIR: C:\Users\ping\Downloads
DEST_DIR: ..\data

source_path: C:\Users\ping\Downloads\df_OHLCV_2025-03-14.pkl
dest_path: ..\data\df_OHLCV_2025-03-14_clean.pkl


In [29]:
# --- Data Loading & Initial Inspection ---
import pandas as pd

# Load raw data from pickle file
df = pd.read_pickle(source_path)

# Display initial data structure
print("[Raw Data Overview]")
display(df.head())
df.info()

[Raw Data Overview]


Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Adj Close,Volume,Adj Open,Adj High,Adj Low
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
UBS,2025-03-14,32.25,32.85,32.19,32.73,32.73,7312500,32.25,32.85,32.19
UBS,2025-03-13,31.85,31.9,31.56,31.71,31.71,1876700,31.85,31.9,31.56
UBS,2025-03-12,31.96,32.03,31.64,31.94,31.94,3057200,31.96,32.03,31.64
UBS,2025-03-11,31.68,31.81,31.02,31.38,31.38,6007500,31.68,31.81,31.02
UBS,2025-03-10,32.74,32.86,31.65,31.88,31.88,7367000,32.74,32.86,31.65


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 348198 entries, ('UBS', Timestamp('2025-03-14 00:00:00')) to ('SARO', Timestamp('2024-10-02 00:00:00'))
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Open       348198 non-null  float64
 1   High       348198 non-null  float64
 2   Low        348198 non-null  float64
 3   Close      348198 non-null  float64
 4   Adj Close  348198 non-null  float64
 5   Volume     348168 non-null  Int64  
 6   Adj Open   348198 non-null  float64
 7   Adj High   348198 non-null  float64
 8   Adj Low    348198 non-null  float64
dtypes: Int64(1), float64(8)
memory usage: 25.6+ MB


In [30]:
import sys
from pathlib import Path

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))


# --- Data Filtering & Cleaning ---
import utils  # Custom utility functions

# 1. Align dates across all symbols using SPY as reference
df = utils.filter_df_dates_to_reference_symbol(df=df, reference_symbol='SPY')

# 2. Remove symbols with missing data points
df_clean, missing_symbols = utils.filter_symbols_with_missing_values(df)

# Display cleaning results
print("\n[Cleaning Report]")
print(f"Removed {len(missing_symbols)} symbols with missing data: {missing_symbols}")
print("\n[Cleaned Data Structure]")
df_clean.info()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Original number of symbols: 1399
Number of symbols after filtering: 1380
Number of symbols filtered out: 19

First 10 symbols that were filtered out:
['ZK', 'SAIL', 'LTM', 'TTAN', 'TEM', 'SOLV', 'BTC', 'ULS', 'STRK', 'SW']

Example of dates for first filtered out symbol:

Dates for ZK:
DatetimeIndex(['2025-03-14', '2025-03-13', '2025-03-12', '2025-03-11',
               '2025-03-10', '2025-03-07', '2025-03-06', '2025-03-05',
               '2025-03-04', '2025-03-03',
               ...
               '2024-05-23', '2024-05-22', '2024-05-21', '2024-05-20',
               '2024-05-17', '2024-05-16', '2024-05-15', '2024-05-14',
               '2024-05-13', '2024-05-10'],
              dtype='datetime64[ns]', name='Date', length=211, freq=None)

Filtered DataFrame info:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 345000 entries, ('UBS', Timestamp('2025-03-14 00:00:00')) to ('PCVX', Timestamp('2024

In [31]:
# --- Save Cleaned Data ---
# Save processed data to pickle file
df_clean.to_pickle(dest_path)
print(f"\n[Save Successful] Cleaned data saved to:\n{dest_path}")


[Save Successful] Cleaned data saved to:
..\data\df_OHLCV_2025-03-14_clean.pkl
