In [1]:
import sys
from pathlib import Path

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# --- Execute the processor ---
import utils


SOURCE_PATH, DEST_PATH = utils.main_processor(
    data_dir='..\data',  # search project ..\data
    downloads_dir=None,  # None searchs Downloads dir, '' omits search
    downloads_limit=10,  # search the first 10 files
    clean_name_override=None,  # override filename
    start_file_pattern='df_', # search for files starting with 'df_'
)



Python will look in these locations:
['C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\python310.zip', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\DLLs', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.10.5', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv', '', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\win32', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\win32\\lib', 'c:\\Users\\ping\\Files_win10\\python\\py310\\.venv\\lib\\site-packages\\Pythonwin', 'c:\\Users\\ping\\Files_win10\\python\\py310\\stocks\\src']


<span style='color:#00ffff;font-weight:500'>[Downloads] Scanned latest 10 files • Found 6 'df_' matches</span>

**Available 'df_' files:**

- (1) `[DOWNLOADS]` `df_OHLCV_2025-03-07.pkl` <span style='color:#00ffff'>(46.41 MB, 2025-03-07 16:13)</span>

- (2) `[DOWNLOADS]` `df_OHLCV_2025-03-06.pkl` <span style='color:#00ffff'>(46.37 MB, 2025-03-06 16:34)</span>

- (3) `[DOWNLOADS]` `df_OHLCV_2025-03-05.pkl` <span style='color:#00ffff'>(46.40 MB, 2025-03-05 16:32)</span>

- (4) `[DOWNLOADS]` `df_OHLCV_2025-03-04.pkl` <span style='color:#00ffff'>(46.42 MB, 2025-03-04 16:45)</span>

- (5) `[DOWNLOADS]` `df_OHLCV_2025-03-03.pkl` <span style='color:#00ffff'>(46.30 MB, 2025-03-03 19:14)</span>

- (6) `[DOWNLOADS]` `df_OHLCV_2025-02-28.pkl` <span style='color:#00ffff'>(46.35 MB, 2025-02-28 16:48)</span>


Input a number to select file (1-6)



    **Selected paths:**
    - Source: `C:\Users\ping\Downloads\df_OHLCV_2025-03-07.pkl`  
    - Destination: `..\data\df_OHLCV_2025-03-07_clean.pkl`
    

In [2]:
# --- Data Loading & Initial Inspection ---
import pandas as pd

# Load raw data from pickle file
df = pd.read_pickle(SOURCE_PATH)

# Display initial data structure
print("[Raw Data Overview]")
display(df.head())
df.info()

[Raw Data Overview]


Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Adj Close,Volume,Adj Open,Adj High,Adj Low
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAPL,2025-03-07,235.1,241.37,234.76,239.07,239.07,44099046,235.1,241.37,234.76
AAPL,2025-03-06,234.44,237.86,233.16,235.33,235.33,45170400,234.44,237.86,233.16
AAPL,2025-03-05,235.42,236.55,229.23,235.74,235.74,47227600,235.42,236.55,229.23
AAPL,2025-03-04,237.71,240.07,234.68,235.93,235.93,53798100,237.71,240.07,234.68
AAPL,2025-03-03,241.79,244.03,236.11,238.03,238.03,47184000,241.79,244.03,236.11


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 631716 entries, ('AAPL', Timestamp('2025-03-07 00:00:00')) to ('IBTE', Timestamp('2024-03-08 00:00:00'))
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Open       631716 non-null  float64
 1   High       631716 non-null  float64
 2   Low        631716 non-null  float64
 3   Close      631716 non-null  float64
 4   Adj Close  631716 non-null  float64
 5   Volume     631516 non-null  Int64  
 6   Adj Open   631716 non-null  float64
 7   Adj High   631716 non-null  float64
 8   Adj Low    631716 non-null  float64
dtypes: Int64(1), float64(8)
memory usage: 46.4+ MB


In [3]:
# --- Data Filtering & Cleaning ---
import utils  # Custom utility functions

# 1. Align dates across all symbols using SPY as reference
df = utils.filter_df_dates_to_reference_symbol(df=df, reference_symbol='SPY')

# 2. Remove symbols with missing data points
df_clean, missing_symbols = utils.filter_symbols_with_missing_values(df)

# Display cleaning results
print("\n[Cleaning Report]")
print(f"Removed {len(missing_symbols)} symbols with missing data: {missing_symbols}")
print("\n[Cleaned Data Structure]")
df_clean.info()

Original number of symbols: 2551
Number of symbols after filtering: 2492
Number of symbols filtered out: 59

First 10 symbols that were filtered out:
['WAY', 'KLMN', 'STRK', 'LB', 'OS', 'LINE', 'KRMN', 'INGM', 'RSSL', 'MRP']

Example of dates for first filtered out symbol:

Dates for WAY:
DatetimeIndex(['2025-03-07', '2025-03-06', '2025-03-05', '2025-03-04',
               '2025-03-03', '2025-02-28', '2025-02-27', '2025-02-26',
               '2025-02-25', '2025-02-24',
               ...
               '2024-06-21', '2024-06-20', '2024-06-18', '2024-06-17',
               '2024-06-14', '2024-06-13', '2024-06-12', '2024-06-11',
               '2024-06-10', '2024-06-07'],
              dtype='datetime64[ns]', name='Date', length=187, freq=None)

Filtered DataFrame info:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 623000 entries, ('AAPL', Timestamp('2025-03-07 00:00:00')) to ('FCOM', Timestamp('2024-03-08 00:00:00'))
Data columns (total 9 columns):
 #   Column     Non-Null Count   

In [4]:
# --- Save Cleaned Data ---
# Save processed data to pickle file
df_clean.to_pickle(DEST_PATH)
print(f"\n[Save Successful] Cleaned data saved to:\n{DEST_PATH}")


[Save Successful] Cleaned data saved to:
..\data\df_OHLCV_2025-03-07_clean.pkl
