In [1]:
import pandas as pd

# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
pd.set_option('display.width', None)        # Let the display adjust to the window

# 2. Set the display width (optional but often helpful)
#    'None' tries to detect terminal width. 
#    A large number (e.g., 1000) ensures no wrapping unless absolutely necessary.
pd.set_option('display.width', 1000) 

In [2]:
# process_files.py
from config import date_str, DOWNLOAD_DIR, DEST_DIR
from pathlib import Path  # Better path handling

print(f"date_str: {date_str}")
print(f"DOWNLOAD_DIR: {DOWNLOAD_DIR}")
print(f"DEST_DIR: {DEST_DIR}\n")

# Build paths
source_path = Path(DOWNLOAD_DIR) / f'df_OHLCV_{date_str}.parquet'
dest_path = Path(DEST_DIR) / f'{date_str}_df_OHLCV_clean.parquet'

print(f"source_path: {source_path}")
print(f"dest_path: {dest_path}")

date_str: 2025-04-08
DOWNLOAD_DIR: C:\Users\ping\Downloads
DEST_DIR: ..\data

source_path: C:\Users\ping\Downloads\df_OHLCV_2025-04-08.parquet
dest_path: ..\data\2025-04-08_df_OHLCV_clean.parquet


In [3]:
# --- Data Loading & Initial Inspection ---
import pandas as pd

# # Load raw data from pickle file
# df = pd.read_pickle(source_path)

df = pd.read_parquet(source_path, engine='pyarrow')

# Display initial data structure
print("[Raw Data Overview]")
display(df.head())
df.info()

[Raw Data Overview]


Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Adj Close,Volume,Adj Open,Adj High,Adj Low
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ILMN,2025-04-08,75.76,75.54,69.38,70.3,70.3,3212285,75.76,75.54,69.38
ILMN,2025-04-07,70.61,75.83,69.55,73.45,73.45,5170600,70.61,75.83,69.55
ILMN,2025-04-04,75.4,75.43,70.64,74.16,74.16,3913400,75.4,75.43,70.64
ILMN,2025-04-03,79.51,79.97,76.26,76.42,76.42,2117600,79.51,79.97,76.26
ILMN,2025-04-02,78.27,82.76,77.99,81.88,81.88,1908300,78.27,82.76,77.99


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 389125 entries, ('ILMN', Timestamp('2025-04-08 00:00:00')) to ('HAL', Timestamp('2024-04-09 00:00:00'))
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Open       389125 non-null  float64
 1   High       389125 non-null  float64
 2   Low        389125 non-null  float64
 3   Close      389125 non-null  float64
 4   Adj Close  389125 non-null  float64
 5   Volume     389107 non-null  Int64  
 6   Adj Open   389125 non-null  float64
 7   Adj High   389125 non-null  float64
 8   Adj Low    389125 non-null  float64
dtypes: Int64(1), float64(8)
memory usage: 28.6+ MB


In [4]:
import sys
from pathlib import Path

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))


# --- Data Filtering & Cleaning ---
import utils  # Custom utility functions

# 1. Align dates across all symbols using SPY as reference
df = utils.filter_df_dates_to_reference_symbol(df=df, reference_symbol='SPY')

# 2. Remove symbols with missing data points
df_clean, missing_symbols = utils.filter_symbols_with_missing_values(df)

# Display cleaning results
print("\n[Cleaning Report]")
print(f"Removed {len(missing_symbols)} symbols with missing data: {missing_symbols}")
print("\n[Cleaned Data Structure]")
df_clean.info()

Original number of symbols: 1550
Number of symbols after filtering: 1522
Number of symbols filtered out: 28

First 10 symbols that were filtered out:
['DOCU', 'WAY', 'ULS', 'FLR', 'TTAN', 'DG', 'LINE', 'SARO', 'JHX', 'STRK']

Example of dates for first filtered out symbol:

Dates for DOCU:
DatetimeIndex(['2025-04-08', '2025-04-07', '2025-04-04', '2025-04-03', '2025-04-02', '2025-04-01', '2025-03-31', '2025-03-28', '2025-03-27', '2025-03-26',
               ...
               '2024-04-22', '2024-04-19', '2024-04-18', '2024-04-17', '2024-04-16', '2024-04-15', '2024-04-12', '2024-04-11', '2024-04-10', '2024-04-09'], dtype='datetime64[ns]', name='Date', length=502, freq=None)

Filtered DataFrame info:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 382022 entries, ('ILMN', Timestamp('2025-04-08 00:00:00')) to ('RF', Timestamp('2024-04-09 00:00:00'))
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Open       382022 non


[Cleaning Report]
Removed 2 symbols with missing data: ['FSEC', 'FER']

[Cleaned Data Structure]
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 381520 entries, ('ILMN', Timestamp('2025-04-08 00:00:00')) to ('RF', Timestamp('2024-04-09 00:00:00'))
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Open       381520 non-null  float64
 1   High       381520 non-null  float64
 2   Low        381520 non-null  float64
 3   Close      381520 non-null  float64
 4   Adj Close  381520 non-null  float64
 5   Volume     381520 non-null  Int64  
 6   Adj Open   381520 non-null  float64
 7   Adj High   381520 non-null  float64
 8   Adj Low    381520 non-null  float64
dtypes: Int64(1), float64(8)
memory usage: 28.1+ MB


In [5]:
# --- Save Cleaned Data ---
# # Save processed data to pickle file
# df_clean.to_pickle(dest_path)

# Using PyArrow (default, recommended for most cases)
df_clean.to_parquet(dest_path, engine='pyarrow', compression='zstd')
print(f"\n[Save Successful] Cleaned data saved to:\n{dest_path}")




[Save Successful] Cleaned data saved to:
..\data\2025-04-08_df_OHLCV_clean.parquet
