In [8]:
import pandas as pd

# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
pd.set_option('display.width', None)        # Let the display adjust to the window

# 2. Set the display width (optional but often helpful)
#    'None' tries to detect terminal width. 
#    A large number (e.g., 1000) ensures no wrapping unless absolutely necessary.
pd.set_option('display.width', 1000) 

In [9]:
# process_files.py
from config import date_str, DOWNLOAD_DIR, DEST_DIR
from pathlib import Path  # Better path handling

print(f"date_str: {date_str}")
print(f"DOWNLOAD_DIR: {DOWNLOAD_DIR}")
print(f"DEST_DIR: {DEST_DIR}\n")

# Build paths
source_path = Path(DOWNLOAD_DIR) / f'df_OHLCV_{date_str}_stocks_etfs.parquet'
dest_path = Path(DEST_DIR) / f'{date_str}_df_OHLCV_clean_stocks_etfs.parquet'

print(f"source_path: {source_path}")
print(f"dest_path: {dest_path}")

date_str: 2025-04-25
DOWNLOAD_DIR: C:\Users\ping\Downloads
DEST_DIR: ..\data

source_path: C:\Users\ping\Downloads\df_OHLCV_2025-04-25_stocks_etfs.parquet
dest_path: ..\data\2025-04-25_df_OHLCV_clean_stocks_etfs.parquet


In [10]:
# --- Data Loading & Initial Inspection ---
import pandas as pd

# # Load raw data from pickle file
# df = pd.read_pickle(source_path)

df = pd.read_parquet(source_path, engine='pyarrow')

# Display initial data structure
print("[Raw Data Overview]")
display(df.head(10))
df.info()

[Raw Data Overview]


Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Adj Close,Volume,Adj Open,Adj High,Adj Low
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
WTFC,2025-04-25,110.09,111.35,109.29,110.79,110.79,451000,110.09,111.35,109.29
WTFC,2025-04-24,107.33,111.67,106.86,111.38,111.38,485000,107.33,111.67,106.86
WTFC,2025-04-23,109.98,112.94,106.82,107.68,107.68,763800,109.98,112.94,106.82
WTFC,2025-04-22,100.64,106.94,100.64,106.66,106.66,843100,100.64,106.94,100.64
WTFC,2025-04-21,102.2,102.9,100.37,101.43,101.43,437700,102.2,102.9,100.37
WTFC,2025-04-17,101.76,103.6,101.51,102.51,102.51,654000,101.76,103.6,101.51
WTFC,2025-04-16,101.75,103.18,99.72,101.61,101.61,509900,101.75,103.18,99.72
WTFC,2025-04-15,101.24,104.42,101.24,102.97,102.97,503000,101.24,104.42,101.24
WTFC,2025-04-14,100.14,103.61,97.68,101.0,101.0,529600,100.14,103.61,97.68
WTFC,2025-04-11,101.08,101.94,94.99,98.61,98.61,751100,101.08,101.94,94.99


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 387929 entries, ('WTFC', Timestamp('2025-04-25 00:00:00')) to ('AEG', Timestamp('2024-04-26 00:00:00'))
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Open       387929 non-null  float64
 1   High       387929 non-null  float64
 2   Low        387929 non-null  float64
 3   Close      387929 non-null  float64
 4   Adj Close  387929 non-null  float64
 5   Volume     387916 non-null  Int64  
 6   Adj Open   387929 non-null  float64
 7   Adj High   387929 non-null  float64
 8   Adj Low    387929 non-null  float64
dtypes: Int64(1), float64(8)
memory usage: 28.5+ MB


In [11]:
# Assuming your DataFrame is named 'df'
unique_symbols = df.index.get_level_values(0).unique().tolist()

# Print the list (optional)
print(unique_symbols)
len(unique_symbols)

['WTFC', 'CFR', 'SIRI', 'CWST', 'LW', 'DLB', 'OBDC', 'EXP', 'CVLT', 'ESAB', 'ENSG', 'PSN', 'MDGL', 'EXLS', 'RVMD', 'SKX', 'FN', 'AES', 'FOUR', 'AAON', 'HPE', 'WBD', 'ES', 'TSN', 'DOW', 'HBAN', 'NVR', 'CINF', 'VG', 'CDW', 'NTRA', 'TOST', 'INVH', 'EXPE', 'STM', 'DG', 'QSR', 'PHM', 'DVN', 'WAT', 'COR', 'HWM', 'OKE', 'DLR', 'MPLX', 'FCX', 'JCI', 'COIN', 'SNOW', 'HLT', 'CSX', 'MFC', 'AMX', 'LNG', 'CARR', 'NWG', 'PAYX', 'TRP', 'MET', 'ALL', 'NTAP', 'WY', 'HAL', 'VIK', 'OKTA', 'L', 'SSNC', 'NTNX', 'TECK', 'NTRS', 'KGC', 'GPN', 'PINS', 'STX', 'BIIB', 'ULTA', 'DLTR', 'SMMT', 'SHG', 'UMC', 'HMC', 'EXC', 'DFS', 'FAST', 'AXON', 'KR', 'ROST', 'HLN', 'GM', 'RBLX', 'D', 'AMP', 'EW', 'KVUE', 'SU', 'TGT', 'HOOD', 'KMB', 'CCI', 'NDAQ', 'LNW', 'OTEX', 'UWMC', 'AZEK', 'RRX', 'EAT', 'NFG', 'ALV', 'MIDD', 'UGI', 'APPF', 'LNTH', 'CMA', 'AXTA', 'UMBF', 'BFAM', 'WTS', 'BBIO', 'KBR', 'AN', 'IBDR', 'SUSA', 'VNQI', 'DXJ', 'UCON', 'SPYI', 'PTLC', 'NEAR', 'EWU', 'DFLV', 'IYR', 'EMLP', 'EUFN', 'JAVA', 'AIRR', 'IBDQ'

1558

In [12]:
symbol_to_check = 'AAPL'  # Example symbol to check
is_present = symbol_to_check in unique_symbols
print(f"Is '{symbol_to_check}' in the list? {is_present}")

Is 'AAPL' in the list? True


In [13]:
import sys
from pathlib import Path

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))


# --- Data Filtering & Cleaning ---
import utils  # Custom utility functions

# 1. Align dates across all symbols using AAPL as reference
df = utils.filter_df_dates_to_reference_symbol(df=df, reference_symbol='AAPL')

# 2. Remove symbols with missing data points
df_clean, missing_symbols = utils.filter_symbols_with_missing_values(df)

# Display cleaning results
print("\n[Cleaning Report]")
print(f"Removed {len(missing_symbols)} symbols with missing data: {missing_symbols}")
print("\n[Cleaned Data Structure]")
df_clean.info()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Original number of symbols: 1558
Number of symbols after filtering: 1540
Number of symbols filtered out: 18

First 10 symbols that were filtered out:
['SARO', 'NBIS', 'LINE', 'ZG', 'VIK', 'LB', 'SAIL', 'TEM', 'SFD', 'WAY']

Example of dates for first filtered out symbol:

Dates for SARO:
DatetimeIndex(['2025-04-25', '2025-04-24', '2025-04-23', '2025-04-22', '2025-04-21', '2025-04-17', '2025-04-16', '2025-04-15', '2025-04-14', '2025-04-11',
               ...
               '2024-10-15', '2024-10-14', '2024-10-11', '2024-10-10', '2024-10-09', '2024-10-08', '2024-10-07', '2024-10-04', '2024-10-03', '2024-10-02'], dtype='datetime64[ns]', name='Date', length=141, freq=None)

Filtered DataFrame info:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 385000 entries, ('WTFC', Timestamp('2025-04-25 00:00:00')) to ('AEG', Timestamp('2024-04-26 00:00:00'))
Data columns (total 9 columns):
 #   Column     Non-N

In [14]:
# --- Save Cleaned Data ---
# # Save processed data to pickle file
# df_clean.to_pickle(dest_path)

# Using PyArrow (default, recommended for most cases)
df_clean.to_parquet(dest_path, engine='pyarrow', compression='zstd')
print(f"\n[Save Successful] Cleaned data saved to:\n{dest_path}")




[Save Successful] Cleaned data saved to:
..\data\2025-04-25_df_OHLCV_clean_stocks_etfs.parquet
