#### Read OHLCV data from source_path
#### Use VOO's date index as a reference for dates
#### Filters out symbol's date index not matching VOO's date index
#### Filters out symbols from a MultiIndex DataFrame that have:
    1. Any missing values in any columns
    2. Missing any dates present in the original DataFrame's date index
#### Save dataframe to dest_path    

In [23]:
import pandas as pd

# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
pd.set_option('display.width', 2000)        # Let the display adjust to the window


In [24]:
source_path = r'../data/df_OHLCV_stocks_etfs.parquet'
dest_path = r'../data/df_OHLCV_clean_stocks_etfs.parquet'

print(F'source_path: {source_path}')
print(F'dest_path: {dest_path}')

source_path: ../data/df_OHLCV_stocks_etfs.parquet
dest_path: ../data/df_OHLCV_clean_stocks_etfs.parquet


In [25]:
# --- Data Loading & Initial Inspection ---
import pandas as pd

# # Load raw data from pickle file
df_raw = pd.read_parquet(source_path, engine='pyarrow')
raw_ticker_count = len(df_raw.index.get_level_values(0).unique().tolist())

# Display initial data structure
print(f'df_raw.head(3):\n{df_raw.head(3)}\n')
print(f'\ndf_raw.tail(3):\n{df_raw.tail(3)}\n')
print(f'\ndf_raw.info():\n{df_raw.info()}')
print(f"\nNumber of tickers in df_raw data: {raw_ticker_count}")

df_raw.head(3):
                   Adj Open  Adj High  Adj Low  Adj Close   Volume
Ticker Date                                                       
A      2025-05-09    108.96    109.86   106.80     106.93  1124390
       2025-05-08    108.00    110.65   106.55     108.70  2093300
       2025-05-07    106.69    107.60   104.79     107.52  2143700


df_raw.tail(3):
                   Adj Open  Adj High  Adj Low  Adj Close   Volume
Ticker Date                                                       
ZWS    2024-02-05   30.2039   31.2311  30.1348    30.8064  2476154
       2024-02-02   29.5816   30.5397  29.5421    30.4805  1078362
       2024-02-01   29.6113   30.1446  29.3347    29.8878   981876

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 498420 entries, ('A', Timestamp('2025-05-09 00:00:00')) to ('ZWS', Timestamp('2024-02-01 00:00:00'))
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Adj Open   498420 non-nul

In [26]:
# Assuming your DataFrame is named 'df'
unique_symbols = df_raw.index.get_level_values(0).unique().tolist()

# Print the list (optional)
print(f"unique_symbols: {unique_symbols}")
print(f"Number of unique symbols: {len(unique_symbols)}")


unique_symbols: ['A', 'AA', 'AAL', 'AAON', 'AAPL', 'ABBV', 'ABEV', 'ABNB', 'ABT', 'ACGL', 'ACI', 'ACIW', 'ACM', 'ACN', 'ACWI', 'ACWV', 'ACWX', 'ADBE', 'ADC', 'ADI', 'ADM', 'ADMA', 'ADP', 'ADSK', 'ADT', 'AEE', 'AEG', 'AEM', 'AEP', 'AER', 'AES', 'AFG', 'AFL', 'AFRM', 'AGCO', 'AGG', 'AGI', 'AGNC', 'AIG', 'AIRR', 'AIT', 'AIZ', 'AJG', 'AKAM', 'AL', 'ALAB', 'ALB', 'ALC', 'ALGN', 'ALK', 'ALL', 'ALLE', 'ALLY', 'ALNY', 'ALSN', 'ALV', 'AM', 'AMAT', 'AMCR', 'AMD', 'AME', 'AMGN', 'AMH', 'AMLP', 'AMP', 'AMT', 'AMX', 'AMZN', 'AN', 'ANET', 'ANSS', 'AON', 'AOS', 'APA', 'APD', 'APG', 'APH', 'APO', 'APP', 'APPF', 'APTV', 'AR', 'ARCC', 'ARE', 'ARES', 'ARGX', 'ARKB', 'ARKK', 'ARM', 'ARMK', 'ARW', 'AS', 'ASML', 'ASND', 'ASR', 'ASTS', 'ASX', 'ATI', 'ATO', 'ATR', 'AU', 'AUR', 'AVAV', 'AVB', 'AVDE', 'AVDV', 'AVEM', 'AVGO', 'AVLV', 'AVTR', 'AVUS', 'AVUV', 'AVY', 'AWI', 'AWK', 'AXON', 'AXP', 'AXS', 'AXTA', 'AYI', 'AZEK', 'AZN', 'AZO', 'BA', 'BABA', 'BAC', 'BAH', 'BALL', 'BAM', 'BAP', 'BAX', 'BBAX', 'BBCA', 'BBD

In [27]:
symbol_to_check = 'VOO'  # Example symbol to check
is_present = symbol_to_check in unique_symbols
print(f"Is '{symbol_to_check}' in the list? {is_present}")

Is 'VOO' in the list? True


In [28]:
import sys
from pathlib import Path

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))


# --- Data Filtering & Cleaning ---
import utils  # Custom utility functions

# 1. Align dates across all symbols using AAPL as reference
df_filter, filtered_out_symbols = utils.filter_df_dates_to_reference_symbol(df=df_raw, reference_symbol=symbol_to_check)

# 2. Remove symbols with missing data points
df_clean, missing_values_symbols = utils.filter_symbols_with_missing_values(df=df_filter)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using 'Ticker' as the symbol identifier.
Original number of Tickers: 1573
Number of Tickers after filtering: 1544
Number of Tickers filtered out: 29

First 10 Tickers that were filtered out:
['ALAB', 'TLX', 'WAY', 'LINE', 'LOAR', 'ULS', 'TEM', 'RDDT', 'GEV', 'SMBS']

Example of dates for first filtered out Ticker:

Dates for ALAB:
DatetimeIndex(['2025-05-09', '2025-05-08', '2025-05-07', '2025-05-06', '2025-05-05', '2025-05-02', '2025-05-01', '2025-04-30', '2025-04-29', '2025-04-28',
               ...
               '2024-04-03', '2024-04-02', '2024-04-01', '2024-03-28', '2024-03-27', '2024-03-26', '2024-03-25', '2024-03-22', '2024-03-21', '2024-03-20'], dtype='datetime64[ns]', name='Date', length=286, freq=None)

Filtered DataFrame info:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 492536 entries, ('A', Timestamp('2025-05-09 00:00:00')) to ('ZWS', Timestamp('2024-02-01 00:00:00'))
Data columns

In [29]:
df_clean.to_parquet(dest_path, engine='pyarrow', compression='zstd')
print(f'\nsaved df_clean to {dest_path}')


saved df_clean to ../data/df_OHLCV_clean_stocks_etfs.parquet


In [30]:
filter_ticker_count = len(df_filter.index.get_level_values(0).unique().tolist())
cleaned_ticker_count = len(df_clean.index.get_level_values(0).unique().tolist())

print(f'read df_raw from  {source_path}')
print(f'saved df_clean to {dest_path}')
print(f'\n{len(filtered_out_symbols)} filtered_out_symbols: {filtered_out_symbols}')
print(f'{len(missing_values_symbols)} missing_values_symbols: {missing_values_symbols}')
print(f"\nNumber of tickers in raw data:      {raw_ticker_count}")
print(f"Number of tickers in filtered data: {filter_ticker_count}")
print(f"Number of tickers in cleaned data:  {cleaned_ticker_count}")

read df_raw from  ../data/df_OHLCV_stocks_etfs.parquet
saved df_clean to ../data/df_OHLCV_clean_stocks_etfs.parquet

29 filtered_out_symbols: ['ALAB', 'TLX', 'WAY', 'LINE', 'LOAR', 'ULS', 'TEM', 'RDDT', 'GEV', 'SMBS', 'SOLV', 'ZK', 'BTC', 'JGLO', 'SW', 'NBIS', 'VG', 'SAIL', 'LB', 'OS', 'SFD', 'SARO', 'LTM', 'GOLD', 'CRWV', 'TTAN', 'MSTY', 'RBRK', 'VIK']
0 missing_values_symbols: []

Number of tickers in raw data:      1573
Number of tickers in filtered data: 1544
Number of tickers in cleaned data:  1544
