### **1. The New Pre-processing Function**

This function encapsulates the entire cleaning and filtering logic we designed.

In [1]:
def preprocess_data(df_ohlcv, min_days_history=60, max_nan_pct=0.15, min_median_dollar_volume=100000):
    """
    Cleans and filters the raw OHLCV DataFrame.
    
    Phase 1: Sanitizes bad trading days (0 volume, etc.) by setting price data to NaN.
    Phase 2: Filters out entire tickers based on history, stale data, and liquidity.
    """
    print("--- Starting Data Pre-processing ---")
    
    # --- Initial Setup ---
    df = df_ohlcv.copy()
    initial_tickers = df.index.get_level_values('Ticker').unique()
    print(f"Initial number of tickers: {len(initial_tickers)}")

    # --- Phase 1: Row-Level Sanitization (Mark bad days as NaN) ---
    # Identify rows where the stock was likely not trading
    mask_bad_days = (df['Volume'] == 0) | (df['Adj High'] == df['Adj Low'])
    
    # For these rows, nullify the price data. Keep Volume as is for later calculation.
    columns_to_nan = ['Adj Open', 'Adj High', 'Adj Low', 'Adj Close']
    df.loc[mask_bad_days, columns_to_nan] = np.nan
    print(f"Phase 1: Identified {mask_bad_days.sum()} stale/non-trading days and marked OHLC as NaN.")

    # --- Phase 2: Ticker-Level Filtering (Remove bad tickers) ---
    print("\n--- Phase 2: Analyzing Tickers for Removal ---")
    
    # Create a summary DataFrame to analyze each ticker
    df['DollarVolume'] = df['Adj Close'] * df['Volume']
    
    # Define aggregation functions
    agg_funcs = {
        'Adj Close': [
            ('data_points', 'count'), # 'count' ignores NaNs
            ('nan_pct', lambda x: x.isna().mean())
        ],
        'DollarVolume': [
            ('median_dollar_volume', 'median')
        ]
    }
    
    summary_df = df.groupby('Ticker').agg(agg_funcs)
    summary_df.columns = [col[1] for col in summary_df.columns] # Flatten MultiIndex columns

    # -- Filter 1: Insufficient History --
    history_mask = summary_df['data_points'] < min_days_history
    remove_history = summary_df[history_mask].index.tolist()
    print(f"\n[Filter 1] Removing {len(remove_history)} tickers with < {min_days_history} data points.")
    if remove_history:
        print("Tickers removed for INSUFFICIENT HISTORY:")
        display(summary_df.loc[remove_history].style.format({
            'data_points': '{:,.0f}', 'nan_pct': '{:.2%}', 'median_dollar_volume': '${:,.0f}'
        }))
        
    # -- Filter 2: Excessive Stale Data --
    remaining_tickers_df = summary_df.drop(remove_history)
    nan_mask = remaining_tickers_df['nan_pct'] > max_nan_pct
    remove_nan = remaining_tickers_df[nan_mask].index.tolist()
    print(f"\n[Filter 2] Removing {len(remove_nan)} tickers with > {max_nan_pct:.0%} stale/NaN days.")
    if remove_nan:
        print("Tickers removed for EXCESSIVE STALE DATA:")
        display(summary_df.loc[remove_nan].style.format({
            'data_points': '{:,.0f}', 'nan_pct': '{:.2%}', 'median_dollar_volume': '${:,.0f}'
        }))

    # -- Filter 3: Low Liquidity --
    remaining_tickers_df = remaining_tickers_df.drop(remove_nan)
    liq_mask = remaining_tickers_df['median_dollar_volume'] < min_median_dollar_volume
    remove_liq = remaining_tickers_df[liq_mask].index.tolist()
    print(f"\n[Filter 3] Removing {len(remove_liq)} tickers with < ${min_median_dollar_volume:,.0f} median dollar volume.")
    if remove_liq:
        print("Tickers removed for LOW LIQUIDITY:")
        display(summary_df.loc[remove_liq].style.format({
            'data_points': '{:,.0f}', 'nan_pct': '{:.2%}', 'median_dollar_volume': '${:,.0f}'
        }))

    # --- Final Execution ---
    tickers_to_remove = set(remove_history + remove_nan + remove_liq)
    final_tickers = initial_tickers.drop(list(tickers_to_remove))
    
    df_clean = df[df.index.get_level_values('Ticker').isin(final_tickers)]
    
    print("\n--- Pre-processing Summary ---")
    print(f"Total tickers removed: {len(tickers_to_remove)}")
    print(f"Final number of tickers: {len(final_tickers)}")
    
    return df_clean.drop(columns=['DollarVolume'])

### **2. The Isolated Test Script**

This script generates sample data, deliberately adds "bad" tickers, runs our new function, and allows us to inspect the verbose output.


In [9]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000)

In [6]:
# --- ISOLATED TEST SCRIPT FOR PRE-PROCESSING ---

# Use the same sample data generation from our main script
date_rng = pd.date_range(start='2023-01-01', end='2023-10-31', freq='B')
tickers = ['GOOD1', 'GOOD2', 'GOOD3'] # Start with known good tickers
data = []
np.random.seed(42)
for ticker in tickers:
    price = 100 + (np.random.randn(len(date_rng)).cumsum() * 0.5)
    # Ensure High is always >= Low
    high_add = np.random.uniform(0, 2, size=len(date_rng))
    low_sub = np.random.uniform(0, 2, size=len(date_rng))
    high = price + high_add
    low = price - low_sub
    open_price = (high + low) / 2
    volume = np.random.randint(50000, 1000000, size=len(date_rng))
    ticker_df = pd.DataFrame({'Date': date_rng,'Ticker': ticker,'Adj Open': open_price,'Adj High': high,'Adj Low': low,'Adj Close': price,'Volume': volume})
    data.append(ticker_df)
df_sample = pd.concat(data).set_index(['Ticker', 'Date'])

# --- 1. Inject Known "Bad" Data for Testing ---
print("--- Injecting known 'bad' data for test ---")
bad_data = []

# Ticker A: Insufficient History (will be filtered by Filter 1)
hist_rng = pd.date_range(start='2023-10-01', periods=20, freq='B')
bad_data.append(pd.DataFrame({
    'Date': hist_rng, 'Ticker': 'BAD_HIST', 'Adj Open': 10, 'Adj High': 10.1, 
    'Adj Low': 9.9, 'Adj Close': 10, 'Volume': 500000
}))

# Ticker B: Low Liquidity (will be filtered by Filter 3)
bad_data.append(pd.DataFrame({
    'Date': date_rng, 'Ticker': 'BAD_LIQ', 'Adj Open': 1, 'Adj High': 1.1, 
    'Adj Low': 0.9, 'Adj Close': 1, 'Volume': np.random.randint(1000, 20000, size=len(date_rng))
}))

# Ticker C: Zero Volume (will have high NaN % after Phase 1, filtered by Filter 2)
# Prices are flat, which also triggers the H==L condition in Phase 1
bad_data.append(pd.DataFrame({
    'Date': date_rng, 'Ticker': 'BAD_VOL_NAN', 'Adj Open': 50, 'Adj High': 50, 
    'Adj Low': 50, 'Adj Close': 50, 'Volume': 0
}))

# Ticker D: Another good ticker for comparison that will NOT be filtered
good_data_df = pd.DataFrame({
    'Date': date_rng, 'Ticker': 'GOOD4', 'Adj Open': 200, 'Adj High': 202, 
    'Adj Low': 198, 'Adj Close': 200, 'Volume': 200000
})

# Combine all data into a single test DataFrame
df_test = pd.concat([df_sample.reset_index()] + bad_data + [good_data_df]).set_index(['Ticker', 'Date']).sort_index()

print(f"Test DataFrame created with {len(df_test.index.get_level_values('Ticker').unique())} tickers.")
print("-" * 50)

# --- 2. Run the Pre-processing Function ---
df_OHLCV_clean = preprocess_data(df_test)

# --- 3. Final Verification ---
print("\n--- Final Verification ---")
final_tickers_list = df_OHLCV_clean.index.get_level_values('Ticker').unique().tolist()
print(f"Tickers remaining in the cleaned DataFrame: {final_tickers_list}")

assert 'BAD_HIST' not in final_tickers_list, "BAD_HIST should have been removed."
assert 'BAD_LIQ' not in final_tickers_list, "BAD_LIQ should have been removed."
assert 'BAD_VOL_NAN' not in final_tickers_list, "BAD_VOL_NAN should have been removed."
assert 'GOOD1' in final_tickers_list, "GOOD1 should have been kept."
assert 'GOOD4' in final_tickers_list, "GOOD4 should have been kept."

print("\n✅ All assertions passed. The function correctly identified and removed the bad tickers.")

--- Injecting known 'bad' data for test ---
Test DataFrame created with 7 tickers.
--------------------------------------------------
--- Starting Data Pre-processing ---
Initial number of tickers: 7
Phase 1: Identified 217 stale/non-trading days and marked OHLC as NaN.

--- Phase 2: Analyzing Tickers for Removal ---

[Filter 1] Removing 2 tickers with < 60 data points.
Tickers removed for INSUFFICIENT HISTORY:


Unnamed: 0_level_0,data_points,nan_pct,median_dollar_volume
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BAD_HIST,20,0.00%,"$5,000,000"
BAD_VOL_NAN,0,100.00%,$nan



[Filter 2] Removing 0 tickers with > 15% stale/NaN days.

[Filter 3] Removing 1 tickers with < $100,000 median dollar volume.
Tickers removed for LOW LIQUIDITY:


Unnamed: 0_level_0,data_points,nan_pct,median_dollar_volume
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BAD_LIQ,217,0.00%,"$10,227"



--- Pre-processing Summary ---
Total tickers removed: 3
Final number of tickers: 4

--- Final Verification ---
Tickers remaining in the cleaned DataFrame: ['GOOD1', 'GOOD2', 'GOOD3', 'GOOD4']

✅ All assertions passed. The function correctly identified and removed the bad tickers.


In [13]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1322 entries, ('BAD_HIST', Timestamp('2023-10-02 00:00:00')) to ('GOOD4', Timestamp('2023-10-31 00:00:00'))
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Adj Open   1322 non-null   float64
 1   Adj High   1322 non-null   float64
 2   Adj Low    1322 non-null   float64
 3   Adj Close  1322 non-null   float64
 4   Volume     1322 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 98.1+ KB


In [17]:
# unique tickers in level-0 of the MultiIndex
tickers = df_test.index.get_level_values(0).unique()

for tic in tickers:
    print(f'--- {tic} ---')
    print('head:')
    print(df_test.loc[tic].head())
    print('tail:')
    print(df_test.loc[tic].tail())
    print('======')

--- BAD_HIST ---
head:
            Adj Open  Adj High  Adj Low  Adj Close  Volume
Date                                                      
2023-10-02      10.0      10.1      9.9       10.0  500000
2023-10-03      10.0      10.1      9.9       10.0  500000
2023-10-04      10.0      10.1      9.9       10.0  500000
2023-10-05      10.0      10.1      9.9       10.0  500000
2023-10-06      10.0      10.1      9.9       10.0  500000
tail:
            Adj Open  Adj High  Adj Low  Adj Close  Volume
Date                                                      
2023-10-23      10.0      10.1      9.9       10.0  500000
2023-10-24      10.0      10.1      9.9       10.0  500000
2023-10-25      10.0      10.1      9.9       10.0  500000
2023-10-26      10.0      10.1      9.9       10.0  500000
2023-10-27      10.0      10.1      9.9       10.0  500000
--- BAD_LIQ ---
head:
            Adj Open  Adj High  Adj Low  Adj Close  Volume
Date                                                      
2023-

In [18]:
df_test.loc['BAD_HIST']

Unnamed: 0_level_0,Adj Open,Adj High,Adj Low,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-10-02,10.0,10.1,9.9,10.0,500000
2023-10-03,10.0,10.1,9.9,10.0,500000
2023-10-04,10.0,10.1,9.9,10.0,500000
2023-10-05,10.0,10.1,9.9,10.0,500000
2023-10-06,10.0,10.1,9.9,10.0,500000
2023-10-09,10.0,10.1,9.9,10.0,500000
2023-10-10,10.0,10.1,9.9,10.0,500000
2023-10-11,10.0,10.1,9.9,10.0,500000
2023-10-12,10.0,10.1,9.9,10.0,500000
2023-10-13,10.0,10.1,9.9,10.0,500000


In [19]:
df_test.loc['BAD_VOL_NAN']

Unnamed: 0_level_0,Adj Open,Adj High,Adj Low,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-02,50.0,50.0,50.0,50.0,0
2023-01-03,50.0,50.0,50.0,50.0,0
2023-01-04,50.0,50.0,50.0,50.0,0
2023-01-05,50.0,50.0,50.0,50.0,0
2023-01-06,50.0,50.0,50.0,50.0,0
...,...,...,...,...,...
2023-10-25,50.0,50.0,50.0,50.0,0
2023-10-26,50.0,50.0,50.0,50.0,0
2023-10-27,50.0,50.0,50.0,50.0,0
2023-10-30,50.0,50.0,50.0,50.0,0
