In [12]:
import pandas as pd
import numpy as np

from datetime import datetime, date
from IPython.display import display, Markdown
from pathlib import Path

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 3000)

In [13]:
download_path = Path.home() / "Downloads"  
# OHLCV_file_path = r'c:\Users\ping\Files_win10\python\py311\stocks\data\df_OHLCV_clean_stocks_etfs.parquet'
OHLCV_file_path = r'c:\Users\ping\Files_win10\python\py311\stocks\data\df_OHLCV_stocks_etfs.parquet'

df_OHLCV = pd.read_parquet(OHLCV_file_path, engine='pyarrow')
print(f'df_OHLCV.info() :\n{df_OHLCV.info()}')
print(f'\ndf_OHLCV.head():\n{df_OHLCV.head()}')
print(f'\ndf_OHLCV.tail():\n{df_OHLCV.tail()}')

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2115382 entries, ('A', Timestamp('2020-01-02 00:00:00')) to ('ZWS', Timestamp('2025-10-03 00:00:00'))
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Adj Open   float64
 1   Adj High   float64
 2   Adj Low    float64
 3   Adj Close  float64
 4   Volume     int64  
dtypes: float64(4), int64(1)
memory usage: 88.9+ MB
df_OHLCV.info() :
None

df_OHLCV.head():
                   Adj Open  Adj High  Adj Low  Adj Close   Volume
Ticker Date                                                       
A      2020-01-02   82.4973   82.9295  81.8250    82.5453  1468677
       2020-01-03   81.3160   81.9499  81.1527    81.2200  1164425
       2020-01-06   80.6725   81.4600  80.2884    81.4600  2075412
       2020-01-07   80.6341   81.8826  80.6149    81.7098  1754187
       2020-01-08   82.5549   83.0447  81.8250    82.5165  1923806

df_OHLCV.tail():
                   Adj Open  Adj High  Adj Low  Adj Close  Volume
Ti

In [14]:
# --- B. DYNAMIC DATA QUALITY FILTER FUNCTIONS ---
# FINAL RECOMMENDED VERSION v2
def calculate_rolling_quality_metrics(df_ohlcv, window=252, min_periods=126, debug=False):
    """
    Calculates rolling quality metrics for OHLCV data to identify tradable tickers.

    This function enriches the input DataFrame with metrics that quantify data
    quality and liquidity over a specified rolling window.

    Args:
        df_ohlcv (pd.DataFrame): DataFrame with a ('Ticker', 'Date') MultiIndex
                                 and columns for OHLCV data.
        window (int): The lookback period in days for the rolling calculations.
                      Defaults to 252 (approx. one trading year).
        min_periods (int): The minimum number of observations in the window required
                           to have a value. Defaults to 126 (approx. half a year).
        debug (bool): If True, returns a DataFrame with all intermediate
                      calculations. Defaults to False, returning only the
                      final quality metrics.

    Returns:
        pd.DataFrame: A DataFrame containing the calculated quality metrics. See the
                      'Metrics Description' section below for column details. If debug is True,
                      the output will also include original and intermediate columns.

    Metrics Description:
        RollingStalePct (float):
            The rolling percentage (0.0 to 1.0) of days considered 'stale' within
            the lookback window. A day is flagged as stale if its trading volume is
            zero OR its high price is equal to its low price. This metric helps
            identify non-trading assets or assets with poor quality data feeds.
            **A lower value is better.**

        RollingMedianVolume (float):
            The rolling median of the daily dollar volume (Adj Close * Volume). The
            median is used to provide a robust measure of typical liquidity that is
            insensitive to single-day volume spikes (outliers). This metric is crucial
            for filtering out illiquid stocks where trades could incur significant
            slippage.
            **A higher value is better.**

        RollingSameVolCount (float):
            A rolling count of the number of times a day's volume was exactly equal
            to the previous day's volume. This is a heuristic for detecting
            potential low-quality data feeds, as this event is statistically rare
            for actively traded assets and may indicate improper forward-filling of
            missing data.
            **A lower value is better.**
    """
    print(f"--- Calculating Rolling Quality Metrics (Window: {window} days) ---")
    df = df_ohlcv.copy()

    # Improvement 1: Ensure data is sorted for correctness
    if not df.index.is_monotonic_increasing:
        print("ℹ️ Data is not sorted. Sorting index chronologically...")
        df.sort_index(inplace=True)

    # --- Intermediate calculations ---
    # This calculation is always the same, whether in debug mode or not.
    df['IsStale'] = np.where((df['Volume'] == 0) | (df['Adj High'] == df['Adj Low']), 1, 0)
    df['DollarVolume'] = df['Adj Close'] * df['Volume']
    df['HasSameVolumeAsPrevDay'] = (df.groupby(level='Ticker')['Volume'].diff() == 0).astype(int)
    
    # === NEW: Add component columns ONLY if in debug mode ===
    if debug:
        print("...Adding debug component columns for 'IsStale'.")
        df['Debug_HasZeroVolume'] = (df['Volume'] == 0).astype(int)
        df['Debug_IsHighEqLow'] = (df['Adj High'] == df['Adj Low']).astype(int)
    # =========================================================

    # --- Rolling calculations ---
    grouped = df.groupby(level='Ticker')
    stale_pct = grouped['IsStale'].rolling(window=window, min_periods=min_periods).mean()
    median_vol = grouped['DollarVolume'].rolling(window=window, min_periods=min_periods).median()
    same_vol_count = grouped['HasSameVolumeAsPrevDay'].rolling(window=window, min_periods=min_periods).sum()

    quality_df = pd.concat([stale_pct, median_vol, same_vol_count], axis=1)
    quality_df.columns = ['RollingStalePct', 'RollingMedianVolume', 'RollingSameVolCount']
    quality_df.index = quality_df.index.droplevel(0)

    print("✅ Rolling metrics calculation complete.")
    
    if debug:
        # For debugging, return the original data joined with all calculations
        print("...Debug mode enabled, returning full calculation trace.")
        # The 'df' DataFrame now contains the extra debug columns, which will be included automatically.
        full_df = df.join(quality_df)
        return full_df
    else:
        # Default production behavior
        return quality_df


def get_eligible_universe(quality_metrics_df, filter_date, thresholds):
    """
    Filters tickers to create an eligible universe for a specific date based on quality metrics.

    Args:
        quality_metrics_df (pd.DataFrame): The output from calculate_rolling_quality_metrics.
        filter_date (pd.Timestamp or str): The specific date to perform the filtering on.
        thresholds (dict): A dictionary with the filtering rules, e.g.,
                           {'min_median_dollar_volume': 1e6, 'max_stale_pct': 0.05, 'max_same_vol_count': 1}.

    Returns:
        list: A list of ticker symbols that are eligible on the filter_date.
    """
    try:
        metrics_on_date = quality_metrics_df.xs(pd.to_datetime(filter_date), level='Date')
    except KeyError:
        print(f"Warning: Filter date {pd.to_datetime(filter_date).date()} not found in quality metrics index. Returning all tickers.")
        return quality_metrics_df.index.get_level_values('Ticker').unique().tolist()

    mask = (
        (metrics_on_date['RollingMedianVolume'] >= thresholds['min_median_dollar_volume']) &
        (metrics_on_date['RollingStalePct'] <= thresholds['max_stale_pct']) &
        (metrics_on_date['RollingSameVolCount'] <= thresholds['max_same_vol_count'])
    )
    
    eligible_tickers = metrics_on_date[mask].index.tolist()
    all_tickers = metrics_on_date.index.tolist()
    
    print(f"Dynamic Filter ({pd.to_datetime(filter_date).date()}): Kept {len(eligible_tickers)} of {len(all_tickers)} tickers.")
    return eligible_tickers    


    Metrics Description:
        RollingStalePct (float):
            The rolling percentage (0.0 to 1.0) of days considered 'stale' within
            the lookback window. A day is flagged as stale if its trading volume is
            zero OR its high price is equal to its low price. This metric helps
            identify non-trading assets or assets with poor quality data feeds.
            **A lower value is better.**

        RollingMedianVolume (float):
            The rolling median of the daily dollar volume (Adj Close * Volume). The
            median is used to provide a robust measure of typical liquidity that is
            insensitive to single-day volume spikes (outliers). This metric is crucial
            for filtering out illiquid stocks where trades could incur significant
            slippage.
            **A higher value is better.**

        RollingSameVolCount (float):
            A rolling count of the number of times a day's volume was exactly equal
            to the previous day's volume. This is a heuristic for detecting
            potential low-quality data feeds, as this event is statistically rare
            for actively traded assets and may indicate improper forward-filling of
            missing data.
            **A lower value is better.**

In [15]:
# Our inspection rules
my_thresholds = {
    'min_median_dollar_volume': 10_600_000, # Must have high liquidity
    'max_stale_pct': 0.1,                   # Allow very few stale days (max 10%)
    'max_same_vol_count': 1                 # Allow at most 1 suspicious volume event
}

In [16]:
quality_df = calculate_rolling_quality_metrics(
    df_ohlcv=df_OHLCV,
    window=252,
    min_periods=126,
    # debug=True  # <-- The key to our new, improved workflow    
)

# Let's look at the input for our next function (a snippet from the last day)
print("--- Quality Metrics for 2025-10-03 (Last Day) ---")
print(quality_df.xs('2025-10-03', level='Date'))


--- Calculating Rolling Quality Metrics (Window: 252 days) ---
✅ Rolling metrics calculation complete.
--- Quality Metrics for 2025-10-03 (Last Day) ---
        RollingStalePct  RollingMedianVolume  RollingSameVolCount
Ticker                                                           
A                   0.0         2.184756e+08                  0.0
AA                  0.0         1.724810e+08                  0.0
AAL                 0.0         6.185211e+08                  0.0
AAON                0.0         7.029373e+07                  0.0
AAPL                0.0         1.056353e+10                  0.0
...                 ...                  ...                  ...
ZM                  0.0         1.973018e+08                  0.0
ZS                  0.0         3.879914e+08                  0.0
ZTO                 0.0         4.067204e+07                  0.0
ZTS                 0.0         4.400619e+08                  0.0
ZWS                 0.0         3.283692e+07           

### Code to Generate the Test DataFrame

In [None]:
def create_test_dataframe():
    """
    Generates a synthetic DataFrame with specific edge cases to test the
    quality metric calculation logic.

    Returns:
        pd.DataFrame: A DataFrame with a ('Ticker', 'Date') MultiIndex, ready for testing.
    """
    print("--- Creating synthetic test DataFrame ---")
    
    # Create a date range for our test data
    dates = pd.to_datetime(pd.date_range(start='2024-01-01', periods=10, freq='B')) # 'B' for business day

    # Base data dictionary
    data = []
    tickers = ['GOOD', 'VOL_0', 'H_EQ_L', 'SAME_VOL', 'MIXED_BAD', 'AT_THE_START']

    for ticker in tickers:
        # Ticker 'AT_THE_START' only has 4 days of data
        num_days = 4 if ticker == 'AT_THE_START' else 10
        for i in range(num_days):
            day_num = i + 1
            # Start with a clean, default row
            row = {
                'Ticker': ticker,
                'Date': dates[i],
                'Adj Open': 100 + i,
                'Adj High': 102 + i,
                'Adj Low': 98 + i,
                'Adj Close': 100 + i,
                'Volume': 100000 + (i * 1000)
            }
            data.append(row)

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)

    # --- Now, inject the specific edge cases as per our blueprint ---

    # 2. Ticker: VOL_0
    df.loc[(df['Ticker'] == 'VOL_0') & (df['Date'] == dates[3]), 'Volume'] = 0 # Day 4
    df.loc[(df['Ticker'] == 'VOL_0') & (df['Date'] == dates[6]), 'Volume'] = 0 # Day 7

    # 3. Ticker: H_EQ_L
    df.loc[(df['Ticker'] == 'H_EQ_L') & (df['Date'] == dates[4]), ['Adj High', 'Adj Low']] = [50.0, 50.0] # Day 5

    # 4. Ticker: SAME_VOL
    df.loc[(df['Ticker'] == 'SAME_VOL') & (df['Date'] == dates[3]), 'Volume'] = 100000 # Day 4, matches Day 3's vol
    df.loc[(df['Ticker'] == 'SAME_VOL') & (df['Date'] == dates[6]), 'Volume'] = 250000 # Day 7
    df.loc[(df['Ticker'] == 'SAME_VOL') & (df['Date'] == dates[7]), 'Volume'] = 250000 # Day 8, matches Day 7
    df.loc[(df['Ticker'] == 'SAME_VOL') & (df['Date'] == dates[8]), 'Volume'] = 250000 # Day 9, matches Day 8

    # 5. Ticker: MIXED_BAD
    df.loc[(df['Ticker'] == 'MIXED_BAD') & (df['Date'] == dates[3]), ['Volume', 'Adj High', 'Adj Low']] = [0, 60.0, 60.0] # Day 4
    df.loc[(df['Ticker'] == 'MIXED_BAD') & (df['Date'] == dates[5]), 'Volume'] = df.loc[(df['Ticker'] == 'MIXED_BAD') & (df['Date'] == dates[4]), 'Volume'].values[0] # Day 6 vol matches Day 5
    
    # Set the MultiIndex
    df.set_index(['Ticker', 'Date'], inplace=True)
    
    print("✅ Synthetic test DataFrame created successfully.")
    return df

# --- How to use this code ---

# 1. Generate the test DataFrame
test_df_ohlcv = create_test_dataframe()

# 2. Display the raw input data to see our setup
print("\n--- Raw Test Input Data ---")
print(test_df_ohlcv.to_string())

### How to Run the Test Tomorrow

When you are ready tomorrow, you will run the `calculate_rolling_quality_metrics` function using this `test_df_ohlcv` as the input. Remember to use the small window parameters we planned on.

In [18]:
test_quality_df = calculate_rolling_quality_metrics(
    df_ohlcv=test_df_ohlcv,
    window=5,
    min_periods=3,
    # debug=True  # <-- The key to our new, improved workflow    
)

# Let's look at the input for our next function (a snippet from the last day)
print("--- Quality Metrics for 2024-01-12 (Last Day) ---")
print(test_quality_df.xs('2024-01-12', level='Date'))


--- Calculating Rolling Quality Metrics (Window: 5 days) ---
ℹ️ Data is not sorted. Sorting index chronologically...
✅ Rolling metrics calculation complete.
--- Quality Metrics for 2024-01-12 (Last Day) ---
           RollingStalePct  RollingMedianVolume  RollingSameVolCount
Ticker                                                              
GOOD                   0.0           11449000.0                  0.0
H_EQ_L                 0.0           11449000.0                  0.0
MIXED_BAD              0.0           11449000.0                  1.0
SAME_VOL               0.0           26500000.0                  2.0
VOL_0                  0.2           11449000.0                  0.0


In [35]:
test_quality_df

Unnamed: 0_level_0,Unnamed: 1_level_0,RollingStalePct,RollingMedianVolume,RollingSameVolCount
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AT_THE_START,2024-01-01,,,
AT_THE_START,2024-01-02,,,
AT_THE_START,2024-01-03,0.0,10201000.0,0.0
AT_THE_START,2024-01-04,0.0,10302500.0,0.0
GOOD,2024-01-01,,,
GOOD,2024-01-02,,,
GOOD,2024-01-03,0.0,10201000.0,0.0
GOOD,2024-01-04,0.0,10302500.0,0.0
GOOD,2024-01-05,0.0,10404000.0,0.0
GOOD,2024-01-08,0.0,10609000.0,0.0


#### Step 2: Define the Rules (The `thresholds`)

Now, we need to create our set of inspection rules. We will choose thresholds specifically designed to include some of our test tickers and exclude others.

#### Step 3: Choose a Date to Inspect (The `filter_date`)

The inspector only works on one day at a time. Let's pick a late date in our test series so all the rolling windows are full.

In [20]:
# The specific day we want to create our universe for
my_filter_date = pd.to_datetime('2024-01-12')
my_filter_date

Timestamp('2024-01-12 00:00:00')

#### Step 4: Run the Inspector (`get_eligible_universe`)

With all the inputs ready, we can now call the function.

In [21]:
# Get the list of tickers that pass inspection on our chosen date
eligible_tickers = get_eligible_universe(
    quality_metrics_df=test_quality_df,
    filter_date=my_filter_date,
    thresholds=my_thresholds
)

print("\n--- Inspection Results ---")
print(f"Eligible Tickers for {my_filter_date.date()}: {eligible_tickers}")

Dynamic Filter (2024-01-12): Kept 3 of 5 tickers.

--- Inspection Results ---
Eligible Tickers for 2024-01-12: ['GOOD', 'H_EQ_L', 'MIXED_BAD']


In [22]:
# You can now print this or save it to a CSV to inspect.
print("\n--- Full Verification Output ---")
print(test_quality_df.to_string())

# Optional: Save to CSV for easy viewing
test_quality_df.to_csv("export_csv/full_verification_output.csv")


--- Full Verification Output ---
                         RollingStalePct  RollingMedianVolume  RollingSameVolCount
Ticker       Date                                                                 
AT_THE_START 2024-01-01              NaN                  NaN                  NaN
             2024-01-02              NaN                  NaN                  NaN
             2024-01-03             0.00           10201000.0                  0.0
             2024-01-04             0.00           10302500.0                  0.0
GOOD         2024-01-01              NaN                  NaN                  NaN
             2024-01-02              NaN                  NaN                  NaN
             2024-01-03             0.00           10201000.0                  0.0
             2024-01-04             0.00           10302500.0                  0.0
             2024-01-05             0.00           10404000.0                  0.0
             2024-01-08             0.00           10

In [23]:
quality_df = calculate_rolling_quality_metrics(df_ohlcv=df_OHLCV)

--- Calculating Rolling Quality Metrics (Window: 252 days) ---
✅ Rolling metrics calculation complete.


In [24]:
print(f'quality_df.info() :\n{quality_df.info()}')
print(f'\nquality_df.head():\n{quality_df.head()}')
print(f'\nquality_df.tail(50):\n{quality_df.tail(50)}')

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2115382 entries, ('A', Timestamp('2020-01-02 00:00:00')) to ('ZWS', Timestamp('2025-10-03 00:00:00'))
Data columns (total 3 columns):
 #   Column               Dtype  
---  ------               -----  
 0   RollingStalePct      float64
 1   RollingMedianVolume  float64
 2   RollingSameVolCount  float64
dtypes: float64(3)
memory usage: 56.6+ MB
quality_df.info() :
None

quality_df.head():
                   RollingStalePct  RollingMedianVolume  RollingSameVolCount
Ticker Date                                                                 
A      2020-01-02              NaN                  NaN                  NaN
       2020-01-03              NaN                  NaN                  NaN
       2020-01-06              NaN                  NaN                  NaN
       2020-01-07              NaN                  NaN                  NaN
       2020-01-08              NaN                  NaN                  NaN

quality_df.tail(50)

### Verification Code

In [25]:
def verify_ticker_rolling_quality_metrics_calculations(df_ohlcv, ticker_symbol, window=252, min_periods=126, output_filename=None):
    """
    Isolates a single ticker and runs the quality metric calculations step-by-step,
    outputting all intermediate and final results to a CSV for verification.

    Args:
        df_ohlcv (pd.DataFrame): The original, raw OHLCV DataFrame.
        ticker_symbol (str): The symbol of the ticker you want to verify (e.g., 'A').
        window (int): The same window size used in the main function.
        min_periods (int): The same min_periods value used in the main function.
        output_filename (str, optional): The name of the output CSV file. 
                                         If None, a default name is created.
    """
    print(f"--- Verifying calculations for ticker: {ticker_symbol} ---")

    # 1. Isolate the data for the single ticker
    try:
        # Using .loc[ticker_symbol] on the first level of the MultiIndex
        ticker_df = df_ohlcv.loc[ticker_symbol].copy()
    except KeyError:
        print(f"❌ Error: Ticker '{ticker_symbol}' not found in the DataFrame.")
        return

    # 2. Ensure data is sorted chronologically (critical for rolling/diff)
    ticker_df.sort_index(inplace=True)
    
    print(f"Found {len(ticker_df)} data points for {ticker_symbol}.")

    # 3. Re-create the intermediate calculations (same logic as the main function)
    # Since this is for a single ticker, we don't need groupby()
    print("Step 1: Calculating intermediate values (IsStale, DollarVolume, etc.)...")
    ticker_df['IsStale'] = np.where((ticker_df['Volume'] == 0) | (ticker_df['Adj High'] == ticker_df['Adj Low']), 1, 0)
    ticker_df['DollarVolume'] = ticker_df['Adj Close'] * ticker_df['Volume']
    ticker_df['HasSameVolumeAsPrevDay'] = (ticker_df['Volume'].diff() == 0).astype(int)

    # 4. Re-create the final rolling window calculations
    print(f"Step 2: Calculating rolling metrics (window={window}, min_periods={min_periods})...")
    ticker_df['RollingStalePct'] = ticker_df['IsStale'].rolling(window=window, min_periods=min_periods).mean()
    ticker_df['RollingMedianVolume'] = ticker_df['DollarVolume'].rolling(window=window, min_periods=min_periods).median()
    ticker_df['RollingSameVolCount'] = ticker_df['HasSameVolumeAsPrevDay'].rolling(window=window, min_periods=min_periods).sum()

    # 5. Save the combined DataFrame to a CSV file
    if output_filename is None:
        output_filename = f"export_csv/verification_rolling_quality_metrics_{ticker_symbol}.csv"
    
    ticker_df.to_csv(output_filename)
    
    print(f"✅ Verification complete. All calculations saved to '{output_filename}'")
    
    return ticker_df # Optionally return the DataFrame for use in a notebook

In [26]:
ticker_df = verify_ticker_rolling_quality_metrics_calculations( df_ohlcv=df_OHLCV, 
                                                                ticker_symbol='NVDA',
                                                                window=252, 
                                                                min_periods=126, 
                                                                output_filename=None)

--- Verifying calculations for ticker: NVDA ---
Found 1447 data points for NVDA.
Step 1: Calculating intermediate values (IsStale, DollarVolume, etc.)...
Step 2: Calculating rolling metrics (window=252, min_periods=126)...
✅ Verification complete. All calculations saved to 'export_csv/verification_rolling_quality_metrics_NVDA.csv'


### Step 1: Find Tickers with Triggered Flags

This function will do the heavy lifting of calculating the intermediate values across your entire `df_OHLCV` and then reporting which tickers are worth investigating.

In [27]:
def find_tickers_with_flags(df_ohlcv):
    """
    Scans the entire OHLCV DataFrame to find tickers that have at least one
    'IsStale' or 'HasSameVolumeAsPrevDay' flag.

    Args:
        df_ohlcv (pd.DataFrame): The original, raw OHLCV DataFrame.

    Returns:
        dict: A dictionary containing two lists of ticker symbols:
              'stale_tickers' and 'same_volume_tickers'.
    """
    print("--- Scanning for tickers with data quality flags ---")
    df = df_ohlcv.copy()

    # We must sort the data to correctly calculate 'HasSameVolumeAsPrevDay'
    if not df.index.is_monotonic_increasing:
        df.sort_index(inplace=True)
    
    # --- Calculate the intermediate values ---
    df['IsStale'] = np.where((df['Volume'] == 0) | (df['Adj High'] == df['Adj Low']), 1, 0)
    df['HasSameVolumeAsPrevDay'] = (df.groupby(level='Ticker')['Volume'].diff() == 0).astype(int)

    # --- Find tickers where the sum of flags is greater than zero ---
    print("Finding tickers with 'IsStale' flags...")
    stale_counts = df.groupby(level='Ticker')['IsStale'].sum()
    stale_tickers = stale_counts[stale_counts > 0].index.tolist()
    print(f"✅ Found {len(stale_tickers)} tickers with at least one stale day.")

    print("\nFinding tickers with 'HasSameVolumeAsPrevDay' flags...")
    same_vol_counts = df.groupby(level='Ticker')['HasSameVolumeAsPrevDay'].sum()
    same_volume_tickers = same_vol_counts[same_vol_counts > 0].index.tolist()
    print(f"✅ Found {len(same_volume_tickers)} tickers with at least one day of repeated volume.")

    return {
        'stale_tickers': stale_tickers,
        'same_volume_tickers': same_volume_tickers
    }

In [28]:
# Step 1: Find the tickers that are worth inspecting.
flagged_tickers = find_tickers_with_flags(df_OHLCV)

# Step 2: Use these tickers with the verification function.

# --- Verify the 'IsStale' calculation ---
if flagged_tickers['stale_tickers']:
    # Pick the first ticker from the list to check
    ticker_to_check_stale = flagged_tickers['stale_tickers'][0]
    print(f"\n--- Now generating verification file for a STALE ticker: {ticker_to_check_stale} ---")
    verify_ticker_rolling_quality_metrics_calculations(df_ohlcv=df_OHLCV, ticker_symbol=ticker_to_check_stale)
else:
    print("\nNo tickers with stale data were found to verify.")


# --- Verify the 'HasSameVolumeAsPrevDay' calculation ---
if flagged_tickers['same_volume_tickers']:
    # Pick the first ticker from this list to check
    ticker_to_check_same_vol = flagged_tickers['same_volume_tickers'][0]
    print(f"\n--- Now generating verification file for a REPEATED VOLUME ticker: {ticker_to_check_same_vol} ---")
    verify_ticker_rolling_quality_metrics_calculations(df_ohlcv=df_OHLCV, ticker_symbol=ticker_to_check_same_vol)
else:
    print("\nNo tickers with repeated volume were found to verify.")

--- Scanning for tickers with data quality flags ---
Finding tickers with 'IsStale' flags...
✅ Found 25 tickers with at least one stale day.

Finding tickers with 'HasSameVolumeAsPrevDay' flags...
✅ Found 315 tickers with at least one day of repeated volume.

--- Now generating verification file for a STALE ticker: ALNY ---
--- Verifying calculations for ticker: ALNY ---
Found 1447 data points for ALNY.
Step 1: Calculating intermediate values (IsStale, DollarVolume, etc.)...
Step 2: Calculating rolling metrics (window=252, min_periods=126)...
✅ Verification complete. All calculations saved to 'export_csv/verification_rolling_quality_metrics_ALNY.csv'

--- Now generating verification file for a REPEATED VOLUME ticker: AAON ---
--- Verifying calculations for ticker: AAON ---
Found 1447 data points for AAON.
Step 1: Calculating intermediate values (IsStale, DollarVolume, etc.)...
Step 2: Calculating rolling metrics (window=252, min_periods=126)...
✅ Verification complete. All calculations

In [29]:
_ticker = 'AAON'
display(df_OHLCV.loc[_ticker].head())
display(df_OHLCV.loc[_ticker].tail())

Unnamed: 0_level_0,Adj Open,Adj High,Adj Low,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,32.2274,32.2727,31.3802,32.2662,295942
2020-01-03,31.7876,32.609,31.4513,32.5637,283418
2020-01-06,32.208,32.5637,31.9234,32.5055,141941
2020-01-07,32.3891,32.5637,32.0593,32.4344,109780
2020-01-08,32.4344,32.7901,32.3115,32.3438,190800


Unnamed: 0_level_0,Adj Open,Adj High,Adj Low,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-09-29,91.81,93.0,91.3,91.49,907400
2025-09-30,91.47,94.53,90.73,93.44,867700
2025-10-01,92.77,96.68,92.1,95.92,861700
2025-10-02,96.6,98.73,96.06,98.35,735700
2025-10-03,98.35,100.73,97.35,98.32,831700


In [33]:
eligible_tickers = get_eligible_universe(quality_metrics_df=quality_df, filter_date='2021-06-01', thresholds=my_thresholds)

Dynamic Filter (2021-06-01): Kept 1229 of 1408 tickers.


In [34]:
print(f'eligible_tickers:\n{eligible_tickers}')

eligible_tickers:
['A', 'AA', 'AAL', 'AAPL', 'ABBV', 'ABEV', 'ABT', 'ACGL', 'ACI', 'ACM', 'ACN', 'ACWI', 'ACWX', 'ADBE', 'ADC', 'ADI', 'ADM', 'ADP', 'ADSK', 'ADT', 'AEE', 'AEIS', 'AEM', 'AEP', 'AER', 'AES', 'AFG', 'AFL', 'AGCO', 'AGG', 'AGI', 'AGNC', 'AIG', 'AIT', 'AIZ', 'AJG', 'AKAM', 'AL', 'ALB', 'ALC', 'ALGN', 'ALK', 'ALL', 'ALLE', 'ALLY', 'ALNY', 'ALSN', 'ALV', 'AM', 'AMAT', 'AMCR', 'AMD', 'AME', 'AMG', 'AMGN', 'AMH', 'AMKR', 'AMLP', 'AMP', 'AMT', 'AMX', 'AMZN', 'AN', 'ANET', 'AON', 'AOS', 'APA', 'APD', 'APG', 'APH', 'APO', 'APPF', 'APTV', 'AR', 'ARCC', 'ARE', 'ARES', 'ARGX', 'ARKK', 'ARMK', 'ASML', 'ASND', 'ATI', 'ATO', 'AU', 'AVAV', 'AVB', 'AVGO', 'AVTR', 'AVY', 'AWI', 'AWK', 'AXON', 'AXP', 'AXS', 'AXTA', 'AYI', 'AZN', 'AZO', 'B', 'BA', 'BABA', 'BAC', 'BAH', 'BALL', 'BAP', 'BAX', 'BBD', 'BBEU', 'BBIO', 'BBJP', 'BBVA', 'BBY', 'BCE', 'BCS', 'BDX', 'BE', 'BEKE', 'BEN', 'BEP', 'BF-B', 'BG', 'BHP', 'BIDU', 'BIIB', 'BIL', 'BILI', 'BIO', 'BIP', 'BIV', 'BJ', 'BK', 'BKLN', 'BKNG', 'BKR', 

In [32]:
quality_df

Unnamed: 0_level_0,Unnamed: 1_level_0,RollingStalePct,RollingMedianVolume,RollingSameVolCount
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,2020-01-02,,,
A,2020-01-03,,,
A,2020-01-06,,,
A,2020-01-07,,,
A,2020-01-08,,,
...,...,...,...,...
ZWS,2025-09-29,0.0,3.305977e+07,0.0
ZWS,2025-09-30,0.0,3.299628e+07,0.0
ZWS,2025-10-01,0.0,3.283692e+07,0.0
ZWS,2025-10-02,0.0,3.283692e+07,0.0
