In [1]:
import sys
from pathlib import Path


def add_project_root_to_path():
    """Find notebooks_RLVR and add to sys.path."""
    current = Path.cwd()

    # Search upward for notebooks_RLVR folder
    for path in [current] + list(current.parents):
        if path.name == "notebooks_RLVR":
            sys.path.insert(0, str(path))
            print(f"✓ Added to path: {path}")
            return path
        # Also check if notebooks_RLVR exists as child (for running from stocks/)
        candidate = path / "notebooks_RLVR"
        if candidate.exists():
            sys.path.insert(0, str(candidate))
            print(f"✓ Added to path: {candidate}")
            return candidate

    raise RuntimeError("Could not find notebooks_RLVR directory")


# Run once at notebook start
add_project_root_to_path()

✓ Added to path: c:\Users\ping\Files_win10\python\py311\stocks\notebooks_RLVR


WindowsPath('c:/Users/ping/Files_win10/python/py311/stocks/notebooks_RLVR')

In [2]:
import os
import pandas as pd
import numpy as np

from dotenv import load_dotenv
from IPython.display import display
from typing import List, Union, Tuple
from core.settings import GLOBAL_SETTINGS
from core.paths import OUTPUT_DIR

# pd.set_option('display.max_rows', None)  display all rows
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)
pd.set_option("display.max_colwidth", 50)
pd.set_option("display.precision", 4)

NOTEBOOKS_RLVR_ROOT: C:\Users\ping\Files_win10\python\py311\stocks\notebooks_RLVR

OUTPUT_DIR: C:\Users\ping\Files_win10\python\py311\stocks\notebooks_RLVR\output



In [3]:
def load_env_and_get_path():
    """Load .env file and return data path. Works from any subdirectory."""

    # Start from current file's directory
    current_dir = Path.cwd()

    # Search upward for the .env folder
    for parent in [current_dir] + list(current_dir.parents):
        env_file = parent / ".env" / "my_api_key.env"
        if env_file.exists():
            load_dotenv(env_file)
            print(f"✓ Loaded .env from: {env_file}")
            break

    data_ohlcv_path = os.getenv("DATA_PATH_OHLCV")
    if not data_ohlcv_path:
        raise ValueError("DATA_PATH_OHLCV not found in .env file")

    data_indices_path = os.getenv("DATA_PATH_INDICES")
    if not data_indices_path:
        raise ValueError("DATA_PATH_INDICES not found in .env file")

    return data_ohlcv_path, data_indices_path


def generate_features(
    df_ohlcv: pd.DataFrame,
    df_indices: pd.DataFrame = None,
    benchmark_ticker: str = GLOBAL_SETTINGS["benchmark_ticker"],
    atr_period: int = GLOBAL_SETTINGS["atr_period"],
    rsi_period: int = GLOBAL_SETTINGS["rsi_period"],
    win_5d: int = GLOBAL_SETTINGS["5d_window"],
    win_21d: int = GLOBAL_SETTINGS["21d_window"],
    win_63d: int = GLOBAL_SETTINGS["63d_window"],
    feature_zscore_clip: float = GLOBAL_SETTINGS["feature_zscore_clip"],
    quality_window: int = GLOBAL_SETTINGS["quality_window"],
    quality_min_periods: int = GLOBAL_SETTINGS["quality_min_periods"],
) -> Tuple[pd.DataFrame, pd.DataFrame]:

    print(f"⚡ Generating Decoupled Features (Benchmark: {benchmark_ticker})...")

    # --- 0. PREP ---
    df_ohlcv = df_ohlcv.sort_index(level=["Ticker", "Date"])
    all_dates = df_ohlcv.index.get_level_values("Date").unique().sort_values()

    # --- 1. MACRO ENGINE ---
    macro_df = pd.DataFrame(index=all_dates)
    if benchmark_ticker in df_ohlcv.index.get_level_values("Ticker"):
        mkt_close = (
            df_ohlcv.xs(benchmark_ticker, level="Ticker")["Adj Close"]
            .reindex(all_dates)
            .ffill()
        )
        macro_df["Mkt_Ret"] = mkt_close.pct_change().fillna(0.0)
        macro_df["Macro_Trend"] = (mkt_close / mkt_close.rolling(200).mean()) - 1.0
    else:
        macro_df["Mkt_Ret"] = 0.0
        macro_df["Macro_Trend"] = 0.0

    # --- TREND VELOCITY & MOMENTUM ---
    macro_df["Macro_Trend_Vel"] = macro_df["Macro_Trend"].diff(win_21d)
    macro_df["Macro_Trend_Vel_Z"] = (
        macro_df["Macro_Trend_Vel"] / macro_df["Macro_Trend"].rolling(win_63d).std()
    ).clip(-feature_zscore_clip, feature_zscore_clip)
    macro_df["Macro_Trend_Mom"] = (
        np.sign(macro_df["Macro_Trend"])
        * np.sign(macro_df["Macro_Trend_Vel"])
        * np.abs(macro_df["Macro_Trend_Vel"])
    ).fillna(0)

    # VIX Extraction (Same as before)
    macro_df["Macro_Vix_Z"] = 0.0
    macro_df["Macro_Vix_Ratio"] = 1.0
    if df_indices is not None:
        idx_names = df_indices.index.get_level_values(0).unique()
        if "^VIX" in idx_names:
            v = df_indices.xs("^VIX", level=0)["Adj Close"].reindex(all_dates).ffill()
            macro_df["Macro_Vix_Z"] = (
                (v - v.rolling(63).mean()) / v.rolling(63).std()
            ).clip(-feature_zscore_clip, feature_zscore_clip)
        if "^VIX" in idx_names and "^VIX3M" in idx_names:
            v3 = (
                df_indices.xs("^VIX3M", level=0)["Adj Close"].reindex(all_dates).ffill()
            )
            macro_df["Macro_Vix_Ratio"] = (v / v3).fillna(1.0)
    macro_df.fillna(0.0, inplace=True)

    # --- 2. TICKER ENGINE ---
    grouped = df_ohlcv.groupby(level="Ticker")
    rets = grouped["Adj Close"].pct_change()
    mkt_ret_series = macro_df["Mkt_Ret"]  # The "Master" market vector

    # A. Hybrid Metrics (Beta & IR)
    # 1. IR_63 (Passed previously, kept same logic)
    active_ret = rets.sub(mkt_ret_series, axis=0, level="Date")
    roll_active = active_ret.groupby(level="Ticker").rolling(win_63d)
    ir_63 = (
        (roll_active.mean() / roll_active.std())
        .reset_index(level=0, drop=True)
        .fillna(0)
    )

    # 2. Beta_63 (Optimized: Pre-compute market variance, audit-exact calculation)
    mkt_var = mkt_ret_series.rolling(win_63d).var()

    def calc_rolling_beta(ticker_rets):
        dates = ticker_rets.index.get_level_values("Date")
        m = mkt_ret_series.reindex(dates)
        return ticker_rets.rolling(win_63d).cov(m) / mkt_var.reindex(dates)

    beta_63 = (
        rets.groupby(level="Ticker", group_keys=False)
        .apply(calc_rolling_beta)
        .fillna(1.0)
    )

    # B. Volatility (ATR / TRP) - Optimized
    prev_close = grouped["Adj Close"].shift(1)

    # Vectorized True Range without pd.concat memory overhead
    high_low = df_ohlcv["Adj High"] - df_ohlcv["Adj Low"]
    high_close = (df_ohlcv["Adj High"] - prev_close).abs()
    low_close = (df_ohlcv["Adj Low"] - prev_close).abs()

    # Nested np.maximum avoids creating a 3-column DataFrame
    tr = np.maximum(np.maximum(high_low, high_close), low_close)

    atr = (
        tr.groupby(level="Ticker")
        .ewm(alpha=1 / atr_period, adjust=False)
        .mean()
        .reset_index(level=0, drop=True)
    )
    natr = (atr / df_ohlcv["Adj Close"]).fillna(0)
    trp = (tr / df_ohlcv["Adj Close"]).fillna(0)

    # C. Momentum & Consistency
    mom_21 = grouped["Adj Close"].pct_change(win_21d)
    consistency = (
        (rets > 0)
        .astype(float)
        .groupby(level="Ticker")
        .rolling(win_5d)
        .mean()
        .reset_index(level=0, drop=True)
    )
    dd_21 = (
        df_ohlcv["Adj Close"]
        / grouped["Adj Close"].rolling(win_21d).max().reset_index(level=0, drop=True)
    ) - 1.0

    # D. RSI (Wilder's Logic)
    delta = grouped["Adj Close"].diff()
    up, down = delta.clip(lower=0), -1 * delta.clip(upper=0)
    ma_up = (
        up.groupby(level="Ticker")
        .ewm(alpha=1 / rsi_period, adjust=False)
        .mean()
        .reset_index(level=0, drop=True)
    )
    ma_down = (
        down.groupby(level="Ticker")
        .ewm(alpha=1 / rsi_period, adjust=False)
        .mean()
        .reset_index(level=0, drop=True)
    )
    # FIX: Allow division by zero (i.e. no down day) to create inf (correct RSI=100),
    # inf→100, -inf→0, NaN→50
    # then clean up remaining NaNs (initial periods/no movement)
    # - Initial periods: Before the 14-day lookback is filled, the EWM mean is undefined → NaN.
    # - Flat prices: If price doesn't move (Avg Up = 0 and Avg Down = 0), RS is 0/0 → NaN.
    # - By convention, RSI is set to 50 (neutral) when there is no directional momentum.
    rs = ma_up / ma_down  # Keep zero denominator → inf
    raw_rsi = 100 - (100 / (1 + rs))
    rsi = raw_rsi.replace({np.inf: 100, -np.inf: 0}).fillna(50)

    # E. Assemble Features
    features_df = pd.DataFrame(
        {
            "ATR": atr,
            "ATRP": natr,
            "TRP": trp,
            "RSI": rsi,
            "Mom_21": mom_21,
            "Consistency": consistency,
            "IR_63": ir_63,
            "Beta_63": beta_63,
            "DD_21": dd_21.fillna(0),
            "Ret_1d": rets,
        }
    )

    # F. Quality (Universe Filtering) - Optimized
    quality_temp = pd.DataFrame(
        {
            "IsStale": np.where(
                (df_ohlcv["Volume"] == 0)
                | (df_ohlcv["Adj High"] == df_ohlcv["Adj Low"]),
                1,
                0,
            ),
            "DollarVolume": df_ohlcv["Adj Close"] * df_ohlcv["Volume"],
            "HasSameVolume": (grouped["Volume"].diff() == 0).astype(int),
        },
        index=df_ohlcv.index,
    )

    # Calculate rolling stats separately (avoid slow dict agg) and use .values to bypass index alignment overhead
    grp = quality_temp.groupby(level="Ticker")
    rolling_quality = pd.DataFrame(
        {
            "RollingStalePct": grp["IsStale"]
            .rolling(window=quality_window, min_periods=quality_min_periods)
            .mean()
            .values,
            "RollMedDollarVol": grp["DollarVolume"]
            .rolling(window=quality_window, min_periods=quality_min_periods)
            .median()
            .values,
            "RollingSameVolCount": grp["HasSameVolume"]
            .rolling(window=quality_window, min_periods=quality_min_periods)
            .sum()
            .values,
        },
        index=quality_temp.index,
    )

    return pd.concat([features_df, rolling_quality], axis=1).sort_index(), macro_df


def create_combined_dict(
    df_ohlcv: pd.DataFrame,
    features_df: pd.DataFrame,
    tickers: Union[str, List[str]],
    date_start: str,
    date_end: str,
    verbose: bool = True,
) -> dict:
    """
    Create a combined dictionary with both OHLCV and features data for each ticker.

    Parameters:
    -----------
    df_ohlcv : pd.DataFrame
        DataFrame with OHLCV data (MultiIndex: ticker, date)
    features_df : pd.DataFrame
        DataFrame with features data (MultiIndex: ticker, date)
    tickers : str or list of str
        Ticker symbol(s) to retrieve
    date_start : str
        Start date in 'YYYY-MM-DD' format
    date_end : str
        End date in 'YYYY-MM-DD' format
    verbose : bool, optional
        Whether to print progress information (default: True)

    Returns:
    --------
    dict
        Dictionary with tickers as keys and combined DataFrames (OHLCV + features) as values
    """
    # Convert single ticker to list
    if isinstance(tickers, str):
        tickers = [tickers]

    if verbose:
        print(f"Creating combined dictionary for {len(tickers)} ticker(s)")
        print(f"Date range: {date_start} to {date_end}")
        print("=" * 60)

    # Get OHLCV data as dictionary
    ohlcv_dict = get_ticker_OHLCV(
        df_ohlcv, tickers, date_start, date_end, return_format="dict", verbose=verbose
    )

    # Get features data as dictionary
    features_dict = get_ticker_features(
        features_df,
        tickers,
        date_start,
        date_end,
        return_format="dict",
        verbose=verbose,
    )

    # Create combined_dict
    combined_dict = {}

    for ticker in tickers:
        if verbose:
            print(f"\nProcessing {ticker}...")

        # Check if ticker exists in both dictionaries
        if ticker in ohlcv_dict and ticker in features_dict:
            ohlcv_data = ohlcv_dict[ticker]
            features_data = features_dict[ticker]

            # Check if both dataframes have data
            if not ohlcv_data.empty and not features_data.empty:
                # Combine OHLCV and features data
                # Note: Both dataframes have the same index (dates), so we can concatenate
                combined_df = pd.concat([ohlcv_data, features_data], axis=1)

                # Ensure proper index naming
                combined_df.index.name = "Date"

                # Store in combined_dict
                combined_dict[ticker] = combined_df

                if verbose:
                    print(f"  ✓ Successfully combined data")
                    print(f"  OHLCV shape: {ohlcv_data.shape}")
                    print(f"  Features shape: {features_data.shape}")
                    print(f"  Combined shape: {combined_df.shape}")
                    print(
                        f"  Date range: {combined_df.index.min()} to {combined_df.index.max()}"
                    )
            else:
                if verbose:
                    print(f"  ✗ Cannot combine: One or both dataframes are empty")
                    print(f"    OHLCV empty: {ohlcv_data.empty}")
                    print(f"    Features empty: {features_data.empty}")
                combined_dict[ticker] = pd.DataFrame()
        else:
            if verbose:
                print(f"  ✗ Ticker not found in both dictionaries")
                if ticker not in ohlcv_dict:
                    print(f"    Not in OHLCV data")
                if ticker not in features_dict:
                    print(f"    Not in features data")
            combined_dict[ticker] = pd.DataFrame()

    # Print summary
    if verbose:
        print("\n" + "=" * 60)
        print("SUMMARY")
        print("=" * 60)
        print(f"Total tickers processed: {len(tickers)}")

        tickers_with_data = [
            ticker for ticker, df in combined_dict.items() if not df.empty
        ]
        print(f"Tickers with combined data: {len(tickers_with_data)}")

        if tickers_with_data:
            print("\nTicker details:")
            for ticker in tickers_with_data:
                df = combined_dict[ticker]
                print(f"  {ticker}: {df.shape} - {df.index.min()} to {df.index.max()}")
                print(f"    Columns: {len(df.columns)}")

        empty_tickers = [ticker for ticker, df in combined_dict.items() if df.empty]
        if empty_tickers:
            print(f"\nTickers with no data: {', '.join(empty_tickers)}")

    return combined_dict


def get_ticker_OHLCV(
    df_ohlcv: pd.DataFrame,
    tickers: Union[str, List[str]],
    date_start: str,
    date_end: str,
    return_format: str = "dataframe",
    verbose: bool = True,
) -> Union[pd.DataFrame, dict]:
    """
    Get OHLCV data for specified tickers within a date range.

    Parameters
    ----------
    df_ohlcv : pd.DataFrame
        DataFrame with MultiIndex of (ticker, date) and OHLCV columns
    tickers : str or list of str
        Ticker symbol(s) to retrieve
    date_start : str
        Start date in 'YYYY-MM-DD' format
    date_end : str
        End date in 'YYYY-MM-DD' format
    return_format : str, optional
        Format to return data in. Options:
        - 'dataframe': Single DataFrame with MultiIndex (default)
        - 'dict': Dictionary with tickers as keys and DataFrames as values
        - 'separate': List of separate DataFrames for each ticker
    verbose : bool, optional
        Whether to print summary information (default: True)

    Returns
    -------
    Union[pd.DataFrame, dict, list]
        Filtered OHLCV data in specified format

    Raises
    ------
    ValueError
        If input parameters are invalid
    KeyError
        If tickers not found in DataFrame

    Examples
    --------
    >>> # Get data for single ticker
    >>> vlo_data = get_ticker_OHLCV(df_ohlcv, 'VLO', '2025-08-13', '2025-09-04')

    >>> # Get data for multiple tickers
    >>> multi_data = get_ticker_OHLCV(df_ohlcv, ['VLO', 'JPST'], '2025-08-13', '2025-09-04')

    >>> # Get data as dictionary
    >>> data_dict = get_ticker_OHLCV(df_ohlcv, ['VLO', 'JPST'], '2025-08-13',
    ...                              '2025-09-04', return_format='dict')
    """

    # Input validation
    if not isinstance(df_ohlcv, pd.DataFrame):
        raise TypeError("df_ohlcv must be a pandas DataFrame")

    if not isinstance(df_ohlcv.index, pd.MultiIndex):
        raise ValueError("DataFrame must have MultiIndex of (ticker, date)")

    if len(df_ohlcv.index.levels) != 2:
        raise ValueError("MultiIndex must have exactly 2 levels: (ticker, date)")

    # Convert single ticker to list for consistent processing
    if isinstance(tickers, str):
        tickers = [tickers]
    elif not isinstance(tickers, list):
        raise TypeError("tickers must be a string or list of strings")

    # Convert dates to Timestamps
    try:
        start_date = pd.Timestamp(date_start)
        end_date = pd.Timestamp(date_end)
    except ValueError as e:
        raise ValueError(f"Invalid date format. Use 'YYYY-MM-DD': {e}")

    if start_date > end_date:
        raise ValueError("date_start must be before or equal to date_end")

    # Check if tickers exist in the DataFrame
    available_tickers = df_ohlcv.index.get_level_values(0).unique()
    missing_tickers = [t for t in tickers if t not in available_tickers]

    if missing_tickers:
        raise KeyError(f"Ticker(s) not found in DataFrame: {missing_tickers}")

    # Filter the data using MultiIndex slicing
    try:
        filtered_data = df_ohlcv.loc[(tickers, slice(date_start, date_end)), :]
    except Exception as e:
        raise ValueError(f"Error filtering data: {e}")

    # Handle empty results
    if filtered_data.empty:
        if verbose:
            print(
                f"No data found for tickers {tickers} in date range {date_start} to {date_end}"
            )
        return filtered_data

    # Print summary if verbose
    if verbose:
        print(
            f"Data retrieved for {len(tickers)} ticker(s) from {date_start} to {date_end}"
        )
        print(f"Total rows: {len(filtered_data)}")
        print(
            f"Date range in data: {filtered_data.index.get_level_values(1).min()} to "
            f"{filtered_data.index.get_level_values(1).max()}"
        )

        # Print ticker-specific counts
        ticker_counts = filtered_data.index.get_level_values(0).value_counts()
        for ticker in tickers:
            count = ticker_counts.get(ticker, 0)
            if count > 0:
                print(f"  {ticker}: {count} rows")
            else:
                print(f"  {ticker}: No data in range")

    # Return in requested format
    if return_format == "dict":
        result = {}
        for ticker in tickers:
            try:
                result[ticker] = filtered_data.xs(ticker, level=0).loc[
                    date_start:date_end
                ]
            except KeyError:
                result[ticker] = pd.DataFrame()
        return result

    elif return_format == "separate":
        result = []
        for ticker in tickers:
            try:
                result.append(
                    filtered_data.xs(ticker, level=0).loc[date_start:date_end]
                )
            except KeyError:
                result.append(pd.DataFrame())
        return result

    elif return_format == "dataframe":
        return filtered_data

    else:
        raise ValueError(
            f"Invalid return_format: {return_format}. "
            f"Must be 'dataframe', 'dict', or 'separate'"
        )


def get_ticker_features(
    features_df: pd.DataFrame,
    tickers: Union[str, List[str]],
    date_start: str,
    date_end: str,
    return_format: str = "dataframe",
    verbose: bool = True,
) -> Union[pd.DataFrame, dict]:
    """
    Get features data for specified tickers within a date range.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame with MultiIndex of (ticker, date) and feature columns
    tickers : str or list of str
        Ticker symbol(s) to retrieve
    date_start : str
        Start date in 'YYYY-MM-DD' format
    date_end : str
        End date in 'YYYY-MM-DD' format
    return_format : str, optional
        Format to return data in. Options:
        - 'dataframe': Single DataFrame with MultiIndex (default)
        - 'dict': Dictionary with tickers as keys and DataFrames as values
        - 'separate': List of separate DataFrames for each ticker
    verbose : bool, optional
        Whether to print summary information (default: True)

    Returns
    -------
    Union[pd.DataFrame, dict, list]
        Filtered features data in specified format
    """
    # Convert single ticker to list for consistent processing
    if isinstance(tickers, str):
        tickers = [tickers]

    # Filter the data using MultiIndex slicing
    try:
        filtered_data = features_df.loc[(tickers, slice(date_start, date_end)), :]
    except Exception as e:
        if verbose:
            print(f"Error filtering data: {e}")
        return pd.DataFrame() if return_format == "dataframe" else {}

    # Handle empty results
    if filtered_data.empty:
        if verbose:
            print(
                f"No data found for tickers {tickers} in date range {date_start} to {date_end}"
            )
        return filtered_data

    # Print summary if verbose
    if verbose:
        print(
            f"Features data retrieved for {len(tickers)} ticker(s) from {date_start} to {date_end}"
        )
        print(f"Total rows: {len(filtered_data)}")
        print(
            f"Date range in data: {filtered_data.index.get_level_values(1).min()} to "
            f"{filtered_data.index.get_level_values(1).max()}"
        )
        print(f"Available features: {', '.join(filtered_data.columns.tolist())}")

        # Print ticker-specific counts
        ticker_counts = filtered_data.index.get_level_values(0).value_counts()
        for ticker in tickers:
            count = ticker_counts.get(ticker, 0)
            if count > 0:
                print(f"  {ticker}: {count} rows")
            else:
                print(f"  {ticker}: No data in range")

    # Return in requested format
    if return_format == "dict":
        result = {}
        for ticker in tickers:
            try:
                result[ticker] = filtered_data.xs(ticker, level=0).loc[
                    date_start:date_end
                ]
            except KeyError:
                result[ticker] = pd.DataFrame()
        return result

    elif return_format == "separate":
        result = []
        for ticker in tickers:
            try:
                result.append(
                    filtered_data.xs(ticker, level=0).loc[date_start:date_end]
                )
            except KeyError:
                result.append(pd.DataFrame())
        return result

    elif return_format == "dataframe":
        return filtered_data

    else:
        raise ValueError(
            f"Invalid return_format: {return_format}. "
            f"Must be 'dataframe', 'dict', or 'separate'"
        )


#

In [4]:
# Usage in any notebook:
data_ohlcv_path, data_indices_path = load_env_and_get_path()

✓ Loaded .env from: c:\Users\ping\Files_win10\python\py311\.env\my_api_key.env


In [5]:
df_ohlcv = pd.read_parquet(data_ohlcv_path, engine="pyarrow")
print(f"df_ohlcv:\n{df_ohlcv}")

df_ohlcv:
                   Adj Open  Adj High  Adj Low  Adj Close    Volume
Ticker Date                                                        
A      1999-11-18   27.1966   29.8864  23.9091    26.3000  74849954
       1999-11-19   25.6649   25.7023  23.7970    24.1333  18230876
       1999-11-22   24.6936   26.3000  23.9465    26.3000   7871810
       1999-11-23   25.4034   26.0759  23.9091    23.9091   7151080
       1999-11-24   23.9838   25.0672  23.9091    24.5442   5795947
...                     ...       ...      ...        ...       ...
ZWS    2026-02-12   52.5800   53.1700  51.1700    51.3800    956200
       2026-02-13   51.2400   51.5700  50.7200    51.3000    768600
       2026-02-17   51.3800   51.4900  50.5600    51.1200    793500
       2026-02-18   50.7700   51.7800  50.6600    51.2100    667200
       2026-02-19   50.8400   51.1550  50.5000    50.9100    379762

[9503519 rows x 5 columns]


In [6]:
df_indices = pd.read_parquet(data_indices_path, engine="pyarrow")
print(f"df_indices:\n{df_indices}")

df_indices:
                   Adj Open  Adj High  Adj Low  Adj Close  Volume
Ticker Date                                                      
^AXJO  1992-11-22   1455.00   1455.00  1455.00    1455.00       0
       1992-11-23   1458.40   1458.40  1458.40    1458.40       0
       1992-11-24   1467.90   1467.90  1467.90    1467.90       0
       1992-11-25   1459.00   1459.00  1459.00    1459.00       0
       1992-11-26   1458.90   1458.90  1458.90    1458.90       0
...                     ...       ...      ...        ...     ...
^VIX3M 2026-02-12     20.19     22.35    20.03      22.17       0
       2026-02-13     22.24     23.01    21.06      22.17       0
       2026-02-17     22.52     23.18    21.33      21.60       0
       2026-02-18     21.55     21.72    20.73      21.39       0
       2026-02-19     21.86     22.37    21.71      21.88       0

[144200 rows x 5 columns]


In [7]:
print(f"Takes about 2.5 minutes to generate_features")

features_df, macro_df = generate_features(
    df_ohlcv=df_ohlcv,
    df_indices=df_indices,
    benchmark_ticker="SPY",
)

Takes about 2.5 minutes to generate_features
⚡ Generating Decoupled Features (Benchmark: SPY)...


In [8]:
# Filter where RollingSameVolCount is non-zero AND not NaN
nonzero_samevol = features_df[
    (features_df["RollingSameVolCount"] > 0) & (features_df["RollingStalePct"] > 0)
]

# Get unique tickers
tickers_with_samevol = nonzero_samevol.index.get_level_values(0).unique()
print(f"date_ranges with same volume: {len(tickers_with_samevol)}")
print(tickers_with_samevol.tolist())

# Get date range per ticker
date_ranges = nonzero_samevol.groupby(level=0).apply(
    lambda x: (x.index.get_level_values(1).min(), x.index.get_level_values(1).max())
)

# import pandas as pd
# pd.set_option("display.max_rows", 100)
# print(date_ranges)
display(date_ranges.head(100))

date_ranges with same volume: 668
['AA', 'AAON', 'ABEV', 'ACGL', 'ACWX', 'ADC', 'ADI', 'ADSK', 'AEG', 'AEIS', 'AEM', 'AEP', 'AFG', 'AFL', 'AG', 'AGCO', 'AGI', 'AIG', 'AIRR', 'AIT', 'AJG', 'ALLY', 'AMAT', 'AMCR', 'AME', 'AMG', 'AMGN', 'AN', 'AOS', 'APA', 'APH', 'ARE', 'ARGX', 'ARKK', 'ARWR', 'ASR', 'ASTS', 'ASX', 'ATO', 'ATR', 'AVB', 'AVDE', 'AVDV', 'AVY', 'AXP', 'AZN', 'BAC', 'BALL', 'BAP', 'BBAX', 'BBEU', 'BBIN', 'BBJP', 'BBVA', 'BBY', 'BCE', 'BCH', 'BCS', 'BDX', 'BEN', 'BEP', 'BF-A', 'BF-B', 'BHP', 'BIIB', 'BIL', 'BIO', 'BK', 'BKLC', 'BLD', 'BMO', 'BN', 'BNS', 'BNT', 'BOKF', 'BOXX', 'BP', 'BPOP', 'BRK-A', 'BRO', 'BSAC', 'BSCQ', 'BSCR', 'BTI', 'BVN', 'BWA', 'BYD', 'C', 'CACI', 'CADE', 'CAE', 'CAG', 'CAH', 'CASY', 'CB', 'CBSH', 'CCJ', 'CCK', 'CDE', 'CDNS', 'CDP', 'CEF', 'CELH', 'CFR', 'CHD', 'CHDN', 'CHT', 'CHTR', 'CIB', 'CIGI', 'CINF', 'CLF', 'CLH', 'CM', 'CMA', 'CMC', 'CMCSA', 'CMI', 'CMS', 'CNA', 'CNP', 'COHR', 'COKE', 'COLB', 'COO', 'CORT', 'COWZ', 'CP', 'CPB', 'CPRT', 'CRH', 'CRK'

Ticker
AA       (1962-06-29 00:00:00, 1967-12-05 00:00:00)
AAON     (1993-06-16 00:00:00, 2002-03-01 00:00:00)
ABEV     (1997-09-02 00:00:00, 2010-12-30 00:00:00)
ACGL     (1996-07-26 00:00:00, 2003-02-11 00:00:00)
ACWX     (2008-09-26 00:00:00, 2009-06-18 00:00:00)
ADC      (1994-10-13 00:00:00, 2002-02-20 00:00:00)
ADI      (1981-05-05 00:00:00, 1991-10-04 00:00:00)
ADSK     (1986-12-26 00:00:00, 1987-05-26 00:00:00)
AEG      (1986-01-23 00:00:00, 1997-09-23 00:00:00)
AEIS     (1996-11-26 00:00:00, 1998-04-21 00:00:00)
AEM      (1974-03-26 00:00:00, 1997-05-22 00:00:00)
AEP      (1974-04-22 00:00:00, 1975-01-02 00:00:00)
AFG      (1981-07-22 00:00:00, 1996-10-24 00:00:00)
AFL      (1984-08-28 00:00:00, 1985-08-26 00:00:00)
AG       (2008-04-14 00:00:00, 2009-04-13 00:00:00)
AGCO     (1992-10-14 00:00:00, 1994-05-04 00:00:00)
AGI      (2003-10-29 00:00:00, 2013-11-14 00:00:00)
AIG      (1973-07-02 00:00:00, 1981-03-13 00:00:00)
AIRR     (2015-11-25 00:00:00, 2017-07-06 00:00:00)
AIT  

In [9]:
# Find a ticker in data_ranges to download
item = 73
ticker = date_ranges.index[item]
date_range = date_ranges.iloc[item]

# Or unpack directly
ticker, date_range = date_ranges.index[item], date_ranges.iloc[item]
start_date = date_range[0]
end_date = date_range[1]

print(f"ticker: {ticker}")
print(f"Start date: {start_date}")
print(f"End date: {end_date}")

ticker: BNT
Start date: 2024-03-18 00:00:00
End date: 2025-08-25 00:00:00


In [10]:
# ticker data to retrieve
tickers = [ticker]

# number of data rows to retrieve
first_nrows = 300

output_dir = "output"

In [11]:
# 3. Generate the combined dict
combined = create_combined_dict(
    df_ohlcv=df_ohlcv.copy(),
    features_df=features_df,
    tickers=tickers,
    date_start=start_date,
    date_end=None,
    verbose=True,
)

Creating combined dictionary for 1 ticker(s)
Date range: 2024-03-18 00:00:00 to None
Data retrieved for 1 ticker(s) from 2024-03-18 00:00:00 to None
Total rows: 483
Date range in data: 2024-03-18 00:00:00 to 2026-02-19 00:00:00
  BNT: 483 rows
Features data retrieved for 1 ticker(s) from 2024-03-18 00:00:00 to None
Total rows: 483
Date range in data: 2024-03-18 00:00:00 to 2026-02-19 00:00:00
Available features: ATR, ATRP, TRP, RSI, Mom_21, Consistency, IR_63, Beta_63, DD_21, Ret_1d, RollingStalePct, RollMedDollarVol, RollingSameVolCount
  BNT: 483 rows

Processing BNT...
  ✓ Successfully combined data
  OHLCV shape: (483, 5)
  Features shape: (483, 13)
  Combined shape: (483, 18)
  Date range: 2024-03-18 00:00:00 to 2026-02-19 00:00:00

SUMMARY
Total tickers processed: 1
Tickers with combined data: 1

Ticker details:
  BNT: (483, 18) - 2024-03-18 00:00:00 to 2026-02-19 00:00:00
    Columns: 18


In [None]:
# # Export to CSV in the output directory
f_name = f"combined_{ticker}_with_SPY_head({first_nrows}).csv"
file_path = os.path.join(OUTPUT_DIR, f_name)
print(f"file_path: {file_path}")

file_path: C:\Users\ping\Files_win10\python\py311\stocks\notebooks_RLVR\output\combined_BNT_with_SPY_head(300).csv


In [None]:
# Get SPY's Adj Close and rename
spy_close = df_ohlcv.loc["SPY"][["Adj Close"]].rename(
    columns={"Adj Close": "SPY_Adj_Close"}
)

# Left join: keeps only dates from combined[ticker]
combined_with_spy = combined[ticker].join(spy_close, how="left")

# Verify
print(combined_with_spy.info())
print(combined_with_spy.head(first_nrows))


# Export to CSV in the output directory
f_name = f"combined_{ticker}_with_SPY_head({first_nrows}).csv"
combined_with_spy.head(first_nrows).to_csv(
    file_path,
    index=True,  # Set to True if you want to keep the index
)
print(f"file saved as: {file_path}")

#

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 483 entries, 2024-03-18 to 2026-02-19
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Adj Open             483 non-null    float64
 1   Adj High             483 non-null    float64
 2   Adj Low              483 non-null    float64
 3   Adj Close            483 non-null    float64
 4   Volume               483 non-null    int64  
 5   ATR                  483 non-null    float64
 6   ATRP                 483 non-null    float64
 7   TRP                  483 non-null    float64
 8   RSI                  483 non-null    float64
 9   Mom_21               483 non-null    float64
 10  Consistency          483 non-null    float64
 11  IR_63                483 non-null    float64
 12  Beta_63              483 non-null    float64
 13  DD_21                483 non-null    float64
 14  Ret_1d               483 non-null    float64
 15  RollingStalePct      