In [1]:
import pandas as pd
import numpy as np
import os
import yfinance as yf
from datetime import datetime

# ===============================================================
# CONFIGURATION
# ===============================================================

START_DATE = "1998-01-01"
END_DATE   = "2025-12-31"

OUTPUT_DIR = "8-SPY_200DMA_market_regime"
VER_DIR    = "system_verification/8-SPY_200DMA_market_regime"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "8-SPY_200DMA_regime.parquet")

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VER_DIR, exist_ok=True)

RET_DIFF_THRESHOLD = 0.03     # >3% mismatch is suspicious
CORR_MIN_THRESHOLD = 0.95     # SPY vs SPX daily-return correlation must exceed this

print("=== MARKET REGIME FILTER (SPY-only regime + SPX cross-validation) ===")

# ===============================================================
# VALIDATION LOGGING
# ===============================================================

validation_rows = []

def add_validation(source, issue, detail=""):
    row = {
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "source": source,
        "issue": issue,
        "detail": detail,
    }
    validation_rows.append(row)
    print(f"⚠ VALIDATION [{source}]: {issue} | {detail}")


# ===============================================================
# 1. DOWNLOAD SPY
# ===============================================================

print(f"\nDownloading SPY {START_DATE} → {END_DATE} ...")

spy = yf.download(
    "SPY",
    start=START_DATE,
    end=END_DATE,
    auto_adjust=True,   # << adjusted OHLC (Close = adjusted close)
    progress=False
)

if spy.empty:
    raise ValueError("Could not download SPY")

spy.index = spy.index.tz_localize(None)

# MultiIndex-safe Close extraction
if isinstance(spy.columns, pd.MultiIndex):
    spy_close = spy.xs("Close", axis=1, level=0)
    # if SPY is multi-ticker format, pick the first column
    if isinstance(spy_close, pd.DataFrame):
        spy_close = spy_close.iloc[:, 0]
else:
    spy_close = spy["Close"]

spy_close = spy_close.astype(float)
spy_close.name = "SPY_close"


# ===============================================================
# 2. DOWNLOAD SPX INDEX (^GSPC)
# ===============================================================

print(f"Downloading ^GSPC {START_DATE} → {END_DATE} ...")

spx = yf.download(
    "^GSPC",
    start=START_DATE,
    end=END_DATE,
    auto_adjust=True,
    progress=False
)

if spx.empty:
    raise ValueError("Could not download ^GSPC")

spx.index = spx.index.tz_localize(None)

# MultiIndex-safe Close extraction
if isinstance(spx.columns, pd.MultiIndex):
    spx_close = spx.xs("Close", axis=1, level=0)
    if isinstance(spx_close, pd.DataFrame):
        spx_close = spx_close.iloc[:, 0]
else:
    spx_close = spx["Close"]

spx_close = spx_close.astype(float)
spx_close.name = "SPX_close"


# ===============================================================
# VALIDATION HELPERS
# ===============================================================

def validate_series(s: pd.Series, label: str):

    # Ensure sorted & deduped
    if not s.index.is_monotonic_increasing:
        add_validation(label, "non_monotonic_dates", "sorting applied")
        s = s.sort_index()

    if s.index.duplicated().any():
        n = int(s.index.duplicated().sum())
        add_validation(label, "duplicate_dates", f"{n} duplicates removed")
        s = s[~s.index.duplicated(keep="first")]

    # Non-positive prices
    if (s <= 0).any():
        n = int((s <= 0).sum())
        add_validation(label, "non_positive_prices", f"{n} invalid rows")

    # Large gaps
    gaps = s.index.to_series().diff().dt.days
    if (gaps > 7).any():
        add_validation(label, "large_calendar_gaps", f"max gap {int(gaps.max())} days")

    # Spike detection
    if len(s) > 50:
        med = s.median()
        maxv = s.max()
        if maxv > med * 20:
            add_validation(label, "price_spike_detected",
                           f"max={maxv:.2f}, median={med:.2f}")

    return s

spy_close = validate_series(spy_close, "SPY")
spx_close = validate_series(spx_close, "SPX")


# ===============================================================
# 3. CROSS-VALIDATE DAILY RETURNS
# ===============================================================

print("\nCross-validating SPY vs SPX...")

spy_ret = spy_close.pct_change()
spx_ret = spx_close.pct_change()

common_dates = spy_ret.index.intersection(spx_ret.index)
spy_r = spy_ret.loc[common_dates]
spx_r = spx_ret.loc[common_dates]

corr = spy_r.corr(spx_r)

if np.isnan(corr) or corr < CORR_MIN_THRESHOLD:
    add_validation("CROSS", "low_return_correlation", f"corr={corr:.3f}")
else:
    print(f"Return correlation SPY vs SPX: {corr:.3f}")

ret_diff = (spy_r - spx_r).abs()
bad_days = ret_diff[ret_diff > RET_DIFF_THRESHOLD]

if not bad_days.empty:
    add_validation("CROSS", "large_return_mismatches",
                   f"{len(bad_days)} days exceed threshold 3%")
    print("\nExample mismatched days:\n", bad_days.head())


# ===============================================================
# 4. COMPUTE MA200 REGIME (SPY-ONLY)
# ===============================================================

spy_ma200 = spy_close.rolling(200, min_periods=200).mean()
regime = ((spy_close > spy_ma200) & (~spy_ma200.isna())).astype(int)

regime_df = pd.DataFrame({
    "spy_close": spy_close,
    "spy_ma200": spy_ma200,
    "market_regime": regime,
})


# ===============================================================
# 5. SAVE RESULTS
# ===============================================================

regime_df.to_parquet(OUTPUT_FILE)

print(f"\n✓ Saved regime to {OUTPUT_FILE}")
print(f"Rows: {len(regime_df)}   Range: {regime_df.index.min().date()} → {regime_df.index.max().date()}")


# ===============================================================
# 6. SAVE VALIDATION REPORT
# ===============================================================

val_df = pd.DataFrame(validation_rows)
val_file = os.path.join(
    VER_DIR,
    f"SPY_200DMA_market_regime_validation-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
)
val_df.to_csv(val_file, index=False)

print("\nValidation Summary:")
print(val_df)
print("\nSaved validation log →", val_file)
print("==============================================================")


=== MARKET REGIME FILTER (SPY-only regime + SPX cross-validation) ===

Downloading SPY 1998-01-01 → 2025-12-31 ...
Downloading ^GSPC 1998-01-01 → 2025-12-31 ...

Cross-validating SPY vs SPX...
Return correlation SPY vs SPX: 0.986
⚠ VALIDATION [CROSS]: large_return_mismatches | 1 days exceed threshold 3%

Example mismatched days:
 Date
2000-01-07    0.030986
dtype: float64

✓ Saved regime to 8-SPY_200DMA_market_regime\8-SPY_200DMA_regime.parquet
Rows: 7042   Range: 1998-01-02 → 2025-12-30

Validation Summary:
             timestamp source                    issue  \
0  2025-12-31T08:45:32  CROSS  large_return_mismatches   

                       detail  
0  1 days exceed threshold 3%  

Saved validation log → system_verification/8-SPY_200DMA_market_regime\SPY_200DMA_market_regime_validation-20251231-084532.csv


In [2]:
# Load and inspect the market regime file
regime_data = pd.read_parquet(OUTPUT_FILE)

print("=== MARKET REGIME DATA INSPECTION ===\n")
print(f"Shape: {regime_data.shape}")
print(f"Date range: {regime_data.index.min()} to {regime_data.index.max()}")
print(f"\nColumns: {list(regime_data.columns)}")
print(f"\nData types:\n{regime_data.dtypes}")
print(f"\n--- First 10 rows ---")
print(regime_data.head(10))
print(f"\n--- Last 10 rows ---")
print(regime_data.tail(10))
print(f"\n--- Summary statistics ---")
print(regime_data.describe())
print(f"\n--- Market regime distribution ---")
print(regime_data['market_regime'].value_counts().sort_index())
print(f"\nBullish days (regime=1): {(regime_data['market_regime'] == 1).sum()}")
print(f"Bearish days (regime=0): {(regime_data['market_regime'] == 0).sum()}")
print(f"Percentage bullish: {(regime_data['market_regime'] == 1).mean() * 100:.2f}%")

# Check for any metadata
if hasattr(regime_data, 'attrs') and regime_data.attrs:
    print(f"\n--- Metadata ---")
    for key, value in regime_data.attrs.items():
        print(f"{key}: {value}")

=== MARKET REGIME DATA INSPECTION ===

Shape: (7042, 3)
Date range: 1998-01-02 00:00:00 to 2025-12-30 00:00:00

Columns: ['spy_close', 'spy_ma200', 'market_regime']

Data types:
spy_close        float64
spy_ma200        float64
market_regime      int32
dtype: object

--- First 10 rows ---
            spy_close  spy_ma200  market_regime
Date                                           
1998-01-02  60.025837        NaN              0
1998-01-05  60.160397        NaN              0
1998-01-06  59.199081        NaN              0
1998-01-07  59.352856        NaN              0
1998-01-08  58.833786        NaN              0
1998-01-09  56.795715        NaN              0
1998-01-12  57.833958        NaN              0
1998-01-13  58.641506        NaN              0
1998-01-14  58.910667        NaN              0
1998-01-15  58.420383        NaN              0

--- Last 10 rows ---
             spy_close   spy_ma200  market_regime
Date                                             
2025-12-16  