In [22]:
import pandas as pd
import numpy as np
import polars as pl
from pathlib import Path
from gmsm.models.mdsv.src.mdsv import MDSV
from gmsm.models.mdsv.src.estimation import MDSVEstimator
from gmsm.models.mdsv.src.forecasting import MDSVForecaster

# Load data with Polars and convert to pandas
root = Path().resolve().parent
cboe_path = root / 'data' / 'cboe'

print("Loading data with Polars...")
lazy_df = pl.scan_parquet(cboe_path / '2023' / '*.gzip.parquet')
df = lazy_df.collect().to_pandas()

print(f"Loaded {len(df)} rows")
print("Processing underlying prices...")
underlying_prices = (
    df[['quote_datetime', 'active_underlying_price']]
    .drop_duplicates(subset=['quote_datetime'])
    .sort_values('quote_datetime')
    .reset_index(drop=True)
)

print(f"Total observations: {len(underlying_prices)}")
print(f"Date range: {underlying_prices['quote_datetime'].min()} to {underlying_prices['quote_datetime'].max()}")

# Step 2: Calculate intraday log returns (30-minute returns)
# IMPORTANT: This is calculated on INTRADAY data (every 30 minutes)
underlying_prices['log_return'] = (
    np.log(underlying_prices['active_underlying_price']) -
    np.log(underlying_prices['active_underlying_price'].shift(1))
)

# Calculate squared log returns for realized variance
underlying_prices['squared_log_return'] = underlying_prices['log_return'] ** 2

# Step 3: Extract date for grouping
underlying_prices['date'] = pd.to_datetime(underlying_prices['quote_datetime']).dt.date

# Step 4: Calculate DAILY realized variance
# Following the paper: RV_t = sum of squared intraday returns, scaled by 100^2
print("Calculating daily realized variance...")
daily_data = underlying_prices.groupby('date').agg({
    'squared_log_return': 'sum',  # Sum of squared intraday returns
    'quote_datetime': 'count'      # Number of observations per day
}).reset_index()

daily_data.columns = ['date', 'sum_squared_returns', 'n_obs']

# Scale by 100^2 as per the paper
daily_data['realized_variance'] = daily_data['sum_squared_returns'] * (100 ** 2)

# Step 5: Calculate DAILY (close-to-close) log returns
# Following the paper: "Daily returns... are computed on a close to close basis"
# Get the last price of each day (close price)
print("Calculating daily close-to-close returns...")
daily_close_prices = underlying_prices.groupby('date').agg({
    'active_underlying_price': 'last'  # Last price of the day = close price
}).reset_index()

daily_close_prices.columns = ['date', 'close_price']

# Calculate close-to-close log returns
daily_close_prices['daily_log_return'] = (
    np.log(daily_close_prices['close_price']) -
    np.log(daily_close_prices['close_price'].shift(1))
)

# Scale by 100 (in percentage terms) as per the paper
daily_close_prices['daily_log_return_pct'] = daily_close_prices['daily_log_return'] * 100

# Step 6: Merge daily returns with realized variance
daily_data = daily_data.merge(
    daily_close_prices[['date', 'daily_log_return_pct']],
    on='date',
    how='left'
)

# Step 7: Demean the daily returns as per the paper
# "demeaned, and expressed in percentages"
mean_return = daily_data['daily_log_return_pct'].mean()
daily_data['demeaned_log_return'] = daily_data['daily_log_return_pct'] - mean_return

# Step 8: Create forward-looking realized variance (prediction target)
# RV_{t+1} is what we want to predict based on information available at time t
daily_data['forward_realized_variance'] = daily_data['realized_variance'].shift(-1)

# Step 9: Remove first row (no return) and last row (no forward RV)
daily_data = daily_data.iloc[1:-1].reset_index(drop=True)


Loading data with Polars...
Loaded 38802410 rows
Processing underlying prices...
Total observations: 2988
Date range: 2023-01-03 09:31:00-05:00 to 2023-12-29 16:00:00-05:00
Calculating daily realized variance...
Calculating daily close-to-close returns...


In [15]:
phdaily_data

Unnamed: 0,date,sum_squared_returns,n_obs,realized_variance,daily_log_return_pct,demeaned_log_return,forward_realized_variance
0,2023-01-04,0.000179,12,1.786900,0.748327,0.659417,0.837126
1,2023-01-05,0.000084,12,0.837126,-1.167722,-1.256631,1.537528
2,2023-01-06,0.000154,12,1.537528,2.234695,2.145785,0.944130
3,2023-01-09,0.000094,12,0.944130,-0.054197,-0.143106,0.211935
4,2023-01-10,0.000021,12,0.211935,0.694113,0.605203,0.765736
...,...,...,...,...,...,...,...
243,2023-12-21,0.000099,12,0.986300,1.033222,0.944312,0.502645
244,2023-12-22,0.000050,12,0.502645,0.173011,0.084101,0.068773
245,2023-12-26,0.000007,12,0.068773,0.418657,0.329748,0.084854
246,2023-12-27,0.000008,12,0.084854,0.151397,0.062487,0.044457


# Fit MDSV

In [19]:
mdsv_input = daily_data[['date', 'demeaned_log_return', 'realized_variance']]

test_size = len(mdsv_input) // 3
train_data = mdsv_input.iloc[:-test_size].copy()
test_data = mdsv_input.iloc[-test_size:].copy()

train_returns = train_data['demeaned_log_return'].values
train_rv = train_data['realized_variance'].values
train_joint = np.column_stack([train_returns, train_rv])

In [24]:
model = MDSV(N=3, D=10, model_type=2, leverage=True)
estimator = MDSVEstimator(model)

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Add MDSV package to path
root = Path().resolve().parent
mdsv_path = root / 'mdsv-main'
sys.path.append(str(mdsv_path))

from gmsm.models.mdsv.src.mdsv import MDSV, MDSVResult
from gmsm.models.mdsv.src.estimation import MDSVEstimator, EstimationOptions
from gmsm.models.mdsv.src.forecasting import MDSVForecaster

# ============================================================================
# STEP 1: Load Preprocessed Data
# ============================================================================
print("="*80)
print("STEP 1: LOADING PREPROCESSED DATA")
print("="*80)

data_path = root / 'data' / 'processed' / 'daily_data_2015_2024.csv'
print(f"Loading data from: {data_path}")

mdsv_input = pd.read_csv(data_path)
mdsv_input['date'] = pd.to_datetime(mdsv_input['date'])

print(f"\nTotal days: {len(mdsv_input)}")
print(f"Date range: {mdsv_input['date'].min()} to {mdsv_input['date'].max()}")

# Check for NaN values
print(f"\nChecking for missing values:")
print(mdsv_input[['demeaned_log_return', 'realized_variance']].isna().sum())

# Remove any rows with NaN values
mdsv_input_clean = mdsv_input.dropna(subset=['demeaned_log_return', 'realized_variance']).reset_index(drop=True)
print(f"\nAfter removing NaN: {len(mdsv_input_clean)} days")
print(f"Date range: {mdsv_input_clean['date'].min()} to {mdsv_input_clean['date'].max()}")

# ============================================================================
# STEP 2: Train/Test Split
# ============================================================================
print("\n" + "="*80)
print("STEP 2: TRAIN/TEST SPLIT")
print("="*80)

# Use last year as test set
test_size = 252  # Approximately 1 trading year
train_data = mdsv_input_clean.iloc[:-test_size].copy()
test_data = mdsv_input_clean.iloc[-test_size:].copy()

print(f"Train set: {len(train_data)} days ({train_data['date'].min()} to {train_data['date'].max()})")
print(f"Test set: {len(test_data)} days ({test_data['date'].min()} to {test_data['date'].max()})")

# ============================================================================
# STEP 3: Fit MDSV Model using MDSVEstimator
# ============================================================================
print("\n" + "="*80)
print("STEP 3: FITTING MDSV MODEL")
print("="*80)

# Prepare training data: returns and RV as columns
train_returns = train_data['demeaned_log_return'].values
train_rv = train_data['realized_variance'].values
train_joint = np.column_stack([train_returns, train_rv])

print(f"\nTraining data shape: {train_joint.shape}")
print(f"  Column 0 (returns): mean={train_returns.mean():.4f}, std={train_returns.std():.4f}")
print(f"  Column 1 (RV): mean={train_rv.mean():.4f}, std={train_rv.std():.4f}")
print(f"  Returns range: [{train_returns.min():.4f}, {train_returns.max():.4f}]")
print(f"  RV range: [{train_rv.min():.4f}, {train_rv.max():.4f}]")

# Initialize MDSV model with smaller dimensions for faster fitting
print("\nInitializing MDSV(2, 5) model with leverage...")
model = MDSV(
    N=2,              # Number of components
    D=5,              # States per component
    model_type=2,     # Joint model (0=returns only, 1=RV only, 2=joint)
    leverage=True     # Include leverage effect
)

# Use MDSVEstimator for proper estimation (following the example)
print("\nSetting up MDSVEstimator with options...")
estimator = MDSVEstimator(model)

# Create estimation options
options = EstimationOptions(
    method='L-BFGS-B',
    maxiter=1000,
    verbose=True,
)

print("\nFitting model using MDSVEstimator (this will take several minutes)...")
print("This is the proper way to fit the model - please be patient...\n")

try:
    result = estimator.estimate(
        data=train_joint,
        options=options
    )

    print("\n" + "="*80)
    print("MODEL FITTING RESULTS")
    print("="*80)
    print(f"Success: {result.success}")
    print(f"Log-likelihood: {result.log_likelihood:.2f}")
    print(f"BIC: {result.bic:.2f}")
    print(f"AIC: {result.aic:.2f}")
    print(f"Number of iterations: {result.nit if hasattr(result, 'nit') else 'N/A'}")

    print("\nEstimated Parameters:")
    for param_name, param_value in result.parameters.items():
        if isinstance(param_value, (int, float, np.floating)):
            print(f"  {param_name}: {param_value:.6f}")
        elif isinstance(param_value, np.ndarray):
            if param_value.size <= 10:
                print(f"  {param_name}: {param_value}")
            else:
                print(f"  {param_name}: shape={param_value.shape}, mean={param_value.mean():.6f}")
        else:
            print(f"  {param_name}: {type(param_value)}")

    # Check if model fitted successfully
    if not np.isfinite(result.log_likelihood):
        raise ValueError("Model fitting failed - log-likelihood is not finite")

except Exception as e:
    print(f"\nError during fitting: {e}")
    import traceback
    traceback.print_exc()
    print("\nTrying simpler model (no leverage)...")

    # Fallback: simpler model without leverage
    model = MDSV(N=2, D=5, model_type=2, leverage=False)
    estimator = MDSVEstimator(model)

    result = estimator.estimate(
        data=train_joint,
        options=options
    )
    print(f"\nFallback model results:")
    print(f"Log-likelihood: {result.log_likelihood:.2f}")
    print(f"BIC: {result.bic:.2f}")

# ============================================================================
# STEP 4: One-Step Ahead Forecasting
# ============================================================================
print("\n" + "="*80)
print("STEP 4: ONE-STEP AHEAD FORECASTING")
print("="*80)

# Create forecaster with the fitted model
print("\nInitializing forecaster with fitted model...")
forecaster = MDSVForecaster(model)

# Generate rolling forecasts
print("\nGenerating one-step ahead forecasts...")
print("Using expanding window approach (re-estimating is too slow)...\n")

predictions_rv = []
actuals_rv = []
prediction_errors = []

# For each test point, use all data up to that point to forecast
for i in range(len(test_data)):
    try:
        # Data available up to time t (for forecasting t+1)
        if i == 0:
            # First forecast: use only training data
            available_returns = train_returns
            available_rv = train_rv
        else:
            # Subsequent forecasts: add observed test data up to (but not including) current point
            available_returns = np.concatenate([
                train_returns,
                test_data['demeaned_log_return'].iloc[:i].values
            ])
            available_rv = np.concatenate([
                train_rv,
                test_data['realized_variance'].iloc[:i].values
            ])

        available_data = np.column_stack([available_returns, available_rv])

        # Get last observation for conditioning (leverage effect)
        last_obs = available_data[-1:]

        # Forecast 1-step ahead
        forecast = forecaster.forecast(
            n_ahead=1,
            last_obs=last_obs,
            n_simulations=10000  # More simulations for better accuracy
        )

        # Extract RV forecast (handle different return formats)
        if isinstance(forecast, dict):
            if 'rv' in forecast:
                pred_rv = forecast['rv'][0] if hasattr(forecast['rv'], '__len__') else forecast['rv']
            elif 'volatility' in forecast:
                pred_rv = forecast['volatility'][0] if hasattr(forecast['volatility'], '__len__') else forecast['volatility']
            else:
                # Try to get first value from forecast dict
                pred_rv = list(forecast.values())[0]
                if hasattr(pred_rv, '__len__'):
                    pred_rv = pred_rv[0]
        else:
            pred_rv = forecast[0] if hasattr(forecast, '__len__') else forecast

        predictions_rv.append(pred_rv)
        actuals_rv.append(test_data['realized_variance'].iloc[i])

        if (i + 1) % 50 == 0 or i == 0:
            print(f"  Forecast {i+1}/{len(test_data)}: predicted={pred_rv:.4f}, actual={test_data['realized_variance'].iloc[i]:.4f}")

    except Exception as e:
        print(f"  Error at forecast {i+1}: {e}")
        predictions_rv.append(np.nan)
        actuals_rv.append(test_data['realized_variance'].iloc[i])
        prediction_errors.append(str(e))

predictions_rv = np.array(predictions_rv)
actuals_rv = np.array(actuals_rv)

print(f"\nCompleted {len(predictions_rv)} forecasts")

# ============================================================================
# STEP 5: Evaluate Performance
# ============================================================================
print("\n" + "="*80)
print("STEP 5: FORECAST EVALUATION")
print("="*80)

# Remove NaN predictions
valid_mask = ~np.isnan(predictions_rv) & ~np.isnan(actuals_rv)
predictions_valid = predictions_rv[valid_mask]
actuals_valid = actuals_rv[valid_mask]

print(f"\nValid forecasts: {len(predictions_valid)}/{len(predictions_rv)}")

if len(predictions_valid) > 10:  # Need reasonable sample size
    # Calculate error metrics
    errors = predictions_valid - actuals_valid
    squared_errors = errors ** 2
    abs_errors = np.abs(errors)

    rmse = np.sqrt(np.mean(squared_errors))
    mae = np.mean(abs_errors)
    mape = np.mean(np.abs(errors / actuals_valid)) * 100

    # Median metrics (more robust to outliers)
    median_ae = np.median(abs_errors)

    print(f"\nMDSV Forecast Performance:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  Median AE: {median_ae:.4f}")
    print(f"  MAPE: {mape:.2f}%")

    print(f"\nActual RV statistics (test period):")
    print(f"  Mean: {actuals_valid.mean():.4f}")
    print(f"  Std: {actuals_valid.std():.4f}")
    print(f"  Median: {np.median(actuals_valid):.4f}")

    print(f"\nPredicted RV statistics:")
    print(f"  Mean: {predictions_valid.mean():.4f}")
    print(f"  Std: {predictions_valid.std():.4f}")
    print(f"  Median: {np.median(predictions_valid):.4f}")

    # Naive benchmark (persistence/random walk model)
    print(f"\n" + "-"*80)
    print("BENCHMARK COMPARISON")
    print("-"*80)

    # Persistence: RV_t predicts RV_{t+1}
    naive_predictions = test_data['realized_variance'].iloc[:-1].values
    naive_actuals = test_data['realized_variance'].iloc[1:].values
    naive_mask = ~np.isnan(naive_predictions) & ~np.isnan(naive_actuals)

    naive_rmse = np.sqrt(np.mean((naive_predictions[naive_mask] - naive_actuals[naive_mask]) ** 2))
    naive_mae = np.mean(np.abs(naive_predictions[naive_mask] - naive_actuals[naive_mask]))

    print(f"\nNaive (Persistence) Forecast:")
    print(f"  RMSE: {naive_rmse:.4f}")
    print(f"  MAE: {naive_mae:.4f}")

    improvement_rmse = ((naive_rmse - rmse) / naive_rmse * 100)
    improvement_mae = ((naive_mae - mae) / naive_mae * 100)

    print(f"\nMDSV vs Naive:")
    if improvement_rmse > 0:
        print(f"  ✓ RMSE improvement: {improvement_rmse:.2f}%")
    else:
        print(f"  ✗ RMSE worse by: {abs(improvement_rmse):.2f}%")

    if improvement_mae > 0:
        print(f"  ✓ MAE improvement: {improvement_mae:.2f}%")
    else:
        print(f"  ✗ MAE worse by: {abs(improvement_mae):.2f}%")

else:
    print("\n✗ Insufficient valid forecasts for evaluation!")
    if len(prediction_errors) > 0:
        print(f"\nSample errors encountered:")
        for err in prediction_errors[:5]:
            print(f"  - {err}")

# ============================================================================
# STEP 6: Save Results
# ============================================================================
print("\n" + "="*80)
print("STEP 6: SAVING RESULTS")
print("="*80)

output_path = root / 'data' / 'results'
output_path.mkdir(parents=True, exist_ok=True)

# Save predictions
forecast_df = pd.DataFrame({
    'date': test_data['date'].values,
    'actual_rv': actuals_rv,
    'predicted_rv': predictions_rv,
    'error': actuals_rv - predictions_rv,
    'squared_error': (actuals_rv - predictions_rv) ** 2,
    'abs_error': np.abs(actuals_rv - predictions_rv)
})

forecast_df.to_csv(output_path / 'mdsv_forecasts_2015_2024.csv', index=False)
print(f"\n✓ Forecasts saved to: {output_path / 'mdsv_forecasts_2015_2024.csv'}")

# Save model info
model_info = {
    'model_specification': {
        'N': model.N,
        'D': model.D,
        'model_type': model.model_type,
        'leverage': model.leverage
    },
    'estimation_results': {
        'log_likelihood': float(result.log_likelihood),
        'aic': float(result.aic),
        'bic': float(result.bic),
        'success': result.success if hasattr(result, 'success') else None
    },
    'data_info': {
        'train_size': len(train_data),
        'test_size': len(test_data),
        'train_period': f"{train_data['date'].min()} to {train_data['date'].max()}",
        'test_period': f"{test_data['date'].min()} to {test_data['date'].max()}"
    },
    'forecast_performance': {
        'valid_forecasts': int(len(predictions_valid)),
        'total_forecasts': int(len(predictions_rv)),
        'rmse': float(rmse) if len(predictions_valid) > 10 else None,
        'mae': float(mae) if len(predictions_valid) > 10 else None,
        'mape': float(mape) if len(predictions_valid) > 10 else None,
        'naive_rmse': float(naive_rmse) if len(predictions_valid) > 10 else None,
        'improvement_pct': float(improvement_rmse) if len(predictions_valid) > 10 else None
    }
}

import json
with open(output_path / 'mdsv_model_info_2015_2024.json', 'w') as f:
    json.dump(model_info, f, indent=2)
print(f"✓ Model info saved to: {output_path / 'mdsv_model_info_2015_2024.json'}")

print("\n" + "="*80)
print("PIPELINE COMPLETED")
print("="*80)

STEP 1: LOADING PREPROCESSED DATA
Loading data from: /Users/gregruyoga/gmoneycodes/gmsm/gmsm/data/processed/daily_data_2015_2024.csv

Total days: 2498
Date range: 2015-01-05 00:00:00 to 2024-12-05 00:00:00

Checking for missing values:
demeaned_log_return    9
realized_variance      0
dtype: int64

After removing NaN: 2489 days
Date range: 2015-01-05 00:00:00 to 2024-12-05 00:00:00

STEP 2: TRAIN/TEST SPLIT
Train set: 2237 days (2015-01-05 00:00:00 to 2023-12-04 00:00:00)
Test set: 252 days (2023-12-05 00:00:00 to 2024-12-05 00:00:00)

STEP 3: FITTING MDSV MODEL

Training data shape: (2237, 2)
  Column 0 (returns): mean=-0.0081, std=1.1578
  Column 1 (RV): mean=1.1815, std=3.8702
  Returns range: [-12.8532, 8.9716]
  RV range: [0.0111, 90.9579]

Initializing MDSV(2, 5) model with leverage...

Setting up MDSVEstimator with options...

Fitting model using MDSVEstimator (this will take several minutes)...
This is the proper way to fit the model - please be patient...



  lognorm.pdf(rv, s=np.sqrt(shape), scale=np.exp(mu_rv)))
