In [3]:
root

PosixPath('/Users/gregruyoga/gmoneycodes/gmsm/gmsm')

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from scipy import stats

print("="*80)
print("PREPROCESSED DATA DIAGNOSTICS")
print("="*80)

# Load data
root = Path().resolve().parent.parent
data_path = root / 'gmsm' / 'data' / 'processed' / 'daily_data_2015_2024.csv'
print(f"\nLoading data from: {data_path}")

df = pd.read_csv(data_path)
df['date'] = pd.to_datetime(df['date'])

print(f"\nDataset info:")
print(f"  Total observations: {len(df)}")
print(f"  Date range: {df['date'].min()} to {df['date'].max()}")
print(f"  Columns: {list(df.columns)}")

# ============================================================================
# 1. MISSING VALUES CHECK
# ============================================================================
print("\n" + "="*80)
print("1. MISSING VALUES")
print("="*80)
missing = df.isnull().sum()
print(missing[missing > 0])
if missing.sum() == 0:
    print("✓ No missing values found")

# ============================================================================
# 2. REALIZED VARIANCE ANALYSIS
# ============================================================================
print("\n" + "="*80)
print("2. REALIZED VARIANCE DIAGNOSTICS")
print("="*80)

rv = df['realized_variance'].dropna()

print(f"\nBasic statistics:")
print(f"  Count: {len(rv)}")
print(f"  Mean: {rv.mean():.4f}")
print(f"  Median: {rv.median():.4f}")
print(f"  Std: {rv.std():.4f}")
print(f"  Min: {rv.min():.4f}")
print(f"  Max: {rv.max():.4f}")
print(f"  Range: {rv.max() - rv.min():.4f}")

print(f"\nPercentiles:")
percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99, 99.5, 99.9]
for p in percentiles:
    val = np.percentile(rv, p)
    print(f"  {p:5.1f}%: {val:10.4f}")

print(f"\nDistribution shape:")
print(f"  Skewness: {stats.skew(rv):.4f}")
print(f"  Kurtosis: {stats.kurtosis(rv):.4f}")

# Check for extreme values
print(f"\nExtreme values check:")
q99 = np.percentile(rv, 99)
extreme_count = (rv > q99 * 3).sum()
print(f"  Values > 3 * 99th percentile: {extreme_count} ({extreme_count/len(rv)*100:.2f}%)")

max_val = rv.max()
if max_val > 1000:
    print(f"  ⚠ WARNING: Maximum RV ({max_val:.2f}) is very large!")
    print(f"  This may cause numerical overflow in exp() operations")

# ============================================================================
# 3. LOG-SCALE ANALYSIS
# ============================================================================
print("\n" + "="*80)
print("3. LOG-SCALE TRANSFORMATION ANALYSIS")
print("="*80)

# Check if log transformation helps
log_rv = np.log(rv)
print(f"\nLog(RV) statistics:")
print(f"  Mean: {log_rv.mean():.4f}")
print(f"  Median: {log_rv.median():.4f}")
print(f"  Std: {log_rv.std():.4f}")
print(f"  Min: {log_rv.min():.4f}")
print(f"  Max: {log_rv.max():.4f}")
print(f"  Range: {log_rv.max() - log_rv.min():.4f}")

print(f"\nLog(RV) distribution shape:")
print(f"  Skewness: {stats.skew(log_rv):.4f}")
print(f"  Kurtosis: {stats.kurtosis(log_rv):.4f}")

# Check if log-normal distribution is appropriate
if log_rv.std() < 10:
    print(f"  ✓ Log(RV) has reasonable std for log-normal modeling")
else:
    print(f"  ⚠ Log(RV) std is large, may cause numerical issues")

# ============================================================================
# 4. RETURNS ANALYSIS
# ============================================================================
print("\n" + "="*80)
print("4. RETURNS DIAGNOSTICS")
print("="*80)

returns = df['demeaned_log_return'].dropna()

print(f"\nBasic statistics:")
print(f"  Count: {len(returns)}")
print(f"  Mean: {returns.mean():.6f}")
print(f"  Median: {returns.median():.6f}")
print(f"  Std: {returns.std():.4f}")
print(f"  Min: {returns.min():.4f}")
print(f"  Max: {returns.max():.4f}")

print(f"\nPercentiles:")
for p in [1, 5, 25, 50, 75, 95, 99]:
    val = np.percentile(returns, p)
    print(f"  {p:5.1f}%: {val:10.4f}")

print(f"\nDistribution shape:")
print(f"  Skewness: {stats.skew(returns):.4f}")
print(f"  Kurtosis: {stats.kurtosis(returns):.4f}")

# Check for outliers
print(f"\nOutlier check (>3 std):")
outliers_pos = (returns > returns.mean() + 3*returns.std()).sum()
outliers_neg = (returns < returns.mean() - 3*returns.std()).sum()
print(f"  Positive outliers: {outliers_pos} ({outliers_pos/len(returns)*100:.2f}%)")
print(f"  Negative outliers: {outliers_neg} ({outliers_neg/len(returns)*100:.2f}%)")

# ============================================================================
# 5. SCALING RECOMMENDATIONS
# ============================================================================
print("\n" + "="*80)
print("5. SCALING RECOMMENDATIONS")
print("="*80)

# Check current scale
if rv.mean() > 100:
    print("\n⚠ ISSUE DETECTED: Realized variance is on a large scale")
    print(f"   Current mean RV: {rv.mean():.2f}")
    print(f"   Current max RV: {rv.max():.2f}")

    print("\n📊 RECOMMENDED FIXES:")

    # Option 1: Divide by 100
    scaled_rv_1 = rv / 100
    print(f"\n1. Divide RV by 100:")
    print(f"   New mean: {scaled_rv_1.mean():.4f}")
    print(f"   New max: {scaled_rv_1.max():.4f}")
    print(f"   New std: {scaled_rv_1.std():.4f}")

    # Option 2: Divide by 10000
    scaled_rv_2 = rv / 10000
    print(f"\n2. Divide RV by 10,000:")
    print(f"   New mean: {scaled_rv_2.mean():.4f}")
    print(f"   New max: {scaled_rv_2.max():.4f}")
    print(f"   New std: {scaled_rv_2.std():.4f}")

    # Option 3: Convert to annualized volatility (sqrt(RV * 252))
    ann_vol = np.sqrt(rv * 252)
    print(f"\n3. Convert to annualized volatility % (sqrt(RV * 252)):")
    print(f"   Mean: {ann_vol.mean():.4f}%")
    print(f"   Max: {ann_vol.max():.4f}%")
    print(f"   Std: {ann_vol.std():.4f}%")

    # Option 4: Use log(RV)
    print(f"\n4. Use log(RV) instead of RV:")
    print(f"   Mean: {log_rv.mean():.4f}")
    print(f"   Max: {log_rv.max():.4f}")
    print(f"   Std: {log_rv.std():.4f}")

    print("\n💡 BEST PRACTICE:")
    print("   Option 2 (divide by 10,000) is recommended because:")
    print("   - Puts RV in variance units (not percentage^2)")
    print("   - Results in values mostly between 0-1")
    print("   - Prevents numerical overflow in exp() operations")
    print("   - Standard in academic literature")

else:
    print("✓ Realized variance scale appears reasonable")
    print(f"  Mean RV: {rv.mean():.4f}")
    print(f"  This should not cause numerical issues")

# ============================================================================
# 6. TIME SERIES CHECKS
# ============================================================================
print("\n" + "="*80)
print("6. TIME SERIES PROPERTIES")
print("="*80)

# Check for gaps in dates
df_sorted = df.sort_values('date').reset_index(drop=True)
date_diffs = df_sorted['date'].diff()
gaps = date_diffs[date_diffs > pd.Timedelta(days=5)]

print(f"\nTemporal coverage:")
print(f"  Total days: {len(df_sorted)}")
print(f"  Expected trading days (~252/year): {len(df_sorted) / ((df_sorted['date'].max() - df_sorted['date'].min()).days / 365.25) :.1f} days/year")
print(f"  Large gaps (>5 days): {len(gaps)}")

if len(gaps) > 0:
    print(f"\nLargest gaps:")
    for idx in gaps.nlargest(5).index:
        print(f"  {df_sorted.loc[idx-1, 'date'].date()} -> {df_sorted.loc[idx, 'date'].date()} ({date_diffs[idx].days} days)")

# Autocorrelation
from pandas.plotting import autocorrelation_plot
print(f"\nAutocorrelation (lag 1):")
print(f"  Returns: {returns.autocorr(lag=1):.4f}")
print(f"  RV: {rv.autocorr(lag=1):.4f}")

# ============================================================================
# 7. YEAR-BY-YEAR COMPARISON
# ============================================================================
print("\n" + "="*80)
print("7. YEAR-BY-YEAR STATISTICS")
print("="*80)

df['year'] = df['date'].dt.year
yearly_stats = df.groupby('year').agg({
    'realized_variance': ['count', 'mean', 'std', 'min', 'max'],
    'demeaned_log_return': ['mean', 'std', 'min', 'max']
})

print("\nRealized Variance by year:")
print(yearly_stats['realized_variance'].round(2))

print("\nReturns by year:")
print(yearly_stats['demeaned_log_return'].round(4))

# ============================================================================
# 8. CORRELATION ANALYSIS
# ============================================================================
print("\n" + "="*80)
print("8. CORRELATION ANALYSIS")
print("="*80)

corr_data = df[['demeaned_log_return', 'realized_variance']].dropna()
correlation = corr_data.corr()

print("\nCorrelation matrix:")
print(correlation)

# Leverage effect (negative returns -> higher volatility)
neg_returns = df[df['demeaned_log_return'] < 0]
pos_returns = df[df['demeaned_log_return'] > 0]

print(f"\nLeverage effect check:")
print(f"  Mean RV after negative returns: {neg_returns['forward_realized_variance'].mean():.4f}")
print(f"  Mean RV after positive returns: {pos_returns['forward_realized_variance'].mean():.4f}")
ratio = neg_returns['forward_realized_variance'].mean() / pos_returns['forward_realized_variance'].mean()
print(f"  Ratio: {ratio:.4f}")
if ratio > 1.1:
    print(f"  ✓ Leverage effect present (neg returns -> higher vol)")

# ============================================================================
# SUMMARY AND RECOMMENDATIONS
# ============================================================================
print("\n" + "="*80)
print("SUMMARY AND RECOMMENDATIONS")
print("="*80)

issues = []
recommendations = []

if rv.max() > 1000:
    issues.append("RV values are very large (max > 1000)")
    recommendations.append("Rescale RV by dividing by 10,000")

if missing.sum() > 0:
    issues.append(f"{missing.sum()} missing values detected")
    recommendations.append("Remove or impute missing values")

if len(gaps) > 10:
    issues.append(f"{len(gaps)} large gaps in dates")
    recommendations.append("Check data continuity across year boundaries")

if issues:
    print("\n⚠ ISSUES FOUND:")
    for i, issue in enumerate(issues, 1):
        print(f"  {i}. {issue}")

    print("\n💡 RECOMMENDATIONS:")
    for i, rec in enumerate(recommendations, 1):
        print(f"  {i}. {rec}")
else:
    print("\n✓ No critical issues detected")
    print("  Data appears ready for modeling")

print("\n" + "="*80)
print("DIAGNOSTICS COMPLETE")
print("="*80)

PREPROCESSED DATA DIAGNOSTICS

Loading data from: /Users/gregruyoga/gmoneycodes/gmsm/gmsm/data/processed/daily_data_2015_2024.csv

Dataset info:
  Total observations: 2498
  Date range: 2015-01-05 00:00:00 to 2024-12-05 00:00:00
  Columns: ['date', 'sum_squared_returns', 'n_obs', 'realized_variance', 'daily_log_return_pct', 'year', 'demeaned_log_return', 'forward_realized_variance']

1. MISSING VALUES
Series([], dtype: int64)
✓ No missing values found

2. REALIZED VARIANCE DIAGNOSTICS

Basic statistics:
  Count: 2498
  Mean: 1.1232
  Median: 0.4303
  Std: 3.6995
  Min: 0.0111
  Max: 90.9579
  Range: 90.9468

Percentiles:
    1.0%:     0.0321
    5.0%:     0.0651
   10.0%:     0.0937
   25.0%:     0.1885
   50.0%:     0.4303
   75.0%:     0.9953
   90.0%:     2.1644
   95.0%:     3.4970
   99.0%:    11.1747
   99.5%:    17.7492
   99.9%:    61.7917

Distribution shape:
  Skewness: 14.5900
  Kurtosis: 271.6202

Extreme values check:
  Values > 3 * 99th percentile: 7 (0.28%)

3. LOG-SCALE