# Options Surface NaN Analysis

This notebook scans the derived options surface outputs, loads each parquet with pandas, and summarizes missing-data patterns. Use it to verify which columns contain NaN values across multiple symbols and time ranges.

In [None]:
from pathlib import Path
import os
from collections import Counter, defaultdict
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
pd.set_option('display.max_rows', 100)

In [None]:
DATA_ROOT = Path(os.environ.get('QUANTO_DATA_ROOT', Path.cwd() / '.quanto_data'))
SURFACE_ROOT = DATA_ROOT / 'derived' / 'options_surface_v1'
SURFACE_ROOT

In [None]:
if not SURFACE_ROOT.exists():
    raise FileNotFoundError(f'No derived options surface data found under {SURFACE_ROOT}')
files = sorted(SURFACE_ROOT.glob('*/*/*.parquet'))
len(files)

In [None]:
nan_counts = Counter()
symbol_nan_counts = defaultdict(set)
total_rows = 0
for path in files:
    df = pd.read_parquet(path)
    if df.empty:
        continue
    total_rows += len(df)
    counts = df.isna().sum()
    symbol = path.parent.parent.name
    for col, cnt in counts.items():
        if cnt:
            nan_counts[col] += int(cnt)
            symbol_nan_counts[col].add(symbol)
sum(nan_counts.values())

In [None]:
summary_rows = []
for col, cnt in nan_counts.items():
    summary_rows.append({
        'column': col,
        'nan_count': cnt,
        'nan_pct': cnt / total_rows if total_rows else 0.0,
        'symbols_with_nan': len(symbol_nan_counts[col]),
    })
nan_summary = pd.DataFrame(summary_rows).sort_values('nan_count', ascending=False)
nan_summary.head(20)

In [None]:
top_k = nan_summary.head(20) if not nan_summary.empty else pd.DataFrame()
if not top_k.empty:
    fig, ax = plt.subplots(figsize=(10,6))
    ax.barh(top_k['column'], top_k['nan_count'])
    ax.invert_yaxis()
    ax.set_xlabel('NaN Count')
    ax.set_title('Top Columns by NaN Count')
    plt.show()
else:
    print('No NaN values detected in the loaded data.')

In [None]:
if not nan_summary.empty:
    display(nan_summary.sort_values('symbols_with_nan', ascending=False).head(20))