## **Data Collection - Polygon.io Integration**

**Purpose:** Get raw data from Polygon.io and save it to disk

**What it does:**

- Downloads/loads trade, quote (NBBO), and L2 LOB data from Polygon.io
- Initial data quality checks (missing values, timestamps, gaps)
- Saves cleaned raw data to `data/interim/`
- Basic exploratory data analysis (EDA) to understand the data

**Data Sources:**
- **Trades**: Individual trade tick data from `/v3/trades/{ticker}`
- **Quotes**: NBBO (National Best Bid/Offer) quote data
- **LOB**: L2 aggregated order book snapshots

In [None]:
# ============================================================================
# 00_data_collection.ipynb
# Purpose: Load and validate raw data from Polygon.io
# ============================================================================

# %% Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

# Visualization settings
plt.style.use('seaborn-v0_8-paper')  
plt.rcParams.update({
    'font.family': 'serif',
    'font.weight': 'bold',        
    'axes.labelweight': 'bold',    
    'axes.titleweight': 'bold',   
    'axes.linewidth': 1.2,
    'axes.spines.top': False,
    'axes.spines.right': False,
})
%matplotlib inline

from src.config import RAW_DATA_DIR, INTERIM_DATA_DIR, FIGURES_DIR
from src.data.polygon_trade_loader import load_polygon_trades, PolygonTradeLoader
from src.data.polygon_quote_loader import load_polygon_quotes, PolygonQuoteLoader
from src.data.polygon_lob_loader import load_polygon_lob, PolygonLOBLoader
from src.data.synchronizer import align_trades_quotes
from src.utils.data_quality import (
    check_data_quality,
    print_quality_report,
    detect_gaps,
    remove_outliers
)

In [None]:
# %% Configuration
TICKER = "AAPL"  # U.S. equity ticker
START_DATE = "2024-01-01"  # Optional: specify date range
END_DATE = None  # None = use most recent data

# Data source selection
DATA_SOURCE = "local"  # "local" = load from disk, "api" = fetch from Polygon.io

print(f"Ticker: {TICKER}")
print(f"Data source: {DATA_SOURCE}")
if START_DATE:
    print(f"Date range: {START_DATE} to {END_DATE or 'latest'}")

In [None]:
# %% Note about data collection
# To collect new data from Polygon.io, you can:
# 1. Use the API directly (requires Polygon.io API key)
# 2. Or run data collection scripts (to be implemented):
#    python scripts/collect_polygon_data.py --ticker AAPL --start-date 2024-01-01
#
# This notebook assumes data has already been collected and saved to disk.

In [None]:
# %% Load data from Polygon.io

print("\n" + "=" * 60)
print("LOADING POLYGON.IO DATA")
print("=" * 60)

# Load trade data
print("\n1. Loading trade data...")
try:
    trades_df = load_polygon_trades(
        ticker=TICKER,
        start_date=START_DATE,
        end_date=END_DATE,
        filter_regular=True  # Exclude odd-lots, out-of-sequence, etc.
    )
    print(f"   Loaded {len(trades_df):,} trades")
    print(f"   Columns: {trades_df.columns.tolist()}")
except Exception as e:
    print(f"   Error loading trades: {e}")
    trades_df = None

# Load quote data (NBBO)
print("\n2. Loading NBBO quote data...")
try:
    quotes_df = load_polygon_quotes(
        ticker=TICKER,
        start_date=START_DATE,
        end_date=END_DATE,
        compute_features=True  # Compute spread, mid-price, etc.
    )
    print(f"   Loaded {len(quotes_df):,} quotes")
    print(f"   Columns: {quotes_df.columns.tolist()}")
except Exception as e:
    print(f"   Error loading quotes: {e}")
    quotes_df = None

# Load LOB data (L2 snapshots)
print("\n3. Loading L2 order book snapshots...")
try:
    lob_df = load_polygon_lob(
        ticker=TICKER,
        start_date=START_DATE,
        end_date=END_DATE,
        depth=10,  # Load top 10 levels
        compute_features=True  # Compute book shape features
    )
    print(f"   Loaded {len(lob_df):,} snapshots")
    print(f"   Columns: {lob_df.columns.tolist()}")
except Exception as e:
    print(f"   Error loading LOB: {e}")
    lob_df = None

print("\n" + "=" * 60)

In [None]:
# %% Inspect quote data (NBBO)
print("\n" + "=" * 60)
print("QUOTE DATA INSPECTION (NBBO)")
print("=" * 60)

if quotes_df is not None:
    display(quotes_df.head())
    print(f"\nShape: {quotes_df.shape}")
    print(f"\nData types:\n{quotes_df.dtypes}")
    print(f"\nSample statistics:")
    print(quotes_df[['bid_price', 'ask_price', 'bid_size', 'ask_size']].describe())
else:
    print("No quote data available")

In [None]:
# %% Inspect trade data
print("\n" + "=" * 60)
print("TRADE DATA INSPECTION")
print("=" * 60)

if trades_df is not None:
    display(trades_df.head())
    print(f"\nShape: {trades_df.shape}")
    print(f"\nData types:\n{trades_df.dtypes}")
    print(f"\nTrade statistics:")
    print(trades_df[['price', 'size']].describe())
    
    # Check trade classification if available
    if 'is_aggressive_buy' in trades_df.columns:
        n_buys = trades_df['is_aggressive_buy'].sum()
        n_sells = trades_df['is_aggressive_sell'].sum()
        print(f"\nTrade classification:")
        print(f"  Aggressive buys: {n_buys:,} ({n_buys/len(trades_df)*100:.1f}%)")
        print(f"  Aggressive sells: {n_sells:,} ({n_sells/len(trades_df)*100:.1f}%)")
else:
    print("No trade data available")

In [None]:
# %% Inspect LOB data (L2 snapshots)
print("\n" + "=" * 60)
print("ORDER BOOK DATA INSPECTION (L2)")
print("=" * 60)

if lob_df is not None:
    display(lob_df.head())
    print(f"\nShape: {lob_df.shape}")
    print(f"\nData types:\n{lob_df.dtypes.head(20)}")
    
    # Check for computed features
    if 'quoted_spread' in lob_df.columns:
        print(f"\nOrder book statistics:")
        print(f"  Average spread: ${lob_df['quoted_spread'].mean():.4f}")
        if 'relative_spread' in lob_df.columns:
            print(f"  Average spread (bps): {lob_df['relative_spread'].mean()*10000:.2f}")
        if 'depth_at_best' in lob_df.columns:
            print(f"  Average depth at best: {lob_df['depth_at_best'].mean():.0f} shares")
else:
    print("No LOB data available")

In [None]:
# %% Data quality checks
print("\n" + "=" * 60)
print("DATA QUALITY ANALYSIS")
print("=" * 60)

# Check quote data quality (most comprehensive)
if quotes_df is not None:
    print("\nChecking quote (NBBO) data quality...")
    quality_report = check_data_quality(quotes_df, timestamp_col="timestamp")
    print_quality_report(quality_report)
else:
    print("No quote data available for quality checks")

# Check trade data quality
if trades_df is not None:
    print("\n\nChecking trade data quality...")
    quality_report = check_data_quality(trades_df, timestamp_col="timestamp")
    print_quality_report(quality_report)

In [None]:
# %% Check for time gaps
print("\n" + "=" * 60)
print("TIME GAP ANALYSIS")
print("=" * 60)

# Check gaps in quote data
if quotes_df is not None:
    print("\nAnalyzing gaps in quote data...")
    gaps = detect_gaps(quotes_df, timestamp_col="timestamp", max_gap_seconds=60.0)
    if len(gaps) > 0:
        print(f"   Found {len(gaps)} gaps > 60s:")
        display(gaps.head())
    else:
        print("   No significant time gaps found in quotes")

# Check gaps in trade data
if trades_df is not None:
    print("\nAnalyzing gaps in trade data...")
    gaps = detect_gaps(trades_df, timestamp_col="timestamp", max_gap_seconds=300.0)
    if len(gaps) > 0:
        print(f"   Found {len(gaps)} gaps > 300s:")
        display(gaps.head())
    else:
        print("   No significant time gaps found in trades")

In [None]:
# %% Visualize quote data
print("\n" + "=" * 60)
print("VISUALIZATIONS")
print("=" * 60)

if quotes_df is not None and len(quotes_df) > 0:
    fig, axes = plt.subplots(3, 1, figsize=(15, 10))
    
    # Best bid/ask prices (NBBO)
    axes[0].plot(quotes_df["timestamp"], quotes_df["bid_price"], label="Bid", linewidth=1.0, alpha=0.7)
    axes[0].plot(quotes_df["timestamp"], quotes_df["ask_price"], label="Ask", linewidth=1.0, alpha=0.7)
    if 'mid_price' in quotes_df.columns:
        axes[0].plot(quotes_df["timestamp"], quotes_df["mid_price"], label="Mid", linewidth=0.8, alpha=0.5)
    axes[0].set_title(f"{TICKER} - Best Bid/Ask Prices Over Time (NBBO)")
    axes[0].set_ylabel("Price ($)")
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Spread
    if 'quoted_spread' in quotes_df.columns:
        spread = quotes_df['quoted_spread']
    else:
        spread = quotes_df["ask_price"] - quotes_df["bid_price"]
    axes[1].plot(quotes_df["timestamp"], spread, linewidth=1.0, color="orange", alpha=0.7)
    axes[1].set_title("Bid-Ask Spread Over Time")
    axes[1].set_ylabel("Spread ($)")
    axes[1].grid(True, alpha=0.3)
    
    # Volume at best bid/ask
    axes[2].plot(
        quotes_df["timestamp"], quotes_df["bid_size"], label="Bid Size", linewidth=0.5, alpha=0.7
    )
    axes[2].plot(
        quotes_df["timestamp"], quotes_df["ask_size"], label="Ask Size", linewidth=0.5, alpha=0.7
    )
    axes[2].set_title("Size at Best Bid/Ask (NBBO)")
    axes[2].set_ylabel("Size (shares)")
    axes[2].set_xlabel("Time")
    axes[2].legend()
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / f"{TICKER}_quote_data_overview.png", dpi=300, bbox_inches="tight")
    plt.show()
    print(f"\nSaved visualization to: {FIGURES_DIR / f'{TICKER}_quote_data_overview.png'}")
else:
    print("No quote data available for visualization")

In [None]:
# %% Handle missing values and outliers
print("\n" + "=" * 60)
print("DATA CLEANING")
print("=" * 60)

# Clean quote data
if quotes_df is not None:
    print("\nCleaning quote data...")
    print(f"  Before cleaning: {len(quotes_df):,} rows")
    
    # Drop missing values
    quotes_df_clean = quotes_df.dropna()
    print(f"  After dropping NaN: {len(quotes_df_clean):,} rows")
    
    # Remove outliers (optional - be conservative)
    price_cols = ["bid_price", "ask_price"]
    quotes_df_clean = remove_outliers(
        quotes_df_clean, columns=price_cols, method="iqr", threshold=3.0, verbose=True
    )
    print(f"  After outlier removal: {len(quotes_df_clean):,} rows")
else:
    quotes_df_clean = None
    print("No quote data to clean")

# Clean trade data
if trades_df is not None:
    print("\nCleaning trade data...")
    print(f"  Before cleaning: {len(trades_df):,} rows")
    
    # Drop missing values
    trades_df_clean = trades_df.dropna()
    print(f"  After dropping NaN: {len(trades_df_clean):,} rows")
    
    # Remove outliers
    trades_df_clean = remove_outliers(
        trades_df_clean, columns=['price'], method="iqr", threshold=3.0, verbose=True
    )
    print(f"  After outlier removal: {len(trades_df_clean):,} rows")
else:
    trades_df_clean = None
    print("No trade data to clean")

# Clean LOB data
if lob_df is not None:
    print("\nCleaning LOB data...")
    print(f"  Before cleaning: {len(lob_df):,} rows")
    
    # Drop missing values
    lob_df_clean = lob_df.dropna()
    print(f"  After dropping NaN: {len(lob_df_clean):,} rows")
else:
    lob_df_clean = None
    print("No LOB data to clean")

In [None]:
# %% Save cleaned data
print("\n" + "=" * 60)
print("SAVING CLEANED DATA")
print("=" * 60)

INTERIM_DATA_DIR.mkdir(parents=True, exist_ok=True)

# Save quote data
if quotes_df_clean is not None:
    output_file = INTERIM_DATA_DIR / f"{TICKER}_quotes_cleaned.parquet"
    quotes_df_clean.to_parquet(output_file, index=False)
    print(f"\n Saved cleaned quote data to: {output_file}")
    print(f"  File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")
    print(f"  Records: {len(quotes_df_clean):,}")

# Save trade data
if trades_df_clean is not None:
    output_file = INTERIM_DATA_DIR / f"{TICKER}_trades_cleaned.parquet"
    trades_df_clean.to_parquet(output_file, index=False)
    print(f"\n Saved cleaned trade data to: {output_file}")
    print(f"  File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")
    print(f"  Records: {len(trades_df_clean):,}")

# Save LOB data
if lob_df_clean is not None:
    output_file = INTERIM_DATA_DIR / f"{TICKER}_lob_cleaned.parquet"
    lob_df_clean.to_parquet(output_file, index=False)
    print(f"\n Saved cleaned LOB data to: {output_file}")
    print(f"  File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")
    print(f"  Records: {len(lob_df_clean):,}")

# %% Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)

if quotes_df_clean is not None:
    print(f"\n Loaded {len(quotes_df_clean):,} clean quote snapshots (NBBO)")
    print(f"  Time range: {quotes_df_clean['timestamp'].min()} to {quotes_df_clean['timestamp'].max()}")

if trades_df_clean is not None:
    print(f"\n Loaded {len(trades_df_clean):,} clean trades")
    print(f"  Time range: {trades_df_clean['timestamp'].min()} to {trades_df_clean['timestamp'].max()}")

if lob_df_clean is not None:
    print(f"\n Loaded {len(lob_df_clean):,} clean LOB snapshots")
    print(f"  Time range: {lob_df_clean['timestamp'].min()} to {lob_df_clean['timestamp'].max()}")

print(f"\n Data saved to: {INTERIM_DATA_DIR}")
print(f"\n Next notebook: Run 10_basic_features.ipynb")

In [None]:
# %% Optional: Align trades with quotes (for trade classification)
print("\n" + "=" * 60)
print("TRADE-QUOTE ALIGNMENT (OPTIONAL)")
print("=" * 60)

if trades_df_clean is not None and quotes_df_clean is not None:
    print("\nAligning trades with quotes using Lee-Ready method...")
    
    # Align trades with the most recent quote before each trade (as-of merge)
    aligned_df = align_trades_quotes(
        trades_df_clean,
        quotes_df_clean,
        method='asof',
        tolerance='10ms'  # Maximum time difference allowed
    )
    
    print(f"  Aligned {len(aligned_df):,} trades with quotes")
    print(f"\nSample aligned data:")
    display(aligned_df[['timestamp', 'price', 'size', 'bid_price', 'ask_price', 'mid_price']].head())
    
    # Save aligned data
    output_file = INTERIM_DATA_DIR / f"{TICKER}_trades_aligned.parquet"
    aligned_df.to_parquet(output_file, index=False)
    print(f"\n Saved aligned data to: {output_file}")
else:
    print("Both trade and quote data are required for alignment")