# Multi-Symbol Arbitrage Research Skeleton

This notebook provides a flexible framework for arbitrage research using VectorBT.

**Key Features:**
- Schema-agnostic CSV/JSONL loading with automatic column discovery
- Multi-symbol time alignment utilities
- Spread calculation helpers for various arbitrage strategies
- VectorBT portfolio integration points

**Data Sources:**
- `depth.jsonl` - Order book snapshots/diffs
- `trades.jsonl` - Trade events
- Any CSV with timestamp and price columns

---
## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Optional, Callable
import json
import warnings

# VectorBT imports (install with: pip install vectorbt)
try:
    import vectorbt as vbt
    VBT_AVAILABLE = True
except ImportError:
    warnings.warn("VectorBT not installed. Portfolio features disabled. Install with: pip install vectorbt")
    VBT_AVAILABLE = False

# Visualization
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

---
## 2. Configuration

Define your data paths and trading parameters here.

In [None]:
# === DATA CONFIGURATION ===
# Point to your session data directory
DATA_ROOT = Path("../data/sessions")  # Adjust as needed
SESSION_NAME = "profile1_2h_20260122_2138"  # Adjust to your session

# Symbols for arbitrage analysis
SYMBOLS = ["BTCUSDT", "ETHUSDT", "ETHBTC"]  # Example: triangular arb

# === TRADING PARAMETERS ===
TRADING_FEES = {
    "maker": 0.001,  # 0.1% maker fee (adjust per exchange)
    "taker": 0.001,  # 0.1% taker fee
}

# Time alignment parameters
RESAMPLE_FREQ = "100ms"  # Resample frequency for alignment
FFILL_LIMIT = 10  # Max forward-fill periods

---
## 3. Flexible Data Loading

Schema-agnostic loaders that discover columns dynamically from CSV or JSONL files.

In [None]:
class FlexibleDataLoader:
    """
    Schema-agnostic data loader with automatic column discovery.
    Supports CSV and JSONL formats.
    """
    
    # Common timestamp column names to try
    TIMESTAMP_CANDIDATES = ['ts', 'timestamp', 'time', 'datetime', 'date', 't', 'event_time']
    # Common price column names to try
    PRICE_CANDIDATES = ['price', 'close', 'last', 'mid', 'mark_price', 'ltp']
    
    def __init__(self, path: Path):
        self.path = Path(path)
        self.df: Optional[pd.DataFrame] = None
        self.schema: Dict = {}
    
    def load(self, **read_kwargs) -> pd.DataFrame:
        """Load data from file, auto-detecting format."""
        suffix = self.path.suffix.lower()
        
        if suffix == '.jsonl':
            self.df = self._load_jsonl(**read_kwargs)
        elif suffix == '.csv':
            self.df = pd.read_csv(self.path, **read_kwargs)
        elif suffix == '.parquet':
            self.df = pd.read_parquet(self.path, **read_kwargs)
        else:
            raise ValueError(f"Unsupported file format: {suffix}")
        
        self._discover_schema()
        return self.df
    
    def _load_jsonl(self, nrows: Optional[int] = None, **kwargs) -> pd.DataFrame:
        """Load JSONL file line by line."""
        records = []
        with open(self.path, 'r') as f:
            for i, line in enumerate(f):
                if nrows and i >= nrows:
                    break
                records.append(json.loads(line.strip()))
        return pd.DataFrame(records)
    
    def _discover_schema(self):
        """Automatically discover column types and roles."""
        if self.df is None:
            return
        
        self.schema = {
            'columns': list(self.df.columns),
            'dtypes': self.df.dtypes.to_dict(),
            'shape': self.df.shape,
            'timestamp_col': self._find_timestamp_col(),
            'price_col': self._find_price_col(),
        }
    
    def _find_timestamp_col(self) -> Optional[str]:
        """Find the most likely timestamp column."""
        for candidate in self.TIMESTAMP_CANDIDATES:
            if candidate in self.df.columns:
                return candidate
        # Try to find datetime-like columns
        for col in self.df.columns:
            if self.df[col].dtype == 'datetime64[ns]':
                return col
            # Try parsing first non-null value
            sample = self.df[col].dropna().iloc[0] if len(self.df[col].dropna()) > 0 else None
            if isinstance(sample, str) and ('T' in sample or '-' in sample):
                try:
                    pd.to_datetime(sample)
                    return col
                except:
                    pass
        return None
    
    def _find_price_col(self) -> Optional[str]:
        """Find the most likely price column."""
        for candidate in self.PRICE_CANDIDATES:
            if candidate in self.df.columns:
                return candidate
        return None
    
    def describe_schema(self):
        """Print discovered schema information."""
        print(f"File: {self.path.name}")
        print(f"Shape: {self.schema.get('shape')}")
        print(f"Detected timestamp column: {self.schema.get('timestamp_col')}")
        print(f"Detected price column: {self.schema.get('price_col')}")
        print(f"\nColumns: {self.schema.get('columns')}")
        print(f"\nSample data:")
        display(self.df.head(3))

In [None]:
def load_symbol_data(
    session_path: Path,
    symbol: str,
    data_type: str = "trades",  # "trades" or "depth"
    nrows: Optional[int] = None
) -> pd.DataFrame:
    """
    Load data for a specific symbol from session directory.
    
    Args:
        session_path: Path to session directory
        symbol: Trading symbol (e.g., "BTCUSDT")
        data_type: "trades" or "depth"
        nrows: Limit number of rows (for testing)
    
    Returns:
        DataFrame with parsed timestamp index
    """
    file_path = session_path / symbol / f"{data_type}.jsonl"
    
    if not file_path.exists():
        raise FileNotFoundError(f"Data file not found: {file_path}")
    
    loader = FlexibleDataLoader(file_path)
    df = loader.load(nrows=nrows)
    
    # Parse timestamp if found
    ts_col = loader.schema.get('timestamp_col')
    if ts_col:
        df[ts_col] = pd.to_datetime(df[ts_col])
        df = df.set_index(ts_col).sort_index()
    
    # Add symbol column for multi-symbol processing
    df['symbol'] = symbol
    
    return df

In [None]:
# === EXAMPLE: Load and inspect data ===
# Uncomment to test with your data

# session_path = DATA_ROOT / SESSION_NAME
# btc_trades = load_symbol_data(session_path, "BTCUSDT", "trades", nrows=1000)
# print(f"Loaded {len(btc_trades)} rows")
# display(btc_trades.head())

---
## 4. Multi-Symbol Time Alignment

Utilities for synchronizing data across multiple symbols with different timestamps.

In [None]:
class MultiSymbolAligner:
    """
    Align multiple symbol DataFrames to a common time grid.
    Essential for arbitrage calculations requiring synchronized prices.
    """
    
    def __init__(
        self,
        resample_freq: str = "100ms",
        ffill_limit: int = 10,
        agg_method: str = "last"  # How to aggregate within each bin
    ):
        self.resample_freq = resample_freq
        self.ffill_limit = ffill_limit
        self.agg_method = agg_method
        self.aligned_data: Dict[str, pd.DataFrame] = {}
    
    def align(
        self,
        data_dict: Dict[str, pd.DataFrame],
        price_col: str = "price",
        price_exponent_col: Optional[str] = "price_exponent"
    ) -> pd.DataFrame:
        """
        Align multiple symbols to common time grid.
        
        Args:
            data_dict: Dict of {symbol: DataFrame} with datetime index
            price_col: Column containing price data
            price_exponent_col: Optional column with price exponent for fixed-point conversion
        
        Returns:
            DataFrame with columns for each symbol's price, aligned in time
        """
        aligned_series = {}
        
        for symbol, df in data_dict.items():
            if price_col not in df.columns:
                raise ValueError(f"Price column '{price_col}' not found in {symbol} data")
            
            # Handle fixed-point prices if exponent column exists
            if price_exponent_col and price_exponent_col in df.columns:
                prices = df[price_col] * (10 ** df[price_exponent_col].iloc[0])
            else:
                prices = df[price_col]
            
            # Resample and aggregate
            resampled = prices.resample(self.resample_freq).agg(self.agg_method)
            aligned_series[symbol] = resampled
        
        # Combine into single DataFrame
        aligned_df = pd.DataFrame(aligned_series)
        
        # Forward fill with limit
        aligned_df = aligned_df.ffill(limit=self.ffill_limit)
        
        # Drop rows where any symbol is NaN (no valid data)
        aligned_df = aligned_df.dropna()
        
        self.aligned_data = aligned_df
        return aligned_df
    
    def get_alignment_stats(self) -> Dict:
        """Return statistics about the alignment quality."""
        if self.aligned_data is None or len(self.aligned_data) == 0:
            return {}
        
        return {
            'total_rows': len(self.aligned_data),
            'time_range': (self.aligned_data.index.min(), self.aligned_data.index.max()),
            'symbols': list(self.aligned_data.columns),
            'null_counts': self.aligned_data.isnull().sum().to_dict(),
        }

In [None]:
# === EXAMPLE: Align multiple symbols ===
# Uncomment to test with your data

# session_path = DATA_ROOT / SESSION_NAME
# data_dict = {}
# for symbol in SYMBOLS:
#     data_dict[symbol] = load_symbol_data(session_path, symbol, "trades", nrows=10000)
#
# aligner = MultiSymbolAligner(resample_freq=RESAMPLE_FREQ)
# aligned_prices = aligner.align(data_dict)
# print(f"Aligned data shape: {aligned_prices.shape}")
# display(aligned_prices.head())

---
## 5. Spread Calculation Utilities

Helper functions for computing various arbitrage spreads.

In [None]:
class SpreadCalculator:
    """
    Calculate various spread types for arbitrage analysis.
    """
    
    @staticmethod
    def simple_spread(
        prices_a: pd.Series,
        prices_b: pd.Series,
        spread_type: str = "ratio"  # "ratio", "diff", "log_diff"
    ) -> pd.Series:
        """
        Calculate simple two-asset spread.
        
        Args:
            prices_a: First asset prices
            prices_b: Second asset prices
            spread_type: Type of spread calculation
        
        Returns:
            Spread series
        """
        if spread_type == "ratio":
            return prices_a / prices_b
        elif spread_type == "diff":
            return prices_a - prices_b
        elif spread_type == "log_diff":
            return np.log(prices_a) - np.log(prices_b)
        else:
            raise ValueError(f"Unknown spread type: {spread_type}")
    
    @staticmethod
    def z_score(
        spread: pd.Series,
        lookback: int = 100,
        min_periods: int = 20
    ) -> pd.Series:
        """
        Calculate rolling z-score of spread for mean reversion signals.
        
        Args:
            spread: Spread series
            lookback: Rolling window size
            min_periods: Minimum periods for valid calculation
        
        Returns:
            Z-score series
        """
        rolling_mean = spread.rolling(window=lookback, min_periods=min_periods).mean()
        rolling_std = spread.rolling(window=lookback, min_periods=min_periods).std()
        return (spread - rolling_mean) / rolling_std
    
    @staticmethod
    def fee_adjusted_spread(
        spread: pd.Series,
        fee_rate: float = 0.001,  # Total round-trip fees
        num_legs: int = 2  # Number of trades in the arb
    ) -> pd.Series:
        """
        Adjust spread for trading fees.
        
        For arbitrage to be profitable, spread must exceed fees.
        
        Args:
            spread: Raw spread series (as ratio, e.g., 1.001 = 0.1% spread)
            fee_rate: Fee rate per trade
            num_legs: Number of trades required
        
        Returns:
            Fee-adjusted spread (negative = unprofitable)
        """
        total_fees = fee_rate * num_legs
        # Convert ratio spread to percentage, subtract fees
        spread_pct = (spread - 1) * 100  # Convert to percentage
        return spread_pct - (total_fees * 100)

---
## 6. Triangular Arbitrage Spread Calculation [PLACEHOLDER]

Calculate triangular arbitrage opportunities across three currency pairs.

**Example:** BTC/USDT, ETH/USDT, ETH/BTC
- Buy ETH with USDT
- Sell ETH for BTC
- Sell BTC for USDT
- If final USDT > initial USDT (minus fees), arbitrage exists

In [None]:
def calculate_triangular_spread(
    aligned_prices: pd.DataFrame,
    base: str = "USDT",  # Base currency
    leg1: str = "BTCUSDT",  # First leg symbol
    leg2: str = "ETHUSDT",  # Second leg symbol  
    leg3: str = "ETHBTC",   # Cross pair symbol
) -> pd.Series:
    """
    [PLACEHOLDER] Calculate triangular arbitrage spread.
    
    This function calculates the implied vs actual cross rate.
    
    For BTC/USDT, ETH/USDT, ETH/BTC:
    - Implied ETH/BTC = ETH/USDT / BTC/USDT
    - Spread = Actual ETH/BTC / Implied ETH/BTC
    
    Args:
        aligned_prices: DataFrame with aligned prices for all symbols
        base: Base currency for the triangle
        leg1, leg2, leg3: Symbol names for the three legs
    
    Returns:
        Series with triangular spread (>1 = arb opportunity direction A, <1 = direction B)
    """
    # TODO: Implement triangular arbitrage spread calculation
    # Example structure:
    #
    # btc_usdt = aligned_prices[leg1]
    # eth_usdt = aligned_prices[leg2]
    # eth_btc_actual = aligned_prices[leg3]
    #
    # eth_btc_implied = eth_usdt / btc_usdt
    # triangular_spread = eth_btc_actual / eth_btc_implied
    #
    # return triangular_spread
    
    raise NotImplementedError("Implement triangular arbitrage spread calculation")

In [None]:
# === TRIANGULAR ARB ANALYSIS PLACEHOLDER ===
# Uncomment and complete after implementing calculate_triangular_spread

# tri_spread = calculate_triangular_spread(aligned_prices)
# print(f"Triangular spread stats:")
# print(f"  Mean: {tri_spread.mean():.6f}")
# print(f"  Std:  {tri_spread.std():.6f}")
# print(f"  Min:  {tri_spread.min():.6f}")
# print(f"  Max:  {tri_spread.max():.6f}")

---
## 7. Statistical Arbitrage Z-Score Calculation [PLACEHOLDER]

Calculate z-scores for pairs trading / statistical arbitrage strategies.

**Concept:** Two correlated assets should maintain a stable price ratio. When the ratio deviates significantly (high z-score), we expect mean reversion.

In [None]:
def calculate_stat_arb_zscore(
    aligned_prices: pd.DataFrame,
    symbol_a: str,
    symbol_b: str,
    lookback: int = 500,
    spread_type: str = "log_diff"
) -> pd.DataFrame:
    """
    [PLACEHOLDER] Calculate statistical arbitrage z-score.
    
    Args:
        aligned_prices: DataFrame with aligned prices
        symbol_a: First symbol
        symbol_b: Second symbol
        lookback: Rolling window for z-score calculation
        spread_type: Type of spread calculation
    
    Returns:
        DataFrame with columns: ['spread', 'zscore', 'rolling_mean', 'rolling_std']
    """
    # TODO: Implement statistical arbitrage z-score calculation
    # Example structure:
    #
    # calc = SpreadCalculator()
    # spread = calc.simple_spread(
    #     aligned_prices[symbol_a],
    #     aligned_prices[symbol_b],
    #     spread_type=spread_type
    # )
    # zscore = calc.z_score(spread, lookback=lookback)
    #
    # return pd.DataFrame({
    #     'spread': spread,
    #     'zscore': zscore,
    #     'rolling_mean': spread.rolling(lookback).mean(),
    #     'rolling_std': spread.rolling(lookback).std(),
    # })
    
    raise NotImplementedError("Implement statistical arbitrage z-score calculation")

In [None]:
# === STAT ARB ANALYSIS PLACEHOLDER ===
# Uncomment and complete after implementing calculate_stat_arb_zscore

# stat_arb_df = calculate_stat_arb_zscore(aligned_prices, "BTCUSDT", "ETHUSDT")
# print(f"Z-score distribution:")
# print(stat_arb_df['zscore'].describe())
#
# # Count extreme z-scores (potential signals)
# extreme_long = (stat_arb_df['zscore'] < -2).sum()
# extreme_short = (stat_arb_df['zscore'] > 2).sum()
# print(f"\nExtreme z-scores (|z| > 2): Long signals: {extreme_long}, Short signals: {extreme_short}")

---
## 8. Fee-Adjusted Return Calculation [PLACEHOLDER]

Calculate net returns after accounting for trading fees.

**Important:** Arbitrage opportunities often appear profitable before fees but become unprofitable after accounting for:
- Maker/taker fees
- Slippage
- Network/gas fees (for DeFi)

In [None]:
def calculate_fee_adjusted_returns(
    spread: pd.Series,
    entry_threshold: float = 0.002,  # 0.2% spread to enter
    exit_threshold: float = 0.0,     # Exit when spread returns to 0
    fee_per_leg: float = 0.001,      # 0.1% per trade
    num_legs: int = 2,               # Number of trades per arb
    slippage_bps: float = 1.0,       # 1 bps slippage per trade
) -> pd.DataFrame:
    """
    [PLACEHOLDER] Calculate fee-adjusted arbitrage returns.
    
    Args:
        spread: Spread series (as ratio where 1.0 = no spread)
        entry_threshold: Minimum spread to enter position
        exit_threshold: Spread level to exit position
        fee_per_leg: Fee rate per trade leg
        num_legs: Number of trade legs in the arbitrage
        slippage_bps: Expected slippage in basis points per trade
    
    Returns:
        DataFrame with columns: ['gross_return', 'fees', 'slippage', 'net_return', 'profitable']
    """
    # TODO: Implement fee-adjusted return calculation
    # Example structure:
    #
    # gross_return = (spread - 1) * 100  # Convert to percentage
    #
    # total_fees = fee_per_leg * num_legs * 100  # As percentage
    # total_slippage = (slippage_bps / 100) * num_legs  # As percentage
    #
    # net_return = gross_return - total_fees - total_slippage
    # profitable = net_return > 0
    #
    # return pd.DataFrame({
    #     'gross_return': gross_return,
    #     'fees': total_fees,
    #     'slippage': total_slippage,
    #     'net_return': net_return,
    #     'profitable': profitable,
    # })
    
    raise NotImplementedError("Implement fee-adjusted return calculation")

In [None]:
# === FEE-ADJUSTED RETURNS PLACEHOLDER ===
# Uncomment and complete after implementing calculate_fee_adjusted_returns

# returns_df = calculate_fee_adjusted_returns(tri_spread)
# print(f"Return analysis:")
# print(f"  Gross return mean: {returns_df['gross_return'].mean():.4f}%")
# print(f"  Net return mean:   {returns_df['net_return'].mean():.4f}%")
# print(f"  Profitable periods: {returns_df['profitable'].sum()} / {len(returns_df)}")

---
## 9. Basic Visualization Scaffolding

Visualization helpers for arbitrage analysis.

In [None]:
class ArbVisualizer:
    """
    Visualization utilities for arbitrage research.
    """
    
    @staticmethod
    def plot_aligned_prices(
        aligned_prices: pd.DataFrame,
        normalize: bool = True,
        figsize: tuple = (14, 6)
    ):
        """Plot aligned prices for multiple symbols."""
        fig, ax = plt.subplots(figsize=figsize)
        
        data = aligned_prices.copy()
        if normalize:
            data = data / data.iloc[0] * 100  # Normalize to 100
            ax.set_ylabel('Normalized Price (start=100)')
        else:
            ax.set_ylabel('Price')
        
        for col in data.columns:
            ax.plot(data.index, data[col], label=col, alpha=0.8)
        
        ax.set_xlabel('Time')
        ax.set_title('Aligned Prices')
        ax.legend()
        plt.tight_layout()
        return fig, ax
    
    @staticmethod
    def plot_spread_zscore(
        spread: pd.Series,
        zscore: pd.Series,
        entry_threshold: float = 2.0,
        figsize: tuple = (14, 8)
    ):
        """Plot spread and z-score with entry threshold bands."""
        fig, axes = plt.subplots(2, 1, figsize=figsize, sharex=True)
        
        # Spread plot
        axes[0].plot(spread.index, spread, label='Spread', color='blue', alpha=0.8)
        axes[0].axhline(y=spread.mean(), color='red', linestyle='--', label='Mean')
        axes[0].set_ylabel('Spread')
        axes[0].set_title('Spread Time Series')
        axes[0].legend()
        
        # Z-score plot with bands
        axes[1].plot(zscore.index, zscore, label='Z-Score', color='green', alpha=0.8)
        axes[1].axhline(y=0, color='black', linestyle='-', alpha=0.3)
        axes[1].axhline(y=entry_threshold, color='red', linestyle='--', label=f'+{entry_threshold} threshold')
        axes[1].axhline(y=-entry_threshold, color='red', linestyle='--', label=f'-{entry_threshold} threshold')
        axes[1].fill_between(zscore.index, -entry_threshold, entry_threshold, alpha=0.1, color='gray')
        axes[1].set_ylabel('Z-Score')
        axes[1].set_xlabel('Time')
        axes[1].set_title('Z-Score with Entry Thresholds')
        axes[1].legend()
        
        plt.tight_layout()
        return fig, axes
    
    @staticmethod
    def plot_spread_distribution(
        spread: pd.Series,
        bins: int = 100,
        figsize: tuple = (10, 6)
    ):
        """Plot spread distribution histogram."""
        fig, ax = plt.subplots(figsize=figsize)
        
        ax.hist(spread.dropna(), bins=bins, edgecolor='black', alpha=0.7)
        ax.axvline(x=spread.mean(), color='red', linestyle='--', label=f'Mean: {spread.mean():.6f}')
        ax.axvline(x=spread.median(), color='green', linestyle='--', label=f'Median: {spread.median():.6f}')
        
        ax.set_xlabel('Spread Value')
        ax.set_ylabel('Frequency')
        ax.set_title('Spread Distribution')
        ax.legend()
        plt.tight_layout()
        return fig, ax

In [None]:
# === VISUALIZATION EXAMPLE ===
# Uncomment to test with your data

# viz = ArbVisualizer()
# fig, ax = viz.plot_aligned_prices(aligned_prices, normalize=True)
# plt.show()

---
## 10. VectorBT Portfolio Integration

Integration points for VectorBT backtesting and portfolio management.

In [None]:
class VBTArbPortfolio:
    """
    VectorBT portfolio wrapper for arbitrage backtesting.
    
    This class provides integration points for running arbitrage strategies
    through VectorBT's portfolio simulation engine.
    """
    
    def __init__(self, prices: pd.DataFrame, fees: float = 0.001):
        """
        Initialize VBT portfolio wrapper.
        
        Args:
            prices: DataFrame with price columns for each symbol
            fees: Trading fee rate (applies to both entry and exit)
        """
        if not VBT_AVAILABLE:
            raise ImportError("VectorBT not installed. Run: pip install vectorbt")
        
        self.prices = prices
        self.fees = fees
        self.portfolio = None
    
    def generate_signals_from_zscore(
        self,
        zscore: pd.Series,
        entry_threshold: float = 2.0,
        exit_threshold: float = 0.5,
    ) -> tuple:
        """
        Generate entry/exit signals from z-score series.
        
        Args:
            zscore: Z-score series
            entry_threshold: Z-score level to enter position
            exit_threshold: Z-score level to exit position
        
        Returns:
            Tuple of (entries, exits) boolean Series
        """
        # Long spread when z-score is very negative (expect reversion up)
        long_entries = zscore < -entry_threshold
        long_exits = zscore > -exit_threshold
        
        # Short spread when z-score is very positive (expect reversion down)
        short_entries = zscore > entry_threshold
        short_exits = zscore < exit_threshold
        
        return {
            'long_entries': long_entries,
            'long_exits': long_exits,
            'short_entries': short_entries,
            'short_exits': short_exits,
        }
    
    def run_spread_backtest(
        self,
        spread: pd.Series,
        entries: pd.Series,
        exits: pd.Series,
        init_cash: float = 100000,
        size: float = 1.0,
    ):
        """
        Run backtest on spread using VectorBT.
        
        Note: This is a simplified example. For real arbitrage,
        you'd need to track positions in multiple assets.
        
        Args:
            spread: Spread series to trade
            entries: Boolean entry signals
            exits: Boolean exit signals
            init_cash: Initial cash
            size: Position size
        
        Returns:
            VectorBT Portfolio object
        """
        self.portfolio = vbt.Portfolio.from_signals(
            close=spread,
            entries=entries,
            exits=exits,
            init_cash=init_cash,
            size=size,
            fees=self.fees,
            freq='1T',  # Adjust based on your data frequency
        )
        return self.portfolio
    
    def get_stats(self) -> pd.Series:
        """Get portfolio statistics."""
        if self.portfolio is None:
            raise ValueError("Run backtest first")
        return self.portfolio.stats()
    
    def plot_equity(self):
        """Plot equity curve."""
        if self.portfolio is None:
            raise ValueError("Run backtest first")
        return self.portfolio.plot()

In [None]:
# === VECTORBT BACKTEST EXAMPLE ===
# Uncomment to test with your data

# if VBT_AVAILABLE:
#     # Calculate spread and z-score
#     calc = SpreadCalculator()
#     spread = calc.simple_spread(aligned_prices['BTCUSDT'], aligned_prices['ETHUSDT'], 'log_diff')
#     zscore = calc.z_score(spread, lookback=500)
#     
#     # Initialize portfolio
#     vbt_portfolio = VBTArbPortfolio(aligned_prices, fees=0.001)
#     
#     # Generate signals
#     signals = vbt_portfolio.generate_signals_from_zscore(zscore, entry_threshold=2.0)
#     
#     # Run backtest (using long signals only for simplicity)
#     portfolio = vbt_portfolio.run_spread_backtest(
#         spread=spread,
#         entries=signals['long_entries'],
#         exits=signals['long_exits'],
#     )
#     
#     # Display stats
#     print(vbt_portfolio.get_stats())
#     vbt_portfolio.plot_equity()

---
## 11. Utility Functions

Additional helper functions for common operations.

In [None]:
def discover_session_symbols(session_path: Path) -> List[str]:
    """
    Discover all available symbols in a session directory.
    
    Args:
        session_path: Path to session directory
    
    Returns:
        List of symbol names
    """
    symbols = []
    for item in session_path.iterdir():
        if item.is_dir() and (item / "trades.jsonl").exists():
            symbols.append(item.name)
    return sorted(symbols)


def list_available_sessions(data_root: Path) -> List[str]:
    """
    List all available session directories.
    
    Args:
        data_root: Root data directory
    
    Returns:
        List of session names
    """
    sessions_dir = data_root / "sessions" if (data_root / "sessions").exists() else data_root
    sessions = []
    for item in sessions_dir.iterdir():
        if item.is_dir():
            sessions.append(item.name)
    return sorted(sessions)


def sample_data_head(
    session_path: Path,
    symbol: str,
    data_type: str = "trades",
    n: int = 5
) -> None:
    """
    Quick preview of data file contents.
    
    Args:
        session_path: Path to session directory
        symbol: Symbol name
        data_type: "trades" or "depth"
        n: Number of rows to show
    """
    file_path = session_path / symbol / f"{data_type}.jsonl"
    print(f"File: {file_path}")
    print(f"Exists: {file_path.exists()}")
    if file_path.exists():
        loader = FlexibleDataLoader(file_path)
        df = loader.load(nrows=n)
        loader.describe_schema()

In [None]:
# === DISCOVER AVAILABLE DATA ===
# Uncomment to explore your data

# session_path = DATA_ROOT / SESSION_NAME
# available_symbols = discover_session_symbols(session_path)
# print(f"Available symbols: {available_symbols}")
#
# # Preview sample data
# sample_data_head(session_path, available_symbols[0], "trades", n=3)

---
## 12. Research Workflow Template

Follow this workflow for arbitrage research:

### Step 1: Data Discovery
```python
# List sessions and symbols
sessions = list_available_sessions(DATA_ROOT)
symbols = discover_session_symbols(DATA_ROOT / sessions[0])
```

### Step 2: Load and Inspect Data
```python
# Load data for selected symbols
data_dict = {}
for symbol in SYMBOLS:
    data_dict[symbol] = load_symbol_data(session_path, symbol, "trades")
```

### Step 3: Align Data
```python
# Align to common time grid
aligner = MultiSymbolAligner(resample_freq="100ms")
aligned_prices = aligner.align(data_dict)
```

### Step 4: Calculate Spreads
```python
# Calculate your arbitrage spread
spread = calculate_triangular_spread(aligned_prices, ...)  # Implement this
```

### Step 5: Analyze and Visualize
```python
# Visualize spread distribution
viz = ArbVisualizer()
viz.plot_spread_distribution(spread)
```

### Step 6: Backtest with VectorBT
```python
# Run backtest
vbt_portfolio = VBTArbPortfolio(aligned_prices)
portfolio = vbt_portfolio.run_spread_backtest(spread, entries, exits)
```

In [None]:
# === YOUR RESEARCH STARTS HERE ===
# Use the utilities above to conduct your arbitrage research

pass