In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

def load_data():
    """Load the order book data"""
    try:
        # Try to load from CSV file
        df = pd.read_csv('data.csv')
        print(f"Data loaded successfully. Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def calculate_obi(df, level=10):
    """
    Calculate Order Book Imbalance (OBI) for specified level
    
    OBI = (Bid Quantity - Ask Quantity) / (Bid Quantity + Ask Quantity)
    """
    # Initialize lists to store bid and ask quantities
    bid_quantities = []
    ask_quantities = []
    
    # Collect quantities for specified levels
    for i in range(1, level+1):
        bid_qty_col = f'bid_qty_{i}'
        ask_qty_col = f'ask_qty_{i}'
        
        if bid_qty_col in df.columns and ask_qty_col in df.columns:
            bid_quantities.append(df[bid_qty_col])
            ask_quantities.append(df[ask_qty_col])
    
    # Sum all bid and ask quantities
    total_bid_qty = pd.concat(bid_quantities, axis=1).sum(axis=1)
    total_ask_qty = pd.concat(ask_quantities, axis=1).sum(axis=1)
    
    # Calculate OBI
    obi = (total_bid_qty - total_ask_qty) / (total_bid_qty + total_ask_qty)
    
    return obi, total_bid_qty, total_ask_qty

def calculate_spread(df):
    """Calculate the spread between best ask and best bid prices"""
    if 'ask_price_1' in df.columns and 'bid_price_1' in df.columns:
        return df['ask_price_1'] - df['bid_price_1']
    else:
        print("Required price columns not found.")
        return None

def classify_spread_regime(spread, tick_size=0.1):
    """
    Classify spread regime:
    - Tight Spread (TS): Spread ≤ 1 tick
    - Wide Spread (WS): Spread > 2-3 ticks
    """
    tight_threshold = 1 * tick_size
    wide_threshold = 2.5 * tick_size  # Using 2.5 as the middle of 2-3
    
    conditions = [
        (spread <= tight_threshold),
        (spread > wide_threshold)
    ]
    choices = ['Tight', 'Wide']
    
    # Default to 'Medium' for anything between tight and wide
    return np.select(conditions, choices, default='Medium')

def analyze_orderbook():
    """Analyze order book data and calculate OBI and spread regime"""
    # Load the data
    df = load_data()
    if df is None:
        return
    
    # Calculate OBI for level 10
    print("Calculating Order Book Imbalance (OBI)...")
    obi_10, total_bid_qty, total_ask_qty = calculate_obi(df, level=10)
    df['OBI_L10'] = obi_10
    
    # Calculate spread
    print("Calculating spread...")
    df['Spread'] = calculate_spread(df)
    
    # Determine tick size from data (assuming minimum price increment)
    unique_diffs = set()
    for i in range(1, min(10, df.shape[0])):
        for col_prefix in ['bid_price_', 'ask_price_']:
            for j in range(1, 10):
                col = f"{col_prefix}{j}"
                if col in df.columns:
                    col_next = f"{col_prefix}{j+1}"
                    if col_next in df.columns:
                        diff = abs(df[col].iloc[i] - df[col_next].iloc[i])
                        if diff > 0:
                            unique_diffs.add(diff)
    
    tick_size = min(unique_diffs) if unique_diffs else 0.1
    print(f"Estimated tick size: {tick_size}")
    
    # Classify spread regime
    print("Classifying spread regimes...")
    df['SpreadRegime'] = classify_spread_regime(df['Spread'], tick_size)
    
    # Add timestamp columns in more usable format
    if 'timestamp' in df.columns:
        df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    
    # Basic statistics
    print("\n====== Order Book Analysis ======")
    print(f"Average OBI (L10): {df['OBI_L10'].mean():.4f}")
    print(f"Average Spread: {df['Spread'].mean():.4f}")
    print("\nSpread Regime Distribution:")
    print(df['SpreadRegime'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}"))
    
    # OBI statistics by spread regime
    print("\nOBI by Spread Regime:")
    regime_stats = df.groupby('SpreadRegime')['OBI_L10'].agg(['mean', 'std', 'min', 'max'])
    print(regime_stats)
    
    # Price movement analysis based on OBI
    if 'bid_price_1' in df.columns:
        df['PriceChange'] = df['bid_price_1'].diff()
        
        # Analyze price changes following high OBI values
        high_buy_pressure = df[df['OBI_L10'] > 0.3]['PriceChange'].mean()
        high_sell_pressure = df[df['OBI_L10'] < -0.3]['PriceChange'].mean()
        
        print("\nPrice Movement Analysis:")
        print(f"Avg price change after high buy pressure (OBI>0.3): {high_buy_pressure:.4f}")
        print(f"Avg price change after high sell pressure (OBI<-0.3): {high_sell_pressure:.4f}")
    
    # Save enriched data
    output_file = '/Users/ranjanshahajishitole/Desktop/GIthub/Realtime-Data-API-Websocket/orderbook_analysis.csv'
    df.to_csv(output_file, index=False)
    print(f"\nEnriched data saved to: {output_file}")
    
    return df

def plot_obi_analysis(df):
    """Generate plots to visualize OBI and spread regime analysis"""
    if df is None or df.empty:
        return
    
    # Ensure we have a datetime column
    if 'datetime' not in df.columns and 'timestamp' in df.columns:
        df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    
    # Create figure with subplots
    fig, axes = plt.subplots(3, 1, figsize=(14, 12), sharex=True)
    
    # Plot 1: OBI over time
    axes[0].plot(df['datetime'], df['OBI_L10'], 'b-', linewidth=1)
    axes[0].axhline(y=0, color='k', linestyle='-', alpha=0.3)
    axes[0].fill_between(df['datetime'], df['OBI_L10'], 0, 
                        where=(df['OBI_L10'] > 0), color='green', alpha=0.3)
    axes[0].fill_between(df['datetime'], df['OBI_L10'], 0, 
                        where=(df['OBI_L10'] < 0), color='red', alpha=0.3)
    axes[0].set_title('Order Book Imbalance (Level 10)')
    axes[0].set_ylabel('OBI')
    axes[0].grid(True, alpha=0.3)
    
    # Plot 2: Spread over time with regime color coding
    if 'Spread' in df.columns:
        scatter = axes[1].scatter(df['datetime'], df['Spread'], 
                                c=pd.Categorical(df['SpreadRegime']).codes, 
                                cmap='viridis', s=10, alpha=0.7)
        
        # Add a legend
        legend_labels = df['SpreadRegime'].unique()
        handles = [plt.Line2D([0], [0], marker='o', color='w', 
                             markerfacecolor=scatter.cmap(scatter.norm(i)), 
                             markersize=8) for i in range(len(legend_labels))]
        axes[1].legend(handles, legend_labels, loc='upper right')
        
        axes[1].set_title('Spread and Regime Classification')
        axes[1].set_ylabel('Spread')
        axes[1].grid(True, alpha=0.3)
    
    # Plot 3: Price levels with OBI heatmap
    if 'bid_price_1' in df.columns:
        sc = axes[2].scatter(df['datetime'], df['bid_price_1'], c=df['OBI_L10'], 
                          cmap='coolwarm', vmin=-1, vmax=1, s=10)
        fig.colorbar(sc, ax=axes[2], label='OBI')
        axes[2].set_title('Price Level with OBI Heatmap')
        axes[2].set_ylabel('Price')
        axes[2].grid(True, alpha=0.3)
    
    # X-axis formatting
    axes[2].set_xlabel('Time')
    plt.tight_layout()
    
    # Save the figure
    plt.savefig('/Users/ranjanshahajishitole/Desktop/GIthub/Realtime-Data-API-Websocket/obi_analysis.png', dpi=300)
    print("Analysis plot saved to: obi_analysis.png")
    
    # Additional plot: OBI distribution by spread regime
    plt.figure(figsize=(10, 6))
    spread_regimes = df['SpreadRegime'].unique()
    
    for regime in spread_regimes:
        regime_data = df[df['SpreadRegime'] == regime]['OBI_L10']
        plt.hist(regime_data, bins=50, alpha=0.5, label=regime)
    
    plt.title('OBI Distribution by Spread Regime')
    plt.xlabel('OBI')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('/Users/ranjanshahajishitole/Desktop/GIthub/Realtime-Data-API-Websocket/obi_distribution.png', dpi=300)
    print("OBI distribution plot saved to: obi_distribution.png")

if __name__ == "__main__":
    print("Starting Order Book Imbalance (OBI) Analysis")
    df = analyze_orderbook()
    if df is not None:
        plot_obi_analysis(df)
    print("Analysis complete!")