# Exploratory Data Analysis - Preprocessing Pipeline

This notebook demonstrates the data preprocessing pipeline and provides exploratory analysis of the processed market data.

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path

# Add src to path
sys.path.append('../')

from src.data.preprocess_pipeline import MarketDataPipeline
from src.utils.config_loader import load_config

# Set up plotting
plt.style.use('default')
%matplotlib inline

## 1. Load Configuration and Run Pipeline

In [None]:
# Load configuration
config = load_config('../configs/config.yaml')
print("Configuration loaded:")
print(config)

In [None]:
# Initialize pipeline and process data
pipeline = MarketDataPipeline(config)

# Run preprocessing
input_path = '../data/raw/sample_data.csv'
output_path = '../data/processed/eda_sample.csv'

df_processed = pipeline.run_full_pipeline(input_path, output_path)
print(f"Processed {len(df_processed)} rows")

## 2. Data Overview

In [None]:
# Display basic information
print("Dataset Shape:", df_processed.shape)
print("\nColumn Names:")
print(df_processed.columns.tolist())

print("\nData Types:")
print(df_processed.dtypes)

In [None]:
# Display first few rows
print("First 5 rows:")
df_processed.head()

In [None]:
# Summary statistics
print("Summary Statistics:")
df_processed.describe()

## 3. Time Series Visualization

In [None]:
# Plot mid price over time
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Mid price
axes[0, 0].plot(df_processed['timestamp'], df_processed['mid_price'], 'b-', linewidth=1)
axes[0, 0].set_title('Mid Price Over Time')
axes[0, 0].set_xlabel('Time')
axes[0, 0].set_ylabel('Mid Price')
axes[0, 0].grid(True, alpha=0.3)

# Spread
axes[0, 1].plot(df_processed['timestamp'], df_processed['spread'], 'r-', linewidth=1)
axes[0, 1].set_title('Bid-Ask Spread Over Time')
axes[0, 1].set_xlabel('Time')
axes[0, 1].set_ylabel('Spread')
axes[0, 1].grid(True, alpha=0.3)

# Trade volume
if 'trade_volume' in df_processed.columns:
    axes[1, 0].bar(range(len(df_processed)), df_processed['trade_volume'], alpha=0.7)
    axes[1, 0].set_title('Trade Volume')
    axes[1, 0].set_xlabel('Time Bin')
    axes[1, 0].set_ylabel('Volume')
    axes[1, 0].grid(True, alpha=0.3)

# Bid/Ask sizes
axes[1, 1].plot(df_processed['timestamp'], df_processed['bid_size'], 'g-', label='Bid Size', alpha=0.7)
axes[1, 1].plot(df_processed['timestamp'], df_processed['ask_size'], 'orange', label='Ask Size', alpha=0.7)
axes[1, 1].set_title('Order Book Sizes')
axes[1, 1].set_xlabel('Time')
axes[1, 1].set_ylabel('Size')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Interactive Plotly Visualization

In [None]:
# Interactive plot with Plotly
fig = go.Figure()

# Add bid and ask prices
fig.add_trace(go.Scatter(
    x=df_processed['timestamp'],
    y=df_processed['bid_price'],
    mode='lines',
    name='Bid Price',
    line=dict(color='green')
))

fig.add_trace(go.Scatter(
    x=df_processed['timestamp'],
    y=df_processed['ask_price'],
    mode='lines',
    name='Ask Price',
    line=dict(color='red')
))

# Add mid price
fig.add_trace(go.Scatter(
    x=df_processed['timestamp'],
    y=df_processed['mid_price'],
    mode='lines',
    name='Mid Price',
    line=dict(color='blue', width=2)
))

fig.update_layout(
    title='Order Book Prices Over Time',
    xaxis_title='Time',
    yaxis_title='Price',
    hovermode='x unified'
)

fig.show()

## 5. Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
correlation_matrix = df_processed[numeric_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 8))
plt.imshow(correlation_matrix, cmap='coolwarm', aspect='auto')
plt.colorbar()
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=45)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Feature Correlation Matrix')

# Add correlation values
for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        plt.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}', 
                ha='center', va='center', color='black' if abs(correlation_matrix.iloc[i, j]) < 0.5 else 'white')

plt.tight_layout()
plt.show()

print("\nCorrelation Matrix:")
correlation_matrix

## 6. Normalized Features Analysis

In [None]:
# Check normalized columns
normalized_cols = [col for col in df_processed.columns if col.endswith('_normalized')]
print(f"Normalized columns: {normalized_cols}")

if normalized_cols:
    # Plot original vs normalized
    fig, axes = plt.subplots(len(normalized_cols), 2, figsize=(15, 4*len(normalized_cols)))
    
    if len(normalized_cols) == 1:
        axes = axes.reshape(1, -1)
    
    for i, norm_col in enumerate(normalized_cols):
        orig_col = norm_col.replace('_normalized', '')
        
        # Original data
        axes[i, 0].hist(df_processed[orig_col], bins=20, alpha=0.7, color='blue')
        axes[i, 0].set_title(f'Original {orig_col}')
        axes[i, 0].set_xlabel('Value')
        axes[i, 0].set_ylabel('Frequency')
        axes[i, 0].grid(True, alpha=0.3)
        
        # Normalized data
        axes[i, 1].hist(df_processed[norm_col], bins=20, alpha=0.7, color='red')
        axes[i, 1].set_title(f'Normalized {orig_col}')
        axes[i, 1].set_xlabel('Normalized Value')
        axes[i, 1].set_ylabel('Frequency')
        axes[i, 1].grid(True, alpha=0.3)
        
        # Add statistics
        orig_mean, orig_std = df_processed[orig_col].mean(), df_processed[orig_col].std()
        norm_mean, norm_std = df_processed[norm_col].mean(), df_processed[norm_col].std()
        
        axes[i, 0].text(0.05, 0.95, f'μ={orig_mean:.4f}\nσ={orig_std:.4f}', 
                       transform=axes[i, 0].transAxes, verticalalignment='top', 
                       bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        
        axes[i, 1].text(0.05, 0.95, f'μ={norm_mean:.4f}\nσ={norm_std:.4f}', 
                       transform=axes[i, 1].transAxes, verticalalignment='top',
                       bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.show()

## 7. Data Quality Summary

In [None]:
# Data quality summary
print("=== DATA QUALITY SUMMARY ===")
print(f"Total rows: {len(df_processed)}")
print(f"Total columns: {len(df_processed.columns)}")
print(f"Memory usage: {df_processed.memory_usage(deep=True).sum() / 1024:.2f} KB")

print("\nMissing values:")
missing_counts = df_processed.isnull().sum()
if missing_counts.sum() == 0:
    print("✅ No missing values found")
else:
    print(missing_counts[missing_counts > 0])

print("\nTime range:")
print(f"Start: {df_processed['timestamp'].min()}")
print(f"End: {df_processed['timestamp'].max()}")
print(f"Duration: {df_processed['timestamp'].max() - df_processed['timestamp'].min()}")

if 'mid_price' in df_processed.columns:
    print("\nPrice statistics:")
    print(f"Mid price range: {df_processed['mid_price'].min():.4f} - {df_processed['mid_price'].max():.4f}")
    print(f"Average spread: {df_processed['spread'].mean():.4f}")
    print(f"Max spread: {df_processed['spread'].max():.4f}")

print("\n=== PREPROCESSING PIPELINE COMPLETED SUCCESSFULLY ===")