# Sequence Preparation Demo

This notebook demonstrates the complete feature engineering and sequence preparation pipeline for market data.

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import torch
from pathlib import Path

# Add src to path
sys.path.append('../')

from src.features.feature_engineering import compute_base_features, normalize_features, select_feature_columns
from src.data.sequence_builder import build_sequences, build_targets, MarketSequenceDataset, save_sequences, load_sequences
from src.data.dataloader_factory import get_dataloader, create_train_val_test_loaders
from src.utils.config_loader import load_config

# Set up plotting
plt.style.use('default')
%matplotlib inline

print("✅ All imports successful")

## 1. Load Configuration and Data

In [None]:
# Load configuration
config = load_config('../configs/config.yaml')
print("Configuration loaded:")
print(f"Features config: {config.get('features', {})}")
print(f"Preprocessing config: {config.get('preprocessing', {})}")

In [None]:
# Load preprocessed data
data_path = '../data/processed/sample_preprocessed.csv'
df = pd.read_csv(data_path)

print(f"Loaded data shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

## 2. Feature Engineering

In [None]:
# Compute base features
print("Computing base features...")
df_features = compute_base_features(df)

print(f"After feature engineering: {df_features.shape}")
new_columns = [col for col in df_features.columns if col not in df.columns]
print(f"New feature columns: {new_columns}")

# Show sample of new features
feature_sample = df_features[['mid_price', 'spread', 'log_return', 'order_imbalance', 'trade_intensity', 'rolling_vol_10']].head()
print("\nSample of computed features:")
feature_sample

In [None]:
# Normalize features
feature_cols_to_normalize = ['mid_price', 'spread', 'order_imbalance', 'trade_intensity', 'rolling_vol_10']
print(f"Normalizing features: {feature_cols_to_normalize}")

df_normalized, scalers = normalize_features(df_features, feature_cols_to_normalize)

print(f"After normalization: {df_normalized.shape}")
print(f"Scalers created for: {list(scalers.keys())}")

# Show normalization results
norm_cols = [f'{col}_z' for col in feature_cols_to_normalize]
norm_sample = df_normalized[norm_cols].head()
print("\nSample of normalized features:")
norm_sample

In [None]:
# Verify normalization (mean ≈ 0, std ≈ 1)
print("Normalization verification:")
for col in norm_cols:
    if col in df_normalized.columns:
        mean_val = df_normalized[col].mean()
        std_val = df_normalized[col].std()
        print(f"{col}: mean={mean_val:.6f}, std={std_val:.6f}")

## 3. Feature Visualization

In [None]:
# Plot original vs normalized features
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, col in enumerate(feature_cols_to_normalize):
    if i < len(axes):
        ax = axes[i]
        
        # Plot original and normalized
        ax.plot(df_normalized[col], label=f'Original {col}', alpha=0.7)
        ax.plot(df_normalized[f'{col}_z'], label=f'Normalized {col}', alpha=0.7)
        
        ax.set_title(f'{col} - Original vs Normalized')
        ax.legend()
        ax.grid(True, alpha=0.3)

# Remove empty subplots
for i in range(len(feature_cols_to_normalize), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

## 4. Sequence Building

In [None]:
# Select feature columns for sequences
feature_cols = [f'{col}_z' for col in feature_cols_to_normalize]
print(f"Feature columns for sequences: {feature_cols}")

# Build sequences
seq_len = 50
step = 5

print(f"Building sequences with seq_len={seq_len}, step={step}")
sequences = build_sequences(
    df_normalized,
    seq_len=seq_len,
    step=step,
    mode='sliding',
    feature_cols=feature_cols
)

print(f"Sequences shape: {sequences.shape}")
print(f"Number of sequences: {len(sequences)}")
print(f"Sequence length: {sequences.shape[1]}")
print(f"Number of features: {sequences.shape[2]}")

In [None]:
# Build targets
print("Building targets...")
targets = build_targets(
    df_normalized,
    horizon=1,
    target_col='mid_price',
    target_type='return'
)

# Align targets with sequences
targets_aligned = targets[:len(sequences)]

print(f"Targets shape: {targets_aligned.shape}")
print(f"Target statistics:")
print(f"  Mean: {targets_aligned.mean():.6f}")
print(f"  Std: {targets_aligned.std():.6f}")
print(f"  Min: {targets_aligned.min():.6f}")
print(f"  Max: {targets_aligned.max():.6f}")

## 5. Save and Load Sequences

In [None]:
# Save sequences
output_path = '../data/processed/sequences/sample_sequences.npz'
metadata = {
    'feature_cols': feature_cols,
    'seq_len': seq_len,
    'step': step,
    'n_original_samples': len(df_normalized)
}

print(f"Saving sequences to {output_path}")
save_sequences(sequences, targets_aligned, output_path, metadata)

# Verify file was created
from pathlib import Path
if Path(output_path).exists():
    file_size = Path(output_path).stat().st_size / 1024  # KB
    print(f"✅ Sequences saved successfully ({file_size:.1f} KB)")
else:
    print("❌ Failed to save sequences")

In [None]:
# Load sequences back
print("Loading sequences from file...")
loaded_sequences, loaded_targets, loaded_metadata = load_sequences(output_path)

print(f"Loaded sequences shape: {loaded_sequences.shape}")
print(f"Loaded targets shape: {loaded_targets.shape}")
print(f"Loaded metadata: {loaded_metadata}")

# Verify data integrity
assert np.array_equal(sequences, loaded_sequences), "Sequences don't match!"
assert np.array_equal(targets_aligned, loaded_targets), "Targets don't match!"
print("✅ Data integrity verified")

## 6. PyTorch Dataset and DataLoader

In [None]:
# Create PyTorch dataset
dataset = MarketSequenceDataset(loaded_sequences, loaded_targets, loaded_metadata)

print(f"Dataset length: {len(dataset)}")

# Test getting a sample
sample_seq, sample_target, sample_metadata = dataset[0]

print(f"Sample sequence shape: {sample_seq.shape}")
print(f"Sample sequence dtype: {sample_seq.dtype}")
print(f"Sample target shape: {sample_target.shape}")
print(f"Sample target dtype: {sample_target.dtype}")
print(f"Sample metadata: {sample_metadata}")

In [None]:
# Create DataLoader
dataloader = get_dataloader(output_path, batch_size=8, shuffle=True)

print(f"DataLoader created with {len(dataloader.dataset)} samples")
print(f"Number of batches: {len(dataloader)}")

# Get a sample batch
for batch_sequences, batch_targets, batch_metadata in dataloader:
    print(f"\nSample batch:")
    print(f"  Batch sequences shape: {batch_sequences.shape}")
    print(f"  Batch targets shape: {batch_targets.shape}")
    print(f"  Batch metadata keys: {list(batch_metadata.keys())}")
    
    # Show some statistics
    print(f"  Sequence value range: [{batch_sequences.min():.4f}, {batch_sequences.max():.4f}]")
    print(f"  Target value range: [{batch_targets.min():.4f}, {batch_targets.max():.4f}]")
    break

## 7. Train/Val/Test Split

In [None]:
# Create train/val/test DataLoaders
train_loader, val_loader, test_loader = create_train_val_test_loaders(
    loaded_sequences,
    loaded_targets,
    batch_size=16,
    train_frac=0.7,
    val_frac=0.2,
    save_dir='../data/processed/sequences'
)

print(f"Train loader: {len(train_loader.dataset)} samples, {len(train_loader)} batches")
print(f"Val loader: {len(val_loader.dataset)} samples, {len(val_loader)} batches")
print(f"Test loader: {len(test_loader.dataset)} samples, {len(test_loader)} batches")

# Test each loader
for name, loader in [('Train', train_loader), ('Val', val_loader), ('Test', test_loader)]:
    for batch_seq, batch_targets, batch_metadata in loader:
        print(f"{name} batch shape: {batch_seq.shape}")
        break

## 8. Sequence Visualization

In [None]:
# Visualize a sample sequence
sample_idx = 0
sample_sequence = loaded_sequences[sample_idx]  # Shape: (seq_len, n_features)
sample_target = loaded_targets[sample_idx]

print(f"Visualizing sequence {sample_idx}")
print(f"Sequence shape: {sample_sequence.shape}")
print(f"Target value: {sample_target:.6f}")

# Create interactive plot with Plotly
fig = make_subplots(
    rows=len(feature_cols), cols=1,
    subplot_titles=feature_cols,
    shared_xaxes=True,
    vertical_spacing=0.02
)

for i, feature_name in enumerate(feature_cols):
    fig.add_trace(
        go.Scatter(
            x=list(range(seq_len)),
            y=sample_sequence[:, i],
            mode='lines+markers',
            name=feature_name,
            line=dict(width=2),
            marker=dict(size=4)
        ),
        row=i+1, col=1
    )

fig.update_layout(
    height=200 * len(feature_cols),
    title=f'Sample Sequence {sample_idx} (Target: {sample_target:.6f})',
    showlegend=False
)

fig.update_xaxes(title_text="Time Step", row=len(feature_cols), col=1)

fig.show()

In [None]:
# Plot multiple sequences for comparison
n_sequences_to_plot = 3
feature_to_plot = 0  # Index of feature to plot (mid_price_z)

fig = go.Figure()

for i in range(min(n_sequences_to_plot, len(loaded_sequences))):
    fig.add_trace(
        go.Scatter(
            x=list(range(seq_len)),
            y=loaded_sequences[i][:, feature_to_plot],
            mode='lines',
            name=f'Sequence {i} (target: {loaded_targets[i]:.4f})',
            line=dict(width=2)
        )
    )

fig.update_layout(
    title=f'Multiple Sequences - {feature_cols[feature_to_plot]}',
    xaxis_title='Time Step',
    yaxis_title='Normalized Value',
    hovermode='x unified'
)

fig.show()

## 9. Summary and Next Steps

In [None]:
print("=== SEQUENCE PREPARATION PIPELINE SUMMARY ===")
print(f"✅ Original data: {df.shape[0]} samples, {df.shape[1]} columns")
print(f"✅ Feature engineering: Added {len(new_columns)} new features")
print(f"✅ Normalization: {len(feature_cols)} features normalized")
print(f"✅ Sequences: {len(loaded_sequences)} sequences of length {seq_len}")
print(f"✅ Features per sequence: {len(feature_cols)}")
print(f"✅ Targets: {len(loaded_targets)} return predictions")
print(f"✅ Data splits: Train({len(train_loader.dataset)}), Val({len(val_loader.dataset)}), Test({len(test_loader.dataset)})")
print(f"✅ Saved to: {output_path}")

print("\n=== READY FOR RL AGENT TRAINING ===")
print("The sequences are now ready to be used for:")
print("• Reinforcement Learning agent training")
print("• Supervised learning models")
print("• Time series forecasting")
print("• Anomaly detection")

print("\n=== NEXT STEPS ===")
print("1. Implement RL agent architecture")
print("2. Define reward functions")
print("3. Set up training loop")
print("4. Integrate with rule-based systems")
print("5. Add explainability components")