# Large Data Loading Example

This notebook demonstrates how to use the `LargeDataLoader` to:
1. Load subsets from large institutional datasets (5000+ securities)
2. Apply corporate action adjustments (splits, dividends)
3. Handle time-varying sector classifications
4. Convert data to backtester format
5. Run a backtest with the loaded data

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from pathlib import Path

from backtesting import (
    LargeDataLoader,
    convert_to_backtester_format,
    Backtester,
    BacktestConfig,
    DataManager,
    SignalBasedTradeGenerator
)

## Step 1: Create Sample Large Data Files

In practice, you would have these files from your data provider. Here we create synthetic data for demonstration.

In [None]:
# Create sample data directory
large_data_dir = Path('../data/large_data')
large_data_dir.mkdir(parents=True, exist_ok=True)

# Create a universe of 100 securities (simulating 5000+ in production)
all_tickers = [f'TICK{i:04d}' for i in range(100)]
dates = pd.date_range('2023-01-01', '2024-12-31', freq='B')

print(f"Created universe of {len(all_tickers)} tickers")
print(f"Date range: {dates[0]} to {dates[-1]} ({len(dates)} trading days)")

In [None]:
# Generate synthetic price data
np.random.seed(42)
price_data = []

for ticker in all_tickers:
    # Random walk for prices
    returns = np.random.normal(0.0005, 0.02, len(dates))
    prices = 100 * np.exp(np.cumsum(returns))
    
    for date, price in zip(dates, prices):
        price_data.append({
            'date': date,
            'ticker': ticker,
            'price': price
        })

prices_df = pd.DataFrame(price_data)
print(f"Created price data: {len(prices_df)} rows")

# Save as parquet (recommended for large files)
prices_df.to_parquet(large_data_dir / 'prices_large.parquet', index=False)
print(f"Saved to {large_data_dir / 'prices_large.parquet'}")

In [None]:
# Generate price adjustments for corporate actions
# Simulate splits and dividends for a subset of securities
adjustment_data = []

for i, ticker in enumerate(all_tickers[:20]):  # Apply to first 20 tickers
    # 2-for-1 split on random date
    split_date = np.random.choice(dates[len(dates)//2:])
    adjustment_data.append({
        'date': split_date,
        'ticker': ticker,
        'adjustment_factor': 0.5,  # 2-for-1 split
        'event_type': 'split'
    })

adjustments_df = pd.DataFrame(adjustment_data)
print(f"Created {len(adjustments_df)} price adjustments")
adjustments_df.to_parquet(large_data_dir / 'price_adjustments.parquet', index=False)
print(f"Saved to {large_data_dir / 'price_adjustments.parquet'}")

print("\nSample adjustments:")
print(adjustments_df.head())

In [None]:
# Generate ADV (Average Daily Volume) data
adv_data = []

for ticker in all_tickers:
    base_adv = np.random.uniform(500_000, 10_000_000)
    
    for date in dates:
        # Add some time variation
        daily_adv = base_adv * np.random.uniform(0.8, 1.2)
        adv_data.append({
            'date': date,
            'ticker': ticker,
            'adv': daily_adv
        })

adv_df = pd.DataFrame(adv_data)
adv_df.to_parquet(large_data_dir / 'adv_large.parquet', index=False)
print(f"Created ADV data: {len(adv_df)} rows")

In [None]:
# Generate beta data
beta_data = []

for ticker in all_tickers:
    base_beta = np.random.uniform(0.5, 1.5)
    
    for date in dates:
        # Slowly varying beta
        daily_beta = base_beta + np.random.normal(0, 0.05)
        beta_data.append({
            'date': date,
            'ticker': ticker,
            'beta': daily_beta
        })

beta_df = pd.DataFrame(beta_data)
beta_df.to_parquet(large_data_dir / 'betas_large.parquet', index=False)
print(f"Created beta data: {len(beta_df)} rows")

In [None]:
# Generate time-varying sector mapping
sectors = ['Technology', 'Healthcare', 'Financials', 'Energy', 'Consumer Discretionary']
sector_data = []

for ticker in all_tickers:
    # Initial sector
    current_sector = np.random.choice(sectors)
    sector_data.append({
        'date': dates[0],
        'ticker': ticker,
        'sector': current_sector
    })
    
    # Some tickers change sectors mid-period
    if np.random.random() < 0.1:  # 10% change sectors
        change_date = np.random.choice(dates[len(dates)//2:])
        new_sector = np.random.choice([s for s in sectors if s != current_sector])
        sector_data.append({
            'date': change_date,
            'ticker': ticker,
            'sector': new_sector
        })

sector_df = pd.DataFrame(sector_data)
sector_df.to_parquet(large_data_dir / 'sector_mapping_dated.parquet', index=False)
print(f"Created sector mapping: {len(sector_df)} entries")
print(f"Tickers with sector changes: {sector_df.groupby('ticker').size().gt(1).sum()}")

## Step 2: Load Data for a Specific Universe and Date Range

Now we'll use the `LargeDataLoader` to load data for a subset of securities.

In [None]:
# Initialize the loader
loader = LargeDataLoader(
    data_dir=str(large_data_dir),
    use_float32=True  # Save memory
)

# Define our backtest universe (subset of all securities)
backtest_universe = all_tickers[:30]  # Use first 30 securities
start_date = '2023-06-01'
end_date = '2023-12-31'

print(f"Loading data for {len(backtest_universe)} securities")
print(f"Date range: {start_date} to {end_date}")

In [None]:
# Load prices with corporate action adjustments
prices = loader.load_prices_with_adjustments(
    universe=backtest_universe,
    start_date=start_date,
    end_date=end_date,
    prices_file='prices_large.parquet',
    adjustments_file='price_adjustments.parquet',
    apply_adjustments=True
)

print(f"Loaded prices: {prices.shape}")
print(f"Date range: {prices.index[0]} to {prices.index[-1]}")
print(f"\nFirst few rows:")
print(prices.head())

In [None]:
# Load ADV data
adv = loader.load_adv(
    universe=backtest_universe,
    start_date=start_date,
    end_date=end_date,
    adv_file='adv_large.parquet'
)

print(f"Loaded ADV: {adv.shape}")
print(f"\nSample ADV values:")
print(adv.iloc[0])

In [None]:
# Load beta data
betas = loader.load_betas(
    universe=backtest_universe,
    start_date=start_date,
    end_date=end_date,
    beta_file='betas_large.parquet'
)

print(f"Loaded betas: {betas.shape}")
print(f"\nSample beta values:")
print(betas.iloc[0])

In [None]:
# Load sector mapping (time-varying)
sector_mapping = loader.load_sector_mapping_with_dates(
    universe=backtest_universe,
    date=end_date,  # Get sectors as of end date
    sector_file='sector_mapping_dated.parquet'
)

print(f"Loaded sector mapping: {len(sector_mapping)} securities")
print(f"\nSector distribution:")
print(sector_mapping['sector'].value_counts())
print(f"\nSample mappings:")
print(sector_mapping.head(10))

## Step 3: Verify Price Adjustments

Let's verify that corporate action adjustments were applied correctly.

In [None]:
# Check if any of our universe had adjustments
universe_adjustments = adjustments_df[adjustments_df['ticker'].isin(backtest_universe)]

if len(universe_adjustments) > 0:
    print(f"Found {len(universe_adjustments)} adjustments in our universe:")
    print(universe_adjustments)
    
    # Pick one ticker with adjustment
    example_ticker = universe_adjustments.iloc[0]['ticker']
    adj_date = universe_adjustments.iloc[0]['date']
    adj_factor = universe_adjustments.iloc[0]['adjustment_factor']
    
    print(f"\nExample: {example_ticker} had {adj_factor}x adjustment on {adj_date}")
    print(f"This means prices BEFORE {adj_date} were multiplied by {adj_factor}")
    
    # Show prices around adjustment date
    if example_ticker in prices.columns:
        ticker_prices = prices[example_ticker].dropna()
        adj_date_pd = pd.Timestamp(adj_date)
        
        if adj_date_pd in ticker_prices.index:
            idx = ticker_prices.index.get_loc(adj_date_pd)
            window = ticker_prices.iloc[max(0, idx-5):min(len(ticker_prices), idx+6)]
            print(f"\nPrices around adjustment date:")
            print(window)
else:
    print("No adjustments in our selected universe")

## Step 4: Convert to Backtester Format

Convert the loaded data to CSV format compatible with the backtester.

In [None]:
# Convert to backtester format
output_dir = Path('../data/converted_backtest_data')

file_paths = convert_to_backtester_format(
    prices=prices,
    adv=adv,
    betas=betas,
    sector_mapping=sector_mapping,
    factor_exposures=None,  # Not using factor model in this example
    output_dir=str(output_dir)
)

print("Converted data saved to:")
for data_type, path in file_paths.items():
    print(f"  {data_type}: {path}")

## Step 5: Run Backtest with Loaded Data

Now use the converted data in a backtest.

In [None]:
# Load data using DataManager
data_manager = DataManager(str(output_dir))

# Verify data was loaded
print(f"Loaded prices: {data_manager.prices.shape}")
print(f"Loaded ADV: {data_manager.adv.shape}")
print(f"Loaded betas: {data_manager.betas.shape}")
print(f"Loaded sector mapping: {len(data_manager.sector_mapping)} securities")

In [None]:
# Create a simple momentum signal
def momentum_signal(prices_df, lookback=20):
    """Simple momentum signal: rank by returns."""
    returns = prices_df.pct_change(lookback)
    latest_returns = returns.iloc[-1]
    
    # Rank returns (higher is better)
    ranks = latest_returns.rank(pct=True)
    
    # Convert to z-scores
    signals = (ranks - 0.5) * 2  # Scale to [-1, 1]
    
    return signals

# Generate signal
signal = momentum_signal(data_manager.prices, lookback=20)
print(f"Generated signal for {len(signal)} securities")
print(f"\nTop 5 signals:")
print(signal.nlargest(5))
print(f"\nBottom 5 signals:")
print(signal.nsmallest(5))

In [None]:
# Configure backtest
config = BacktestConfig(
    initial_capital=1_000_000,
    max_position_size=0.15,  # 15% per position
    max_adv_participation=0.10,  # 10% ADV limit
    transaction_cost_bps=10,
    enable_beta_hedge=True,
    target_beta=0.0,
    enable_sector_hedge=True,
    sector_hedge_method='proportional'
)

# Create trade generator
trade_gen = SignalBasedTradeGenerator(
    signal=signal,
    target_positions=15  # Long top 15
)

print("Backtest configuration:")
print(f"  Initial capital: ${config.initial_capital:,.0f}")
print(f"  Max position size: {config.max_position_size:.1%}")
print(f"  Beta hedge: {config.enable_beta_hedge}")
print(f"  Sector hedge: {config.enable_sector_hedge}")

In [None]:
# Run backtest
backtester = Backtester(config, data_manager)
results = backtester.run(trade_gen, rebalance_frequency='weekly')

print("\nBacktest completed!")
print(f"Total return: {results.total_return:.2%}")
print(f"Sharpe ratio: {results.sharpe_ratio:.2f}")
print(f"Max drawdown: {results.max_drawdown:.2%}")
print(f"Total trades: {len(results.trades)}")

In [None]:
# Show portfolio evolution
portfolio_values = [state.total_value for state in results.states]
dates_list = [state.date for state in results.states]

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(dates_list, portfolio_values)
plt.title('Portfolio Value Over Time')
plt.xlabel('Date')
plt.ylabel('Portfolio Value ($)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nFinal portfolio value: ${portfolio_values[-1]:,.2f}")
print(f"Initial capital: ${config.initial_capital:,.2f}")
print(f"P&L: ${portfolio_values[-1] - config.initial_capital:,.2f}")

## Step 6: Performance Breakdown

In [None]:
# Analyze trades
trades_df = pd.DataFrame([
    {
        'date': t.date,
        'ticker': t.ticker,
        'shares': t.shares,
        'price': t.price,
        'value': abs(t.shares * t.price),
        'cost': t.transaction_cost
    }
    for t in results.trades
])

print("Trade Statistics:")
print(f"  Total trades: {len(trades_df)}")
print(f"  Total value traded: ${trades_df['value'].sum():,.2f}")
print(f"  Total transaction costs: ${trades_df['cost'].sum():,.2f}")
print(f"  Average cost per trade: ${trades_df['cost'].mean():.2f}")

print(f"\nMost traded securities:")
print(trades_df['ticker'].value_counts().head(10))

## Summary

This notebook demonstrated the complete workflow for using the `LargeDataLoader`:

1. **Created sample large data files** in Parquet format
2. **Loaded subsets** for a specific universe and date range
3. **Applied corporate action adjustments** backward from adjustment date
4. **Loaded time-varying sector classifications**
5. **Converted to backtester format** (CSV files)
6. **Ran a backtest** with the loaded data

### Key Advantages

- **Memory efficient**: Only loads needed securities and dates
- **Fast**: Parquet format is 10x faster than CSV
- **Accurate**: Corporate actions applied correctly
- **Flexible**: Supports time-varying classifications
- **Reusable**: Converted data can be used in multiple backtests

### Production Usage

In production with 5000+ securities:
- Store data in Parquet format
- Use `use_float32=True` to save memory
- Load only the universe and date range needed
- Cache converted data for reuse
- Consider partitioning by date for even faster loading