In [26]:
import pandas as pd
import numpy as np
from datetime import timedelta
from pathlib import Path


INPUT_CURRENT_TIME = "19:45:0"      
INPUT_WINDOW_SIZE = 500              
INPUT_LOOK_AHEAD_SECONDS = 10         
INPUT_BUY_POWER = 100000.0           
FILE_PATH = "Dataset/BTC_1sec.csv"   
MAX_ROWS = 60_000                    
COLLINEARITY_BLEND = 0.99999999999     



In [27]:
# Mapping of candidate column names for different datasets
BOOK_SIZE_CANDIDATES = {
    1: {
        "bid": ["Bid_Sz_1", "bs1", "bids_1", "bids_notional_0", "bids_limit_notional_0", "bids_market_notional_0"],
        "ask": ["Ask_Sz_1", "as1", "asks_1", "asks_notional_0", "asks_limit_notional_0", "asks_market_notional_0"],
    },
    2: {
        "bid": ["Bid_Sz_2", "bs2", "bids_2", "bids_notional_1", "bids_limit_notional_1", "bids_market_notional_1"],
        "ask": ["Ask_Sz_2", "as2", "asks_2", "asks_notional_1", "asks_limit_notional_1", "asks_market_notional_1"],
    },
}

# ==========================================
# 1. DATA LOADING & PREPARATION
# ==========================================
def pick_column(df, candidates):
    for name in candidates:
        if name in df.columns:
            return name
    return None


def require_column(df, candidates, label):
    col = pick_column(df, candidates)
    if col is None:
        raise ValueError(f"Missing required column for {label}. Tried: {', '.join(candidates)}")
    return col


def load_and_prep_data(filepath, max_rows=None, blend=COLLINEARITY_BLEND):
    path = Path(filepath)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {filepath}")

    df = pd.read_csv(path, nrows=max_rows)

    for extra in ["Unnamed: 0", "Unnamed: 0.1"]:
        if extra in df.columns:
            df.drop(columns=extra, inplace=True)

    time_col = require_column(df, ["Time", "time", "timestamp", "system_time"], "time")
    df.rename(columns={time_col: "Time"}, inplace=True)
    df["Time"] = pd.to_datetime(df["Time"], utc=True).dt.tz_localize(None)

    price_col = pick_column(df, ["Mid_Price", "midpoint", "midprice"])
    if price_col:
        if price_col != "Mid_Price":
            df.rename(columns={price_col: "Mid_Price"}, inplace=True)
    else:
        bid_col = require_column(df, ["Bid_Px_1", "bp1"], "top-of-book bid price")
        ask_col = require_column(df, ["Ask_Px_1", "ap1"], "top-of-book ask price")
        df["Mid_Price"] = (df[bid_col] + df[ask_col]) / 2

    df.sort_values("Time", inplace=True)
    df.reset_index(drop=True, inplace=True)

    for level, names in BOOK_SIZE_CANDIDATES.items():
        bid_sz = require_column(df, names["bid"], f"bid size level {level}")
        ask_sz = require_column(df, names["ask"], f"ask size level {level}")
        df[f"OFI_L{level}"] = df[bid_sz].diff().fillna(0) - df[ask_sz].diff().fillna(0)

    df["Feature_Main"] = df["OFI_L1"]
    df["Feature_Shadow"] = (blend * df["OFI_L1"]) + ((1 - blend) * df["OFI_L2"])

    df["d_Price_Next"] = df["Mid_Price"].diff().shift(-1)
    df.dropna(inplace=True)

    return df

# ==========================================
# 2. THE MATH ENGINES
# ==========================================

def solve_naive(X, y):
    XtX = X.T @ X
    try:
        XtX_inv = np.linalg.inv(XtX)
        return XtX_inv @ X.T @ y
    except np.linalg.LinAlgError:
        return np.zeros(X.shape[1])


def solve_qr(X, y):
    try:
        Q, R = np.linalg.qr(X, mode="reduced")
        return np.linalg.solve(R, Q.T @ y)
    except np.linalg.LinAlgError:
        return np.zeros(X.shape[1])



def run_simulation(df, target_time_str, window, look_ahead, capital):
    print()
    print("=" * 60)
    print("HFT SIMULATION REPORT")
    print(f"Time: {target_time_str} | Window: {window} ticks | Capital: ${capital:,.2f}")
    print("=" * 60)

    base_date = df["Time"].dt.date.iloc[0]
    target_dt = pd.Timestamp(f"{base_date} {target_time_str}")

    idx_now = df["Time"].searchsorted(target_dt)
    if idx_now >= len(df):
        raise ValueError(
            f"Requested time {target_time_str} is beyond the loaded data. "
            f"Increase MAX_ROWS (currently {MAX_ROWS}) or pick an earlier time before {df['Time'].iloc[-1].time()}"
        )
    if idx_now < window:
        raise ValueError("Not enough history for this window; reduce INPUT_WINDOW_SIZE or increase MAX_ROWS.")

    window_df = df.iloc[idx_now - window : idx_now]
    y_train = window_df["d_Price_Next"].values
    X_train = np.column_stack((
        np.ones(len(window_df)),
        window_df["Feature_Main"].values,
        window_df["Feature_Shadow"].values,
    ))

    cond_number = np.linalg.cond(X_train)
    print(f"MATRIX CONDITION NUMBER: {cond_number:.2e}")
    danger_cond = cond_number > 1e12
    warning_cond = cond_number > 1e10
    if danger_cond:
        print("  [!] EXTREME ILL-CONDITIONING: Naive inversion is likely garbage.")
    elif warning_cond:
        print("  [!] WARNING: Ill-conditioned; Naive inversion may be unreliable.")
    else:
        print("  [i] Matrix looks stable. (Increase COLLINEARITY_BLEND to stress it)")

    print("-" * 60)

    beta_naive = solve_naive(X_train, y_train)
    beta_qr = solve_qr(X_train, y_train)
    naive_nonfinite = not np.isfinite(beta_naive).all()
    naive_zeroed = np.allclose(beta_naive, 0, atol=1e-12)
    if (warning_cond or danger_cond) and (naive_nonfinite or naive_zeroed):
        print("  >>> WARNING: Naive solution collapsed (non-finite or all zeros) under ill-conditioning.")

    print("MODEL COEFFICIENTS (Weights):")
    print(f"  Naive (Main vs Shadow): {beta_naive[1]:.4f}  vs  {beta_naive[2]:.4f}")
    print(f"  QR    (Main vs Shadow): {beta_qr[1]:.4f}  vs  {beta_qr[2]:.4f}")
    if abs(beta_naive[1]) > 1000:
        print("  >>> LOOK! Naive coefficients exploded (e.g. 10,000). This is the math error.")

    row_now = df.iloc[idx_now]
    x_now = np.array([1, row_now["Feature_Main"], row_now["Feature_Shadow"]])
    pred_naive = x_now @ beta_naive
    pred_qr = x_now @ beta_qr

    current_price = row_now["Mid_Price"]
    shares_to_buy = capital / current_price

    print("-" * 60)
    print(f"TRADING SIGNAL @ ${current_price:.2f}:")
    print(f"  Naive Prediction: {pred_naive:+.6f}  -> {'BUY' if pred_naive > 0 else 'SELL'}")
    print(f"  QR Prediction:    {pred_qr:+.6f}  -> {'BUY' if pred_qr > 0 else 'SELL'}")

    idx_future = df["Time"].searchsorted(target_dt + timedelta(seconds=look_ahead))
    if idx_future >= len(df):
        print("Error: Future time is outside dataset.")
        return

    price_future = df.iloc[idx_future]["Mid_Price"]
    actual_move = price_future - current_price

    print()
    print("=" * 60)
    print(f"RESULT ({look_ahead}s later @ {df.iloc[idx_future]['Time'].time()})")
    print(f"Future Price: ${price_future:.2f}")
    print(f"Actual Move:  {actual_move:+.4f}")
    print("-" * 60)

    direction_naive = 1 if pred_naive > 0 else -1
    pnl_naive = shares_to_buy * actual_move * direction_naive

    direction_qr = 1 if pred_qr > 0 else -1
    pnl_qr = shares_to_buy * actual_move * direction_qr

    print(f"NAIVE STRATEGY P&L:  ${pnl_naive:,.2f}  {'✅ WIN' if pnl_naive > 0 else '❌ LOSS'}")
    print(f"QR STRATEGY P&L:     ${pnl_qr:,.2f}  {'✅ WIN' if pnl_qr > 0 else '❌ LOSS'}")
    print("=" * 60)

if __name__ == "__main__":
    try:
        print("Loading Dataset...")
        df = load_and_prep_data(FILE_PATH, MAX_ROWS, COLLINEARITY_BLEND)
        run_simulation(
            df,
            INPUT_CURRENT_TIME,
            INPUT_WINDOW_SIZE,
            INPUT_LOOK_AHEAD_SECONDS,
            INPUT_BUY_POWER,
        )
    except Exception as e:
        print()
        print(f"CRITICAL ERROR: {e}")


Loading Dataset...

HFT SIMULATION REPORT
Time: 19:45:0 | Window: 500 ticks | Capital: $100,000.00
MATRIX CONDITION NUMBER: 2.88e+11
------------------------------------------------------------
MODEL COEFFICIENTS (Weights):
  Naive (Main vs Shadow): 0.0000  vs  0.0000
  QR    (Main vs Shadow): -478323.4314  vs  478323.4314
------------------------------------------------------------
TRADING SIGNAL @ $55869.28:
  Naive Prediction: +0.000000  -> SELL
  QR Prediction:    +1.610306  -> BUY

RESULT (10s later @ 19:45:10.626760)
Future Price: $55900.01
Actual Move:  +30.7300
------------------------------------------------------------
NAIVE STRATEGY P&L:  $-55.00  ❌ LOSS
QR STRATEGY P&L:     $55.00  ✅ WIN
