In [1]:
from __future__ import annotations
import polars as pl
import seaborn as sns
import tqdm
from databento_dbn import FIXED_PRICE_SCALE, UNDEF_PRICE
# plot price of each over time
import plotly.graph_objs as go
import plotly.offline as pyo
from helpers import * 
import pandas as pd
import numpy as np
import datetime as dt  # For plotting x-axis as dates
import matplotlib.pyplot as plt
import statsmodels.api as sm
from signals import  transfer_entropy

In [2]:
data = pl.read_parquet("/Users/danny/trading/databento/mbp.parquet")
data = data.filter((pl.col("symbol") == "GOOGL") | (pl.col("symbol") == "GOOG"))
data = data.filter(pl.col("price") != UNDEF_PRICE)
data = data.with_columns(pl.col("ts_event") - pl.col("ts_event").min())
data = data.filter(pl.col("price") < 200370000000.0)
data.shape

(9562459, 75)

In [3]:
data.columns

['ts_recv',
 'ts_event',
 'rtype',
 'publisher_id',
 'instrument_id',
 'action',
 'side',
 'depth',
 'price',
 'size',
 'flags',
 'ts_in_delta',
 'sequence',
 'bid_px_00',
 'ask_px_00',
 'bid_sz_00',
 'ask_sz_00',
 'bid_ct_00',
 'ask_ct_00',
 'bid_px_01',
 'ask_px_01',
 'bid_sz_01',
 'ask_sz_01',
 'bid_ct_01',
 'ask_ct_01',
 'bid_px_02',
 'ask_px_02',
 'bid_sz_02',
 'ask_sz_02',
 'bid_ct_02',
 'ask_ct_02',
 'bid_px_03',
 'ask_px_03',
 'bid_sz_03',
 'ask_sz_03',
 'bid_ct_03',
 'ask_ct_03',
 'bid_px_04',
 'ask_px_04',
 'bid_sz_04',
 'ask_sz_04',
 'bid_ct_04',
 'ask_ct_04',
 'bid_px_05',
 'ask_px_05',
 'bid_sz_05',
 'ask_sz_05',
 'bid_ct_05',
 'ask_ct_05',
 'bid_px_06',
 'ask_px_06',
 'bid_sz_06',
 'ask_sz_06',
 'bid_ct_06',
 'ask_ct_06',
 'bid_px_07',
 'ask_px_07',
 'bid_sz_07',
 'ask_sz_07',
 'bid_ct_07',
 'ask_ct_07',
 'bid_px_08',
 'ask_px_08',
 'bid_sz_08',
 'ask_sz_08',
 'bid_ct_08',
 'ask_ct_08',
 'bid_px_09',
 'ask_px_09',
 'bid_sz_09',
 'ask_sz_09',
 'bid_ct_09',
 'ask_ct_09',
 '

In [1]:
def get_pairs(data: pl.DataFrame, symbols: list[str] = ["GOOGL", "GOOG"], time_unit: str = "s"):
    vals = []
    for symbol in symbols: 
        symbol_data = data.filter(pl.col("symbol") == symbol)
        if time_unit == "ms":
            symbol_data = symbol_data.with_columns(pl.col("ts_event").dt.total_milliseconds())
        elif time_unit == "s":
            symbol_data = symbol_data.with_columns(pl.col("ts_event").dt.total_seconds())
        symbol_data = symbol_data.group_by("ts_event").agg(pl.col("price").mean())
        symbol_data = symbol_data.with_columns((pl.col("price") / pl.col("price").shift(1)).log())
        symbol_data = symbol_data.with_columns(pl.col("price") / FIXED_PRICE_SCALE)
        merged = symbol_data.to_pandas()
        vec = merged["price"].values
        vals.append(vec)

    # Get a rolling mean over 60
    [print(len(val)) for val in vals]
    length = min([len(val) for val in vals])
    vals = [val[2:length] for val in vals]

    # print(f"Distance correlation: {distance_correlation(vals[0], vals[1])}") 
    return vals[0], vals[1]

pairs = get_pairs(data, time_unit="ms")
x, y = pairs

NameError: name 'pl' is not defined

In [None]:
from sklearn.linear_model import LinearRegression   
from sklearn.metrics import mean_squared_error, r2_score

def pred(x, y, offset=0):
    "predict y+offset from x"
    # Test/train split
    x_test = x[:len(x)//2]
    x_train = x[len(x)//2:]
    y_test = y[:len(y)//2]
    y_train = y[len(y)//2:]

    # print(x_train.shape, y_train.shape)
    # print(x_test.shape, y_test.shape)
    # Fit model
    x_train = x_train[:-offset]
    y_train = y_train[offset:]
    # print(x_train.shape, y_train.shape)
    model = LinearRegression()

    model.fit(x_train.reshape(-1, 1), y_train)
    # Predict
    x_test = x_test[:-offset]
    y_test = y_test[offset:]
    y_pred = model.predict(x_test.reshape(-1, 1))
    # Evaluate
    # print the shape of the data
    # mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    # print(f"MSE: {mse:.4f}, R2: {r2 * 100:.4f}%")
    return r2 * 100

scores = []
step = 10
for i in range(step, 10000, step):
    if i % 1000 == 0 and len(scores) > 0:
        # print current max
        temp = np.array(scores)
        print(temp.max())
        top5 = temp.argsort()[-5:][::-1]
        print(top5 * step)
        print(temp[top5])
    scores.append(pred(x, y, i))

# print top 5 scores and their offsets
scores = np.array(scores)


In [None]:
scores = []
for i in range(2, 20000, 2):
    if i % 500 == 0:
        # print current max
        temp = np.array(scores)
        print(temp.max())
        top5 = temp.argsort()[-5:][::-1]
        print(top5)
        print(temp[top5])
    scores.append(pred(y, x, i))

# print top 5 scores and their offsets
scores = np.array(scores)

In [None]:

def sliding_mean(val, window_size):
    return np.convolve(val, np.ones(window_size)/window_size, mode='valid')


def copula_pairs(data: pl.DataFrame, symbols: list[str]):
    traces = []
    vals = []
    for symbol in symbols: 
        symbol_data = data.filter(pl.col("symbol") == symbol)
        symbol_data = symbol_data.with_columns(pl.col("ts_event").dt.total_milliseconds())
        symbol_data = symbol_data.group_by("ts_event").agg(pl.col("price").mean())
        symbol_data = symbol_data.with_columns((pl.col("price") / pl.col("price").shift(1)).log())
        symbol_data = symbol_data.with_columns(pl.col("price") / FIXED_PRICE_SCALE)
        merged = symbol_data.to_pandas()
        vec = merged["price"].values
        vals.append(vec)

    # Get a rolling mean over 60

    length = min([len(val) for val in vals])
    vals = [val[2:length] for val in vals]

    # print(f"Distance correlation: {distance_correlation(vals[0], vals[1])}")


    val1 = vals[0]
    val2 = vals[1]
    val1_mean = []
    val2_mean = []
    print(len(val1))
    window_size = 5
    for window_size in range(5, 1000, 5):
        val1 = sliding_mean(val1, window_size)
        val2 = sliding_mean(val2, window_size)
        print(f"Window {window_size} Pearson correlation: {np.corrcoef(val1, val2)[0, 1]}")

    # print the mean difference, and std deviation
    print(np.mean(val1 - val2))
    print(np.std(val1 - val2))
    plt.hist(val1 - val2, bins=100)

copula_pairs(data,["GOOGL", "GOOG"])

In [None]:

def plot_price(data: pl.DataFrame, symbols: list[str]):
    traces = []
    vals = []
    for symbol in symbols: 
        symbol_data = data.filter(pl.col("symbol") == symbol)
        symbol_data = symbol_data.with_columns(pl.col("ts_event").dt.total_seconds())
        symbol_data = symbol_data.group_by("ts_event").agg(pl.col("price").mean())
        symbol_data = symbol_data.with_columns((pl.col("price") / pl.col("price").shift(1)).log())
        symbol_data = symbol_data.with_columns(pl.col("price") / FIXED_PRICE_SCALE)
        merged = symbol_data.to_pandas()
        vec = merged["price"].values
        vals.append(vec)

        trace = go.Scatter(
            x=merged.index,
            y=merged["price"],
            mode='lines',
            name=symbol
        )
        traces.append(trace)

    # Cast to fp16
    # Drop na 
    # vals = [val[~np.isnan(val)] for val in vals]
    length = min([len(val) for val in vals])
    vals = [val[2:length] for val in vals]
    # vals = [np.array(val, dtype=np.float16) for val in vals]
    print(distance_correlation(vals[0], vals[1]))
    print(np.corrcoef(vals[0], vals[1])[0, 1])

    layout = go.Layout(
        title='Interactive Plot of Bid and Ask Prices',
        xaxis=dict(title='Time'),
        yaxis=dict(title='Price'),
        hovermode='closest'
    )

    fig = go.Figure(data=traces, layout=layout)
    pyo.iplot(fig)

plot_price(data,["GOOGL", "GOOG"])