In [1]:
import cudf 
import os 
from databento_dbn import FIXED_PRICE_SCALE, UNDEF_PRICE

base_path = os.getcwd()
from typing import Literal
def build_symbol_df(symbol: Literal['GOOG', 'GOOGL']):
    goog_dir = "processed/GOOG" if symbol == 'GOOG' else "processed/GOOGL"
    goog_df_paths = [f"{base_path}/{goog_dir}/{file}" for file in os.listdir(f"{base_path}/{goog_dir}")]
    keep_cols = ["ts_recv", 'price']
    goog_df = cudf.read_parquet(goog_df_paths[0])
    orig_cols = goog_df.columns
    goog_df = goog_df.drop(columns=[col for col in goog_df.columns if col not in keep_cols])
    for file in goog_df_paths[1:]:
        df = cudf.read_parquet(file).drop(columns=[col for col in orig_cols if col not in keep_cols])
        goog_df = cudf.concat([goog_df, df])
    goog_df = goog_df.sort_values(by='ts_recv')
    return goog_df

# goog_df = build_symbol_df('GOOG')
# googl_df = build_symbol_df('GOOGL')

# goog_df['price'] = goog_df['price'] / FIXED_PRICE_SCALE
# googl_df['price'] = googl_df['price'] / FIXED_PRICE_SCALE

# goog_df.to_parquet(goog_path)
# googl_df.to_parquet(googl_path)

In [2]:
def merge(time_unit: str): 
        
    goog_path = f"{base_path}/GOOG.parquet"
    googl_path = f"{base_path}/GOOGL.parquet"
    goog_df = cudf.read_parquet(goog_path)
    googl_df = cudf.read_parquet(googl_path)

    goog_df = goog_df[goog_df['price'] < 500]
    googl_df = googl_df[googl_df['price'] < 500]
    merged_goog = goog_df
    merged_googl = googl_df 
    merged_goog['ts_recv'] = merged_goog['ts_recv'].astype('datetime64[ms]').dt.floor(time_unit)
    merged_goog = merged_goog.groupby('ts_recv').mean()
    merged_googl['ts_recv'] = merged_googl['ts_recv'].astype('datetime64[ms]').dt.floor(time_unit)
    merged_googl = merged_googl.groupby('ts_recv').mean()

    merged_df = cudf.merge(merged_goog, merged_googl, on='ts_recv', how='inner', suffixes=('_goog', '_googl'))
    merged_df = merged_df.reset_index()
    merged_df = merged_df.sort_values(by='ts_recv')

    return merged_df


In [3]:
# Calculate the correlation between the two prices
for time_unit in ['ms', 's', 'min']:
    for offset in [5, 10, 30]: 
        merged_df = merge(time_unit)
        print(f"time unit: {time_unit}")
        print(merged_df['price_goog'].shift(offset).dropna().corr(merged_df['price_googl']))
        print(merged_df['price_goog'].shift(offset).dropna().corr(merged_df['price_goog']))

        merged_df["returns_googl"] = (merged_df['price_googl'] / merged_df['price_googl'].shift(1))
        merged_df["returns_goog"] = (merged_df['price_goog'] / merged_df['price_goog'].shift(1))

        print(merged_df["returns_googl"].shift(offset).dropna().corr(merged_df["returns_goog"]))
        print(merged_df["returns_googl"].shift(offset).dropna().corr(merged_df["returns_goog"]))

time unit: ms
0.02332943996730617
0.33312774165899783
0.01189051853222879
0.01189051853222879
time unit: ms
0.02194541405474677
0.2923794160117022
0.01068284651761066
0.01068284651761066
time unit: ms
0.02031878822802722
0.24461767209971258
0.009598129682295172
0.009598129682295172
time unit: s
0.05731241635065476
0.479832108754642
0.018053828856783565
0.018053828856783565
time unit: s
0.04091184421598063
0.42760912395035366
0.011022784813658162
0.011022784813658162
time unit: s
0.006352982063794
0.3807220503096546
0.005803145453220256
0.005803145453220256
time unit: min
0.19735651658637596
0.3744441637006528
0.08952314624178433
0.08952314624178433
time unit: min
0.12646743402981092
0.2704386574781307
0.041233922932599795
0.041233922932599795
time unit: min
0.015334875158037454
0.11300122467017139
0.0010790387342186267
0.0010790387342186267


In [4]:
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.tsa.stattools import adfuller
import numpy as np 
# Function to perform Augmented Dickey-Fuller test
def adf_test(series, title=''):
    result = adfuller(series, autolag='AIC')
    print(f'ADF Statistic for {title}: {result[0]}')
    print(f'p-value: {result[1]}')
    print('Critical Values:')
    for key, value in result[4].items():
        print(f'\t{key}: {value}')

# Performing ADF test on both series
subsample_percent = 0.05
goog_price = merged_df['price_goog'].to_numpy().astype(np.float16)
googl_price = merged_df['price_googl'].to_numpy().astype(np.float16)
# adf_test(goog_price, title='price_goog')
# adf_test(googl_price, title='price_googl')



In [5]:

# Setting up data for Johansen cointegration test
data_for_test = merged_df[['price_googl', 'price_goog']].dropna()
data_for_test = data_for_test.to_numpy().astype(np.float16)

# Perform the Johansen cointegration test
# The 'det_order' parameter specifies the deterministic part of the setup; 0 means no deterministic part
# The 'k_ar_diff' specifies the lag, here using 1 for simplicity
johansen_test = coint_johansen(data_for_test, det_order=0, k_ar_diff=1)

# Display the results
eig = johansen_test.eig  # Eigenvalues
lr1 = johansen_test.lr1  # Test statistic
cvt = johansen_test.cvt  # Critical values for the test statistic at the 90%, 95%, 99% confidence levels

johansen_results = {
    "Eigenvalues": eig,
    "Test Statistic": lr1,
    "Critical Values (90%, 95%, 99%)": cvt
}

johansen_results

{'Eigenvalues': array([0.16074036, 0.09819602]),
 'Test Statistic': array([9778.90208348, 3627.97282724]),
 'Critical Values (90%, 95%, 99%)': array([[13.4294, 15.4943, 19.9349],
        [ 2.7055,  3.8415,  6.6349]])}