In [112]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from scipy.stats import skew, kurtosis
import statsmodels.tsa.vector_ar.vecm as vecm
from statsmodels.tsa.vector_ar.vecm import coint_johansen, VECM
from statsmodels.tsa.stattools import adfuller, acf
from statsmodels.tsa.vector_ar.var_model import VAR
from statsmodels.stats.sandwich_covariance import cov_hac
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tools.sm_exceptions import ValueWarning
from statsmodels.stats.diagnostic import acorr_ljungbox, het_arch, breaks_cusumolsresid
import statsmodels.stats.sandwich_covariance as sw
from statsmodels.stats.sandwich_covariance import cov_hac, weights_bartlett


In [6]:
btc = pd.read_csv('BTCUSDT_filtered.csv')
btc_f = pd.read_csv('BTCUSDT_futures_filtered.csv')
eth = pd.read_csv('ETHUSDT_filtered.csv')
eth_f = pd.read_csv('ETHUSDT_futures_filtered.csv')
bch = pd.read_csv('BCHUSDT_filtered.csv')
bch_f = pd.read_csv('BCHUSDT_futures_filtered.csv')
doge = pd.read_csv('DOGEUSDT_filtered.csv')
doge_f = pd.read_csv('DOGEUSDT_futures_filtered.csv')
spot_data = btc.copy()
futures_data = btc_f.copy()

In [7]:

def describe_returns(data):
    mean = data['daily_return'].mean()
    std_dev = data['daily_return'].std()
    skewness = skew(data['daily_return'].dropna())
    kurt = kurtosis(data['daily_return'].dropna())
    autocorr = data['daily_return'].autocorr()

    return {'mean': mean, 'std_dev': std_dev, 'skewness': skewness, 'kurtosis': kurt, 'autocorrelation': autocorr}

# Selecting optimal lag order using BIC
def select_k_ar_diff(data, maxlags=168, trend='ct'):
    bic_values = []
    for lag in range(1, maxlags + 1):
        model = VAR(data)
        result = model.fit(lag, trend=trend)
        bic_values.append(result.bic)
    
    optimal_lag = bic_values.index(min(bic_values)) + 1
    return optimal_lag

#ADF Test
def adf_test(series):
    result = adfuller(series)
    print(f"ADF Statistic: {result[0]}")
    print(f"p-value: {result[1]}")
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))
    print("")

# Johansen cointegration test
def johansen_test(s1, s2):
    df = pd.concat([s1, s2], axis=1).dropna()
    df.columns = ['spot_log_price', 'futures_log_price']
    k_ar_diff = select_k_ar_diff(df)
    print(f"Optimal lag order (k_ar_diff) based on BIC: {k_ar_diff}")

    result = coint_johansen(df, det_order=1, k_ar_diff=k_ar_diff)
    print("Johansen Cointegration Test:")
    print("Trace Statistic:")
    print(result.lr1)
    print("Critical Values (90%, 95%, 99%):")
    print(result.cvt)
    print("")
    print("Eigen Statistic:")
    print(result.lr2)
    print("Critical Values (90%, 95%, 99%):")
    print(result.cvm)
    print("")
    return k_ar_diff

# Ljung-Box Q test
def ljung_box_test(series, lags=None, significance_level=0.05, title=''):
    result = acorr_ljungbox(series, lags=lags, return_df=True)
    print(f"Ljung-Box Q Test ({title}):")
    print(result)

    autocorrelated_lags = result[result['lb_pvalue'] < significance_level].index
    if autocorrelated_lags.empty:
        print(f"No autocorrelation found up to lag {lags} at significance level {significance_level}")
    else:
        print(f"Autocorrelation found for lags: {autocorrelated_lags.tolist()} at significance level {significance_level}")
    print("")

# Engle's ARCH test
def arch_test(series, lags=None, title=''):
    test_stat, p_value, _, _ = het_arch(series, nlags=lags)
    print(f"Engle's ARCH Test ({title}):")
    print(f"Test Statistic: {test_stat}")
    print(f"P-value: {p_value}")

    significance_level = 0.05
    if p_value < significance_level:
        print(f"Reject the null hypothesis: Conditional heteroscedasticity is present at significance level {significance_level}")
    else:
        print(f"Fail to reject the null hypothesis: No evidence of conditional heteroscedasticity at significance level {significance_level}")
    print("")


In [8]:
# Convert the timestamp to datetime
spot_data['Date'] = pd.to_datetime(spot_data['Date'])
futures_data['Date'] = pd.to_datetime(futures_data['Date'])

# Calculate log prices
spot_data['log_price'] = np.log(spot_data['Close'])
futures_data['log_price'] = np.log(futures_data['Close'])

# Resample hourly data to daily data
spot_daily = spot_data.resample('D', on='Date').last()
futures_daily = futures_data.resample('D', on='Date').last()

# Calculate daily returns
spot_daily['daily_return'] = spot_daily['log_price'].pct_change()
futures_daily['daily_return'] = futures_daily['log_price'].pct_change()
spot_daily = spot_daily.dropna()
futures_daily = futures_daily.dropna()


In [9]:
spot_stats = describe_returns(spot_daily)
futures_stats = describe_returns(futures_daily)

print("Spot Daily Returns Description:", spot_stats)
print("Futures Daily Returns Description:", futures_stats)

Spot Daily Returns Description: {'mean': 0.0001044536975734611, 'std_dev': 0.003982678112315126, 'skewness': -2.3605228101750932, 'kurtosis': 35.29873047836384, 'autocorrelation': -0.1008207431729649}
Futures Daily Returns Description: {'mean': 0.00010455724288804958, 'std_dev': 0.00400591110023789, 'skewness': -2.4227222286289307, 'kurtosis': 36.65827517240885, 'autocorrelation': -0.10345752433055017}


In [10]:
# Test for stationarity
print("ADF Test for Spot Hourly Log Prices:")
adf_test(spot_data['log_price'])
print("\nADF Test for Futures Hourly Log Prices:")
adf_test(futures_data['log_price'])

ADF Test for Spot Hourly Log Prices:
ADF Statistic: -1.754509901496998
p-value: 0.40326847319769105
	1%: -3.431
	5%: -2.862
	10%: -2.567


ADF Test for Futures Hourly Log Prices:
ADF Statistic: -1.7552874075734748
p-value: 0.4028751283693091
	1%: -3.431
	5%: -2.862
	10%: -2.567



In [11]:
spot_data['log_price_diff'] = spot_data['log_price'].diff()
futures_data['log_price_diff'] = futures_data['log_price'].diff()

spot_data = spot_data.dropna()
futures_data = futures_data.dropna()
print("ADF Test for Spot Hourly Log Prices:")
adf_test(spot_data['log_price_diff'])
print("\nADF Test for Futures Hourly Log Prices:")
adf_test(futures_data['log_price_diff'])




ADF Test for Spot Hourly Log Prices:
ADF Statistic: -23.11644979318639
p-value: 0.0
	1%: -3.431
	5%: -2.862
	10%: -2.567


ADF Test for Futures Hourly Log Prices:
ADF Statistic: -23.14064450408425
p-value: 0.0
	1%: -3.431
	5%: -2.862
	10%: -2.567



In [12]:
warnings.filterwarnings("ignore", category=ValueWarning)
o_lag = johansen_test(spot_data['log_price_diff'], futures_data['log_price_diff'])

Optimal lag order (k_ar_diff) based on BIC: 21
Johansen Cointegration Test:
Trace Statistic:
[3882.79146045 1163.72401442]
Critical Values (90%, 95%, 99%):
[[16.1619 18.3985 23.1485]
 [ 2.7055  3.8415  6.6349]]

Eigen Statistic:
[2719.06744603 1163.72401442]
Critical Values (90%, 95%, 99%):
[[15.0006 17.1481 21.7465]
 [ 2.7055  3.8415  6.6349]]



In [13]:

ljung_box_test(spot_data['log_price_diff'], lags=o_lag, significance_level = 0.01, title="Spot Log-Price Differences")
ljung_box_test(futures_data['log_price_diff'], lags=o_lag, significance_level = 0.01, title="Futures Log-Price Differences")

Ljung-Box Q Test (Spot Log-Price Differences):
      lb_stat     lb_pvalue
1   13.712185  2.130676e-04
2   22.712109  1.169845e-05
3   22.731068  4.594531e-05
4   28.117837  1.180546e-05
5   28.143223  3.412576e-05
6   28.540639  7.430256e-05
7   29.627859  1.111029e-04
8   30.231060  1.923908e-04
9   30.232103  4.005523e-04
10  37.398031  4.828632e-05
11  41.073892  2.340488e-05
12  48.442843  2.616935e-06
13  63.363577  1.305048e-08
14  63.612355  2.702109e-08
15  69.499393  5.484409e-09
16  71.781883  4.853942e-09
17  74.140286  4.129205e-09
18  78.633224  1.484646e-09
19  79.236178  2.517227e-09
20  90.282351  6.616106e-11
21  91.337995  9.475728e-11
Autocorrelation found for lags: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] at significance level 0.01

Ljung-Box Q Test (Futures Log-Price Differences):
      lb_stat     lb_pvalue
1    6.588849  1.026195e-02
2   19.468258  5.922724e-05
3   19.534986  2.118899e-04
4   25.699495  3.638039e-05
5   25.7122

In [14]:
arch_test(spot_data['log_price_diff'], lags=o_lag, title="Spot Log-Price Differences")
arch_test(futures_data['log_price_diff'], lags=o_lag, title="Futures Log-Price Differences")

Engle's ARCH Test (Spot Log-Price Differences):
Test Statistic: 4965.199309781874
P-value: 0.0
Reject the null hypothesis: Conditional heteroscedasticity is present at significance level 0.05

Engle's ARCH Test (Futures Log-Price Differences):
Test Statistic: 5006.158486549715
P-value: 0.0
Reject the null hypothesis: Conditional heteroscedasticity is present at significance level 0.05



In [104]:
# Fit VECM model
data = pd.concat([spot_data['log_price_diff'], futures_data['log_price_diff']], axis=1)
data.columns = ['spot', 'future']
vecm_model = vecm.VECM(data, k_ar_diff=o_lag , coint_rank=1, deterministic='coli')
vecm_fit = vecm_model.fit()
vecm_resid = vecm_fit.resid
print(vecm_fit.summary())

Det. terms outside the coint. relation & lagged endog. parameters for equation spot
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const      -9.135e-06   4.87e-05     -0.187      0.851      -0.000    8.64e-05
L1.spot      -14.8586      2.207     -6.731      0.000     -19.185     -10.532
L1.future     13.8149      2.197      6.288      0.000       9.508      18.121
L2.spot      -13.8946      2.151     -6.459      0.000     -18.111      -9.678
L2.future     12.8758      2.141      6.013      0.000       8.679      17.072
L3.spot      -12.8415      2.084     -6.161      0.000     -16.926      -8.757
L3.future     11.8712      2.074      5.723      0.000       7.806      15.937
L4.spot      -11.9198      2.008     -5.936      0.000     -15.856      -7.984
L4.future     10.9855      1.999      5.497      0.000       7.068      14.903
L5.spot      -10.7203      1.923     -5.574    

In [61]:
print("Shape of the residuals:", vecm_resid.shape)
print("First few rows of the residuals:\n", vecm_resid[:5])

Shape of the residuals: (27589, 2)
First few rows of the residuals:
 [[ 0.00188919  0.00200593]
 [-0.00252174 -0.00237328]
 [-0.00259513 -0.00250129]
 [-0.00543952 -0.00531038]
 [ 0.00233545  0.00240188]]


In [93]:
def newey_west_single_eq(residuals, nlags, kernel='bartlett', use_correction=True):
    nobs = residuals.shape[0]
    gamma0 = np.cov(residuals, bias=True)
    gamma_sum = 0

    if kernel == 'bartlett':
        weights_func = lambda h, nlags: 1 - h / (nlags + 1)
    else:
        raise NotImplementedError(f"Kernel '{kernel}' not implemented")

    for h in range(1, nlags + 1):
        gamma_h = np.cov(residuals[h:], residuals[:-h], bias=True)
        weight = weights_func(h, nlags)
        gamma_sum += weight * (gamma_h + gamma_h)

    nw_cov = gamma0 + gamma_sum

    if use_correction:
        correction = nobs / (nobs - nlags)
        nw_cov *= correction

    return nw_cov

nlags = o_lag
kernel = 'bartlett'
# Extract the residuals for each equation
spot_residuals = vecm_resid[:, 0]
future_residuals = vecm_resid[:, 1]

# Calculate the Newey-West estimator for each equation
nw_cov_spot = newey_west_single_eq(spot_residuals, nlags, kernel=kernel, use_correction=True)
nw_cov_future = newey_west_single_eq(future_residuals, nlags, kernel=kernel, use_correction=True)

# Print the Newey-West robust covariance estimates
print("Newey-West robust covariance estimate for spot equation:", nw_cov_spot)
print("Newey-West robust covariance estimate for future equation:", nw_cov_future)

Newey-West robust covariance estimate for spot equation: [[1.44184225e-03 4.89812568e-05]
 [4.89812568e-05 1.44174856e-03]]
Newey-West robust covariance estimate for future equation: [[1.44681460e-03 4.93770804e-05]
 [4.93770804e-05 1.44671775e-03]]


In [105]:
ljung_box_test(spot_residuals, lags=o_lag, significance_level = 0.01, title="Spot Residuals")

Ljung-Box Q Test (Spot Residuals):
       lb_stat     lb_pvalue
1     0.082136  7.744239e-01
2     0.209619  9.004960e-01
3     0.333576  9.535949e-01
4     0.743709  9.458314e-01
5     1.597328  9.015723e-01
6     3.045860  8.030693e-01
7     5.556687  5.923555e-01
8     8.975851  3.443372e-01
9    13.139354  1.564014e-01
10   20.141745  2.794017e-02
11   29.195617  2.116560e-03
12   38.263260  1.390583e-04
13   53.978580  6.100496e-07
14   72.997834  5.501315e-10
15   95.673056  8.570894e-14
16  122.452220  1.852253e-18
17  154.705085  2.925692e-24
18  186.689284  4.514189e-30
19  224.616072  4.082847e-37
20  265.115909  1.006896e-44
21  307.841036  8.046765e-53
Autocorrelation found for lags: [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] at significance level 0.01



In [106]:
ljung_box_test(future_residuals, lags=o_lag, significance_level = 0.01, title="Future Residuals")

Ljung-Box Q Test (Future Residuals):
       lb_stat     lb_pvalue
1     0.079701  7.777024e-01
2     0.201925  9.039670e-01
3     0.320345  9.561579e-01
4     0.726819  9.479809e-01
5     1.548123  9.074505e-01
6     2.926065  8.180677e-01
7     5.376977  6.140612e-01
8     8.621077  3.752622e-01
9    12.677384  1.777558e-01
10   19.452919  3.487304e-02
11   28.201408  3.015078e-03
12   36.944867  2.280513e-04
13   52.326051  1.183337e-06
14   70.873157  1.340453e-09
15   93.210479  2.488110e-13
16  119.388239  7.200846e-18
17  151.116867  1.479236e-23
18  182.386218  3.227730e-29
19  219.803094  3.774781e-36
20  259.670296  1.273548e-43
21  301.903224  1.303757e-51
Autocorrelation found for lags: [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] at significance level 0.01



In [113]:
o_lag = 10
print(breaks_cusumolsresid(vecm_resid))

(0.4481880424951512, 0.9879685106934546, [(1, 1.63), (5, 1.36), (10, 1.22)])


In [116]:
# Calculate the innovation covariance matrix
omega = np.cov(np.stack([spot_residuals, future_residuals]), bias=True)
# Calculate the innovation correlation matrix
sigma_s = np.std(spot_residuals)
sigma_f = np.std(future_residuals)
rho = omega[0, 1] / (sigma_s * sigma_f)
phi = np.array([[1, rho], [rho, 1]])

# Find the eigenvalues and eigenvectors of the correlation matrix
lamda, G = np.linalg.eig(phi)

# Calculate the matrix V
V = np.diag([sigma_s, sigma_f])

# Calculate the matrix M*
M_star = np.linalg.inv(np.dot(np.dot(G, np.diag(np.sqrt(lamda))), np.dot(G.T, np.linalg.inv(V))))

# Get the error correction coefficient vector
alpha_s, alpha_f = vecm_fit.alpha

# Calculate the MIS for spot and future markets
MIS_s = ((alpha_s * M_star[0, 0] + alpha_f * M_star[1, 0]) ** 2) / ((alpha_s * M_star[0, 0] + alpha_f * M_star[1, 0]) ** 2 + (alpha_s * M_star[0, 1] + alpha_f * M_star[1, 1]) ** 2)
MIS_f = ((alpha_s * M_star[0, 1] + alpha_f * M_star[1, 1]) ** 2) / ((alpha_s * M_star[0, 0] + alpha_f * M_star[1, 0]) ** 2 + (alpha_s * M_star[0, 1] + alpha_f * M_star[1, 1]) ** 2)

# Print the MIS for spot and future markets
print("MIS for spot market:", MIS_s)
print("MIS for future market:", MIS_f)

MIS for spot market: [0.36198286]
MIS for future market: [0.63801714]
