In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from scipy.stats import skew, kurtosis
import statsmodels.tsa.vector_ar.vecm as vecm
from statsmodels.tsa.vector_ar.vecm import coint_johansen, VECM
from statsmodels.tsa.stattools import adfuller, acf
from statsmodels.tsa.vector_ar.var_model import VAR
from statsmodels.stats.sandwich_covariance import cov_hac
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tools.sm_exceptions import ValueWarning
from statsmodels.stats.diagnostic import acorr_ljungbox, het_arch, breaks_cusumolsresid
import statsmodels.stats.sandwich_covariance as sw
from statsmodels.stats.sandwich_covariance import cov_hac, weights_bartlett



In [2]:
btc = pd.read_csv('BTCUSDT_filtered.csv')
btc_f = pd.read_csv('BTCUSDT_futures_filtered.csv')
eth = pd.read_csv('ETHUSDT_filtered.csv')
eth_f = pd.read_csv('ETHUSDT_futures_filtered.csv')
bch = pd.read_csv('BCHUSDT_filtered.csv')
bch_f = pd.read_csv('BCHUSDT_futures_filtered.csv')
doge = pd.read_csv('DOGEUSDT_filtered.csv')
doge_f = pd.read_csv('DOGEUSDT_futures_filtered.csv')
spot_data = doge.copy()
futures_data = doge_f.copy()

In [3]:

def describe_returns(data):
    mean = data['daily_return'].mean()
    std_dev = data['daily_return'].std()
    skewness = skew(data['daily_return'].dropna())
    kurt = kurtosis(data['daily_return'].dropna())
    autocorr = data['daily_return'].autocorr()

    return {'mean': mean, 'std_dev': std_dev, 'skewness': skewness, 'kurtosis': kurt, 'autocorrelation': autocorr}

# Selecting optimal lag order using BIC
def select_k_ar_diff(data, maxlags=168, trend='ct'):
    bic_values = []
    for lag in range(1, maxlags + 1):
        model = VAR(data)
        result = model.fit(lag, trend=trend)
        bic_values.append(result.bic)
    
    optimal_lag = bic_values.index(min(bic_values)) + 1
    return optimal_lag

#ADF Test
def adf_test(series):
    result = adfuller(series)
    print(f"ADF Statistic: {result[0]}")
    print(f"p-value: {result[1]}")
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))
    print("")

# Johansen cointegration test
def johansen_test(s1, s2):
    df = pd.concat([s1, s2], axis=1).dropna()
    df.columns = ['spot_log_price', 'futures_log_price']
    k_ar_diff = select_k_ar_diff(df)
    print(f"Optimal lag order (k_ar_diff) based on BIC: {k_ar_diff}")

    result = coint_johansen(df, det_order=1, k_ar_diff=k_ar_diff)
    print("Johansen Cointegration Test:")
    print("Trace Statistic:")
    print(result.lr1)
    print("Critical Values (90%, 95%, 99%):")
    print(result.cvt)
    print("")
    print("Eigen Statistic:")
    print(result.lr2)
    print("Critical Values (90%, 95%, 99%):")
    print(result.cvm)
    print("")
    return k_ar_diff

# Ljung-Box Q test
def ljung_box_test(series, lags=None, significance_level=0.05, title=''):
    result = acorr_ljungbox(series, lags=lags, return_df=True)
    print(f"Ljung-Box Q Test ({title}):")
    print(result)

    autocorrelated_lags = result[result['lb_pvalue'] < significance_level].index
    if autocorrelated_lags.empty:
        print(f"No autocorrelation found up to lag {lags} at significance level {significance_level}")
    else:
        print(f"Autocorrelation found for lags: {autocorrelated_lags.tolist()} at significance level {significance_level}")
    print("")

# Engle's ARCH test
def arch_test(series, lags=None, title=''):
    test_stat, p_value, _, _ = het_arch(series, nlags=lags)
    print(f"Engle's ARCH Test ({title}):")
    print(f"Test Statistic: {test_stat}")
    print(f"P-value: {p_value}")

    significance_level = 0.05
    if p_value < significance_level:
        print(f"Reject the null hypothesis: Conditional heteroscedasticity is present at significance level {significance_level}")
    else:
        print(f"Fail to reject the null hypothesis: No evidence of conditional heteroscedasticity at significance level {significance_level}")
    print("")


In [4]:
# Convert the timestamp to datetime
spot_data['Date'] = pd.to_datetime(spot_data['Date'])
futures_data['Date'] = pd.to_datetime(futures_data['Date'])

# Calculate log prices
spot_data['log_price'] = np.log(spot_data['Close'])
futures_data['log_price'] = np.log(futures_data['Close'])

# Resample hourly data to daily data
spot_daily = spot_data.resample('D', on='Date').last()
futures_daily = futures_data.resample('D', on='Date').last()

# Calculate daily returns
spot_daily['daily_return'] = spot_daily['log_price'].pct_change()
futures_daily['daily_return'] = futures_daily['log_price'].pct_change()
spot_daily = spot_daily.dropna()
futures_daily = futures_daily.dropna()


In [5]:
spot_stats = describe_returns(spot_daily)
futures_stats = describe_returns(futures_daily)

print("Spot Daily Returns Description:", spot_stats)
print("Futures Daily Returns Description:", futures_stats)

Spot Daily Returns Description: {'mean': 0.0006445611845912323, 'std_dev': 0.05272630615089562, 'skewness': 0.8290809215228713, 'kurtosis': 26.778863464961795, 'autocorrelation': -0.12454469979608652}
Futures Daily Returns Description: {'mean': 0.0006526516825369261, 'std_dev': 0.05287790882351795, 'skewness': 0.8419160232710591, 'kurtosis': 26.825706532871557, 'autocorrelation': -0.12524190678782737}


In [6]:
# Test for stationarity
print("ADF Test for Spot Hourly Log Prices:")
adf_test(spot_data['log_price'])
print("ADF Test for Futures Hourly Log Prices:")
adf_test(futures_data['log_price'])

ADF Test for Spot Hourly Log Prices:
ADF Statistic: -1.7777057520901665
p-value: 0.3915849841685572
	1%: -3.431
	5%: -2.862
	10%: -2.567

ADF Test for Futures Hourly Log Prices:
ADF Statistic: -1.7771896249756545
p-value: 0.39184376103447915
	1%: -3.431
	5%: -2.862
	10%: -2.567



In [7]:
spot_data['log_price_diff'] = spot_data['log_price'].diff()
futures_data['log_price_diff'] = futures_data['log_price'].diff()

spot_data = spot_data.dropna()
futures_data = futures_data.dropna()
print("ADF Test for Spot Hourly Log Prices:")
adf_test(spot_data['log_price_diff'])
print("\nADF Test for Futures Hourly Log Prices:")
adf_test(futures_data['log_price_diff'])




ADF Test for Spot Hourly Log Prices:
ADF Statistic: -23.934617922772787
p-value: 0.0
	1%: -3.431
	5%: -2.862
	10%: -2.567


ADF Test for Futures Hourly Log Prices:
ADF Statistic: -23.920919255302085
p-value: 0.0
	1%: -3.431
	5%: -2.862
	10%: -2.567



In [8]:
warnings.filterwarnings("ignore", category=ValueWarning)
o_lag = johansen_test(spot_data['log_price_diff'], futures_data['log_price_diff'])

Optimal lag order (k_ar_diff) based on BIC: 29
Johansen Cointegration Test:
Trace Statistic:
[2578.01873618  904.9444291 ]
Critical Values (90%, 95%, 99%):
[[16.1619 18.3985 23.1485]
 [ 2.7055  3.8415  6.6349]]

Eigen Statistic:
[1673.07430709  904.9444291 ]
Critical Values (90%, 95%, 99%):
[[15.0006 17.1481 21.7465]
 [ 2.7055  3.8415  6.6349]]



In [9]:

ljung_box_test(spot_data['log_price_diff'], lags=o_lag, significance_level = 0.01, title="Spot Log-Price Differences")
ljung_box_test(futures_data['log_price_diff'], lags=o_lag, significance_level = 0.01, title="Futures Log-Price Differences")

Ljung-Box Q Test (Spot Log-Price Differences):
       lb_stat      lb_pvalue
1    16.979366   3.778826e-05
2    41.929112   7.856135e-10
3    43.486023   1.940476e-09
4    57.702233   8.812110e-12
5    60.974333   7.643963e-12
6    73.141976   9.255598e-14
7    73.264649   3.226187e-13
8   117.564411   1.054740e-21
9   164.247009   9.724843e-31
10  177.318686   8.437938e-33
11  177.533429   3.309025e-32
12  216.689996   1.152254e-39
13  303.926322   3.626429e-57
14  308.141385   2.366300e-57
15  308.605319   9.079553e-57
16  308.621679   4.168906e-56
17  317.693085   2.482146e-57
18  325.987015   2.120192e-58
19  333.074753   3.181112e-59
20  335.898803   3.557228e-59
21  347.146259   7.297740e-61
22  364.535603   8.199690e-64
23  369.647198   3.039561e-64
24  552.652635  1.841063e-101
25  558.518260  5.374722e-102
26  561.951431  4.970156e-102
27  589.600379  4.213243e-107
28  615.023919  1.037225e-111
29  652.677113  7.282996e-119
Autocorrelation found for lags: [1, 2, 3, 4, 5, 6, 7,

In [10]:
arch_test(spot_data['log_price_diff'], lags=o_lag, title="Spot Log-Price Differences")
arch_test(futures_data['log_price_diff'], lags=o_lag, title="Futures Log-Price Differences")

Engle's ARCH Test (Spot Log-Price Differences):
Test Statistic: 6351.108220717202
P-value: 0.0
Reject the null hypothesis: Conditional heteroscedasticity is present at significance level 0.05

Engle's ARCH Test (Futures Log-Price Differences):
Test Statistic: 6179.077399215191
P-value: 0.0
Reject the null hypothesis: Conditional heteroscedasticity is present at significance level 0.05



In [11]:
# Fit VECM model
data = pd.concat([spot_data['log_price_diff'], futures_data['log_price_diff']], axis=1)
data.columns = ['spot', 'future']
vecm_model = vecm.VECM(data, k_ar_diff=o_lag , coint_rank=1, deterministic='coli')
vecm_fit = vecm_model.fit()
vecm_resid = vecm_fit.resid
print(vecm_fit.summary())

Det. terms outside the coint. relation & lagged endog. parameters for equation spot
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.114e-08      0.000   9.82e-05      1.000      -0.000       0.000
L1.spot       12.0658      3.232      3.733      0.000       5.731      18.401
L1.future    -13.0600      3.230     -4.043      0.000     -19.392      -6.728
L2.spot       12.1413      3.187      3.809      0.000       5.894      18.388
L2.future    -13.1370      3.186     -4.124      0.000     -19.381      -6.893
L3.spot       12.8926      3.129      4.120      0.000       6.760      19.025
L3.future    -13.8688      3.127     -4.435      0.000     -19.998      -7.739
L4.spot       13.0342      3.060      4.260      0.000       7.037      19.031
L4.future    -13.9610      3.058     -4.566      0.000     -19.954      -7.968
L5.spot       12.5494      2.981      4.209    

In [12]:
print("Shape of the residuals:", vecm_resid.shape)
print("First few rows of the residuals:\n", vecm_resid[:5])

Shape of the residuals: (23102, 2)
First few rows of the residuals:
 [[-0.00243766 -0.003524  ]
 [ 0.00847916  0.01009953]
 [ 0.04311156  0.04265013]
 [-0.01796734 -0.01717423]
 [-0.01587348 -0.01275963]]


In [13]:
def newey_west_single_eq(residuals, nlags, kernel='bartlett', use_correction=True):
    nobs = residuals.shape[0]
    gamma0 = np.cov(residuals, bias=True)
    gamma_sum = 0

    if kernel == 'bartlett':
        weights_func = lambda h, nlags: 1 - h / (nlags + 1)
    else:
        raise NotImplementedError(f"Kernel '{kernel}' not implemented")

    for h in range(1, nlags + 1):
        gamma_h = np.cov(residuals[h:], residuals[:-h], bias=True)
        weight = weights_func(h, nlags)
        gamma_sum += weight * (gamma_h + gamma_h)

    nw_cov = gamma0 + gamma_sum

    if use_correction:
        correction = nobs / (nobs - nlags)
        nw_cov *= correction

    return nw_cov

nlags = o_lag
kernel = 'bartlett'
# Extract the residuals for each equation
spot_residuals = vecm_resid[:, 0]
future_residuals = vecm_resid[:, 1]

# Calculate the Newey-West estimator for each equation
nw_cov_spot = newey_west_single_eq(spot_residuals, nlags, kernel=kernel, use_correction=True)
nw_cov_future = newey_west_single_eq(future_residuals, nlags, kernel=kernel, use_correction=True)

# Print the Newey-West robust covariance estimates
print("Newey-West robust covariance estimate for spot equation:", nw_cov_spot)
print("Newey-West robust covariance estimate for future equation:", nw_cov_future)

Newey-West robust covariance estimate for spot equation: [[0.00893799 0.00023053]
 [0.00023053 0.00894012]]
Newey-West robust covariance estimate for future equation: [[0.00889163 0.00022827]
 [0.00022827 0.00889361]]


In [14]:
ljung_box_test(spot_residuals, lags=o_lag, significance_level = 0.01, title="Spot Residuals")

Ljung-Box Q Test (Spot Residuals):
       lb_stat     lb_pvalue
1     0.013603  9.071505e-01
2     0.025217  9.874707e-01
3     0.041717  9.977620e-01
4     0.113139  9.984590e-01
5     0.127117  9.997071e-01
6     0.134831  9.999514e-01
7     0.171213  9.999852e-01
8     0.193754  9.999966e-01
9     0.306777  9.999963e-01
10    0.307552  9.999994e-01
11    0.784169  9.999855e-01
12    2.968411  9.957630e-01
13    6.135391  9.410862e-01
14    8.524686  8.602424e-01
15   12.106422  6.709565e-01
16   17.532086  3.519992e-01
17   23.289201  1.400703e-01
18   28.375339  5.656831e-02
19   35.125962  1.348683e-02
20   44.703638  1.210243e-03
21   57.983941  2.565875e-05
22   76.908120  5.190087e-08
23   97.130727  4.386156e-11
24  123.371287  2.424716e-15
25  156.276448  5.817839e-21
26  188.564393  1.333389e-26
27  216.406260  1.801260e-31
28  243.078419  3.731516e-36
29  282.027801  2.840270e-43
Autocorrelation found for lags: [20, 21, 22, 23, 24, 25, 26, 27, 28, 29] at significance level 

In [15]:
ljung_box_test(future_residuals, lags=o_lag, significance_level = 0.01, title="Future Residuals")

Ljung-Box Q Test (Future Residuals):
       lb_stat     lb_pvalue
1     0.014673  9.035854e-01
2     0.026627  9.867749e-01
3     0.044234  9.975583e-01
4     0.127287  9.980587e-01
5     0.149934  9.995611e-01
6     0.160931  9.999182e-01
7     0.191013  9.999785e-01
8     0.223230  9.999941e-01
9     0.351159  9.999934e-01
10    0.352493  9.999988e-01
11    0.859948  9.999767e-01
12    3.098457  9.948081e-01
13    6.387785  9.308945e-01
14    8.870225  8.392730e-01
15   12.586414  6.342090e-01
16   18.220294  3.111558e-01
17   23.920195  1.216196e-01
18   29.033044  4.797672e-02
19   35.840762  1.104088e-02
20   45.730795  8.774630e-04
21   59.144144  1.719496e-05
22   78.268528  3.115192e-08
23   99.002286  2.092118e-11
24  125.352500  1.069380e-15
25  158.448600  2.296326e-21
26  190.911809  4.774653e-27
27  218.889771  5.992472e-32
28  245.472164  1.279143e-36
29  283.896247  1.219126e-43
Autocorrelation found for lags: [20, 21, 22, 23, 24, 25, 26, 27, 28, 29] at significance leve

In [16]:
o_lag = 19
print(breaks_cusumolsresid(vecm_resid))

(0.8009563610363691, 0.5425871778269112, [(1, 1.63), (5, 1.36), (10, 1.22)])


In [17]:
# Calculate the innovation covariance matrix
omega = np.cov(np.stack([spot_residuals, future_residuals]), bias=True)
# Calculate the innovation correlation matrix
sigma_s = np.std(spot_residuals)
sigma_f = np.std(future_residuals)
rho = omega[0, 1] / (sigma_s * sigma_f)
phi = np.array([[1, rho], [rho, 1]])

# Find the eigenvalues and eigenvectors of the correlation matrix
lamda, G = np.linalg.eig(phi)

# Calculate the matrix V
V = np.diag([sigma_s, sigma_f])

# Calculate the matrix M*
M_star = np.linalg.inv(np.dot(np.dot(G, np.diag(np.sqrt(lamda))), np.dot(G.T, np.linalg.inv(V))))

# Get the error correction coefficient vector
alpha_s, alpha_f = vecm_fit.alpha

# Calculate the MIS for spot and future markets
MIS_s = ((alpha_s * M_star[0, 0] + alpha_f * M_star[1, 0]) ** 2) / ((alpha_s * M_star[0, 0] + alpha_f * M_star[1, 0]) ** 2 + (alpha_s * M_star[0, 1] + alpha_f * M_star[1, 1]) ** 2)
MIS_f = ((alpha_s * M_star[0, 1] + alpha_f * M_star[1, 1]) ** 2) / ((alpha_s * M_star[0, 0] + alpha_f * M_star[1, 0]) ** 2 + (alpha_s * M_star[0, 1] + alpha_f * M_star[1, 1]) ** 2)

# Print the MIS for spot and future markets
print("MIS for spot market:", MIS_s)
print("MIS for future market:", MIS_f)

MIS for spot market: [0.56896505]
MIS for future market: [0.43103495]
