In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from scipy.stats import skew, kurtosis
import statsmodels.tsa.vector_ar.vecm as vecm
from statsmodels.tsa.vector_ar.vecm import coint_johansen, VECM
from statsmodels.tsa.stattools import adfuller, acf
from statsmodels.tsa.vector_ar.var_model import VAR
from statsmodels.stats.sandwich_covariance import cov_hac
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tools.sm_exceptions import ValueWarning
from statsmodels.stats.diagnostic import acorr_ljungbox, het_arch, breaks_cusumolsresid
import statsmodels.stats.sandwich_covariance as sw
from statsmodels.stats.sandwich_covariance import cov_hac, weights_bartlett



In [2]:
btc = pd.read_csv('BTCUSDT_filtered.csv')
btc_f = pd.read_csv('BTCUSDT_futures_filtered.csv')
eth = pd.read_csv('ETHUSDT_filtered.csv')
eth_f = pd.read_csv('ETHUSDT_futures_filtered.csv')
bch = pd.read_csv('BCHUSDT_filtered.csv')
bch_f = pd.read_csv('BCHUSDT_futures_filtered.csv')
doge = pd.read_csv('DOGEUSDT_filtered.csv')
doge_f = pd.read_csv('DOGEUSDT_futures_filtered.csv')
spot_data = eth.copy()
futures_data = eth_f.copy()

In [3]:

def describe_returns(data):
    mean = data['daily_return'].mean()
    std_dev = data['daily_return'].std()
    skewness = skew(data['daily_return'].dropna())
    kurt = kurtosis(data['daily_return'].dropna())
    autocorr = data['daily_return'].autocorr()

    return {'mean': mean, 'std_dev': std_dev, 'skewness': skewness, 'kurtosis': kurt, 'autocorrelation': autocorr}

# Selecting optimal lag order using BIC
def select_k_ar_diff(data, maxlags=168, trend='ct'):
    bic_values = []
    for lag in range(1, maxlags + 1):
        model = VAR(data)
        result = model.fit(lag, trend=trend)
        bic_values.append(result.bic)
    
    optimal_lag = bic_values.index(min(bic_values)) + 1
    return optimal_lag

#ADF Test
def adf_test(series):
    result = adfuller(series)
    print(f"ADF Statistic: {result[0]}")
    print(f"p-value: {result[1]}")
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))
    print("")

# Johansen cointegration test
def johansen_test(s1, s2):
    df = pd.concat([s1, s2], axis=1).dropna()
    df.columns = ['spot_log_price', 'futures_log_price']
    k_ar_diff = select_k_ar_diff(df)
    print(f"Optimal lag order (k_ar_diff) based on BIC: {k_ar_diff}")

    result = coint_johansen(df, det_order=1, k_ar_diff=k_ar_diff)
    print("Johansen Cointegration Test:")
    print("Trace Statistic:")
    print(result.lr1)
    print("Critical Values (90%, 95%, 99%):")
    print(result.cvt)
    print("")
    print("Eigen Statistic:")
    print(result.lr2)
    print("Critical Values (90%, 95%, 99%):")
    print(result.cvm)
    print("")
    return k_ar_diff

# Ljung-Box Q test
def ljung_box_test(series, lags=None, significance_level=0.05, title=''):
    result = acorr_ljungbox(series, lags=lags, return_df=True)
    print(f"Ljung-Box Q Test ({title}):")
    print(result)

    autocorrelated_lags = result[result['lb_pvalue'] < significance_level].index
    if autocorrelated_lags.empty:
        print(f"No autocorrelation found up to lag {lags} at significance level {significance_level}")
    else:
        print(f"Autocorrelation found for lags: {autocorrelated_lags.tolist()} at significance level {significance_level}")
    print("")

# Engle's ARCH test
def arch_test(series, lags=None, title=''):
    test_stat, p_value, _, _ = het_arch(series, nlags=lags)
    print(f"Engle's ARCH Test ({title}):")
    print(f"Test Statistic: {test_stat}")
    print(f"P-value: {p_value}")

    significance_level = 0.05
    if p_value < significance_level:
        print(f"Reject the null hypothesis: Conditional heteroscedasticity is present at significance level {significance_level}")
    else:
        print(f"Fail to reject the null hypothesis: No evidence of conditional heteroscedasticity at significance level {significance_level}")
    print("")


In [4]:
# Convert the timestamp to datetime
spot_data['Date'] = pd.to_datetime(spot_data['Date'])
futures_data['Date'] = pd.to_datetime(futures_data['Date'])

# Calculate log prices
spot_data['log_price'] = np.log(spot_data['Close'])
futures_data['log_price'] = np.log(futures_data['Close'])

# Resample hourly data to daily data
spot_daily = spot_data.resample('D', on='Date').last()
futures_daily = futures_data.resample('D', on='Date').last()

# Calculate daily returns
spot_daily['daily_return'] = spot_daily['log_price'].pct_change()
futures_daily['daily_return'] = futures_daily['log_price'].pct_change()
spot_daily = spot_daily.dropna()
futures_daily = futures_daily.dropna()


In [5]:
spot_stats = describe_returns(spot_daily)
futures_stats = describe_returns(futures_daily)

print("Spot Daily Returns Description:", spot_stats)
print("Futures Daily Returns Description:", futures_stats)

Spot Daily Returns Description: {'mean': 0.0003813296269478374, 'std_dev': 0.007962728963726653, 'skewness': -2.290599501562552, 'kurtosis': 36.66785892830594, 'autocorrelation': -0.12817170858017124}
Futures Daily Returns Description: {'mean': 0.0003819435973488823, 'std_dev': 0.00802084713688604, 'skewness': -2.3500107774949988, 'kurtosis': 38.39720118210355, 'autocorrelation': -0.13286766005554326}


In [6]:
# Test for stationarity
print("ADF Test for Spot Hourly Log Prices:")
adf_test(spot_data['log_price'])
print("\nADF Test for Futures Hourly Log Prices:")
adf_test(futures_data['log_price'])

ADF Test for Spot Hourly Log Prices:
ADF Statistic: -2.1394517943286204
p-value: 0.22894946632329571
	1%: -3.431
	5%: -2.862
	10%: -2.567


ADF Test for Futures Hourly Log Prices:
ADF Statistic: -2.139353099800625
p-value: 0.22898765329175103
	1%: -3.431
	5%: -2.862
	10%: -2.567



In [7]:
spot_data['log_price_diff'] = spot_data['log_price'].diff()
futures_data['log_price_diff'] = futures_data['log_price'].diff()

spot_data = spot_data.dropna()
futures_data = futures_data.dropna()
print("ADF Test for Spot Hourly Log Prices:")
adf_test(spot_data['log_price_diff'])
print("\nADF Test for Futures Hourly Log Prices:")
adf_test(futures_data['log_price_diff'])




ADF Test for Spot Hourly Log Prices:
ADF Statistic: -23.15504740599775
p-value: 0.0
	1%: -3.431
	5%: -2.862
	10%: -2.567


ADF Test for Futures Hourly Log Prices:
ADF Statistic: -23.196092229002826
p-value: 0.0
	1%: -3.431
	5%: -2.862
	10%: -2.567



In [8]:
warnings.filterwarnings("ignore", category=ValueWarning)
o_lag = johansen_test(spot_data['log_price_diff'], futures_data['log_price_diff'])

Optimal lag order (k_ar_diff) based on BIC: 24
Johansen Cointegration Test:
Trace Statistic:
[3506.93613791 1159.27651096]
Critical Values (90%, 95%, 99%):
[[16.1619 18.3985 23.1485]
 [ 2.7055  3.8415  6.6349]]

Eigen Statistic:
[2347.65962695 1159.27651096]
Critical Values (90%, 95%, 99%):
[[15.0006 17.1481 21.7465]
 [ 2.7055  3.8415  6.6349]]



In [9]:

ljung_box_test(spot_data['log_price_diff'], lags=o_lag, significance_level = 0.01, title="Spot Log-Price Differences")
ljung_box_test(futures_data['log_price_diff'], lags=o_lag, significance_level = 0.01, title="Futures Log-Price Differences")

Ljung-Box Q Test (Spot Log-Price Differences):
       lb_stat     lb_pvalue
1     1.499492  2.207496e-01
2    18.728489  8.573543e-05
3    19.706211  1.952790e-04
4    26.932413  2.051424e-05
5    29.449640  1.892255e-05
6    32.174422  1.510812e-05
7    32.626864  3.106752e-05
8    36.980440  1.160463e-05
9    37.187156  2.436933e-05
10   40.979117  1.138063e-05
11   41.726849  1.805843e-05
12   44.550563  1.230124e-05
13   65.055869  6.440627e-09
14   65.363508  1.317441e-08
15   66.784378  1.659432e-08
16   66.805122  3.594453e-08
17   68.997326  3.207164e-08
18   70.112171  4.326029e-08
19   70.194103  8.539289e-08
20   73.648872  4.569970e-08
21   82.270289  3.358589e-09
22   90.229194  3.132831e-10
23   92.761410  2.433924e-10
24  148.596061  6.036816e-20
Autocorrelation found for lags: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] at significance level 0.01

Ljung-Box Q Test (Futures Log-Price Differences):
       lb_stat     lb_pvalue
1   

In [10]:
arch_test(spot_data['log_price_diff'], lags=o_lag, title="Spot Log-Price Differences")
arch_test(futures_data['log_price_diff'], lags=o_lag, title="Futures Log-Price Differences")

Engle's ARCH Test (Spot Log-Price Differences):
Test Statistic: 4090.7403899546275
P-value: 0.0
Reject the null hypothesis: Conditional heteroscedasticity is present at significance level 0.05

Engle's ARCH Test (Futures Log-Price Differences):
Test Statistic: 4240.112056617033
P-value: 0.0
Reject the null hypothesis: Conditional heteroscedasticity is present at significance level 0.05



In [11]:
# Fit VECM model
data = pd.concat([spot_data['log_price_diff'], futures_data['log_price_diff']], axis=1)
data.columns = ['spot', 'future']
vecm_model = vecm.VECM(data, k_ar_diff=o_lag , coint_rank=1, deterministic='coli')
vecm_fit = vecm_model.fit()
vecm_resid = vecm_fit.resid
print(vecm_fit.summary())

Det. terms outside the coint. relation & lagged endog. parameters for equation spot
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.179e-05   6.18e-05     -0.191      0.849      -0.000       0.000
L1.spot      -23.6710      3.105     -7.623      0.000     -29.757     -17.585
L1.future     22.6486      3.098      7.311      0.000      16.577      28.720
L2.spot      -22.5412      3.044     -7.406      0.000     -28.507     -16.576
L2.future     21.5367      3.036      7.094      0.000      15.586      27.487
L3.spot      -21.0855      2.967     -7.107      0.000     -26.900     -15.271
L3.future     20.1201      2.959      6.799      0.000      14.320      25.920
L4.spot      -19.3158      2.877     -6.713      0.000     -24.956     -13.676
L4.future     18.3833      2.870      6.405      0.000      12.758      24.009
L5.spot      -17.8856      2.779     -6.436    

In [12]:
print("Shape of the residuals:", vecm_resid.shape)
print("First few rows of the residuals:\n", vecm_resid[:5])

Shape of the residuals: (27658, 2)
First few rows of the residuals:
 [[-0.00506587 -0.00473405]
 [ 0.00514941  0.00550065]
 [ 0.00154336  0.00143446]
 [-0.00089724 -0.00070662]
 [-0.00023368  0.00031823]]


In [13]:
def newey_west_single_eq(residuals, nlags, kernel='bartlett', use_correction=True):
    nobs = residuals.shape[0]
    gamma0 = np.cov(residuals, bias=True)
    gamma_sum = 0

    if kernel == 'bartlett':
        weights_func = lambda h, nlags: 1 - h / (nlags + 1)
    else:
        raise NotImplementedError(f"Kernel '{kernel}' not implemented")

    for h in range(1, nlags + 1):
        gamma_h = np.cov(residuals[h:], residuals[:-h], bias=True)
        weight = weights_func(h, nlags)
        gamma_sum += weight * (gamma_h + gamma_h)

    nw_cov = gamma0 + gamma_sum

    if use_correction:
        correction = nobs / (nobs - nlags)
        nw_cov *= correction

    return nw_cov

nlags = o_lag
kernel = 'bartlett'
# Extract the residuals for each equation
spot_residuals = vecm_resid[:, 0]
future_residuals = vecm_resid[:, 1]

# Calculate the Newey-West estimator for each equation
nw_cov_spot = newey_west_single_eq(spot_residuals, nlags, kernel=kernel, use_correction=True)
nw_cov_future = newey_west_single_eq(future_residuals, nlags, kernel=kernel, use_correction=True)

# Print the Newey-West robust covariance estimates
print("Newey-West robust covariance estimate for spot equation:", nw_cov_spot)
print("Newey-West robust covariance estimate for future equation:", nw_cov_future)

Newey-West robust covariance estimate for spot equation: [[2.64127351e-03 7.46594144e-05]
 [7.46594144e-05 2.64098823e-03]]
Newey-West robust covariance estimate for future equation: [[2.66767838e-03 7.57600196e-05]
 [7.57600196e-05 2.66739857e-03]]


In [14]:
ljung_box_test(spot_residuals, lags=o_lag, significance_level = 0.01, title="Spot Residuals")

Ljung-Box Q Test (Spot Residuals):
       lb_stat     lb_pvalue
1     0.000447  9.831364e-01
2     0.001622  9.991892e-01
3     0.087236  9.933239e-01
4     0.346526  9.866161e-01
5     0.959759  9.657458e-01
6     2.239712  8.963853e-01
7     4.610833  7.073333e-01
8     6.984219  5.383362e-01
9    10.525421  3.096452e-01
10   15.228374  1.239549e-01
11   21.151173  3.183826e-02
12   28.734238  4.313982e-03
13   41.048373  9.357909e-05
14   54.498192  1.055608e-06
15   70.409166  3.776564e-09
16   91.166447  1.524561e-12
17  115.709214  1.008032e-16
18  143.005306  1.684797e-21
19  174.325471  4.035394e-27
20  207.754024  3.270642e-33
21  246.477884  2.090752e-40
22  290.372990  1.087839e-48
23  337.403833  1.176661e-57
24  383.555093  1.766844e-66
Autocorrelation found for lags: [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] at significance level 0.01



In [15]:
ljung_box_test(future_residuals, lags=o_lag, significance_level = 0.01, title="Future Residuals")

Ljung-Box Q Test (Future Residuals):
       lb_stat     lb_pvalue
1     0.000311  9.859346e-01
2     0.001533  9.992339e-01
3     0.083532  9.937376e-01
4     0.334117  9.875067e-01
5     0.929468  9.680517e-01
6     2.181724  9.022496e-01
7     4.507639  7.197972e-01
8     6.804116  5.579072e-01
9    10.212356  3.335691e-01
10   14.772931  1.405623e-01
11   20.521780  3.867784e-02
12   27.865935  5.787322e-03
13   39.903663  1.432580e-04
14   53.078180  1.845068e-06
15   68.701204  7.601847e-09
16   89.062302  3.722428e-12
17  113.121198  3.113106e-16
18  139.866231  6.796145e-21
19  170.641515  2.128071e-26
20  203.579677  2.200758e-32
21  241.934705  1.701188e-39
22  285.366485  1.118757e-47
23  331.941432  1.523474e-56
24  377.912255  2.524123e-65
Autocorrelation found for lags: [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] at significance level 0.01



In [16]:
o_lag = 10
print(breaks_cusumolsresid(vecm_resid))

(0.3206530841066789, 0.9999519128907016, [(1, 1.63), (5, 1.36), (10, 1.22)])


In [17]:
# Calculate the innovation covariance matrix
omega = np.cov(np.stack([spot_residuals, future_residuals]), bias=True)
# Calculate the innovation correlation matrix
sigma_s = np.std(spot_residuals)
sigma_f = np.std(future_residuals)
rho = omega[0, 1] / (sigma_s * sigma_f)
phi = np.array([[1, rho], [rho, 1]])

# Find the eigenvalues and eigenvectors of the correlation matrix
lamda, G = np.linalg.eig(phi)

# Calculate the matrix V
V = np.diag([sigma_s, sigma_f])

# Calculate the matrix M*
M_star = np.linalg.inv(np.dot(np.dot(G, np.diag(np.sqrt(lamda))), np.dot(G.T, np.linalg.inv(V))))

# Get the error correction coefficient vector
alpha_s, alpha_f = vecm_fit.alpha

# Calculate the MIS for spot and future markets
MIS_s = ((alpha_s * M_star[0, 0] + alpha_f * M_star[1, 0]) ** 2) / ((alpha_s * M_star[0, 0] + alpha_f * M_star[1, 0]) ** 2 + (alpha_s * M_star[0, 1] + alpha_f * M_star[1, 1]) ** 2)
MIS_f = ((alpha_s * M_star[0, 1] + alpha_f * M_star[1, 1]) ** 2) / ((alpha_s * M_star[0, 0] + alpha_f * M_star[1, 0]) ** 2 + (alpha_s * M_star[0, 1] + alpha_f * M_star[1, 1]) ** 2)

# Print the MIS for spot and future markets
print("MIS for spot market:", MIS_s)
print("MIS for future market:", MIS_f)

MIS for spot market: [0.33710612]
MIS for future market: [0.66289388]
