In [1]:
%matplotlib inline

import numpy as np 
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

from statsmodels.tsa.stattools import adfuller 
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.vector_ar.vecm import coint_johansen #Johansen Cointegration test

In [2]:
There are two types of Johansen’s test: one uses trace (from linear algebra), the other a maximum eigenvalue approach (an eigenvalue is a special scalar; When you multiply a matrix by a vector and get the same vector as an answer, along with a new scalar, the scalar is called an eigenvalue).

Both forms of the test will determine if cointegration is present. The null hypothesis for both forms of test is that there are no cointegrating equations. The difference is in the alternate hypothesis: the trace test alternate hypothesis is simply that the number of cointegrating relationships is at least one (shown by the number of linear combinations). The maximum eigenvalue test has an alternate hypothesis of K0 + 1 (instead of K > K0). Rejecting the null hypothesis in this situation is basically stating there is only one combination of the non-stationary variables that gives a stationary process.

SyntaxError: invalid syntax (<ipython-input-2-f8811be2873e>, line 1)

In [3]:
dta = sm.datasets.webuse('lutkepohl2', 'https://www.stata-press.com/data/r12/') #German macro dataset by H.lutkepohl
dta.index = dta.qtr
dta.index.freq = dta.index.inferred_freq

dta.head()

Unnamed: 0_level_0,inv,inc,consump,qtr,ln_inv,dln_inv,ln_inc,dln_inc,ln_consump,dln_consump
qtr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1960-01-01,180,451,415,1960-01-01,5.192957,,6.111467,,6.028278,
1960-04-01,179,465,421,1960-04-01,5.187386,-0.005571,6.142037,0.03057,6.042633,0.014355
1960-07-01,185,485,434,1960-07-01,5.220356,0.03297,6.184149,0.042111,6.073044,0.030411
1960-10-01,192,493,448,1960-10-01,5.257495,0.037139,6.200509,0.01636,6.104793,0.031749
1961-01-01,211,509,459,1961-01-01,5.351858,0.094363,6.232448,0.031939,6.12905,0.024257


In [4]:
#if det_order=0 and f=0, then we take out the constant from endog, and then we take out the constant from the diff equation. the first removes the mean of y, the second removes the mean of delta y, i.e. takes out the linear trend in y.
#if det_order=1 and f=0, the first removes the linear trend, the second removes the constant in the first differences, which is again the linear trend.
#if det_order=-1, then f=-1 and we don't remove any deterministic trend parts.


joh_model1 = coint_johansen(dta[['ln_inv','ln_inc','ln_consump']],-1,1) # k_ar_diff +1 = K


In [5]:
joh_model1

<statsmodels.tsa.vector_ar.vecm.JohansenTestResult at 0x21cbfa15c88>

In [6]:
joh_model1.lr2 #Max Eign Value Stats

array([43.03085717,  9.35556926,  5.66105395])

In [7]:
joh_model1.lr1 #Trace Stats

array([58.04748038, 15.01662321,  5.66105395])

In [10]:
joh_model1.trace_stat #Trace Stats

array([58.04748038, 15.01662321,  5.66105395])

In [11]:
dta[['ln_inv','ln_inc','ln_consump']].shape #Shape of Data

(92, 3)

In [8]:
def joh_output(res):
    output = pd.DataFrame([res.lr2,res.lr1],
                          index=['max_eig_stat',"trace_stat"])
    print(output.T,'\n')
    print("Critical values(90%, 95%, 99%) of max_eig_stat\n",res.cvm,'\n')
    print("Critical values(90%, 95%, 99%) of trace_stat\n",res.cvt,'\n')

In [9]:
joh_output(joh_model1)

   max_eig_stat  trace_stat
0     43.030857   58.047480
1      9.355569   15.016623
2      5.661054    5.661054 

Critical values(90%, 95%, 99%) of max_eig_stat
 [[15.7175 17.7961 22.2519]
 [ 9.4748 11.2246 15.0923]
 [ 2.9762  4.1296  6.9406]] 

Critical values(90%, 95%, 99%) of trace_stat
 [[21.7781 24.2761 29.5147]
 [10.4741 12.3212 16.364 ]
 [ 2.9762  4.1296  6.9406]] 



In [None]:
#In the table, you should see 4 columns. The test column contains the test statistics, while the three other columns contain the critical values at a 10 percent, 5 percent, and a 1 percent level. For this case, as standard practice, we often use the 5 percent critical value as reference. The r in the table represents the rank and we know that this is some indication of the number of cointegrating relationships. When r = 0, the test stat 87.77 > 22. This means that we reject the null hypothesis which suggests that r > 0. As such, there is some cointegration present. When r <1, 21.64 > 15.67. This again means that we reject the null hypothesis which suggests that r > 1. Lastly, when r < 2, we fail to reject the null hypothesis because 7.89 < 9.24. Therefore, we conclude that there are at most 2 cointegrating relationships present.

In [13]:
#cvm = #Critical values (90%, 95%, 99%) of maximum eigenvalue statistic.

#cvt = #Critical values (90%, 95%, 99%) of trace statistic
traces = joh_model1.lr1
maxeig=joh_model1.lr2
cvts = joh_model1.cvt  ## 0: 90%  1:95% 2: 99%
cvms = joh_model1.cvm   ## 0: 90%  1:95% 2: 99%

In [16]:
cvts[0,1]

24.2761

In [36]:
#Using Traces

for i in range(3):
    if traces[i] > cvts[i, 1]:
        r = i + 1
print(r)

3


In [37]:
#Using Eign Values

for i in range(3):
    if maxeig[i] > cvms[i, 1]:
        r = i + 1
print(r)

3


In [38]:
from statsmodels.tsa.vector_ar.vecm import select_coint_rank

In [41]:
rank=select_coint_rank(dta[['ln_inv','ln_inc','ln_consump']],-1,1)
print(rank.rank)
#result=stt.coint(endog['Sales'], endog['Customers'])

3


In [34]:
from statsmodels.tsa.stattools import coint

In [35]:
coint(dta['ln_inv'],dta['ln_inc'])

(-3.861632869897167,
 0.01119643616982325,
 array([-4.02083922, -3.40409789, -3.09138506]))

In [None]:
There are two types of Johansen’s test: one uses trace (from linear algebra), the other a maximum eigenvalue approach (an eigenvalue is a special scalar; When you multiply a matrix by a vector and get the same vector as an answer, along with a new scalar, the scalar is called an eigenvalue).

Both forms of the test will determine if cointegration is present. The null hypothesis for both forms of test is that there are no cointegrating equations. The difference is in the alternate hypothesis: the trace test alternate hypothesis is simply that the number of cointegrating relationships is at least one (shown by the number of linear combinations). The maximum eigenvalue test has an alternate hypothesis of K0 + 1 (instead of K > K0). Rejecting the null hypothesis in this situation is basically stating there is only one combination of the non-stationary variables that gives a stationary process.

In [None]:
https://notes.quantecon.org/submission/5db25b54831cf4001af7e506
    
https://www.kaggle.com/saritm/vector-error-correction-fx