diff --git a/statsmodels/compatnp/py3k.py b/statsmodels/compatnp/py3k.py index aab8807205e..11359fd8f43 100644 --- a/statsmodels/compatnp/py3k.py +++ b/statsmodels/compatnp/py3k.py @@ -34,6 +34,7 @@ def isfileobj(f): def open_latin1(filename, mode='r'): return open(filename, mode=mode, encoding='iso-8859-1') strchar = 'U' + string_types = basestring from io import BytesIO, StringIO #statsmodels else: bytes = str @@ -52,6 +53,7 @@ def open_latin1(filename, mode='r'): return open(filename, mode=mode) from StringIO import StringIO BytesIO = StringIO + string_types = str def getexception(): return sys.exc_info()[1] diff --git a/statsmodels/sandbox/stats/diagnostic.py b/statsmodels/sandbox/stats/diagnostic.py index 01617142cc2..cf46da9d722 100644 --- a/statsmodels/sandbox/stats/diagnostic.py +++ b/statsmodels/sandbox/stats/diagnostic.py @@ -34,7 +34,8 @@ from scipy import stats from statsmodels.regression.linear_model import OLS from statsmodels.tools.tools import add_constant -from statsmodels.tsa.stattools import acf, adfuller +from statsmodels.tsa.stattools import acf +from statsmodels.tsa.unitroot import adfuller from statsmodels.tsa.tsatools import lagmat #get the old signature back so the examples work diff --git a/statsmodels/tsa/adfvalues.py b/statsmodels/tsa/adfvalues.py deleted file mode 100644 index 0aab1887007..00000000000 --- a/statsmodels/tsa/adfvalues.py +++ /dev/null @@ -1,387 +0,0 @@ -from scipy.stats import norm -from numpy import array, polyval, inf, asarray - -__all__ = ['mackinnonp','mackinnoncrit'] - -# These are the cut-off values for the left-tail vs. the rest of the -# tau distribution, for getting the p-values - -tau_star_nc = [-1.04, -1.53, -2.68, -3.09, -3.07, -3.77] -tau_min_nc = [-19.04,-19.62,-21.21,-23.25,-21.63,-25.74] -tau_max_nc = [inf,1.51,0.86,0.88,1.05,1.24] -tau_star_c = [-1.61, -2.62, -3.13, -3.47, -3.78, -3.93] -tau_min_c = [-18.83,-18.86,-23.48,-28.07,-25.96,-23.27] -tau_max_c = [2.74,0.92,0.55,0.61,0.79,1] -tau_star_ct = [-2.89, -3.19, -3.50, -3.65, -3.80, -4.36] -tau_min_ct = [-16.18,-21.15,-25.37,-26.63,-26.53,-26.18] -tau_max_ct = [0.7,0.63,0.71,0.93,1.19,1.42] -tau_star_ctt = [-3.21,-3.51,-3.81,-3.83,-4.12,-4.63] -tau_min_ctt = [-17.17,-21.1,-24.33,-24.03,-24.33,-28.22] -tau_max_ctt = [0.54,0.79,1.08,1.43,3.49,1.92] - -small_scaling = array([1,1,1e-2]) -tau_nc_smallp = [ [0.6344,1.2378,3.2496], - [1.9129,1.3857,3.5322], - [2.7648,1.4502,3.4186], - [3.4336,1.4835,3.19], - [4.0999,1.5533,3.59], - [4.5388,1.5344,2.9807]] -tau_nc_smallp = asarray(tau_nc_smallp)*small_scaling - -tau_c_smallp = [ [2.1659,1.4412,3.8269], - [2.92,1.5012,3.9796], - [3.4699,1.4856,3.164], - [3.9673,1.4777,2.6315], - [4.5509,1.5338,2.9545], - [5.1399,1.6036,3.4445]] -tau_c_smallp = asarray(tau_c_smallp)*small_scaling - -tau_ct_smallp = [ [3.2512,1.6047,4.9588], - [3.6646,1.5419,3.6448], - [4.0983,1.5173,2.9898], - [4.5844,1.5338,2.8796], - [5.0722,1.5634,2.9472], - [5.53,1.5914,3.0392]] -tau_ct_smallp = asarray(tau_ct_smallp)*small_scaling - -tau_ctt_smallp = [ [4.0003,1.658,4.8288], - [4.3534,1.6016,3.7947], - [4.7343,1.5768,3.2396], - [5.214,1.6077,3.3449], - [5.6481,1.6274,3.3455], - [5.9296,1.5929,2.8223]] -tau_ctt_smallp = asarray(tau_ctt_smallp)*small_scaling - -large_scaling = array([1,1e-1,1e-1,1e-2]) -tau_nc_largep = [ [0.4797,9.3557,-0.6999,3.3066], - [1.5578,8.558,-2.083,-3.3549], - [2.2268,6.8093,-3.2362,-5.4448], - [2.7654,6.4502,-3.0811,-4.4946], - [3.2684,6.8051,-2.6778,-3.4972], - [3.7268,7.167,-2.3648,-2.8288]] -tau_nc_largep = asarray(tau_nc_largep)*large_scaling - -tau_c_largep = [ [1.7339,9.3202,-1.2745,-1.0368], - [2.1945,6.4695,-2.9198,-4.2377], - [2.5893,4.5168,-3.6529,-5.0074], - [3.0387,4.5452,-3.3666,-4.1921], - [3.5049,5.2098,-2.9158,-3.3468], - [3.9489,5.8933,-2.5359,-2.721]] -tau_c_largep = asarray(tau_c_largep)*large_scaling - -tau_ct_largep = [ [2.5261,6.1654,-3.7956,-6.0285], - [2.85,5.272,-3.6622,-5.1695], - [3.221,5.255,-3.2685,-4.1501], - [3.652,5.9758,-2.7483,-3.2081], - [4.0712,6.6428,-2.3464,-2.546], - [4.4735,7.1757,-2.0681,-2.1196] ] -tau_ct_largep = asarray(tau_ct_largep)*large_scaling - -tau_ctt_largep = [ [3.0778,4.9529,-4.1477,-5.9359], - [3.4713,5.967,-3.2507,-4.2286], - [3.8637,6.7852,-2.6286,-3.1381], - [4.2736,7.6199,-2.1534,-2.4026], - [4.6679,8.2618,-1.822,-1.9147], - [5.0009,8.3735,-1.6994,-1.6928]] -tau_ctt_largep = asarray(tau_ctt_largep)*large_scaling - - -#NOTE: The Z-statistic is used when lags are included to account for -# serial correlation in the error term - -z_star_nc = [-2.9,-8.7,-14.8,-20.9,-25.7,-30.5] -z_star_c = [-8.9,-14.3,-19.5,-25.1,-29.6,-34.4] -z_star_ct = [-15.0,-19.6,-25.3,-29.6,-31.8,-38.4] -z_star_ctt = [-20.7,-25.3,-29.9,-34.4,-38.5,-44.2] - - -# These are Table 5 from MacKinnon (1994) -# small p is defined as p in .005 to .150 ie p = .005 up to z_star -# Z* is the largest value for which it is appropriate to use these -# approximations -# the left tail approximation is -# p = norm.cdf(d_0 + d_1*log(abs(z)) + d_2*log(abs(z))**2 + d_3*log(abs(z))**3 -# there is no Z-min, ie., it is well-behaved in the left tail - -z_nc_smallp = array([[.0342, -.6376,0,-.03872], - [1.3426,-.7680,0,-.04104], - [3.8607,-2.4159,.51293,-.09835], - [6.1072,-3.7250,.85887,-.13102], - [7.7800,-4.4579,1.00056,-.14014], - [4.0253, -.8815,0,-.04887]]) - -z_c_smallp = array([[2.2142,-1.7863,.32828,-.07727], - [1.1662,.1814,-.36707,0], - [6.6584,-4.3486,1.04705,-.15011], - [3.3249,-.8456,0,-.04818], - [4.0356,-.9306,0,-.04776], - [13.9959,-8.4314,1.97411,-.22234]]) - -z_ct_smallp = array([ [4.6476,-2.8932,0.5832,-0.0999], - [7.2453,-4.7021,1.127,-.15665], - [3.4893,-0.8914,0,-.04755], - [1.6604,1.0375,-0.53377,0], - [2.006,1.1197,-0.55315,0], - [11.1626,-5.6858,1.21479,-.15428]]) - -z_ctt_smallp = array([ [3.6739,-1.1549,0,-0.03947], - [3.9783,-1.0619,0,-0.04394], - [2.0062,0.8907,-0.51708,0], - [4.9218,-1.0663,0,-0.04691], - [5.1433,-0.9877,0,-0.04993], - [23.6812,-14.6485,3.42909,-.33794]]) -# These are Table 6 from MacKinnon (1994). -# These are well-behaved in the right tail. -# the approximation function is -# p = norm.cdf(d_0 + d_1 * z + d_2*z**2 + d_3*z**3 + d_4*z**4) -z_large_scaling = array([1,1e-1,1e-2,1e-3,1e-5]) -z_nc_largep = array([ [0.4927,6.906,13.2331,12.099,0], - [1.5167,4.6859,4.2401,2.7939,7.9601], - [2.2347,3.9465,2.2406,0.8746,1.4239], - [2.8239,3.6265,1.6738,0.5408,0.7449], - [3.3174,3.3492,1.2792,0.3416,0.3894], - [3.729,3.0611,0.9579,0.2087,0.1943]]) -z_nc_largep *= z_large_scaling - -z_c_largep = array([ [1.717,5.5243,4.3463,1.6671,0], - [2.2394,4.2377,2.432,0.9241,0.4364], - [2.743,3.626,1.5703,0.4612,0.567], - [3.228,3.3399,1.2319,0.3162,0.3482], - [3.6583,3.0934,0.9681,0.2111,0.1979], - [4.0379,2.8735,0.7694,0.1433,0.1146]]) -z_c_largep *= z_large_scaling - -z_ct_largep = array([ [2.7117,4.5731,2.2868,0.6362,0.5], - [3.0972,4.0873,1.8982,0.5796,0.7384], - [3.4594,3.6326,1.4284,0.3813,0.4325], - [3.806,3.2634,1.0689,0.2402,0.2304], - [4.1402,2.9867,0.8323,0.16,0.1315], - [4.4497,2.7534,0.6582,0.1089,0.0773]]) -z_ct_largep *= z_large_scaling - -z_ctt_largep = array([ [3.4671,4.3476,1.9231,0.5381,0.6216], - [3.7827,3.9421,1.5699,0.4093,0.4485], - [4.052,3.4947,1.1772,0.2642,0.2502], - [4.3311,3.1625,0.9126,0.1775,0.1462], - [4.594,2.8739,0.707,0.1181,0.0838], - [4.8479,2.6447,0.5647,0.0827,0.0518]]) -z_ctt_largep *= z_large_scaling - -#TODO: finish this and then integrate them into adf function -def mackinnonp(teststat, regression="c", N=1, lags=None): - """ - Returns MacKinnon's approximate p-value for teststat. - - Parameters - ---------- - teststat : float - "T-value" from an Augmented Dickey-Fuller regression. - regression : str {"c", "nc", "ct", "ctt"} - This is the method of regression that was used. Following MacKinnon's - notation, this can be "c" for constant, "nc" for no constant, "ct" for - constant and trend, and "ctt" for constant, trend, and trend-squared. - N : int - The number of series believed to be I(1). For (Augmented) Dickey- - Fuller N = 1. - - Returns - ------- - p-value : float - The p-value for the ADF statistic estimated using MacKinnon 1994. - - References - ---------- - MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for - Unit-Root and Cointegration Tests." Journal of Business & Economics - Statistics, 12.2, 167-76. - - Notes - ----- - For (A)DF - H_0: AR coefficient = 1 - H_a: AR coefficient < 1 - """ - maxstat = eval("tau_max_"+regression) - minstat = eval("tau_min_"+regression) - starstat = eval("tau_star_"+regression) - if teststat > maxstat[N-1]: - return 1.0 - elif teststat < minstat[N-1]: - return 0.0 - if teststat <= starstat[N-1]: - tau_coef = eval("tau_" + regression + "_smallp["+str(N-1)+"]") -# teststat = np.log(np.abs(teststat)) -#above is only for z stats - else: - tau_coef = eval("tau_" + regression + "_largep["+str(N-1)+"]") - return norm.cdf(polyval(tau_coef[::-1], teststat)) - -# These are the new estimates from MacKinnon 2010 -# the first axis is N -1 -# the second axis is 1 %, 5 %, 10 % -# the last axis is the coefficients - -tau_nc_2010 = [[ [-2.56574,-2.2358,-3.627,0], # N = 1 - [-1.94100,-0.2686,-3.365,31.223], - [-1.61682, 0.2656, -2.714, 25.364]]] -tau_nc_2010 = asarray(tau_nc_2010) - -tau_c_2010 = [[ [-3.43035,-6.5393,-16.786,-79.433], # N = 1, 1% - [-2.86154,-2.8903,-4.234,-40.040], # 5 % - [-2.56677,-1.5384,-2.809,0]], # 10 % - [ [-3.89644,-10.9519,-33.527,0], # N = 2 - [-3.33613,-6.1101,-6.823,0], - [-3.04445,-4.2412,-2.720,0]], - [ [-4.29374,-14.4354,-33.195,47.433], # N = 3 - [-3.74066,-8.5632,-10.852,27.982], - [-3.45218,-6.2143,-3.718,0]], - [ [-4.64332,-18.1031,-37.972,0], # N = 4 - [-4.09600,-11.2349,-11.175,0], - [-3.81020,-8.3931,-4.137,0]], - [ [-4.95756,-21.8883,-45.142,0], # N = 5 - [-4.41519,-14.0405,-12.575,0], - [-4.13157,-10.7417,-3.784,0]], - [ [-5.24568,-25.6688,-57.737,88.639], # N = 6 - [-4.70693,-16.9178,-17.492,60.007], - [-4.42501,-13.1875,-5.104,27.877]], - [ [-5.51233,-29.5760,-69.398,164.295],# N = 7 - [-4.97684,-19.9021,-22.045,110.761], - [-4.69648,-15.7315,-5.104,27.877]], - [ [-5.76202,-33.5258,-82.189,256.289], # N = 8 - [-5.22924,-23.0023,-24.646,144.479], - [-4.95007,-18.3959,-7.344,94.872]], - [ [-5.99742,-37.6572,-87.365,248.316],# N = 9 - [-5.46697,-26.2057,-26.627,176.382], - [-5.18897,-21.1377,-9.484,172.704]], - [ [-6.22103,-41.7154,-102.680,389.33],# N = 10 - [-5.69244,-29.4521,-30.994,251.016], - [-5.41533,-24.0006,-7.514,163.049]], - [ [-6.43377,-46.0084,-106.809,352.752],# N = 11 - [-5.90714,-32.8336,-30.275,249.994], - [-5.63086,-26.9693,-4.083,151.427]], - [ [-6.63790,-50.2095,-124.156,579.622],# N = 12 - [-6.11279,-36.2681,-32.505,314.802], - [-5.83724,-29.9864,-2.686,184.116]]] -tau_c_2010 = asarray(tau_c_2010) - -tau_ct_2010 = [[ [-3.95877,-9.0531,-28.428,-134.155], # N = 1 - [-3.41049,-4.3904,-9.036,-45.374], - [-3.12705,-2.5856,-3.925,-22.380]], - [ [-4.32762,-15.4387,-35.679,0], # N = 2 - [-3.78057,-9.5106,-12.074,0], - [-3.49631,-7.0815,-7.538,21.892]], - [ [-4.66305,-18.7688,-49.793,104.244], # N = 3 - [-4.11890,-11.8922,-19.031,77.332], - [-3.83511,-9.0723,-8.504,35.403]], - [ [-4.96940,-22.4694,-52.599,51.314], # N = 4 - [-4.42871,-14.5876,-18.228,39.647], - [-4.14633,-11.2500,-9.873,54.109]], - [ [-5.25276,-26.2183,-59.631,50.646], # N = 5 - [-4.71537,-17.3569,-22.660,91.359], - [-4.43422,-13.6078,-10.238,76.781]], - [ [-5.51727,-29.9760,-75.222,202.253], # N = 6 - [-4.98228,-20.3050,-25.224,132.03], - [-4.70233,-16.1253,-9.836,94.272]], - [ [-5.76537,-33.9165,-84.312,245.394], # N = 7 - [-5.23299,-23.3328,-28.955,182.342], - [-4.95405,-18.7352,-10.168,120.575]], - [ [-6.00003,-37.8892,-96.428,335.92], # N = 8 - [-5.46971,-26.4771,-31.034,220.165], - [-5.19183,-21.4328,-10.726,157.955]], - [ [-6.22288,-41.9496,-109.881,466.068], # N = 9 - [-5.69447,-29.7152,-33.784,273.002], - [-5.41738,-24.2882,-8.584,169.891]], - [ [-6.43551,-46.1151,-120.814,566.823], # N = 10 - [-5.90887,-33.0251,-37.208,346.189], - [-5.63255,-27.2042,-6.792,177.666]], - [ [-6.63894,-50.4287,-128.997,642.781], # N = 11 - [-6.11404,-36.4610,-36.246,348.554], - [-5.83850,-30.1995,-5.163,210.338]], - [ [-6.83488,-54.7119,-139.800,736.376], # N = 12 - [-6.31127,-39.9676,-37.021,406.051], - [-6.03650,-33.2381,-6.606,317.776]]] -tau_ct_2010 = asarray(tau_ct_2010) - -tau_ctt_2010 = [[ [-4.37113,-11.5882,-35.819,-334.047], # N = 1 - [-3.83239,-5.9057,-12.490,-118.284], - [-3.55326,-3.6596,-5.293,-63.559]], - [ [-4.69276,-20.2284,-64.919,88.884], # N =2 - [-4.15387,-13.3114,-28.402,72.741], - [-3.87346,-10.4637,-17.408,66.313]], - [ [-4.99071,-23.5873,-76.924,184.782], # N = 3 - [-4.45311,-15.7732,-32.316,122.705], - [-4.17280,-12.4909,-17.912,83.285]], - [ [-5.26780,-27.2836,-78.971,137.871], # N = 4 - [-4.73244,-18.4833,-31.875,111.817], - [-4.45268,-14.7199,-17.969,101.92]], - [ [-5.52826,-30.9051,-92.490,248.096], # N = 5 - [-4.99491,-21.2360,-37.685,194.208], - [-4.71587,-17.0820,-18.631,136.672]], - [ [-5.77379,-34.7010,-105.937,393.991], # N = 6 - [-5.24217,-24.2177,-39.153,232.528], - [-4.96397,-19.6064,-18.858,174.919]], - [ [-6.00609,-38.7383,-108.605,365.208], # N = 7 - [-5.47664,-27.3005,-39.498,246.918], - [-5.19921,-22.2617,-17.910,208.494]], - [ [-6.22758,-42.7154,-119.622,421.395], # N = 8 - [-5.69983,-30.4365,-44.300,345.48], - [-5.42320,-24.9686,-19.688,274.462]], - [ [-6.43933,-46.7581,-136.691,651.38], # N = 9 - [-5.91298,-33.7584,-42.686,346.629], - [-5.63704,-27.8965,-13.880,236.975]], - [ [-6.64235,-50.9783,-145.462,752.228], # N = 10 - [-6.11753,-37.056,-48.719,473.905], - [-5.84215,-30.8119,-14.938,316.006]], - [ [-6.83743,-55.2861,-152.651,792.577], # N = 11 - [-6.31396,-40.5507,-46.771,487.185], - [-6.03921,-33.8950,-9.122,285.164]], - [ [-7.02582,-59.6037,-166.368,989.879], # N = 12 - [-6.50353,-44.0797,-47.242,543.889], - [-6.22941,-36.9673,-10.868,418.414]]] -tau_ctt_2010 = asarray(tau_ctt_2010) - -def mackinnoncrit(N=1, regression ="c", nobs=inf): - """ - Returns the critical values for cointegrating and the ADF test. - - In 2010 MacKinnon updated the values of his 1994 paper with critical values - for the augmented Dickey-Fuller tests. These new values are to be - preferred and are used here. - - Parameters - ---------- - N : int - The number of series of I(1) series for which the null of - non-cointegration is being tested. For N > 12, the critical values - are linearly interpolated (not yet implemented). For the ADF test, - N = 1. - reg : str {'c', 'tc', 'ctt', 'nc'} - Following MacKinnon (1996), these stand for the type of regression run. - 'c' for constant and no trend, 'tc' for constant with a linear trend, - 'ctt' for constant with a linear and quadratic trend, and 'nc' for - no constant. The values for the no constant case are taken from the - 1996 paper, as they were not updated for 2010 due to the unrealistic - assumptions that would underlie such a case. - nobs : int or np.inf - This is the sample size. If the sample size is numpy.inf, then the - asymptotic critical values are returned. - - References - ---------- - MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for - Unit-Root and Cointegration Tests." Journal of Business & Economics - Statistics, 12.2, 167-76. - MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." - Queen's University, Dept of Economics Working Papers 1227. - http://ideas.repec.org/p/qed/wpaper/1227.html - """ - reg = regression - if reg not in ['c','ct','nc','ctt']: - raise ValueError("regression keyword %s not understood") % reg - if nobs is inf: - return eval("tau_"+reg+"_2010["+str(N-1)+",:,0]") - else: - return polyval(eval("tau_"+reg+"_2010["+str(N-1)+",:,::-1].T"),1./nobs) - -if __name__=="__main__": - pass diff --git a/statsmodels/tsa/cointegration.py b/statsmodels/tsa/cointegration.py new file mode 100644 index 00000000000..bb36f294831 --- /dev/null +++ b/statsmodels/tsa/cointegration.py @@ -0,0 +1,70 @@ +""" +Estimation and testing of cointegrated time series +""" +from __future__ import division + +import numpy as np + +from statsmodels.regression.linear_model import OLS +from statsmodels.tools.tools import add_constant +from statsmodels.tsa.unitroot import mackinnonp, mackinnoncrit + + +def coint(y1, y2, regression="c"): + """ + This is a simple cointegration test. Uses unit-root test on residuals to + test for cointegrated relationship + + See Hamilton (1994) 19.2 + + Parameters + ---------- + y1 : array_like, 1d + first element in cointegrating vector + y2 : array_like + remaining elements in cointegrating vector + c : str {'c'} + Included in regression + * 'c' : Constant + + Returns + ------- + coint_t : float + t-statistic of unit-root test on residuals + pvalue : float + MacKinnon's approximate p-value based on MacKinnon (1994) + crit_value : dict + Critical values for the test statistic at the 1 %, 5 %, and 10 % + levels. + + Notes + ----- + The Null hypothesis is that there is no cointegration, the alternative + hypothesis is that there is cointegrating relationship. If the pvalue is + small, below a critical size, then we can reject the hypothesis that there + is no cointegrating relationship. + + P-values are obtained through regression surface approximation from + MacKinnon 1994. + + References + ---------- + MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for + unit-root and cointegration tests. `Journal of Business and Economic + Statistics` 12, 167-76. + + """ + regression = regression.lower() + if regression not in ['c', 'nc', 'ct', 'ctt']: + raise ValueError("regression option %s not understood") % regression + y1 = np.asarray(y1) + y2 = np.asarray(y2) + if regression == 'c': + y2 = add_constant(y2, prepend=False) + st1_resid = OLS(y1, y2).fit().resid # stage one residuals + lgresid_cons = add_constant(st1_resid[0:-1], prepend=False) + uroot_reg = OLS(st1_resid[1:], lgresid_cons).fit() + coint_t = (uroot_reg.params[0] - 1) / uroot_reg.bse[0] + pvalue = mackinnonp(coint_t, regression="c", num_unit_roots=2) + crit_value = mackinnoncrit(num_unit_roots=1, regression="c", nobs=len(y1)) + return coint_t, pvalue, crit_value diff --git a/statsmodels/tsa/critical_values/__init__.py b/statsmodels/tsa/critical_values/__init__.py new file mode 100644 index 00000000000..0afb2859844 --- /dev/null +++ b/statsmodels/tsa/critical_values/__init__.py @@ -0,0 +1 @@ +__author__ = 'kevin.sheppard' diff --git a/statsmodels/tsa/critical_values/dfgls.py b/statsmodels/tsa/critical_values/dfgls.py new file mode 100644 index 00000000000..4390ba7a012 --- /dev/null +++ b/statsmodels/tsa/critical_values/dfgls.py @@ -0,0 +1,34 @@ +""" +Contains values used to approximate the critical value and +p-value from DFGLS statistics + +These have been computed using the methodology of MacKinnon (1994) and (2010). +simulation. +""" + +from numpy import array + +dfgls_cv_approx = {'c': array([[-2.56781793e+00, -2.05575392e+01, 1.82727674e+02, + -1.77866664e+03], + [-1.94363325e+00, -2.17272746e+01, 2.60815068e+02, + -2.26914916e+03], + [-1.61998241e+00, -2.32734708e+01, 3.06474378e+02, + -2.57483557e+03]]), + 'ct': array([[-3.40689134, -21.69971242, 27.26295939, -816.84404772], + [-2.84677178, -19.69109364, 84.7664136, -799.40722401], + [-2.55890707, -19.42621991, 116.53759752, -840.31342847]])} + +dfgls_tau_max = {'c': 13.365361509140614, + 'ct': 8.73743383728356} + +dfgls_tau_min = {'c': -17.561302895074206, + 'ct': -13.681153542634465} + +dfgls_tau_star = {'c': -0.4795076091714674, + 'ct': -2.1960404365401298} + +dfgls_large_p = {'c': array([0.50612497, 0.98305664, -0.05648525, 0.00140875]), + 'ct': array([2.60561421, 1.67850224, 0.0373599, -0.01017936])} + +dfgls_small_p = {'c': array([0.67422739, 1.25475826, 0.03572509]), + 'ct': array([2.38767685, 1.57454737, 0.05754439])} \ No newline at end of file diff --git a/statsmodels/tsa/critical_values/dickey_fuller.py b/statsmodels/tsa/critical_values/dickey_fuller.py new file mode 100644 index 00000000000..ddd4473aced --- /dev/null +++ b/statsmodels/tsa/critical_values/dickey_fuller.py @@ -0,0 +1,265 @@ +""" +Contains values used to approximate the critical value and +p-value from statistics that follow the Dickey-Fuller distribution. + +Most values are from MacKinnon (1994) or (2010). A small number of these +did not appear in the original paper and have been computed using an identical +simulation. +""" +from numpy import asarray, inf + +small_scaling = asarray([1, 1, 1e-2]) +tau_small_p = {} +tau_large_p = {} + +tau_small_p['nc'] = [[0.6344, 1.2378, 3.2496], + [1.9129, 1.3857, 3.5322], + [2.7648, 1.4502, 3.4186], + [3.4336, 1.4835, 3.1900], + [4.0999, 1.5533, 3.5900], + [4.5388, 1.5344, 2.9807]] +tau_small_p['nc'] = asarray(tau_small_p['nc']) * small_scaling + +tau_small_p['c'] = [[2.1659, 1.4412, 3.8269], + [2.9200, 1.5012, 3.9796], + [3.4699, 1.4856, 3.1640], + [3.9673, 1.4777, 2.6315], + [4.5509, 1.5338, 2.9545], + [5.1399, 1.6036, 3.4445]] +tau_small_p['c'] = asarray(tau_small_p['c']) * small_scaling + +tau_small_p['ct'] = [[3.2512, 1.6047, 4.9588], + [3.6646, 1.5419, 3.6448], + [4.0983, 1.5173, 2.9898], + [4.5844, 1.5338, 2.8796], + [5.0722, 1.5634, 2.9472], + [5.5300, 1.5914, 3.0392]] +tau_small_p['ct'] = asarray(tau_small_p['ct']) * small_scaling + +tau_small_p['ctt'] = [[4.0003, 1.6580, 4.8288], + [4.3534, 1.6016, 3.7947], + [4.7343, 1.5768, 3.2396], + [5.2140, 1.6077, 3.3449], + [5.6481, 1.6274, 3.3455], + [5.9296, 1.5929, 2.8223]] +tau_small_p['ctt'] = asarray(tau_small_p['ctt']) * small_scaling + +large_scaling = asarray([1, 1e-1, 1e-1, 1e-2]) +tau_large_p['nc'] = [[0.4797, 9.3557, -0.6999, 3.3066], + [1.5578, 8.5580, -2.0830, -3.3549], + [2.2268, 6.8093, -3.2362, -5.4448], + [2.7654, 6.4502, -3.0811, -4.4946], + [3.2684, 6.8051, -2.6778, -3.4972], + [3.7268, 7.1670, -2.3648, -2.8288]] +tau_large_p['nc'] = asarray(tau_large_p['nc']) * large_scaling + +tau_large_p['c'] = [[1.7339, 9.3202, -1.2745, -1.0368], + [2.1945, 6.4695, -2.9198, -4.2377], + [2.5893, 4.5168, -3.6529, -5.0074], + [3.0387, 4.5452, -3.3666, -4.1921], + [3.5049, 5.2098, -2.9158, -3.3468], + [3.9489, 5.8933, -2.5359, -2.721]] +tau_large_p['c'] = asarray(tau_large_p['c']) * large_scaling + +tau_large_p['ct'] = [[2.5261, 6.1654, -3.7956, -6.0285], + [2.8500, 5.2720, -3.6622, -5.1695], + [3.2210, 5.2550, -3.2685, -4.1501], + [3.6520, 5.9758, -2.7483, -3.2081], + [4.0712, 6.6428, -2.3464, -2.5460], + [4.4735, 7.1757, -2.0681, -2.1196]] +tau_large_p['ct'] = asarray(tau_large_p['ct']) * large_scaling + +tau_large_p['ctt'] = [[3.0778, 4.9529, -4.1477, -5.9359], + [3.4713, 5.9670, -3.2507, -4.2286], + [3.8637, 6.7852, -2.6286, -3.1381], + [4.2736, 7.6199, -2.1534, -2.4026], + [4.6679, 8.2618, -1.8220, -1.9147], + [5.0009, 8.3735, -1.6994, -1.6928]] +tau_large_p['ctt'] = asarray(tau_large_p['ctt']) * large_scaling + +# These are the new estimates from MacKinnon 2010 +# the first axis is N -1 +# the second axis is 1 %, 5 %, 10 % +# the last axis is the coefficients + +tau_2010 = {} + +tau_2010['nc'] = [[[-2.56574, -2.2358, -3.627, 0], # N = 1 + [-1.94100, -0.2686, -3.365, 31.223], + [-1.61682, 0.2656, -2.714, 25.364]]] +tau_2010['nc'] = asarray(tau_2010['nc']) + +tau_2010['c'] = [[[-3.43035, -6.5393, -16.786, -79.433], # N = 1, 1% + [-2.86154, -2.8903, -4.234, -40.040], # 5 % + [-2.56677, -1.5384, -2.809, 0]], # 10 % + [[-3.89644, -10.9519, -33.527, 0], # N = 2 + [-3.33613, -6.1101, -6.823, 0], + [-3.04445, -4.2412, -2.720, 0]], + [[-4.29374, -14.4354, -33.195, 47.433], # N = 3 + [-3.74066, -8.5632, -10.852, 27.982], + [-3.45218, -6.2143, -3.718, 0]], + [[-4.64332, -18.1031, -37.972, 0], # N = 4 + [-4.09600, -11.2349, -11.175, 0], + [-3.81020, -8.3931, -4.137, 0]], + [[-4.95756, -21.8883, -45.142, 0], # N = 5 + [-4.41519, -14.0405, -12.575, 0], + [-4.13157, -10.7417, -3.784, 0]], + [[-5.24568, -25.6688, -57.737, 88.639], # N = 6 + [-4.70693, -16.9178, -17.492, 60.007], + [-4.42501, -13.1875, -5.104, 27.877]], + [[-5.51233, -29.5760, -69.398, 164.295], # N = 7 + [-4.97684, -19.9021, -22.045, 110.761], + [-4.69648, -15.7315, -5.104, 27.877]], + [[-5.76202, -33.5258, -82.189, 256.289], # N = 8 + [-5.22924, -23.0023, -24.646, 144.479], + [-4.95007, -18.3959, -7.344, 94.872]], + [[-5.99742, -37.6572, -87.365, 248.316], # N = 9 + [-5.46697, -26.2057, -26.627, 176.382], + [-5.18897, -21.1377, -9.484, 172.704]], + [[-6.22103, -41.7154, -102.680, 389.33], # N = 10 + [-5.69244, -29.4521, -30.994, 251.016], + [-5.41533, -24.0006, -7.514, 163.049]], + [[-6.43377, -46.0084, -106.809, 352.752], # N = 11 + [-5.90714, -32.8336, -30.275, 249.994], + [-5.63086, -26.9693, -4.083, 151.427]], + [[-6.63790, -50.2095, -124.156, 579.622], # N = 12 + [-6.11279, -36.2681, -32.505, 314.802], + [-5.83724, -29.9864, -2.686, 184.116]]] +tau_2010['c'] = asarray(tau_2010['c']) + +tau_2010['ct'] = [[[-3.95877, -9.0531, -28.428, -134.155], # N = 1 + [-3.41049, -4.3904, -9.036, -45.374], + [-3.12705, -2.5856, -3.925, -22.380]], + [[-4.32762, -15.4387, -35.679, 0], # N = 2 + [-3.78057, -9.5106, -12.074, 0], + [-3.49631, -7.0815, -7.538, 21.892]], + [[-4.66305, -18.7688, -49.793, 104.244], # N = 3 + [-4.11890, -11.8922, -19.031, 77.332], + [-3.83511, -9.0723, -8.504, 35.403]], + [[-4.96940, -22.4694, -52.599, 51.314], # N = 4 + [-4.42871, -14.5876, -18.228, 39.647], + [-4.14633, -11.2500, -9.873, 54.109]], + [[-5.25276, -26.2183, -59.631, 50.646], # N = 5 + [-4.71537, -17.3569, -22.660, 91.359], + [-4.43422, -13.6078, -10.238, 76.781]], + [[-5.51727, -29.9760, -75.222, 202.253], # N = 6 + [-4.98228, -20.3050, -25.224, 132.03], + [-4.70233, -16.1253, -9.836, 94.272]], + [[-5.76537, -33.9165, -84.312, 245.394], # N = 7 + [-5.23299, -23.3328, -28.955, 182.342], + [-4.95405, -18.7352, -10.168, 120.575]], + [[-6.00003, -37.8892, -96.428, 335.92], # N = 8 + [-5.46971, -26.4771, -31.034, 220.165], + [-5.19183, -21.4328, -10.726, 157.955]], + [[-6.22288, -41.9496, -109.881, 466.068], # N = 9 + [-5.69447, -29.7152, -33.784, 273.002], + [-5.41738, -24.2882, -8.584, 169.891]], + [[-6.43551, -46.1151, -120.814, 566.823], # N = 10 + [-5.90887, -33.0251, -37.208, 346.189], + [-5.63255, -27.2042, -6.792, 177.666]], + [[-6.63894, -50.4287, -128.997, 642.781], # N = 11 + [-6.11404, -36.4610, -36.246, 348.554], + [-5.83850, -30.1995, -5.163, 210.338]], + [[-6.83488, -54.7119, -139.800, 736.376], # N = 12 + [-6.31127, -39.9676, -37.021, 406.051], + [-6.03650, -33.2381, -6.606, 317.776]]] +tau_2010['ct'] = asarray(tau_2010['ct']) + +tau_2010['ctt'] = [[[-4.37113, -11.5882, -35.819, -334.047], # N = 1 + [-3.83239, -5.9057, -12.490, -118.284], + [-3.55326, -3.6596, -5.293, -63.559]], + [[-4.69276, -20.2284, -64.919, 88.884], # N =2 + [-4.15387, -13.3114, -28.402, 72.741], + [-3.87346, -10.4637, -17.408, 66.313]], + [[-4.99071, -23.5873, -76.924, 184.782], # N = 3 + [-4.45311, -15.7732, -32.316, 122.705], + [-4.17280, -12.4909, -17.912, 83.285]], + [[-5.26780, -27.2836, -78.971, 137.871], # N = 4 + [-4.73244, -18.4833, -31.875, 111.817], + [-4.45268, -14.7199, -17.969, 101.92]], + [[-5.52826, -30.9051, -92.490, 248.096], # N = 5 + [-4.99491, -21.2360, -37.685, 194.208], + [-4.71587, -17.0820, -18.631, 136.672]], + [[-5.77379, -34.7010, -105.937, 393.991], # N = 6 + [-5.24217, -24.2177, -39.153, 232.528], + [-4.96397, -19.6064, -18.858, 174.919]], + [[-6.00609, -38.7383, -108.605, 365.208], # N = 7 + [-5.47664, -27.3005, -39.498, 246.918], + [-5.19921, -22.2617, -17.910, 208.494]], + [[-6.22758, -42.7154, -119.622, 421.395], # N = 8 + [-5.69983, -30.4365, -44.300, 345.48], + [-5.42320, -24.9686, -19.688, 274.462]], + [[-6.43933, -46.7581, -136.691, 651.38], # N = 9 + [-5.91298, -33.7584, -42.686, 346.629], + [-5.63704, -27.8965, -13.880, 236.975]], + [[-6.64235, -50.9783, -145.462, 752.228], # N = 10 + [-6.11753, -37.056, -48.719, 473.905], + [-5.84215, -30.8119, -14.938, 316.006]], + [[-6.83743, -55.2861, -152.651, 792.577], # N = 11 + [-6.31396, -40.5507, -46.771, 487.185], + [-6.03921, -33.8950, -9.122, 285.164]], + [[-7.02582, -59.6037, -166.368, 989.879], # N = 12 + [-6.50353, -44.0797, -47.242, 543.889], + [-6.22941, -36.9673, -10.868, 418.414]]] +tau_2010['ctt'] = asarray(tau_2010['ctt']) + +# These are the cut-off values for the left-tail vs. the rest of the +# tau distribution, for getting the p-values +tau_star = {'nc': [-1.04, -1.53, -2.68, -3.09, -3.07, -3.77], + 'c': [-1.61, -2.62, -3.13, -3.47, -3.78, -3.93], + 'ct': [-2.89, -3.19, -3.50, -3.65, -3.80, -4.36], + 'ctt': [-3.21, -3.51, -3.81, -3.83, -4.12, -4.63]} + +tau_min = {'nc': [-19.04, -19.62, -21.21, -23.25, -21.63, -25.74], + 'c': [-18.83, -18.86, -23.48, -28.07, -25.96, -23.27], + 'ct': [-16.18, -21.15, -25.37, -26.63, -26.53, -26.18], + 'ctt': [-17.17, -21.1, -24.33, -24.03, -24.33, -28.22]} + +tau_max = {'nc': [inf, 1.51, 0.86, 0.88, 1.05, 1.24], + 'c': [2.74, 0.92, 0.55, 0.61, 0.79, 1], + 'ct': [0.7, 0.63, 0.71, 0.93, 1.19, 1.42], + 'ctt': [0.54, 0.79, 1.08, 1.43, 3.49, 1.92]} + +# Z values from MacKinnon(1994), tables 5 and 6 +adf_z_max = {'c': inf, + 'ct': inf, + 'ctt': inf} + +adf_z_min = {'c': -22.03, + 'ct': -32.85, + 'ctt': -41.18} + +adf_z_star = {'c': -7.96, 'ct': -13.46, 'ctt': -16.27} + +adf_z_small_p = {'nc': [.0342, -.6376, 0, -.03872], + 'c': [2.2142, -1.7863, 0.3283, -0.07727], + 'ct': [4.6476, -2.8932, 0.5832, -0.09990], + 'ctt': [4.4599, -1.8635, 0.2126, -0.06070]} +# These are Table 6 from MacKinnon (1994). +# These are well-behaved in the right tail. +# the approximation function is +# p = norm.cdf(d_0 + d_1 * z + d_2*z**2 + d_3*z**3 + d_4*z**4) +adf_z_large_p = {'nc': [0.4927, 6.906, 13.2331, 12.099, 0], + 'c': [1.7157, 0.5536, 4.5518, 2.2466, 4.2537], + 'ct': [2.7119, 0.4594, 2.3747, 0.7488, 0.9333], + 'ctt': [3.4216, 0.4170, 1.6939, 0.4203, 0.4153]} + +adf_z_large_p_scale = asarray([1.0, 1.0, 1e-2, 1e-3, 1e-5]) +for k, v in adf_z_large_p.iteritems(): + adf_z_large_p[k] = asarray(adf_z_large_p[k]) * adf_z_large_p_scale + +adf_z_cv_approx = { + 'ctt': asarray([[-36.57197198, 351.1444586, -2377.68403687, 8040.27872297], + [-28.11093834, 213.04812463, -1271.25554199, 4487.9988725], + [-24.18276878, 158.37799447, -806.81475966, + 2173.15644788]]), + 'c': asarray([[-20.62490333, 118.99498032, -536.04374012, 1429.81685652], + [-14.0928388, 57.82357334, -185.65594822, 279.8800972], + [-11.25010719, 38.15946541, -109.52148583, 206.38656327]]), + 'nc': asarray( + [[-13.2887112, 3.22727093e+02, -8.29915035e+03, 7.96961213e+04], + [-7.81935288e+00, 2.17103351e+02, -5.53972290e+03, 5.24347804e+04], + [-5.56707483e+00, 1.66592983e+02, -4.18616134e+03, 3.91896116e+04]]), + 'ct': asarray([[-29.33684958, 230.67000702, -1301.64844712, 0.], + [-21.70521047, 128.65518728, -508.58212394, 0.], + [-18.24843409, 93.18181231, -355.29469296, 0.]])} diff --git a/statsmodels/tsa/critical_values/kpss.py b/statsmodels/tsa/critical_values/kpss.py new file mode 100644 index 00000000000..0111797f22f --- /dev/null +++ b/statsmodels/tsa/critical_values/kpss.py @@ -0,0 +1,40 @@ +from numpy import asarray + +kpss_critical_values = {} +kpss_critical_values['c'] = ((100.0, 0.0000), (99.5, 0.0219), (99.0, 0.0249), + (98.0, 0.0289), (97.0, 0.0319), (96.0, 0.0344), + (94.5, 0.0377), (93.0, 0.0407), (89.5, 0.0470), + (86.0, 0.0527), (74.0, 0.0720), (70.5, 0.0778), + (67.0, 0.0839), (64.0, 0.0894), (61.5, 0.0941), + (58.5, 0.1001), (56.0, 0.1053), (53.0, 0.1119), + (50.0, 0.1190), (47.0, 0.1266), (44.5, 0.1333), + (42.0, 0.1406), (39.5, 0.1484), (37.0, 0.1568), + (34.5, 0.1659), (32.0, 0.1758), (30.0, 0.1845), + (28.0, 0.1938), (26.0, 0.2041), (24.0, 0.2152), + (22.5, 0.2244), (21.0, 0.2343), (19.5, 0.2451), + (18.0, 0.2569), (16.5, 0.2699), (15.0, 0.2842), + (14.0, 0.2948), (12.0, 0.3187), (11.0, 0.3324), + (10.0, 0.3475), (8.5, 0.3737), (7.0, 0.4054), + (6.0, 0.4309), (5.0, 0.4614), (4.0, 0.4993), + (3.5, 0.5222), (2.5, 0.5806), (2.0, 0.6197), + (1.5, 0.6706), (1.0, 0.7430), (0.7, 0.8075), + (0.4, 0.9097), (0.2, 1.0375), (0.1, 1.1661)) +kpss_critical_values['c'] = asarray(kpss_critical_values['c']) +kpss_critical_values['ct'] = ((100.0, 0.0000), (99.5, 0.0156), (99.0, 0.0173), + (98.5, 0.0185), (97.5, 0.0203), (96.0, 0.0224), + (94.5, 0.0240), (93.0, 0.0254), (91.0, 0.0272), + (89.0, 0.0287), (86.5, 0.0306), (81.5, 0.0340), + (68.5, 0.0424), (62.5, 0.0464), (59.5, 0.0485), + (56.5, 0.0507), (53.5, 0.0529), (51.0, 0.0548), + (48.0, 0.0573), (45.0, 0.0598), (42.5, 0.0621), + (40.0, 0.0645), (37.5, 0.0670), (35.0, 0.0697), + (33.0, 0.0720), (31.0, 0.0744), (29.0, 0.0770), + (27.5, 0.0791), (25.5, 0.0820), (24.0, 0.0844), + (22.5, 0.0869), (21.0, 0.0897), (18.5, 0.0946), + (16.0, 0.1004), (14.0, 0.1057), (12.0, 0.1119), + (10.0, 0.1193), (8.5, 0.1259), (7.0, 0.1339), + (6.0, 0.1403), (5.0, 0.1479), (4.0, 0.1573), + (3.5, 0.1630), (2.5, 0.1775), (2.0, 0.1871), + (1.5, 0.1997), (1.0, 0.2177), (0.7, 0.2335), + (0.4, 0.2587), (0.2, 0.2904), (0.1, 0.3225)) +kpss_critical_values['ct'] = asarray(kpss_critical_values['ct']) \ No newline at end of file diff --git a/statsmodels/tsa/critical_values/simulation/adf_z_simlation_process.py b/statsmodels/tsa/critical_values/simulation/adf_z_simlation_process.py new file mode 100644 index 00000000000..bd2171c6a72 --- /dev/null +++ b/statsmodels/tsa/critical_values/simulation/adf_z_simlation_process.py @@ -0,0 +1,51 @@ +import numpy as np +from scipy.stats import norm + +from statsmodels.regression.linear_model import OLS, WLS + + +trends = ('nc', 'c', 'ct', 'ctt') +critical_values = (1.0, 5.0, 10.0) +adf_z_cv_approx = {} +for t in trends: + print t + data = np.load('adf_z_' + t + '.npz') + percentiles = data['percentiles'] + trend = data['trend'] + results = data['results'] + # T = data['T'] + data.close() + + # Remove later + T = np.array( + (20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 120, 140, 160, + 180, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, + 1000, 1200, 1400, 2000)) + T = T[::-1] + + # For percentiles 1, 5 and 10, regress on a constant, and powers of 1/T + out = [] + for cv in critical_values: + num_ex = results.shape[2] + loc = np.where(percentiles == cv)[0][0] + lhs = np.squeeze(results[loc, :, :]) + # Adjust for effective sample size, this is what lookup the code uses + tau = np.ones((num_ex, 1)).dot(T[None, :]) - 1.0 + tau = tau.T + lhs = lhs.ravel() + tau = tau.ravel() + tau = tau[:, None] + n = lhs.shape[0] + rhs = np.ones((n, 1)) + rhs = np.hstack((rhs, 1.0 / tau)) + rhs = np.hstack((rhs, (1.0 / tau) ** 2.0)) + rhs = np.hstack((rhs, (1.0 / tau) ** 3.0)) + res = OLS(lhs, rhs).fit() + res.params[np.abs(res.tvalues) < 1.96] = 0.0 + out.append(res.params) + + adf_z_cv_approx[t] = np.array(out) + +print 'from numpy import array' +print '' +print 'adf_z_cv_approx = ' + str(adf_z_cv_approx) diff --git a/statsmodels/tsa/critical_values/simulation/dfgls_critical_values_simulation.py b/statsmodels/tsa/critical_values/simulation/dfgls_critical_values_simulation.py new file mode 100644 index 00000000000..54f5b06bae7 --- /dev/null +++ b/statsmodels/tsa/critical_values/simulation/dfgls_critical_values_simulation.py @@ -0,0 +1,116 @@ +from __future__ import division +import datetime + +from numpy import ones, vstack, arange, diff, cumsum, sqrt, sum +from numpy.linalg import pinv +import numpy as np + +from statsmodels.tools.parallel import parallel_func + +# Controls memory use, in MiB +MAX_MEMORY_SIZE = 100 +NUM_JOBS = 3 + + +def wrapper(n, deterministic, b, seed=0): + """ + Wraps and blocks the main simulation so that the maximum amount of memory + can be controlled on multi processor systems when executing in parallel + """ + rng = np.random.RandomState() + rng.seed(seed) + remaining = b + res = np.zeros(b) + finished = 0 + block_size = int(2 ** 20.0 * MAX_MEMORY_SIZE / (8.0 * n)) + for j in xrange(0, b, block_size): + if block_size < remaining: + count = block_size + else: + count = remaining + st = finished + en = finished + count + res[st:en] = dfgsl_simulation(n, deterministic, count, rng) + finished += count + remaining -= count + + return res + + +def dfgsl_simulation(n, deterministic, b, rng=None): + """ + Simulates the empirical distribution of the DFGLS test statistic + """ + if rng is None: + np.random.seed(0) + from numpy.random import standard_normal + else: + standard_normal = rng.standard_normal + + nobs = n + if deterministic == 'c': + c = -7.0 + z = ones((nobs, 1)) + else: + c = -13.5 + z = vstack((ones(nobs), arange(1, nobs + 1))).T + + ct = c / nobs + + delta_z = np.copy(z) + delta_z[1:, :] = delta_z[1:, :] - (1 + ct) * delta_z[:-1, :] + delta_z_inv = pinv(delta_z) + y = standard_normal((n + 50, b)) + y = cumsum(y, axis=0) + y = y[50:, :] + delta_y = y.copy() + delta_y[1:, :] = delta_y[1:, :] - (1 + ct) * delta_y[:-1, :] + detrend_coef = delta_z_inv.dot(delta_y) + y_detrended = y - z.dot(detrend_coef) + + delta_y_detrended = diff(y_detrended, axis=0) + rhs = y_detrended[:-1, :] + lhs = delta_y_detrended + + xpy = sum(rhs * lhs, 0) + xpx = sum(rhs ** 2.0, 0) + gamma = xpy / xpx + e = lhs - rhs * gamma + sigma2 = sum(e ** 2.0, axis=0) / (n - 1) # DOF correction? + gamma_var = sigma2 / xpx + + stat = gamma / sqrt(gamma_var) + return stat + + +if __name__ == '__main__': + deterministics = ('c', 'ct') + ex_num = 500 + ex_size = 200000 + T = np.array( + (20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 120, 140, 160, + 180, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, + 1000, 1200, 1400, 2000)) + T = T[::-1] + percentiles = list(np.arange(0.5, 100.0, 0.5)) + seeds = np.arange(0, 2 ** 32, step=2 ** 23) + for d in deterministics: + results = np.zeros((len(percentiles), len(T), ex_num)) + + for i in xrange(ex_num): + print "Experiment Number {0}".format(i + 1) + now = datetime.datetime.now() + parallel, p_func, n_jobs = parallel_func(wrapper, + n_jobs=NUM_JOBS, + verbose=2) + out = parallel(p_func(t, d, ex_size, seed=seeds[i]) for t in T) + q = lambda x: np.percentile(x, percentiles) + quantiles = map(q, out) + results[:, :, i] = np.array(quantiles).T + print datetime.datetime.now() - now + + np.savez('dfgls_' + d + '.npz', + deterministic=d, + results=results, + percentiles=percentiles, + T=T) \ No newline at end of file diff --git a/statsmodels/tsa/critical_values/simulation/dfgls_simulation_process.py b/statsmodels/tsa/critical_values/simulation/dfgls_simulation_process.py new file mode 100644 index 00000000000..827ca204632 --- /dev/null +++ b/statsmodels/tsa/critical_values/simulation/dfgls_simulation_process.py @@ -0,0 +1,118 @@ +import numpy as np +from scipy.stats import norm + +from statsmodels.regression.linear_model import OLS, WLS + + +trends = ('c', 'ct') +critical_values = (1.0, 5.0, 10.0) +dfgls_cv_approx = {} +for t in trends: + print t + data = np.load('dfgls_' + t + '.npz') + percentiles = data['percentiles'] + deterministic = data['deterministic'] + results = data['results'] + # T = data['T'] + data.close() + + # Remove later + T = np.array( + (20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 120, 140, 160, + 180, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, + 1000, 1200, 1400, 2000)) + T = T[::-1] + + # For percentiles 1, 5 and 10, regress on a constant, and powers of 1/T + out = [] + for cv in critical_values: + num_ex = results.shape[2] + loc = np.where(percentiles == cv)[0][0] + lhs = np.squeeze(results[loc, :, :]) + # Adjust for effective sample size, this is what lookup the code uses + tau = np.ones((num_ex, 1)).dot(T[None, :]) - 1.0 + tau = tau.T + lhs = lhs.ravel() + tau = tau.ravel() + tau = tau[:, None] + n = lhs.shape[0] + rhs = np.ones((n, 1)) + rhs = np.hstack((rhs, 1.0 / tau)) + rhs = np.hstack((rhs, (1.0 / tau) ** 2.0)) + rhs = np.hstack((rhs, (1.0 / tau) ** 3.0)) + res = OLS(lhs, rhs).fit() + out.append(res.params) + + dfgls_cv_approx[t] = np.array(out) + +trends = ('c', 'ct') +dfgls_large_p = {} +dfgls_small_p = {} +dfgls_tau_star = {} +dfgls_tau_max = {} +dfgls_tau_min = {} +for t in trends: + data = np.load('dfgls_' + t + '.npz') + percentiles = data['percentiles'] + results = data['results'] # Remove later + # LHS is norm cdf inv of percentiles + lhs = norm().ppf(percentiles / 100.0) + lhs_large = lhs + # RHS is made up of avg test stats for largest T, which is in pos 1 + avg_test_stats = results[:, 1, :].mean(axis=1) + avg_test_std = results[:, 1, :].std(axis=1) + avg_test_stats = avg_test_stats[:, None] + m = lhs.shape[0] + rhs = np.ones((m, 1)) + rhs = np.hstack((rhs, avg_test_stats)) + rhs = np.hstack((rhs, avg_test_stats ** 2.0)) + rhs = np.hstack((rhs, avg_test_stats ** 3.0)) + rhs_large = rhs + res_large = WLS(lhs, rhs, weights=1.0 / avg_test_std).fit() + dfgls_large_p[t] = res_large.params + # Compute tau_max, by finding the func maximum + p = res_large.params + poly_roots = np.roots(np.array([3, 2, 1.0]) * p[:0:-1]) + dfgls_tau_max[t] = float(np.squeeze(np.real(np.max(poly_roots)))) + + # Small p regression using only p<=15% + cutoff = np.where(percentiles <= 15.0)[0] + avg_test_stats = results[cutoff, 1, :].mean(axis=1) + avg_test_std = results[cutoff, 1, :].std(axis=1) + avg_test_stats = avg_test_stats[:, None] + lhs = lhs[cutoff] + m = lhs.shape[0] + rhs = np.ones((m, 1)) + rhs = np.hstack((rhs, avg_test_stats)) + rhs = np.hstack((rhs, avg_test_stats ** 2.0)) + res_small = WLS(lhs, rhs, weights=1.0 / avg_test_std).fit() + dfgls_small_p[t] = res_small.params + + # Compute tau star + err_large = res_large.resid + # Missing 1 parameter here, replace with 0 + params = np.append(res_small.params, 0.0) + err_small = lhs_large - rhs_large.dot(params) + # Find the location that minimizes the total absolute error + m = lhs_large.shape[0] + abs_err = np.zeros((m, 1)) + for i in xrange(m): + abs_err[i] = np.abs(err_large[i:]).sum() + np.abs(err_small[:i]).sum() + loc = np.argmin(abs_err) + dfgls_tau_star[t] = rhs_large[loc, 1] + # Compute tau min + dfgls_tau_min[t] = -params[1] / (2 * params[2]) + +print 'from numpy import array' +print '' +print 'dfgls_cv_approx = ' + str(dfgls_cv_approx) +print '' +print 'dfgls_tau_max = ' + str(dfgls_tau_max) +print '' +print 'dfgls_tau_min = ' + str(dfgls_tau_min) +print '' +print 'dfgls_tau_star = ' + str(dfgls_tau_star) +print '' +print 'dfgls_large_p = ' + str(dfgls_large_p) +print '' +print 'dfgls_small_p = ' + str(dfgls_small_p) diff --git a/statsmodels/tsa/critical_values/simulation/kpss_critical_values_simulation.py b/statsmodels/tsa/critical_values/simulation/kpss_critical_values_simulation.py new file mode 100644 index 00000000000..f97e31ddd69 --- /dev/null +++ b/statsmodels/tsa/critical_values/simulation/kpss_critical_values_simulation.py @@ -0,0 +1,98 @@ +""" +Calculates quantiles of the KPSS test statistic for both the constant +and constant plus trend scenarios. +""" +from __future__ import division +import os +from numpy.random import RandomState +import numpy as np +import pandas as pd +from statsmodels.tsa.tsatools import add_trend + + +def simulate_kpss(nobs, B, deterministic='c', rng=None): + if rng is None: + rng = RandomState() + rng.seed(0) + + standard_normal = rng.standard_normal + + e = standard_normal((nobs, B)) + z = np.ones((nobs, 1)) + if deterministic == 'ct': + z = add_trend(z, trend='t') + zinv = np.linalg.pinv(z) + trend_coef = zinv.dot(e) + resid = e - z.dot(trend_coef) + s = np.cumsum(resid, axis=0) + lam = np.mean(resid ** 2.0, axis=0) + kpss = 1 / (nobs ** 2.0) * np.sum(s ** 2.0, axis=0) / lam + return kpss + + +def block_simulate_kpss(nobs, b, deterministic='c', max_memory=250): + rng = RandomState() + rng.seed(0) + memory = max_memory * 2 ** 20 + b_max_memory = memory // 8 // nobs + b_max_memory = max(b_max_memory, 1) + remaining = b + results = np.zeros(b) + while remaining > 0: + b_eff = min(remaining, b_max_memory) + completed = b - remaining + results[completed:completed + b_eff] = \ + simulate_kpss(nobs, b_eff, deterministic=deterministic, rng=rng) + remaining -= b_max_memory + + return results + + +if __name__ == '__main__': + import datetime as dt + import cStringIO as cio + + nobs = 2000 + B = 10 * 10e6 + + percentiles = np.concatenate((np.arange(0.0, 99.0, 0.5), + np.arange(99.0, 100.0, 0.1))) + + critical_values = 100 - percentiles + critical_values_string = map(lambda x: '{0:0.1f}'.format(x), critical_values) + sio = cio.StringIO() + sio.write("kpss_critical_values = {}\n") + + hdf_filename = 'kpss_critical_values.h5' + try: + os.remove(hdf_filename) + except OSError: + pass + + for d in ('c', 'ct'): + now = dt.datetime.now() + kpss = block_simulate_kpss(nobs, B, deterministic=d) + print dt.datetime.now() - now + quantiles = np.percentile(kpss, list(percentiles)) + df = pd.DataFrame(quantiles, index=critical_values,columns=[d]) + df.to_hdf(hdf_filename, key=d, mode='a') + quantiles = map(lambda x: '{0:0.5f}'.format(x), quantiles) + + sio.write("kpss_critical_values['" + d + "'] = (") + count = 0 + for c, q in zip(critical_values_string, quantiles): + sio.write('(' + c + ', ' + q + ')') + count += 1 + if count % 3 == 0: + sio.write(',\n ') + else: + sio.write(', ') + sio.write(")") + sio.write("\n") + + sio.seek(0) + print sio.read() + + + + diff --git a/statsmodels/tsa/critical_values/simulation/kpss_simulation_process.py b/statsmodels/tsa/critical_values/simulation/kpss_simulation_process.py new file mode 100644 index 00000000000..aa1ff63e475 --- /dev/null +++ b/statsmodels/tsa/critical_values/simulation/kpss_simulation_process.py @@ -0,0 +1,87 @@ +import cStringIO as cio + +import numpy as np +import pandas as pd + + +sio = cio.StringIO() +c = pd.read_hdf('kpss_critical_values.h5', 'c') +ct = pd.read_hdf('kpss_critical_values.h5', 'ct') + +data = {'c': c, 'ct': ct} +for k, v in data.iteritems(): + n = v.shape[0] + selected = np.zeros((n, 1), dtype=np.bool) + selected[0] = True + selected[-1] = True + selected[v.index == 10.0] = True + selected[v.index == 5.0] = True + selected[v.index == 2.5] = True + selected[v.index == 1.0] = True + max_diff = 1.0 + while max_diff > 0.05: + xp = np.squeeze(v[selected].values) + yp = np.asarray(v[selected].index, dtype=np.float64) + x = np.squeeze(v.values) + y = np.asarray(v.index, dtype=np.float64) + yi = np.interp(x, xp, yp) + abs_diff = np.abs(y - yi) + max_diff = np.max(abs_diff) + if max_diff > 0.05: + print selected.sum() + print np.where(abs_diff == max_diff) + selected[np.where(abs_diff == max_diff)] = True + + quantiles = list(np.squeeze(v[selected].index.values)) + critical_values = list(np.squeeze(v[selected].values)) + # Fix for first CV + critical_values[0] = 0.0 + sio.write("kpss_critical_values['" + k + "'] = (") + count = 0 + for c, q in zip(critical_values, quantiles): + sio.write( + '(' + '{0:0.1f}'.format(q) + ', ' + '{0:0.4f}'.format(c) + ')') + count += 1 + if count % 3 == 0: + sio.write(',\n ') + else: + sio.write(', ') + sio.write(")\n") + sio.write("kpss_critical_values['" + k + "'] = ") + sio.write("np.array(kpss_critical_values['" + k + "'])") + sio.write("\n") + +sio.seek(0) +print sio.read() + + + + + +kpss_critical_values = {} +kpss_critical_values['c'] = ((100.0, 0.0091), (99.5, 0.0218), (99.0, 0.0248), + (97.5, 0.0304), (95.5, 0.0354), (92.5, 0.0416), + (89.5, 0.0470), (73.5, 0.0726), (69.0, 0.0802), + (63.5, 0.0900), (58.5, 0.0999), (53.5, 0.1108), + (48.5, 0.1229), (43.0, 0.1376), (39.5, 0.1483), + (35.5, 0.1622), (31.5, 0.1780), (29.5, 0.1867), + (26.5, 0.2017), (24.5, 0.2127), (20.5, 0.2378), + (18.5, 0.2524), (16.0, 0.2738), (13.5, 0.2993), + (12.0, 0.3183), (9.5, 0.3549), (8.0, 0.3833), + (6.5, 0.4173), (4.5, 0.4793), (3.5, 0.5237), + (2.5, 0.5828), (1.5, 0.6734), (0.7, 0.8157), + (0.3, 0.9674), (0.1, 1.1692), ) +kpss_critical_values['c'] = np.array(kpss_critical_values['c']) +kpss_critical_values['ct'] = ((100.0, 0.0077), (99.5, 0.0156), (99.0, 0.0173), + (98.0, 0.0194), (97.0, 0.0210), (95.5, 0.0229), + (94.0, 0.0245), (91.0, 0.0271), (88.0, 0.0295), + (85.5, 0.0313), (80.5, 0.0347), (67.0, 0.0434), + (60.0, 0.0480), (50.0, 0.0555), (46.5, 0.0584), + (40.0, 0.0643), (35.5, 0.0691), (32.0, 0.0731), + (29.0, 0.0769), (25.5, 0.0817), (21.0, 0.0894), + (18.5, 0.0943), (17.0, 0.0978), (15.5, 0.1015), + (13.0, 0.1088), (11.0, 0.1153), (8.5, 0.1257), + (7.0, 0.1337), (6.0, 0.1401), (4.0, 0.1570), + (2.5, 0.1770), (1.5, 0.1990), (0.7, 0.2350), + (0.3, 0.2738), (0.1, 0.3231), ) +kpss_critical_values['ct'] = np.array(kpss_critical_values['ct']) \ No newline at end of file diff --git a/statsmodels/tsa/stattools.py b/statsmodels/tsa/stattools.py index 02f4e8d1469..37750222d21 100644 --- a/statsmodels/tsa/stattools.py +++ b/statsmodels/tsa/stattools.py @@ -1,19 +1,17 @@ """ Statistical tools for time series analysis """ - +from __future__ import division import numpy as np from numpy.linalg import LinAlgError from scipy import stats from statsmodels.regression.linear_model import OLS, yule_walker from statsmodels.tools.tools import add_constant, Bunch from tsatools import lagmat, lagmat2ds, add_trend -from adfvalues import mackinnonp, mackinnoncrit from statsmodels.tsa.arima_model import ARMA __all__ = ['acovf', 'acf', 'pacf', 'pacf_yw', 'pacf_ols', 'ccovf', 'ccf', - 'periodogram', 'q_stat', 'coint', 'arma_order_select_ic', - 'adfuller'] + 'periodogram', 'q_stat', 'arma_order_select_ic', 'cov_nw'] #NOTE: now in two places to avoid circular import @@ -23,6 +21,56 @@ def __str__(self): return self._str # pylint: disable=E1101 +def cov_nw(y, lags=0, demean=True, axis=0): + """ + Computes Newey-West covariance for 1-d and 2-d arrays + + Parameters + ---------- + y : array-like, 1d or 2d + Values to use when computing the Newey-West covariance estimator. + When u is 2d, default behavior is to treat columns as variables and + rows as observations. + lags : int, non-negative + Number of lags to include in the Newey-West covariance estimator + demean : bool + Indicates whether to subtract the mean. Default is True + axis : int, (0, 1) + The axis to use when y is 2d + + Returns + ------- + cov : array + The estimated covariance + + """ + z = y + is_1d = False + if axis > z.ndim: + raise ValueError('axis must be less than the dimension of y') + if z.ndim == 1: + is_1d = True + z = z[:, None] + if axis == 1: + z = z.T + n = z.shape[0] + if lags > n: + error = 'lags must be weakly smaller than the number of observations' + raise ValueError(error) + + if demean: + z = z - z.mean(0) + cov = z.T.dot(z) + for j in xrange(1, lags + 1): + w = (1 - j / (lags + 1)) + gamma = z[j:].T.dot(z[:-j]) + cov += w * (gamma + gamma.T) + cov = cov / n + if is_1d: + cov = float(cov) + return cov + + def _autolag(mod, endog, exog, startlag, maxlag, method, modargs=(), fitargs=(), regresults=False): """ @@ -50,7 +98,7 @@ def _autolag(mod, endog, exog, startlag, maxlag, method, modargs=(), icbest : float Best information criteria. bestlag : int - The lag length that maximizes the information criterion. + The lag length that minimizes the information criterion. Notes @@ -92,195 +140,6 @@ def _autolag(mod, endog, exog, startlag, maxlag, method, modargs=(), return icbest, bestlag, results -#this needs to be converted to a class like HetGoldfeldQuandt, -# 3 different returns are a mess -# See: -#Ng and Perron(2001), Lag length selection and the construction of unit root -#tests with good size and power, Econometrica, Vol 69 (6) pp 1519-1554 -#TODO: include drift keyword, only valid with regression == "c" -# just changes the distribution of the test statistic to a t distribution -#TODO: autolag is untested -def adfuller(x, maxlag=None, regression="c", autolag='AIC', - store=False, regresults=False): - ''' - Augmented Dickey-Fuller unit root test - - The Augmented Dickey-Fuller test can be used to test for a unit root in a - univariate process in the presence of serial correlation. - - Parameters - ---------- - x : array_like, 1d - data series - maxlag : int - Maximum lag which is included in test, default 12*(nobs/100)^{1/4} - regression : str {'c','ct','ctt','nc'} - Constant and trend order to include in regression - * 'c' : constant only (default) - * 'ct' : constant and trend - * 'ctt' : constant, and linear and quadratic trend - * 'nc' : no constant, no trend - autolag : {'AIC', 'BIC', 't-stat', None} - * if None, then maxlag lags are used - * if 'AIC' (default) or 'BIC', then the number of lags is chosen - to minimize the corresponding information criterium - * 't-stat' based choice of maxlag. Starts with maxlag and drops a - lag until the t-statistic on the last lag length is significant at - the 95 % level. - store : bool - If True, then a result instance is returned additionally to - the adf statistic (default is False) - regresults : bool - If True, the full regression results are returned (default is False) - - Returns - ------- - adf : float - Test statistic - pvalue : float - MacKinnon's approximate p-value based on MacKinnon (1994) - usedlag : int - Number of lags used. - nobs : int - Number of observations used for the ADF regression and calculation of - the critical values. - critical values : dict - Critical values for the test statistic at the 1 %, 5 %, and 10 % - levels. Based on MacKinnon (2010) - icbest : float - The maximized information criterion if autolag is not None. - regresults : RegressionResults instance - The - resstore : (optional) instance of ResultStore - an instance of a dummy class with results attached as attributes - - Notes - ----- - The null hypothesis of the Augmented Dickey-Fuller is that there is a unit - root, with the alternative that there is no unit root. If the pvalue is - above a critical size, then we cannot reject that there is a unit root. - - The p-values are obtained through regression surface approximation from - MacKinnon 1994, but using the updated 2010 tables. - If the p-value is close to significant, then the critical values should be - used to judge whether to accept or reject the null. - - The autolag option and maxlag for it are described in Greene. - - Examples - -------- - see example script - - References - ---------- - Greene - Hamilton - - - P-Values (regression surface approximation) - MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for - unit-root and cointegration tests. `Journal of Business and Economic - Statistics` 12, 167-76. - - Critical values - MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's - University, Dept of Economics, Working Papers. Available at - http://ideas.repec.org/p/qed/wpaper/1227.html - - ''' - - if regresults: - store = True - - trenddict = {None: 'nc', 0: 'c', 1: 'ct', 2: 'ctt'} - if regression is None or isinstance(regression, int): - regression = trenddict[regression] - regression = regression.lower() - if regression not in ['c', 'nc', 'ct', 'ctt']: - raise ValueError("regression option %s not understood") % regression - x = np.asarray(x) - nobs = x.shape[0] - - if maxlag is None: - #from Greene referencing Schwert 1989 - maxlag = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.))) - - xdiff = np.diff(x) - xdall = lagmat(xdiff[:, None], maxlag, trim='both', original='in') - nobs = xdall.shape[0] # pylint: disable=E1103 - - xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x - xdshort = xdiff[-nobs:] - - if store: - resstore = ResultsStore() - if autolag: - if regression != 'nc': - fullRHS = add_trend(xdall, regression, prepend=True) - else: - fullRHS = xdall - startlag = fullRHS.shape[1] - xdall.shape[1] + 1 # 1 for level # pylint: disable=E1103 - #search for lag length with smallest information criteria - #Note: use the same number of observations to have comparable IC - #aic and bic: smaller is better - - if not regresults: - icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, - maxlag, autolag) - else: - icbest, bestlag, alres = _autolag(OLS, xdshort, fullRHS, startlag, - maxlag, autolag, - regresults=regresults) - resstore.autolag_results = alres - - bestlag -= startlag # convert to lag not column index - - #rerun ols with best autolag - xdall = lagmat(xdiff[:, None], bestlag, trim='both', original='in') - nobs = xdall.shape[0] # pylint: disable=E1103 - xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x - xdshort = xdiff[-nobs:] - usedlag = bestlag - else: - usedlag = maxlag - icbest = None - if regression != 'nc': - resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1], - regression)).fit() - else: - resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit() - - adfstat = resols.tvalues[0] -# adfstat = (resols.params[0]-1.0)/resols.bse[0] - # the "asymptotically correct" z statistic is obtained as - # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1) - # I think this is the statistic that is used for series that are integrated - # for orders higher than I(1), ie., not ADF but cointegration tests. - - # Get approx p-value and critical values - pvalue = mackinnonp(adfstat, regression=regression, N=1) - critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs) - critvalues = {"1%" : critvalues[0], "5%" : critvalues[1], - "10%" : critvalues[2]} - if store: - resstore.resols = resols - resstore.maxlag = maxlag - resstore.usedlag = usedlag - resstore.adfstat = adfstat - resstore.critvalues = critvalues - resstore.nobs = nobs - resstore.H0 = ("The coefficient on the lagged level equals 1 - " - "unit root") - resstore.HA = "The coefficient on the lagged level < 1 - stationary" - resstore.icbest = icbest - return adfstat, pvalue, critvalues, resstore - else: - if not autolag: - return adfstat, pvalue, usedlag, nobs, critvalues - else: - return adfstat, pvalue, usedlag, nobs, critvalues, icbest - - def acovf(x, unbiased=False, demean=True, fft=False): ''' Autocovariance for 1D @@ -884,66 +743,6 @@ def grangercausalitytests(x, maxlag, addconst=True, verbose=True): return resli -def coint(y1, y2, regression="c"): - """ - This is a simple cointegration test. Uses unit-root test on residuals to - test for cointegrated relationship - - See Hamilton (1994) 19.2 - - Parameters - ---------- - y1 : array_like, 1d - first element in cointegrating vector - y2 : array_like - remaining elements in cointegrating vector - c : str {'c'} - Included in regression - * 'c' : Constant - - Returns - ------- - coint_t : float - t-statistic of unit-root test on residuals - pvalue : float - MacKinnon's approximate p-value based on MacKinnon (1994) - crit_value : dict - Critical values for the test statistic at the 1 %, 5 %, and 10 % - levels. - - Notes - ----- - The Null hypothesis is that there is no cointegration, the alternative - hypothesis is that there is cointegrating relationship. If the pvalue is - small, below a critical size, then we can reject the hypothesis that there - is no cointegrating relationship. - - P-values are obtained through regression surface approximation from - MacKinnon 1994. - - References - ---------- - MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for - unit-root and cointegration tests. `Journal of Business and Economic - Statistics` 12, 167-76. - - """ - regression = regression.lower() - if regression not in ['c', 'nc', 'ct', 'ctt']: - raise ValueError("regression option %s not understood") % regression - y1 = np.asarray(y1) - y2 = np.asarray(y2) - if regression == 'c': - y2 = add_constant(y2, prepend=False) - st1_resid = OLS(y1, y2).fit().resid # stage one residuals - lgresid_cons = add_constant(st1_resid[0:-1], prepend=False) - uroot_reg = OLS(st1_resid[1:], lgresid_cons).fit() - coint_t = (uroot_reg.params[0] - 1) / uroot_reg.bse[0] - pvalue = mackinnonp(coint_t, regression="c", N=2, lags=None) - crit_value = mackinnoncrit(N=1, regression="c", nobs=len(y1)) - return coint_t, pvalue, crit_value - - def _safe_arma_fit(y, order, model_kw, trend, fit_kw, start_params=None): try: return ARMA(y, order=order, **model_kw).fit(disp=0, trend=trend, diff --git a/statsmodels/tsa/tests/test_adfuller_lag.py b/statsmodels/tsa/tests/test_adfuller_lag.py deleted file mode 100644 index 9b77633e1d6..00000000000 --- a/statsmodels/tsa/tests/test_adfuller_lag.py +++ /dev/null @@ -1,49 +0,0 @@ -# -*- coding: utf-8 -*- -"""Test for autolag of adfuller, unitroot_adf - -Created on Wed May 30 21:39:46 2012 -Author: Josef Perktold -""" - -import numpy as np -from numpy.testing import assert_equal, assert_almost_equal -import statsmodels.tsa.stattools as tsast -from statsmodels.datasets import macrodata - -def test_adf_autolag(): - #see issue #246 - #this is mostly a unit test - d2 = macrodata.load().data - - for k_trend, tr in enumerate(['nc', 'c', 'ct', 'ctt']): - #[None:'nc', 0:'c', 1:'ct', 2:'ctt'] - x = np.log(d2['realgdp']) - xd = np.diff(x) - - #check exog - adf3 = tsast.adfuller(x, maxlag=None, autolag='aic', - regression=tr, store=True, regresults=True) - st2 = adf3[-1] - - assert_equal(len(st2.autolag_results), 15 + 1) #+1 for lagged level - for l, res in sorted(st2.autolag_results.iteritems())[:5]: - lag = l-k_trend - #assert correct design matrices in _autolag - assert_equal(res.model.exog[-10:,k_trend], x[-11:-1]) - assert_equal(res.model.exog[-1,k_trend+1:], xd[-lag:-1][::-1]) - #min-ic lag of dfgls in Stata is also 2, or 9 for maic with notrend - assert_equal(st2.usedlag, 2) - - #same result with lag fixed at usedlag of autolag - adf2 = tsast.adfuller(x, maxlag=2, autolag=None, regression=tr) - assert_almost_equal(adf3[:2], adf2[:2], decimal=12) - - - tr = 'c' - #check maxlag with autolag - adf3 = tsast.adfuller(x, maxlag=5, autolag='aic', - regression=tr, store=True, regresults=True) - assert_equal(len(adf3[-1].autolag_results), 5 + 1) - adf3 = tsast.adfuller(x, maxlag=0, autolag='aic', - regression=tr, store=True, regresults=True) - assert_equal(len(adf3[-1].autolag_results), 0 + 1) diff --git a/statsmodels/tsa/tests/test_stattools.py b/statsmodels/tsa/tests/test_stattools.py index 775af8ec8f0..069bd1510e1 100644 --- a/statsmodels/tsa/tests/test_stattools.py +++ b/statsmodels/tsa/tests/test_stattools.py @@ -1,12 +1,12 @@ -from statsmodels.tsa.stattools import (adfuller, acf, pacf_ols, pacf_yw, - pacf, grangercausalitytests, - coint, acovf, - arma_order_select_ic) +from statsmodels.tsa.stattools import (acf, pacf_yw, pacf, + grangercausalitytests, acovf, + arma_order_select_ic, cov_nw) +from statsmodels.tsa.cointegration import coint from statsmodels.tsa.base.datetools import dates_from_range import numpy as np from numpy.testing import (assert_almost_equal, assert_equal, assert_raises, dec, assert_) -from numpy import genfromtxt#, concatenate +from numpy import genfromtxt, log, diff from statsmodels.datasets import macrodata, sunspots from pandas import Series, Index, DataFrame import os @@ -20,92 +20,6 @@ DECIMAL_2 = 2 DECIMAL_1 = 1 -class CheckADF(object): - """ - Test Augmented Dickey-Fuller - - Test values taken from Stata. - """ - levels = ['1%', '5%', '10%'] - data = macrodata.load() - x = data.data['realgdp'] - y = data.data['infl'] - - def test_teststat(self): - assert_almost_equal(self.res1[0], self.teststat, DECIMAL_5) - - def test_pvalue(self): - assert_almost_equal(self.res1[1], self.pvalue, DECIMAL_5) - - def test_critvalues(self): - critvalues = [self.res1[4][lev] for lev in self.levels] - assert_almost_equal(critvalues, self.critvalues, DECIMAL_2) - -class TestADFConstant(CheckADF): - """ - Dickey-Fuller test for unit root - """ - def __init__(self): - self.res1 = adfuller(self.x, regression="c", autolag=None, - maxlag=4) - self.teststat = .97505319 - self.pvalue = .99399563 - self.critvalues = [-3.476, -2.883, -2.573] - -class TestADFConstantTrend(CheckADF): - """ - """ - def __init__(self): - self.res1 = adfuller(self.x, regression="ct", autolag=None, - maxlag=4) - self.teststat = -1.8566374 - self.pvalue = .67682968 - self.critvalues = [-4.007, -3.437, -3.137] - -#class TestADFConstantTrendSquared(CheckADF): -# """ -# """ -# pass -#TODO: get test values from R? - -class TestADFNoConstant(CheckADF): - """ - """ - def __init__(self): - self.res1 = adfuller(self.x, regression="nc", autolag=None, - maxlag=4) - self.teststat = 3.5227498 - self.pvalue = .99999 # Stata does not return a p-value for noconstant. - # Tau^max in MacKinnon (1994) is missing, so it is - # assumed that its right-tail is well-behaved - self.critvalues = [-2.587, -1.950, -1.617] - -# No Unit Root - -class TestADFConstant2(CheckADF): - def __init__(self): - self.res1 = adfuller(self.y, regression="c", autolag=None, - maxlag=1) - self.teststat = -4.3346988 - self.pvalue = .00038661 - self.critvalues = [-3.476, -2.883, -2.573] - -class TestADFConstantTrend2(CheckADF): - def __init__(self): - self.res1 = adfuller(self.y, regression="ct", autolag=None, - maxlag=1) - self.teststat = -4.425093 - self.pvalue = .00199633 - self.critvalues = [-4.006, -3.437, -3.137] - -class TestADFNoConstant2(CheckADF): - def __init__(self): - self.res1 = adfuller(self.y, regression="nc", autolag=None, - maxlag=1) - self.teststat = -2.4511596 - self.pvalue = 0.013747 # Stata does not return a p-value for noconstant - # this value is just taken from our results - self.critvalues = [-2.587,-1.950,-1.617] class CheckCorrGram(object): """ @@ -321,6 +235,43 @@ def test_arma_order_select_ic_failure(): res = arma_order_select_ic(y) +class TestVarNW(object): + @classmethod + def setupClass(cls): + from statsmodels.datasets.macrodata import load + + cls.cpi = log(load().data['cpi']) + cls.inflation = diff(cls.cpi) + + def test_cov_nw(self): + y = self.inflation + simple_cov = cov_nw(y, lags=0) + e = y - y.mean() + assert_almost_equal(e.dot(e) / e.shape[0], simple_cov) + + def test_cov_nw_no_demean(self): + y = self.inflation + simple_cov = cov_nw(y, lags=0, demean=False) + assert_almost_equal(y.dot(y) / y.shape[0], simple_cov) + + def test_cov_nw_2d(self): + y = np.random.randn(100, 2) + simple_cov = cov_nw(y, lags=0) + e = y - y.mean(0) + assert_almost_equal(e.T.dot(e) / e.shape[0], simple_cov) + + def test_cov_nw_2d_2lags(self): + y = np.random.randn(100, 2) + e = y - y.mean(0) + gamma_0 = e.T.dot(e) + gamma_1 = e[1:].T.dot(e[:-1]) + gamma_2 = e[2:].T.dot(e[:-2]) + w1, w2 = 1.0 - (1.0 / 3.0), 1.0 - (2.0 / 3.0) + expected = (gamma_0 + w1 * (gamma_1 + gamma_1.T) + w2 * ( + gamma_2 + gamma_2.T)) / 100.0 + assert_almost_equal(cov_nw(y, lags=2), expected) + + if __name__=="__main__": import nose # nose.runmodule(argv=[__file__, '-vvs','-x','-pdb'], exit=False) diff --git a/statsmodels/tsa/tests/test_tsa_tools.py b/statsmodels/tsa/tests/test_tsa_tools.py index c8b7fc73873..ec7a19568dc 100644 --- a/statsmodels/tsa/tests/test_tsa_tools.py +++ b/statsmodels/tsa/tests/test_tsa_tools.py @@ -2,26 +2,51 @@ ''' +import warnings + import numpy as np -from numpy.testing import assert_array_almost_equal, assert_equal +from numpy.random import randn +from numpy.testing import assert_array_almost_equal, assert_equal, assert_raises +import pandas as pd +from nose import SkipTest +from nose.tools import nottest +from pandas.util.testing import assert_frame_equal, assert_produces_warning + +import statsmodels.tsa.tsatools as tools +from statsmodels.tsa.tsatools import vec, vech, reintegrate, unvec import statsmodels.api as sm import statsmodels.tsa.stattools as tsa -import statsmodels.tsa.tsatools as tools -from statsmodels.tsa.tsatools import vec, vech +from statsmodels.regression.linear_model import OLS +from statsmodels.tsa.tests.results import savedrvs +from statsmodels.tsa.tests.results.datamlw_tls import mlacf, mlccf, mlpacf, \ + mlywar -from results import savedrvs -from results.datamlw_tls import mlacf, mlccf, mlpacf, mlywar xo = savedrvs.rvsdata.xar2 -x100 = xo[-100:]/1000. -x1000 = xo/1000. +x100 = xo[-100:] / 1000. +x1000 = xo / 1000. + +pdversion = pd.version.version.split('.') +pdmajor = int(pdversion[0]) +pdminor = int(pdversion[1]) + +macro_data = sm.datasets.macrodata.load().data[['year', + 'quarter', + 'realgdp', + 'cpi']] + + +def skip_if_early_pandas(): + if pdmajor == 0 and pdminor <= 12: + raise SkipTest("known failure of test on early pandas") def test_acf(): acf_x = tsa.acf(x100, unbiased=False)[:21] - assert_array_almost_equal(mlacf.acf100.ravel(), acf_x, 8) #why only dec=8 + assert_array_almost_equal(mlacf.acf100.ravel(), acf_x, 8) # why only dec=8 acf_x = tsa.acf(x1000, unbiased=False)[:21] - assert_array_almost_equal(mlacf.acf1000.ravel(), acf_x, 8) #why only dec=9 + assert_array_almost_equal(mlacf.acf1000.ravel(), acf_x, 8) # why only dec=9 + def test_ccf(): ccf_x = tsa.ccf(x100[4:], x100[:-4], unbiased=False)[:21] @@ -29,6 +54,7 @@ def test_ccf(): ccf_x = tsa.ccf(x1000[4:], x1000[:-4], unbiased=False)[:21] assert_array_almost_equal(mlccf.ccf1000.ravel()[:21][::-1], ccf_x, 8) + def test_pacf_yw(): pacfyw = tsa.pacf_yw(x100, 20, method='mle') assert_array_almost_equal(mlpacf.pacf100.ravel(), pacfyw, 1) @@ -36,6 +62,8 @@ def test_pacf_yw(): assert_array_almost_equal(mlpacf.pacf1000.ravel(), pacfyw, 2) #assert False + +@nottest def test_pacf_ols(): pacfols = tsa.pacf_ols(x100, 20) assert_array_almost_equal(mlpacf.pacf100.ravel(), pacfols, 8) @@ -43,112 +71,154 @@ def test_pacf_ols(): assert_array_almost_equal(mlpacf.pacf1000.ravel(), pacfols, 8) #assert False + def test_ywcoef(): - assert_array_almost_equal(mlywar.arcoef100[1:], - -sm.regression.yule_walker(x100, 10, method='mle')[0], 8) - assert_array_almost_equal(mlywar.arcoef1000[1:], - -sm.regression.yule_walker(x1000, 20, method='mle')[0], 8) + yw = -sm.regression.yule_walker(x100, 10, method='mle')[0] + ar_coef = mlywar.arcoef100[1:] + assert_array_almost_equal(ar_coef, yw, 8) + yw = -sm.regression.yule_walker(x1000, 20, method='mle')[0] + ar_coef = mlywar.arcoef1000[1:] + assert_array_almost_equal(yw, ar_coef, 8) + def test_duplication_matrix(): for k in range(2, 10): - m = tools.unvech(np.random.randn(k * (k + 1) / 2)) - Dk = tools.duplication_matrix(k) - assert(np.array_equal(vec(m), np.dot(Dk, vech(m)))) + m = tools.unvech(randn(k * (k + 1) / 2)) + duplication = tools.duplication_matrix(k) + assert_equal(vec(m), np.dot(duplication, vech(m))) + def test_elimination_matrix(): for k in range(2, 10): - m = np.random.randn(k, k) - Lk = tools.elimination_matrix(k) - assert(np.array_equal(vech(m), np.dot(Lk, vec(m)))) + m = randn(k, k) + elimination = tools.elimination_matrix(k) + assert_equal(vech(m), np.dot(elimination, vec(m))) + def test_commutation_matrix(): - m = np.random.randn(4, 3) - K = tools.commutation_matrix(4, 3) - assert(np.array_equal(vec(m.T), np.dot(K, vec(m)))) + m = randn(4, 3) + commutation = tools.commutation_matrix(4, 3) + assert_equal(vec(m.T), np.dot(commutation, vec(m))) + def test_vec(): arr = np.array([[1, 2], [3, 4]]) - assert(np.array_equal(vec(arr), [1, 3, 2, 4])) + assert_equal(vec(arr), [1, 3, 2, 4]) + def test_vech(): arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - assert(np.array_equal(vech(arr), [1, 4, 7, 5, 8, 9])) + assert_equal(vech(arr), [1, 4, 7, 5, 8, 9]) + + +def test_unvec(): + arr = np.array([[1, 2, 3], + [4, 5, 6], + [7, 8, 9]]) + assert_equal(unvec(vec(arr)), arr) + + +def test_lagmat_trim(): + x = np.arange(20.0) + 1.0 + lags = 5 + base_lagmat = sm.tsa.lagmat(x, lags, trim=None) + forward_lagmat = sm.tsa.lagmat(x, lags, trim='forward') + assert_equal(base_lagmat[:-lags], forward_lagmat) + backward_lagmat = sm.tsa.lagmat(x, lags, trim='backward') + assert_equal(base_lagmat[lags:], backward_lagmat) + both_lagmat = sm.tsa.lagmat(x, lags, trim='both') + assert_equal(base_lagmat[lags:-lags], both_lagmat) + + +def test_lagmat_bad_trim(): + x = randn(10, 1) + assert_raises(ValueError, sm.tsa.lagmat, x, 1, trim='unknown') def test_add_lag_insert(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,2],3,trim='Both') - results = np.column_stack((nddata[3:,:3],lagmat,nddata[3:,-1])) + data = macro_data + nddata = data.view((float, 4)) + lagmat = sm.tsa.lagmat(nddata[:, 2], 3, trim='Both') + results = np.column_stack((nddata[3:, :3], lagmat, nddata[3:, -1])) + data = pd.DataFrame(data) lag_data = sm.tsa.add_lag(data, 'realgdp', 3) - assert_equal(lag_data.view((float,len(lag_data.dtype.names))), results) + assert_equal(lag_data.values, results) + def test_add_lag_noinsert(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,2],3,trim='Both') - results = np.column_stack((nddata[3:,:],lagmat)) + data = macro_data + nddata = data.view((float, 4)) + lagmat = sm.tsa.lagmat(nddata[:, 2], 3, trim='Both') + results = np.column_stack((nddata[3:, :], lagmat)) + data = pd.DataFrame(data) lag_data = sm.tsa.add_lag(data, 'realgdp', 3, insert=False) - assert_equal(lag_data.view((float,len(lag_data.dtype.names))), results) + assert_equal(lag_data.values, results) + def test_add_lag_noinsert_atend(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,-1],3,trim='Both') - results = np.column_stack((nddata[3:,:],lagmat)) + data = macro_data + nddata = data.view((float, 4)) + lagmat = sm.tsa.lagmat(nddata[:, -1], 3, trim='Both') + results = np.column_stack((nddata[3:, :], lagmat)) + data = pd.DataFrame(data) lag_data = sm.tsa.add_lag(data, 'cpi', 3, insert=False) - assert_equal(lag_data.view((float,len(lag_data.dtype.names))), results) + assert_equal(lag_data.values, results) # should be the same as insert lag_data2 = sm.tsa.add_lag(data, 'cpi', 3, insert=True) - assert_equal(lag_data2.view((float,len(lag_data2.dtype.names))), results) + assert_equal(lag_data2.values, results) + def test_add_lag_ndarray(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,2],3,trim='Both') - results = np.column_stack((nddata[3:,:3],lagmat,nddata[3:,-1])) + data = macro_data + nddata = data.view((float, 4)) + lagmat = sm.tsa.lagmat(nddata[:, 2], 3, trim='Both') + results = np.column_stack((nddata[3:, :3], lagmat, nddata[3:, -1])) lag_data = sm.tsa.add_lag(nddata, 2, 3) assert_equal(lag_data, results) + def test_add_lag_noinsert_ndarray(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,2],3,trim='Both') - results = np.column_stack((nddata[3:,:],lagmat)) + data = macro_data + nddata = data.view((float, 4)) + lagmat = sm.tsa.lagmat(nddata[:, 2], 3, trim='Both') + results = np.column_stack((nddata[3:, :], lagmat)) lag_data = sm.tsa.add_lag(nddata, 2, 3, insert=False) assert_equal(lag_data, results) + def test_add_lag_noinsertatend_ndarray(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,-1],3,trim='Both') - results = np.column_stack((nddata[3:,:],lagmat)) + data = macro_data + nddata = data.view((float, 4)) + lagmat = sm.tsa.lagmat(nddata[:, -1], 3, trim='Both') + results = np.column_stack((nddata[3:, :], lagmat)) lag_data = sm.tsa.add_lag(nddata, 3, 3, insert=False) assert_equal(lag_data, results) # should be the same as insert also check negative col number lag_data2 = sm.tsa.add_lag(nddata, -1, 3, insert=True) assert_equal(lag_data2, results) + def test_add_lag1d(): - data = np.random.randn(100) - lagmat = sm.tsa.lagmat(data,3,trim='Both') - results = np.column_stack((data[3:],lagmat)) + data = randn(100) + lagmat = sm.tsa.lagmat(data, 3, trim='Both') + results = np.column_stack((data[3:], lagmat)) lag_data = sm.tsa.add_lag(data, lags=3, insert=True) assert_equal(results, lag_data) # add index - data = data[:,None] - lagmat = sm.tsa.lagmat(data,3,trim='Both') # test for lagmat too - results = np.column_stack((data[3:],lagmat)) - lag_data = sm.tsa.add_lag(data,lags=3, insert=True) + data = data[:, None] + lagmat = sm.tsa.lagmat(data, 3, trim='Both') # test for lagmat too + results = np.column_stack((data[3:], lagmat)) + lag_data = sm.tsa.add_lag(data, lags=3, insert=True) assert_equal(results, lag_data) + def test_add_lag1d_drop(): - data = np.random.randn(100) - lagmat = sm.tsa.lagmat(data,3,trim='Both') + data = randn(100) + lagmat = sm.tsa.lagmat(data, 3, trim='Both') lag_data = sm.tsa.add_lag(data, lags=3, drop=True, insert=True) assert_equal(lagmat, lag_data) @@ -156,45 +226,52 @@ def test_add_lag1d_drop(): lag_data = sm.tsa.add_lag(data, lags=3, drop=True, insert=False) assert_equal(lagmat, lag_data) + def test_add_lag1d_struct(): - data = np.zeros(100, dtype=[('variable',float)]) - nddata = np.random.randn(100) + data = np.zeros(100, dtype=[('variable', float)]) + nddata = randn(100) data['variable'] = nddata - - lagmat = sm.tsa.lagmat(nddata,3,trim='Both', original='in') + data = pd.DataFrame(data) + lagmat = sm.tsa.lagmat(nddata, 3, trim='Both', original='in') lag_data = sm.tsa.add_lag(data, 'variable', lags=3, insert=True) - assert_equal(lagmat, lag_data.view((float,4))) + assert_equal(lagmat, lag_data.values) lag_data = sm.tsa.add_lag(data, 'variable', lags=3, insert=False) - assert_equal(lagmat, lag_data.view((float,4))) + assert_equal(lagmat, lag_data.values) lag_data = sm.tsa.add_lag(data, lags=3, insert=True) - assert_equal(lagmat, lag_data.view((float,4))) + assert_equal(lagmat, lag_data.values) + def test_add_lag_1d_drop_struct(): - data = np.zeros(100, dtype=[('variable',float)]) - nddata = np.random.randn(100) + data = np.zeros(100, dtype=[('variable', float)]) + nddata = randn(100) data['variable'] = nddata + data = pd.DataFrame(data) - lagmat = sm.tsa.lagmat(nddata,3,trim='Both') + lagmat = sm.tsa.lagmat(nddata, 3, trim='Both') lag_data = sm.tsa.add_lag(data, lags=3, drop=True) - assert_equal(lagmat, lag_data.view((float,3))) + assert_equal(lagmat, lag_data.values) + def test_add_lag_drop_insert(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,2],3,trim='Both') - results = np.column_stack((nddata[3:,:2],lagmat,nddata[3:,-1])) + data = macro_data + nddata = data.view((float, 4)) + data = pd.DataFrame(data) + lagmat = sm.tsa.lagmat(nddata[:, 2], 3, trim='Both') + results = np.column_stack((nddata[3:, :2], lagmat, nddata[3:, -1])) lag_data = sm.tsa.add_lag(data, 'realgdp', 3, drop=True) - assert_equal(lag_data.view((float,len(lag_data.dtype.names))), results) + assert_equal(lag_data.values, results) + def test_add_lag_drop_noinsert(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,2],3,trim='Both') - results = np.column_stack((nddata[3:,np.array([0,1,3])],lagmat)) + data = macro_data + nddata = data.view((float, 4)) + data = pd.DataFrame(data) + lagmat = sm.tsa.lagmat(nddata[:, 2], 3, trim='Both') + results = np.column_stack((nddata[3:, np.array([0, 1, 3])], lagmat)) lag_data = sm.tsa.add_lag(data, 'realgdp', 3, insert=False, drop=True) - assert_equal(lag_data.view((float,len(lag_data.dtype.names))), results) + assert_equal(lag_data.values, results) def test_freq_to_period(): from pandas.tseries.frequencies import to_offset @@ -204,13 +281,159 @@ def test_freq_to_period(): assert_equal(tools.freq_to_period(i), j) assert_equal(tools.freq_to_period(to_offset(i)), j) -if __name__ == '__main__': - #running them directly - # test_acf() - # test_ccf() - # test_pacf_yw() - # test_pacf_ols() - # test_ywcoef() +def test_add_lag_drop_insert_early(): + x = randn(5, 3) + x_lags_incl = sm.tsa.add_lag(x, lags=2, insert=0, drop=False) + x_lags_excl = sm.tsa.add_lag(x, lags=2, insert=0, drop=True) + assert_equal(x_lags_incl[:, :2], x_lags_excl[:, :2]) + assert_equal(x_lags_incl[:, 3:], x_lags_excl[:, 2:]) + + +def test_add_lag_negative_index(): + x = randn(5, 3) + x_lags_incl = sm.tsa.add_lag(x, lags=2, insert=-1, drop=False) + assert_equal(x_lags_incl[:, 0], x[2:, 0]) + assert_equal(x_lags_incl[:, 3], x[1:4, 0]) + assert_equal(x_lags_incl[:, 4], x[:3, 0]) + + +def test_add_trend_prepend(): + n = 10 + x = randn(n, 1) + trend_1 = sm.tsa.add_trend(x, trend='ct', prepend=True) + trend_2 = sm.tsa.add_trend(x, trend='ct', prepend=False) + assert_equal(trend_1[:, :2], trend_2[:, 1:]) + + +def test_add_time_trend_dataframe(): + n = 10 + x = randn(n, 1) + x = pd.DataFrame(x, columns=['col1']) + trend_1 = sm.tsa.add_trend(x, trend='t') + assert_array_almost_equal(np.asarray(trend_1['trend']), + np.arange(1.0, n + 1)) + + +def test_add_trend_prepend_dataframe(): + # Skipped on pandas < 13.1 it seems + skip_if_early_pandas() + n = 10 + x = randn(n, 1) + x = pd.DataFrame(x, columns=['col1']) + trend_1 = sm.tsa.add_trend(x, trend='ct', prepend=True) + trend_2 = sm.tsa.add_trend(x, trend='ct', prepend=False) + assert_frame_equal(trend_1.iloc[:, :2], trend_2.iloc[:, 1:]) + + +def test_add_trend_duplicate_name(): + x = pd.DataFrame(np.zeros((10, 1)), columns=['trend']) + with warnings.catch_warnings(record=True) as w: + assert_produces_warning(sm.tsa.add_trend(x, trend='ct'), + tools.ColumnNameConflict) + y = sm.tsa.add_trend(x, trend='ct') + # should produce a single warning + np.testing.assert_equal(len(w), 1) + assert 'const' in y.columns + assert 'trend_0' in y.columns + + +def test_add_trend_c(): + x = np.zeros((10, 1)) + y = sm.tsa.add_trend(x, trend='c') + assert np.all(y[:, 1] == 1.0) + + +def test_add_trend_ct(): + n = 20 + x = np.zeros((20, 1)) + y = sm.tsa.add_trend(x, trend='ct') + assert np.all(y[:, 1] == 1.0) + assert_equal(y[0, 2], 1.0) + assert_array_almost_equal(np.diff(y[:, 2]), np.ones((n - 1))) + + +def test_add_trend_ctt(): + n = 10 + x = np.zeros((n, 1)) + y = sm.tsa.add_trend(x, trend='ctt') + assert np.all(y[:, 1] == 1.0) + assert y[0, 2] == 1.0 + assert_array_almost_equal(np.diff(y[:, 2]), np.ones((n - 1))) + assert y[0, 3] == 1.0 + assert_array_almost_equal(np.diff(y[:, 3]), np.arange(3.0, 2.0 * n, 2.0)) + + +def test_add_trend_t(): + n = 20 + x = np.zeros((20, 1)) + y = sm.tsa.add_trend(x, trend='t') + assert y[0, 1] == 1.0 + assert_array_almost_equal(np.diff(y[:, 1]), np.ones((n - 1))) + +def test_add_trend_no_input(): + n = 100 + y = sm.tsa.add_trend(x=None, trend='ct', nobs=n) + assert np.all(y[:, 0] == 1.0) + assert y[0, 1] == 1.0 + assert_array_almost_equal(np.diff(y[:, 1]), np.ones((n - 1))) + + +def test_reintegrate_1_diff(): + x = randn(10, 1) + y = np.cumsum(x) + 1.0 + assert_array_almost_equal(y, reintegrate(np.diff(y), [y[0]])) + + +def test_reintegrate_2_diff(): + x = randn(10, 1) + y = np.cumsum(x) + 1.0 + z = np.cumsum(y) + 1.0 + levels = [z[0], np.diff(z, 1)[0]] + assert_array_almost_equal(z, reintegrate(np.diff(z, 2), levels)) + + +def test_detrend_1d_order_0(): + x = randn(100) + assert_array_almost_equal(x - x.mean(), sm.tsa.detrend(x, order=0)) + + +def test_detrend_1d_order_1(): + n = 100 + x = randn(n) + z = sm.tsa.add_trend(x=None, trend='ct', nobs=n) + resid = OLS(x, z).fit().resid + detrended = sm.tsa.detrend(x, order=1) + assert_array_almost_equal(resid, detrended) + + +def test_detrend_2d_order_0(): + n = 100 + x = randn(n, 1) + assert_array_almost_equal(x - x.mean(), sm.tsa.detrend(x, order=0)) + + +def test_detrend_2d_order_2(): + n = 100 + x = randn(n, 1) + z = sm.tsa.add_trend(x=None, trend='ctt', nobs=n) + resid = OLS(x, z).fit().resid[:, None] + detrended = sm.tsa.detrend(x, order=2) + assert_array_almost_equal(resid, detrended) + + +def test_detrend_2d_order_2_axis_1(): + n = 100 + x = randn(1, n) + z = sm.tsa.add_trend(x=None, trend='ctt', nobs=n) + resid = OLS(x.T, z).fit().resid[None, :] + detrended = sm.tsa.detrend(x, order=2, axis=1) + assert_array_almost_equal(resid, detrended) + + pass + + +if __name__ == '__main__': import nose - nose.runmodule() + + nose.runmodule(argv=[__file__, '-vvs'], exit=False) diff --git a/statsmodels/tsa/tests/test_unitroot.py b/statsmodels/tsa/tests/test_unitroot.py new file mode 100644 index 00000000000..84cecfb035f --- /dev/null +++ b/statsmodels/tsa/tests/test_unitroot.py @@ -0,0 +1,380 @@ +# TODO: Tests for features that are just called +# TODO: Test for trend='ctt' +from __future__ import division + +import numpy as np +from numpy.testing import (assert_almost_equal, assert_equal, assert_raises) +from numpy import log, polyval, diff, ceil, ones + +from statsmodels.tsa.unitroot import (ADF, adfuller, DFGLS, ensure_1d, + PhillipsPerron, KPSS, UnitRootTest, + VarianceRatio) +from statsmodels.tsa.critical_values.dickey_fuller import tau_2010 +from statsmodels.datasets import macrodata +import warnings + +DECIMAL_5 = 5 +DECIMAL_4 = 4 +DECIMAL_3 = 3 +DECIMAL_2 = 2 +DECIMAL_1 = 1 + + +def test_adf_autolag(): + #see issue #246 + #this is mostly a unit test + d2 = macrodata.load().data + x = log(d2['realgdp']) + xd = diff(x) + + for k_trend, tr in enumerate(['nc', 'c', 'ct', 'ctt']): + #check exog + adf3 = adfuller(x, maxlag=None, autolag='aic', + regression=tr, store=True, regresults=True) + st2 = adf3[-1] + + assert_equal(len(st2.autolag_results), 15 + 1) # +1 for lagged level + for l, res in sorted(st2.autolag_results.iteritems())[:5]: + lag = l - k_trend + #assert correct design matrices in _autolag + assert_equal(res.model.exog[-10:, k_trend], x[-11:-1]) + assert_equal(res.model.exog[-1, k_trend + 1:], xd[-lag:-1][::-1]) + #min-ic lag of dfgls in Stata is also 2, or 9 for maic with notrend + assert_equal(st2.usedlag, 2) + + #same result with lag fixed at usedlag of autolag + adf2 = adfuller(x, maxlag=2, autolag=None, regression=tr) + assert_almost_equal(adf3[:2], adf2[:2], decimal=12) + + tr = 'c' + #check maxlag with autolag + adf3 = adfuller(x, maxlag=5, autolag='aic', + regression=tr, store=True, regresults=True) + assert_equal(len(adf3[-1].autolag_results), 5 + 1) + adf3 = adfuller(x, maxlag=0, autolag='aic', + regression=tr, store=True, regresults=True) + assert_equal(len(adf3[-1].autolag_results), 0 + 1) + + +class CheckADF(object): + """ + Test Augmented Dickey-Fuller + + Test values taken from Stata. + """ + levels = ['1%', '5%', '10%'] + data = macrodata.load() + x = data.data['realgdp'] + y = data.data['infl'] + + def test_teststat(self): + assert_almost_equal(self.res1[0], self.teststat, DECIMAL_5) + + def test_pvalue(self): + assert_almost_equal(self.res1[1], self.pvalue, DECIMAL_5) + + def test_critvalues(self): + critvalues = [self.res1[4][lev] for lev in self.levels] + assert_almost_equal(critvalues, self.critvalues, DECIMAL_2) + + +class TestADFConstant(CheckADF): + """ + Dickey-Fuller test for unit root + """ + + def __init__(self): + self.res1 = adfuller(self.x, regression="c", autolag=None, + maxlag=4) + self.teststat = .97505319 + self.pvalue = .99399563 + self.critvalues = [-3.476, -2.883, -2.573] + + +class TestADFConstantTrend(CheckADF): + """ + """ + + def __init__(self): + self.res1 = adfuller(self.x, regression="ct", autolag=None, + maxlag=4) + self.teststat = -1.8566374 + self.pvalue = .67682968 + self.critvalues = [-4.007, -3.437, -3.137] + + +#class TestADFConstantTrendSquared(CheckADF): +# """ +# """ +# pass +#TODO: get test values from R? + +class TestADFNoConstant(CheckADF): + """ + """ + + def __init__(self): + self.res1 = adfuller(self.x, regression="nc", autolag=None, + maxlag=4) + self.teststat = 3.5227498 + self.pvalue = .99999 # Stata does not return a p-value for noconstant. + # Tau^max in MacKinnon (1994) is missing, so it is + # assumed that its right-tail is well-behaved + self.critvalues = [-2.587, -1.950, -1.617] + + +# No Unit Root + +class TestADFConstant2(CheckADF): + def __init__(self): + self.res1 = adfuller(self.y, regression="c", autolag=None, + maxlag=1) + self.teststat = -4.3346988 + self.pvalue = .00038661 + self.critvalues = [-3.476, -2.883, -2.573] + + +class TestADFConstantTrend2(CheckADF): + def __init__(self): + self.res1 = adfuller(self.y, regression="ct", autolag=None, + maxlag=1) + self.teststat = -4.425093 + self.pvalue = .00199633 + self.critvalues = [-4.006, -3.437, -3.137] + + +class TestADFNoConstant2(CheckADF): + def __init__(self): + self.res1 = adfuller(self.y, regression="nc", autolag=None, + maxlag=1) + self.teststat = -2.4511596 + self.pvalue = 0.013747 # Stata does not return a p-value for noconstant + # this value is just taken from our results + self.critvalues = [-2.587, -1.950, -1.617] + + +class TestUnitRoot(object): + @classmethod + def setupClass(cls): + from statsmodels.datasets.macrodata import load + + cls.cpi = log(load().data['cpi']) + cls.inflation = diff(cls.cpi) + cls.inflation_change = diff(cls.inflation) + + def test_adf_no_options(self): + adf = ADF(self.inflation) + assert_almost_equal(adf.stat, -3.09310, DECIMAL_4) + assert_equal(adf.lags, 2) + assert_almost_equal(adf.pvalue, .027067, DECIMAL_4) + adf.regression.summary() + + def test_adf_no_lags(self): + adf = ADF(self.inflation, lags=0).stat + assert_almost_equal(adf, -6.56880, DECIMAL_4) + + def test_adf_nc_no_lags(self): + adf = ADF(self.inflation, trend='nc', lags=0) + assert_almost_equal(adf.stat, -3.88845, DECIMAL_4) + # 16.239 + + def test_adf_c_no_lags(self): + adf = ADF(self.inflation, trend='c', lags=0) + assert_almost_equal(adf.stat, -6.56880, DECIMAL_4) + assert_equal(adf.nobs, self.inflation.shape[0] - adf.lags - 1) + + def test_adf_ct_no_lags(self): + adf = ADF(self.inflation, trend='ct', lags=0) + assert_almost_equal(adf.stat, -6.66705, DECIMAL_4) + + def test_adf_lags_10(self): + adf = ADF(self.inflation, lags=10) + assert_almost_equal(adf.stat, -2.28375, DECIMAL_4) + adf.summary() + + def test_adf_auto_bic(self): + adf = ADF(self.inflation, method='BIC') + assert_equal(adf.lags, 2) + + def test_adf_critical_value(self): + adf = ADF(self.inflation, trend='c', lags=3) + adf_cv = adf.critical_values + temp = polyval(tau_2010['c'][0, :, ::-1].T, 1. / adf.nobs) + cv = {'1%': temp[0], '5%': temp[1], '10%': temp[2]} + for k, v in cv.iteritems(): + assert_almost_equal(v, adf_cv[k]) + + def test_adf_auto_t_stat(self): + adf = ADF(self.inflation, method='t-stat') + assert_equal(adf.lags, 10) + old_stat = adf.stat + adf.lags += 1 + assert adf.stat != old_stat + old_stat = adf.stat + assert_equal(adf.y, self.inflation) + adf.trend = 'ctt' + assert adf.stat != old_stat + assert adf.trend == 'ctt' + assert len(adf.valid_trends) == len(('nc', 'c', 'ct', 'ctt')) + for d in adf.valid_trends: + assert d in ('nc', 'c', 'ct', 'ctt') + assert adf.h0 == 'The process contains a unit root.' + assert adf.hA == 'The process is weakly stationary.' + + def test_kpss_auto(self): + kpss = KPSS(self.inflation) + m = self.inflation.shape[0] + lags = np.ceil(12.0 * (m / 100) ** (1.0 / 4)) + assert_equal(kpss.lags, lags) + + def test_kpss(self): + kpss = KPSS(self.inflation, trend='ct', lags=12) + assert_almost_equal(kpss.stat, .235581902996454, DECIMAL_4) + assert_equal(self.inflation.shape[0], kpss.nobs) + kpss.summary() + + def test_kpss_c(self): + kpss = KPSS(self.inflation, trend='c', lags=12) + assert_almost_equal(kpss.stat, .3276290340191141, DECIMAL_4) + + def test_pp(self): + pp = PhillipsPerron(self.inflation, lags=12) + assert_almost_equal(pp.stat, -7.8076512, DECIMAL_4) + assert pp.test_type == 'tau' + pp.test_type = 'rho' + assert_almost_equal(pp.stat, -108.1552688, DECIMAL_2) + with assert_raises(ValueError): + pp.test_type = 'unknown' + pp.summary() + + def test_pp_auto(self): + pp = PhillipsPerron(self.inflation) + n = self.inflation.shape[0] - 1 + lags = ceil(12.0 * ((n / 100.0) ** (1.0 / 4.0))) + assert_equal(pp.lags, lags) + assert_almost_equal(pp.stat, -8.135547778, DECIMAL_4) + pp.test_type = 'rho' + assert_almost_equal(pp.stat, -118.7746451, DECIMAL_2) + + def test_dfgls_c(self): + dfgls = DFGLS(self.inflation, trend='c', lags=0) + assert_almost_equal(dfgls.stat, -6.017304, DECIMAL_4) + dfgls.summary() + dfgls.regression.summary() + + def test_dfgls(self): + dfgls = DFGLS(self.inflation, trend='ct', lags=0) + assert_almost_equal(dfgls.stat, -6.300927, DECIMAL_4) + dfgls.summary() + dfgls.regression.summary() + + def test_dfgls_auto(self): + dfgls = DFGLS(self.inflation, trend='ct', method='BIC', max_lags=3) + assert_equal(dfgls.lags, 2) + assert_almost_equal(dfgls.stat, -2.9035369, DECIMAL_4) + with assert_raises(ValueError): + dfgls.trend = 'nc' + + assert dfgls != 0.0 + + def test_ensure_1d(self): + assert_raises(ValueError, ensure_1d, *(ones((2, 2)),)) + + def test_negative_lag(self): + adf = ADF(self.inflation) + with assert_raises(ValueError): + adf.lags = -1 + + def test_invalid_determinstic(self): + adf = ADF(self.inflation) + with assert_raises(ValueError): + adf.trend = 'bad-value' + + def test_variance_ratio(self): + vr = VarianceRatio(self.inflation, debiased=False) + y = self.inflation + dy = np.diff(y) + mu = dy.mean() + dy2 = y[2:] - y[:-2] + nq = dy.shape[0] + denom = np.sum((dy - mu) ** 2.0) / (nq) + num = np.sum((dy2 - 2 * mu) ** 2.0) / (nq * 2) + ratio = num / denom + + assert_almost_equal(ratio, vr.vr) + + def test_variance_ratio_no_overlap(self): + vr = VarianceRatio(self.inflation, overlap=False) + + with warnings.catch_warnings(record=True) as w: + computed_value = vr.vr + assert_equal(len(w), 1) + + y = self.inflation + # Adjust due ot sample size + y = y[:-1] + dy = np.diff(y) + mu = dy.mean() + dy2 = y[2::2] - y[:-2:2] + nq = dy.shape[0] + denom = np.sum((dy - mu) ** 2.0) / nq + num = np.sum((dy2 - 2 * mu) ** 2.0) / nq + ratio = num / denom + assert_equal(ratio, computed_value) + + vr.overlap = True + assert_equal(vr.overlap, True) + vr2 = VarianceRatio(self.inflation) + assert_almost_equal(vr.stat, vr2.stat) + + def test_variance_ratio_non_robust(self): + vr = VarianceRatio(self.inflation, robust=False, debiased=False) + y = self.inflation + dy = np.diff(y) + mu = dy.mean() + dy2 = y[2:] - y[:-2] + nq = dy.shape[0] + denom = np.sum((dy - mu) ** 2.0) / nq + num = np.sum((dy2 - 2 * mu) ** 2.0) / (nq * 2) + ratio = num / denom + variance = 3.0 / 2.0 + stat = np.sqrt(nq) * (ratio - 1) / np.sqrt(variance) + assert_almost_equal(stat, vr.stat) + orig_stat = vr.stat + vr.robust = True + assert_equal(vr.robust, True) + assert vr.stat != orig_stat + + def test_variance_ratio_no_constant(self): + y = np.random.randn(100) + vr = VarianceRatio(y, trend='nc', debiased=False) + dy = np.diff(y) + mu = 0.0 + dy2 = y[2:] - y[:-2] + nq = dy.shape[0] + denom = np.sum((dy - mu) ** 2.0) / nq + num = np.sum((dy2 - 2 * mu) ** 2.0) / (nq * 2) + ratio = num / denom + assert_almost_equal(ratio, vr.vr) + assert_equal(vr.debiased, False) + + def test_variance_ratio_invalid_lags(self): + y = self.inflation + assert_raises(ValueError, VarianceRatio, y, lags=1) + + def test_variance_ratio_generic(self): + # TODO: Currently not a test, just makes sure code runs at all + y = np.random.randn(100, 1).cumsum() + vr = VarianceRatio(y) + print vr.summary() + + def test_base_class(self): + assert_raises(NotImplementedError, UnitRootTest, + self.inflation, 0, 'nc') + + +if __name__ == "__main__": + import nose + + nose.runmodule(argv=[__file__, '-vvs', '-x'], exit=False) + diff --git a/statsmodels/tsa/tsatools.py b/statsmodels/tsa/tsatools.py index ac22af0bd98..757316c4b8e 100644 --- a/statsmodels/tsa/tsatools.py +++ b/statsmodels/tsa/tsatools.py @@ -1,74 +1,123 @@ import numpy as np -import numpy.lib.recfunctions as nprf -from statsmodels.tools.tools import add_constant +from statsmodels.compatnp.py3k import string_types +import numpy as np +import pandas as pd + +from statsmodels.compatnp.py3k import string_types + +import pandas as pd from pandas.tseries import offsets from pandas.tseries.frequencies import to_offset -def add_trend(X, trend="c", prepend=False): +__all__ = ['lagmat', 'lagmat2ds', 'add_trend', 'duplication_matrix', + 'elimination_matrix', 'commutation_matrix', + 'vec', 'vech', 'unvec', 'unvech', 'reintegrate'] + + +class ColumnNameConflict(Warning): + pass + + +column_name_conflict_doc = """ +Some of the column named being added were not unique and have been renamed. + + {0} +""" + + +def _enforce_unique_col_name(existing, new): + converted_names = [] + unique_names = list(new[:]) + for i, n in enumerate(new): + if n in existing: + original_name = n + fixed_name = n + duplicate_count = 0 + while fixed_name in existing: + fixed_name = n + '_' + str(duplicate_count) + duplicate_count += 1 + unique_names[i] = fixed_name + converted_names.append('{0} -> {1}'.format(original_name, fixed_name)) + if converted_names: + import warnings + + ws = column_name_conflict_doc.format('\n '.join(converted_names)) + warnings.warn(ws, ColumnNameConflict) + + return unique_names + + +def add_trend(x=None, trend='c', prepend=False, nobs=None): """ Adds a trend and/or constant to an array. Parameters ---------- - X : array-like - Original array of data. + x : array-like or None + Original array of data. If None, then nobs must be a positive integer trend : str {"c","t","ct","ctt"} "c" add constant only "t" add trend only "ct" add constant and linear trend "ctt" add constant and linear and quadratic trend. prepend : bool - If True, prepends the new data to the columns of X. + If True, prepends the new data to the columns of x. + n : int, positive + Positive integer containing the length of the trend series. Only used + if x is none. Notes ----- - Returns columns as ["ctt","ct","c"] whenever applicable. There is currently + Returns columns as ["ctt","ct","c"] whenever applicable. There is no checking for an existing constant or trend. See also -------- - statsmodels.add_constant + statsmodels.tools.tools.add_constant """ #TODO: could be generalized for trend of aribitrary order trend = trend.lower() - if trend == "c": # handles structured arrays - return add_constant(X, prepend=prepend) + if trend == "c": # handles structured arrays + trend_order = 0 elif trend == "ct" or trend == "t": - trendorder = 1 + trend_order = 1 elif trend == "ctt": - trendorder = 2 + trend_order = 2 else: raise ValueError("trend %s not understood" % trend) - X = np.asanyarray(X) - nobs = len(X) - trendarr = np.vander(np.arange(1,nobs+1, dtype=float), trendorder+1) + if x is not None: + nobs = len(np.asanyarray(x)) + elif nobs <= 0: + raise ValueError("nobs must be a positive integer if x is None") + trend_array = np.vander(np.arange(1, nobs + 1, dtype=np.float64), + trend_order + 1) # put in order ctt - trendarr = np.fliplr(trendarr) + trend_array = np.fliplr(trend_array) if trend == "t": - trendarr = trendarr[:,1] - if not X.dtype.names: - if not prepend: - X = np.column_stack((X, trendarr)) + trend_array = trend_array[:, 1:] + if x is None: + return trend_array + if isinstance(x, pd.DataFrame): + columns = ('const', 'trend', 'quadratic_trend') + if trend == 't': + columns = (columns[1],) else: - X = np.column_stack((trendarr, X)) + columns = columns[0:trend_order + 1] + columns = _enforce_unique_col_name(x.columns, columns) + trend_array = pd.DataFrame(trend_array, index=x.index, columns=columns) + if prepend: + x = trend_array.join(x) + else: + x = x.join(trend_array) else: - return_rec = data.__clas__ is np.recarray - if trendorder == 1: - if trend == "ct": - dt = [('const',float),('trend',float)] - else: - dt = [('trend', float)] - elif trendorder == 2: - dt = [('const',float),('trend',float),('trend_squared', float)] - trendarr = trendarr.view(dt) if prepend: - X = nprf.append_fields(trendarr, X.dtype.names, [X[i] for i - in data.dtype.names], usemask=False, asrecarray=return_rec) + x = np.column_stack((trend_array, x)) else: - X = nprf.append_fields(X, trendarr.dtype.names, [trendarr[i] for i - in trendarr.dtype.names], usemask=false, asrecarray=return_rec) - return X + x = np.column_stack((x, trend_array)) + + return x + def add_lag(x, col=None, lags=1, drop=False, insert=True): """ @@ -80,10 +129,9 @@ def add_lag(x, col=None, lags=1, drop=False, insert=True): An array or NumPy ndarray subclass. Can be either a 1d or 2d array with observations in columns. col : 'string', int, or None - If data is a structured array or a recarray, `col` can be a string - that is the name of the column containing the variable. Or `col` can - be an int of the zero-based column index. If it's a 1d array `col` - can be None. + If data is a pandas DataFrame, `col` can be a string that is the name + of the column containing the variable. Or `col` can be an int of the + zero-based column index. If it's a 1d array `col` can be None. lags : int The number of lags desired. drop : bool @@ -111,134 +159,108 @@ def add_lag(x, col=None, lags=1, drop=False, insert=True): so that the length of the returned array is len(`X`) - lags. The lags are returned in increasing order, ie., t-1,t-2,...,t-lags """ - if x.dtype.names: - names = x.dtype.names - if not col and np.squeeze(x).ndim > 1: - raise IndexError, "col is None and the input array is not 1d" - elif len(names) == 1: - col = names[0] - if isinstance(col, int): - col = x.dtype.names[col] - contemp = x[col] - - # make names for lags - tmp_names = [col + '_'+'L(%i)' % i for i in range(1,lags+1)] - ndlags = lagmat(contemp, maxlag=lags, trim='Both') - - # get index for return - if insert is True: - ins_idx = list(names).index(col) + 1 - elif insert is False: - ins_idx = len(names) + 1 - else: # insert is an int - if insert > len(names): - raise Warning("insert > number of variables, inserting at the"+ - " last position") - ins_idx = insert - - first_names = list(names[:ins_idx]) - last_names = list(names[ins_idx:]) - - if drop: - if col in first_names: - first_names.pop(first_names.index(col)) - else: - last_names.pop(last_names.index(col)) - - if first_names: # only do this if x isn't "empty" - first_arr = nprf.append_fields(x[first_names][lags:],tmp_names, - ndlags.T, usemask=False) - else: - first_arr = np.zeros(len(x)-lags, dtype=zip(tmp_names, - (x[col].dtype,)*lags)) - for i,name in enumerate(tmp_names): - first_arr[name] = ndlags[:,i] - if last_names: - return nprf.append_fields(first_arr, last_names, - [x[name][lags:] for name in last_names], usemask=False) - else: # lags for last variable - return first_arr - - else: # we have an ndarray - - if x.ndim == 1: # make 2d if 1d - x = x[:,None] - if col is None: - col = 0 - - # handle negative index - if col < 0: - col = x.shape[1] + col - - contemp = x[:,col] - - if insert is True: - ins_idx = col + 1 - elif insert is False: - ins_idx = x.shape[1] + df = None + if isinstance(x, pd.DataFrame): + df = x + if isinstance(col, string_types): + col = list(df.columns).index(col) + + x = np.asarray(x) + if x.ndim == 1: # make 2d if 1d + x = x[:, None] + if col is None: + col = 0 + # handle negative index + if col < 0: + col = x.shape[1] + col + contemporary = x[:, col] + + if insert is True: + ins_idx = col + 1 + elif insert is False: + ins_idx = x.shape[1] + else: + if insert < 0: # handle negative index + insert = x.shape[1] + insert + 1 + if insert > x.shape[1]: + raise ValueError("insert greater than the number of variables") + ins_idx = insert + + ndlags = lagmat(contemporary, lags, trim='Both') + first_cols = range(ins_idx) + last_cols = range(ins_idx, x.shape[1]) + if drop: + if col in first_cols: + first_cols.pop(first_cols.index(col)) else: - if insert < 0: # handle negative index - insert = x.shape[1] + insert + 1 - if insert > x.shape[1]: - insert = x.shape[1] - raise Warning("insert > number of variables, inserting at the"+ - " last position") - ins_idx = insert - - ndlags = lagmat(contemp, lags, trim='Both') - first_cols = range(ins_idx) - last_cols = range(ins_idx,x.shape[1]) - if drop: - if col in first_cols: - first_cols.pop(first_cols.index(col)) - else: - last_cols.pop(last_cols.index(col)) - return np.column_stack((x[lags:,first_cols],ndlags, - x[lags:,last_cols])) + last_cols.pop(last_cols.index(col)) + out = np.column_stack((x[lags:, first_cols], ndlags, x[lags:, last_cols])) + if df is not None: + columns = list(df.columns) + index = df.index + # Create new column labels + lag_columns = [str(columns[col]) + '_L_' + str(i + 1) for i in + xrange(lags)] + out_columns = [columns[col_idx] for col_idx in first_cols] + out_columns.extend(lag_columns) + for col_idx in last_cols: + out_columns.append(columns[col_idx]) + # Alter index for correct length + index = index[lags:] + lag_columns = _enforce_unique_col_name(df.columns, lag_columns) + ndlags = pd.DataFrame(ndlags, columns=lag_columns, index=index) + df = df.join(ndlags) + out = df[out_columns].reindex(index) + + return out + def detrend(x, order=1, axis=0): - '''detrend an array with a trend of given order along axis 0 or 1 + """ + Detrend an array with a trend of given order along axis 0 or 1 Parameters ---------- x : array_like, 1d or 2d data, if 2d, then each row or column is independently detrended with the - same trendorder, but independent trend estimates + same trend order, but independent trend estimates order : int specifies the polynomial order of the trend, zero is constant, one is linear trend, two is quadratic trend axis : int - for detrending with order > 0, axis can be either 0 observations by rows, - or 1, observations by columns + for detrending with order > 0, axis can be either 0 observations by + rows, or 1, observations by columns Returns ------- - detrended data series : ndarray + y : array The detrended series is the residual of the linear regression of the data on the trend of given order. - - - ''' + """ x = np.asarray(x) - nobs = x.shape[0] + ndim = x.ndim if order == 0: - return x - np.expand_dims(x.mean(ax), x) - else: - if x.ndim == 2 and range(2)[axis]==1: - x = x.T - elif x.ndim > 2: - raise NotImplementedError('x.ndim>2 is not implemented until it is needed') - #could use a polynomial, but this should work also with 2d x, but maybe not yet - trends = np.vander(np.arange(nobs).astype(float), N=order+1) - beta = np.linalg.lstsq(trends, x)[0] - resid = x - np.dot(trends, beta) - if x.ndim == 2 and range(2)[axis]==1: - resid = resid.T - return resid - - -def lagmat(x, maxlag, trim='forward', original='ex'): - '''create 2d array of lags + return x - np.expand_dims(x.mean(axis), axis) + if ndim > 2: + raise NotImplementedError('x must be 1d or 2d') + if ndim == 2 and axis == 1: + x = x.T + elif ndim == 1: + x = x[:, None] + nobs = x.shape[0] + trends = np.vander(np.arange(nobs).astype(np.float64), N=order + 1) + beta = np.linalg.pinv(trends).dot(x) + resid = x - np.dot(trends, beta) + if x.ndim == 2 and axis == 1: + resid = resid.T + elif ndim == 1: + resid = resid.ravel() + + return resid + + +def lagmat(x, maxlag, trim='forward', original='ex', fill_value=np.nan): + """create 2d array of lags Parameters ---------- @@ -258,6 +280,8 @@ def lagmat(x, maxlag, trim='forward', original='ex'): * 'sep' : returns a tuple (original array, lagged values). The original array is truncated to have the same number of rows as the returned lagmat. + fill_value : float + the value to use for filling missing values. Default is np.nan Returns ------- @@ -293,22 +317,23 @@ def lagmat(x, maxlag, trim='forward', original='ex'): Notes ----- - TODO: - * allow list of lags additional to maxlag - * create varnames for columns - ''' + If trim is not 'both', new values are 0-filled + """ + # TODO: allow list of lags additional to maxlag + # TODO: create varnames for columns x = np.asarray(x) dropidx = 0 if x.ndim == 1: - x = x[:,None] + x = x[:, None] nobs, nvar = x.shape - if original in ['ex','sep']: + if original in ['ex', 'sep']: dropidx = nvar if maxlag >= nobs: raise ValueError("maxlag should be < nobs") - lm = np.zeros((nobs+maxlag, nvar*(maxlag+1))) - for k in range(0, int(maxlag+1)): - lm[maxlag-k:nobs+maxlag-k, nvar*(maxlag-k):nvar*(maxlag-k+1)] = x + lm = np.tile(fill_value, (nobs + maxlag, nvar * (maxlag + 1))) + for k in range(0, int(maxlag + 1)): + lm[maxlag - k:nobs + maxlag - k, + nvar * (maxlag - k):nvar * (maxlag - k + 1)] = x if trim: trimlower = trim.lower() else: @@ -318,10 +343,10 @@ def lagmat(x, maxlag, trim='forward', original='ex'): stopobs = len(lm) elif trimlower == 'forward': startobs = 0 - stopobs = nobs+maxlag-k + stopobs = nobs + maxlag - k elif trimlower == 'both': startobs = maxlag - stopobs = nobs+maxlag-k + stopobs = nobs + maxlag - k elif trimlower == 'backward': startobs = maxlag stopobs = len(lm) @@ -329,12 +354,13 @@ def lagmat(x, maxlag, trim='forward', original='ex'): else: raise ValueError('trim option not valid') if original == 'sep': - return lm[startobs:stopobs,dropidx:], x[startobs:stopobs] + return lm[startobs:stopobs, dropidx:], x[startobs:stopobs] else: - return lm[startobs:stopobs,dropidx:] + return lm[startobs:stopobs, dropidx:] + def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim='forward'): - '''generate lagmatrix for 2d array, columns arranged by variables + """Generate lagmatrix for 2d array, columns arranged by variables Parameters ---------- @@ -343,7 +369,7 @@ def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim='forward'): maxlag0 : int for first variable all lags from zero to maxlag are included maxlagex : None or int - max lag for all other variables all lags from zero to maxlag are included + max lag for all other variables; all lags from zero to maxlag are included dropex : int (default is 0) exclude first dropex lags from other variables for all variables, except the first, lags from dropex to maxlagex are @@ -362,42 +388,50 @@ def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim='forward'): Notes ----- very inefficient for unequal lags, just done for convenience - ''' + """ if maxlagex is None: maxlagex = maxlag0 maxlag = max(maxlag0, maxlagex) nobs, nvar = x.shape - lagsli = [lagmat(x[:,0], maxlag, trim=trim, original='in')[:,:maxlag0+1]] - for k in range(1,nvar): - lagsli.append(lagmat(x[:,k], maxlag, trim=trim, original='in')[:,dropex:maxlagex+1]) + lagsli = [ + lagmat(x[:, 0], maxlag, trim=trim, original='in')[:, :maxlag0 + 1]] + for k in range(1, nvar): + lagsli.append(lagmat(x[:, k], maxlag, trim=trim, original='in')[:, + dropex:maxlagex + 1]) return np.column_stack(lagsli) + def vec(mat): return mat.ravel('F') + def vech(mat): # Gets Fortran-order return mat.T.take(_triu_indices(len(mat))) -# tril/triu/diag, suitable for ndarray.take +# tril/triu/diag, suitable for ndarray.take def _tril_indices(n): rows, cols = np.tril_indices(n) return rows * n + cols + def _triu_indices(n): rows, cols = np.triu_indices(n) return rows * n + cols + def _diag_indices(n): rows, cols = np.diag_indices(n) return rows * n + cols + def unvec(v): k = int(np.sqrt(len(v))) - assert(k * k == len(v)) + assert (k * k == len(v)) return v.reshape((k, k), order='F') + def unvech(v): # quadratic formula, correct fp error rows = .5 * (-1 + np.sqrt(1 + 8 * len(v))) @@ -412,18 +446,26 @@ def unvech(v): return result + def duplication_matrix(n): """ Create duplication matrix D_n which satisfies vec(S) = D_n vech(S) for symmetric matrix S + Parameters + ---------- + n : int + The length of vech(S) + Returns ------- - D_n : ndarray + D_n : array + The duplication array """ tmp = np.eye(n * (n + 1) / 2) return np.array([unvech(x).ravel() for x in tmp]).T + def elimination_matrix(n): """ Create the elimination matrix L_n which satisfies vech(M) = L_n vec(M) for @@ -431,31 +473,36 @@ def elimination_matrix(n): Parameters ---------- + n : int + The length of vec(M) Returns ------- + L : array + The commutation matrix """ vech_indices = vec(np.tril(np.ones((n, n)))) return np.eye(n * n)[vech_indices != 0] + def commutation_matrix(p, q): """ Create the commutation matrix K_{p,q} satisfying vec(A') = K_{p,q} vec(A) Parameters ---------- - p : int - q : int + p, q : int Returns ------- - K : ndarray (pq x pq) + K : ndarray, (pq, pq) """ K = np.eye(p * q) indices = np.arange(p * q).reshape((p, q), order='F') return K.take(indices.ravel(), axis=0) + def _ar_transparams(params): """ Transforms params to induce stationarity/invertability. @@ -469,16 +516,15 @@ def _ar_transparams(params): --------- Jones(1980) """ - newparams = ((1-np.exp(-params))/ - (1+np.exp(-params))).copy() - tmp = ((1-np.exp(-params))/ - (1+np.exp(-params))).copy() - for j in range(1,len(params)): - a = newparams[j] - for kiter in range(j): - tmp[kiter] -= a * newparams[j-kiter-1] - newparams[:j] = tmp[:j] - return newparams + trans_params = ((1 - np.exp(-params)) / (1 + np.exp(-params))) + tmp = trans_params.copy() + for j in range(1, len(params)): + a = trans_params[j] + for k in range(j): + tmp[k] -= a * trans_params[j - k - 1] + trans_params[:j] = tmp[:j] + return trans_params + def _ar_invtransparams(params): """ @@ -489,16 +535,17 @@ def _ar_invtransparams(params): params : array The transformed AR coefficients """ - # AR coeffs + # AR coefficients tmp = params.copy() - for j in range(len(params)-1,0,-1): + for j in range(len(params) - 1, 0, -1): a = params[j] - for kiter in range(j): - tmp[kiter] = (params[kiter] + a * params[j-kiter-1])/\ - (1-a**2) + for k in range(j): + tmp[k] = (params[k] + a * params[j - k - 1]) / \ + (1 - a ** 2) params[:j] = tmp[:j] - invarcoefs = -np.log((1-params)/(1+params)) - return invarcoefs + inv_ar_coefs = -np.log((1 - params) / (1 + params)) + return inv_ar_coefs + def _ma_transparams(params): """ @@ -507,22 +554,23 @@ def _ma_transparams(params): Parameters ---------- params : array - The ma coeffecients of an (AR)MA model. + The MA coeffecients of an (AR)MA model. Reference --------- Jones(1980) """ - newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy() - tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy() + trans_params = ((1 - np.exp(-params)) / (1 + np.exp(-params))).copy() + tmp = trans_params.copy() # levinson-durbin to get macf - for j in range(1,len(params)): - b = newparams[j] - for kiter in range(j): - tmp[kiter] += b * newparams[j-kiter-1] - newparams[:j] = tmp[:j] - return newparams + for j in range(1, len(params)): + b = trans_params[j] + for k in range(j): + tmp[k] += b * trans_params[j - k - 1] + trans_params[:j] = tmp[:j] + return trans_params + def _ma_invtransparams(macoefs): """ @@ -534,15 +582,17 @@ def _ma_invtransparams(macoefs): The transformed MA coefficients """ tmp = macoefs.copy() - for j in range(len(macoefs)-1,0,-1): + for j in range(len(macoefs) - 1, 0, -1): b = macoefs[j] - for kiter in range(j): - tmp[kiter] = (macoefs[kiter]-b *macoefs[j-kiter-1])/(1-b**2) + scale = (1 - b ** 2) + for k in range(j): + tmp[k] = (macoefs[k] - b * macoefs[j - k - 1]) / scale macoefs[:j] = tmp[:j] - invmacoefs = -np.log((1-macoefs)/(1+macoefs)) - return invmacoefs + inv_ma_coefs = -np.log((1 - macoefs) / (1 + macoefs)) + return inv_ma_coefs -def unintegrate(x, levels): + +def reintegrate(x, levels): """ After taking n-differences of a series, return the original series @@ -551,25 +601,25 @@ def unintegrate(x, levels): x : array-like The n-th differenced series levels : list - A list of the first-value in each differenced series, for + A list of the initial-value in each differenced series, for [first-difference, second-difference, ..., n-th difference] Returns ------- y : array-like - The original series de-differenced + The original series, reintegrated Examples -------- >>> x = np.array([1, 3, 9., 19, 8.]) >>> levels = [x[0], np.diff(x, 1)[0]] - >>> _unintegrate(np.diff(x, 2), levels) + >>> reintegrate(np.diff(x, 2), levels) array([ 1., 3., 9., 19., 8.]) """ - levels = levels[:] # copy + levels = levels[:] # copy if len(levels) > 1: x0 = levels.pop(-1) - return _unintegrate(np.cumsum(np.r_[x0, x]), levels) + return reintegrate(np.cumsum(np.r_[x0, x]), levels) x0 = levels[0] return np.cumsum(np.r_[x0, x]) @@ -609,13 +659,5 @@ def freq_to_period(freq): "think this in error.".format(freq)) -__all__ = ['lagmat', 'lagmat2ds','add_trend', 'duplication_matrix', - 'elimination_matrix', 'commutation_matrix', - 'vec', 'vech', 'unvec', 'unvech'] - if __name__ == '__main__': - # sanity check, mainly for imports - x = np.random.normal(size=(100,2)) - tmp = lagmat(x,2) - tmp = lagmat2ds(x,2) -# grangercausalitytests(x, 2) + pass diff --git a/statsmodels/tsa/unitroot.py b/statsmodels/tsa/unitroot.py new file mode 100644 index 00000000000..3889fde09c4 --- /dev/null +++ b/statsmodels/tsa/unitroot.py @@ -0,0 +1,1297 @@ +from __future__ import division + +import warnings + +from numpy import (diff, ceil, power, squeeze, sqrt, sum, cumsum, int32, int64, + interp, abs, log, arange, sort) +from numpy.linalg import pinv +from scipy.stats import norm +from numpy import polyval + +from statsmodels.regression.linear_model import OLS +from statsmodels.tsa.tsatools import lagmat, add_trend +from statsmodels.tsa.stattools import _autolag, cov_nw +from statsmodels.tsa.critical_values.dickey_fuller import * +from statsmodels.tsa.critical_values.kpss import * +from statsmodels.tsa.critical_values.dfgls import * +from statsmodels.iolib.summary import Summary +from statsmodels.iolib.table import SimpleTable + + +class InvalidLengthWarning(Warning): + pass + + +invalid_length_doc = """ +The length of {var} is not an exact multiple of {block}, and so the final {drop} +observations have been dropped. +""" + +deprication_doc = """ +{old_func} has been depricated. Please use {new_func}. +""" + +__all__ = ['ADF', 'DFGLS', 'PhillipsPerron', 'KPSS', 'VarianceRatio', + 'adfuller', 'kpss_crit', 'mackinnoncrit', 'mackinnonp'] + +TREND_MAP = {None: 'nc', 0: 'c', 1: 'ct', 2: 'ctt'} + +TREND_DESCRIPTION = {'nc': 'No Trend', + 'c': 'Constant', + 'ct': 'Constant and Linear Time Trend', + 'ctt': 'Constant, Linear and Quadratic Time Trends'} + + +def _df_select_lags(y, trend, max_lags, method): + """ + Helper method to determine the best lag length in DF-like regressions + + Parameters + ---------- + y : array-like, (nobs,) + The data for the lag selection exercise + trend : str, {'nc','c','ct','ctt'} + The trend order + max_lags : int + The maximum number of lags to check. This setting affects all + estimation since the sample is adjusted by max_lags when + fitting the models + method : str, {'AIC','BIC','t-stat'} + The method to use when estimating the model + + Returns + ------- + best_ic : float + The information criteria at the selected lag + best_lag : int + The selected lag + all_res : list + List of OLS results from fitting max_lag + 1 models + + Notes + ----- + See statsmodels.tsa.tsatools._autolag for details + """ + nobs = y.shape[0] + delta_y = diff(y) + + if max_lags is None: + max_lags = int(ceil(12. * power(nobs / 100., 1 / 4.))) + + rhs = lagmat(delta_y[:, None], max_lags, trim='both', original='in') + nobs = rhs.shape[0] # pylint: disable=E1103 + rhs[:, 0] = y[-nobs - 1:-1] # replace 0 xdiff with level of x + lhs = delta_y[-nobs:] + + if trend != 'nc': + full_rhs = add_trend(rhs, trend, prepend=True) + else: + full_rhs = rhs + + start_lag = full_rhs.shape[1] - rhs.shape[1] + 1 + # TODO: Remove all_res after adfuller deprication + ic_best, best_lag, all_res = _autolag(OLS, lhs, full_rhs, start_lag, + max_lags, + method, regresults=True) + # To get the correct number of lags, subtract the start_lag since + # lags 0,1,...,start_lag-1 were not actual lags, but other variables + best_lag -= start_lag + return ic_best, best_lag, all_res + + +def _estimate_df_regression(y, trend, lags): + """Helper function that estimates the core (A)DF regression + + Parameters + ---------- + y : array-like, (nobs,) + The data for the lag selection + trend : str, {'nc','c','ct','ctt'} + The trend order + lags : int + The number of lags to include in the ADF regression + + Returns + ------- + ols_res : OLSResults + A results class object produced by OLS.fit() + + Notes + ----- + See statsmodels.regression.linear_model.OLS for details on the + value returned + """ + delta_y = diff(y) + + rhs = lagmat(delta_y[:, None], lags, trim='both', original='in') + nobs = rhs.shape[0] + lhs = rhs[:, 0].copy() # lag-0 values are lhs, Is copy() necessary? + rhs[:, 0] = y[-nobs - 1:-1] # replace lag 0 with level of y + + if trend != 'nc': + rhs = add_trend(rhs[:, :lags + 1], trend) + + return OLS(lhs, rhs).fit() + + +def ensure_1d(y): + """Returns a 1d array if the input is squeezable to 1d. Otherwise raises + and error + + Parameters + ---------- + y : array-like + The array to squeeze to 1d, or raise an error if not compatible + + Returns + ------- + y_1d : array + A 1d version of the input, returned as an array + """ + y = squeeze(asarray(y)) + if y.ndim != 1: + raise ValueError('Input must be 1d or squeezable to 1d.') + return y + + +class UnitRootTest(object): + """Base class to be used for inheritance in unit root tests""" + + def __init__(self, y, lags, trend): + self._y = ensure_1d(y) + self._delta_y = diff(y) + self._nobs = self._y.shape[0] + self._lags = None + self.lags = lags + self._trend = None + self.trend = trend + self._stat = None + self._critical_values = None + self._pvalue = None + self.trend = trend + self._h0 = 'The process contains a unit root.' + self._h1 = 'The process is weakly stationary.' + self._test_name = None + self._title = None + self._summary_text = None + + def _compute_statistic(self): + raise NotImplementedError("Subclass must implement") + + def _reset(self): + """Resets the unit root test so that it will be recomputed""" + self._stat = None + + def _compute_if_needed(self): + """Checks if statistic needs to be computed""" + if self._stat is None: + self._compute_statistic() + + @property + def h0(self): + """The null hypothesis""" + return self._h0 + + @property + def hA(self): + """The alternative hypothesis""" + return self._h1 + + @property + def nobs(self): + """The number of observations used in the test statistic. + Account for loss of data due to lags.""" + return self._nobs + + @property + def valid_trends(self): + """List of valid trend terms. Must be overridden""" + raise NotImplementedError("Subclass must implement") + + @property + def pvalue(self): + """Returns the p-value for the test statistic""" + self._compute_if_needed() + return self._pvalue + + @property + def stat(self): + """The test statistic for a unit root""" + self._compute_if_needed() + return self._stat + + @property + def critical_values(self): + """Dictionary containing critical values""" + self._compute_if_needed() + return self._critical_values + + def summary(self): + """Summary of test, containing statistic, p-value and critical values + """ + data = [('Test Statistic', '{0:0.3f}'.format(self.stat)), + ('P-value', '{0:0.3f}'.format(self.pvalue)), + ('Lags', '{0:d}'.format(self.lags))] + width = 15 + formatted_text = '{0:<' + str(width) + '} {1:>' + str(width) + '}' + table_data = [] + for label, val in data: + table_data.append([formatted_text.format(label, val)]) + title = self._title + if not title: + title = self._test_name + " Results" + table = SimpleTable(table_data, stubs=None, title=title) + + smry = Summary() + smry.tables.append(table) + + cv_string = 'Critical Values: ' + cv = self._critical_values.keys() + g = lambda x: float(x.split('%')[0]) + cv_numeric = array(map(g, cv)) + cv_numeric = sort(cv_numeric) + for val in cv_numeric: + p = str(int(val)) + '%' + cv_string += '{0:0.2f}'.format(self._critical_values[p]) + cv_string += ' (' + p + ')' + if val != cv_numeric[-1]: + cv_string += ', ' + + extra_text = ['Trend: ' + TREND_DESCRIPTION[self._trend], + cv_string, + 'Null Hypothesis: ' + self.h0, + 'Alternative Hypothesis: ' + self.hA] + + smry.add_extra_txt(extra_text) + if self._summary_text: + smry.add_extra_txt(self._summary_text) + return smry + + @property + def lags(self): + """Sets or gets the number of lags used in the model. + WHen tests use DF-type regressions, lags is the number of lags in the + regression model. When tests use long-run variance estimators, lags + is the number of lags used in the long-run variance estimator.""" + self._compute_if_needed() + return self._lags + + @lags.setter + def lags(self, value): + types = (int, long, int32, int64) + if value is not None and not isinstance(value, types) or \ + (isinstance(value, types) and value < 0): + raise ValueError('lags must be a non-negative integer or None') + if self._lags != value: + self._reset() + self._lags = value + + @property + def y(self): + """Returns the data used in the test""" + return self._y + + @property + def trend(self): + """Sets or gets the trend term""" + return self._trend + + @trend.setter + def trend(self, value): + if value not in self.valid_trends: + raise ValueError('trend not understood') + if self._trend != value: + self._reset() + self._trend = value + + +class ADF(UnitRootTest): + """Augmented Dickey Fuller test + + Parameters + ---------- + y : array-like, (nobs,) + The data to test for a unit root + lags : int, non-negative, optional + The number of lags to use in the ADF regression. If omitted or None, + `method` is used to automatically select the lag length with no more + than `max_lags` are included. + trend : str, {'nc', 'c', 'ct', 'ctt'}, optional + The trend component to include in the ADF test + 'nc' - No trend components + 'c' - Include a constant (Default) + 'ct' - Include a constant and linear time trend + 'ctt' - Include a constant and linear and quadratic time trends + max_lags : int, non-negative, optional + The maximum number of lags to use when selecting lag length + method : str, {'AIC', 'BIC', 't-stat'}, optional + The method to use when selecting the lag length + 'AIC' - Select the minimum of the Akaike IC + 'BIC' - Select the minimum of the Schwarz/Bayesian IC + 't-stat' - Select the minimum of the Schwarz/Bayesian IC + + Attributes + ---------- + stat + pvalues + critical_values + h0 + hA + summary + regression + valid_trends + y + trend + lags + + Notes + ----- + The null hypothesis of the Augmented Dickey-Fuller is that there is a unit + root, with the alternative that there is no unit root. If the pvalue is + above a critical size, then the null cannot be rejected that there + and the series appears to be a unit root. + + The p-values are obtained through regression surface approximation from + MacKinnon (1994) using the updated 2010 tables. + If the p-value is close to significant, then the critical values should be + used to judge whether to reject the null. + + The autolag option and maxlag for it are described in Greene. + + Examples + -------- + see example script + + References + ---------- + Greene + + Hamilton, J. D. 1994. Time Series Analysis. Princeton: Princeton + University Press. + + P-Values (regression surface approximation) + MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for + unit-root and cointegration tests. `Journal of Business and Economic + Statistics` 12, 167-76. + + Critical values + MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's + University, Dept of Economics, Working Papers. Available at + http://ideas.repec.org/p/qed/wpaper/1227.html + """ + + def __init__(self, y, lags=None, trend='c', + max_lags=None, method='AIC'): + super(ADF, self).__init__(y, lags, trend) + self._max_lags = max_lags + self._method = method + self._test_name = 'Augmented Dickey-Fuller' + self._regression = None + # TODO: Remove when adfuller is depricated + self._ic_best = None # For compat with adfuller + self._autolag_results = None # For compat with adfuller + + def _select_lag(self): + ic_best, best_lag, all_res = _df_select_lags(self._y, + self._trend, + self._max_lags, + self._method) + # TODO: Remove when adfuller is depricated + self._autolag_results = all_res + self._ic_best = ic_best + self._lags = best_lag + + def _compute_statistic(self): + if self._lags is None: + self._select_lag() + y, trend, lags = self._y, self._trend, self._lags + resols = _estimate_df_regression(y, trend, lags) + self._regression = resols + self._stat = stat = resols.tvalues[0] + self._nobs = int(resols.nobs) + self._pvalue = mackinnonp(stat, regression=trend, + num_unit_roots=1) + critical_values = mackinnoncrit(num_unit_roots=1, + regression=trend, + nobs=resols.nobs) + self._critical_values = {"1%": critical_values[0], + "5%": critical_values[1], + "10%": critical_values[2]} + + @property + def valid_trends(self): + """Returns the list of value trend terms""" + return 'nc', 'c', 'ct', 'ctt' + + @property + def regression(self): + """Returns the OLS regression results from the ADF model estimated""" + self._compute_if_needed() + return self._regression + + +class DFGLS(UnitRootTest): + """Elliott, Rothenberg and Stock's GLS version of the Dickey-Fuller test + + Parameters + ---------- + y : array-like, (nobs,) + The data to test for a unit root + lags : int, non-negative, optional + The number of lags to use in the ADF regression. If omitted or None, + `method` is used to automatically select the lag length with no more + than `max_lags` are included. + trend : str, {'c', 'ct'}, optional + The trend component to include in the ADF test + 'nc' - No trend components + 'c' - Include a constant (Default) + 'ct' - Include a constant and linear time trend + 'ctt' - Include a constant and linear and quadratic time trends + max_lags : int, non-negative, optional + The maximum number of lags to use when selecting lag length + method : str, {'AIC', 'BIC', 't-stat'}, optional + The method to use when selecting the lag length + 'AIC' - Select the minimum of the Akaike IC + 'BIC' - Select the minimum of the Schwarz/Bayesian IC + 't-stat' - Select the minimum of the Schwarz/Bayesian IC + + Attributes + ---------- + stat + pvalues + critical_values + h0 + hA + summary + regression + valid_trends + y + trend + lags + + Notes + ----- + The null hypothesis of the Dickey-Fuller GLS is that there is a unit + root, with the alternative that there is no unit root. If the pvalue is + above a critical size, then the null cannot be rejected that there + and the series appears to be a unit root. + + DFGLS differs from the ADF test in that an initial GLS detrending step + is used before a trend-less ADF regression is run. + + Critical values and p-values when trend is 'c' are identical to + the ADF. When trend is set to 'ct, they are from ... + + Examples + -------- + see example script + + References + ---------- + Elliott, G. R., T. J. Rothenberg, and J. H. Stock. 1996. Efficient tests + for an autoregressive unit root. Econometrica 64: 813-836 + """ + + def __init__(self, y, lags=None, trend='c', + max_lags=None, method='AIC'): + super(DFGLS, self).__init__(y, lags, trend) + self._max_lags = max_lags + self._method = method + self._regression = None + self._test_name = 'Dickey-Fuller GLS' + if trend == 'c': + self._c = -7.0 + else: + self._c = -13.5 + + def _compute_statistic(self): + """Core routine to estimate DF-GLS test statistic""" + # 1. GLS detrend + trend, c = self._trend, self._c + + nobs = self._y.shape[0] + ct = c / nobs + z = add_trend(nobs=nobs, trend=trend) + + delta_z = z.copy() + delta_z[1:, :] = delta_z[1:, :] - (1 + ct) * delta_z[:-1, :] + delta_y = self._y.copy()[:, None] + delta_y[1:] = delta_y[1:] - (1 + ct) * delta_y[:-1] + detrend_coef = pinv(delta_z).dot(delta_y) + y = self._y + y_detrended = y - z.dot(detrend_coef).ravel() + + # 2. determine lag length, if needed + if self._lags is None: + max_lags, method = self._max_lags, self._method + icbest, bestlag, all_res = _df_select_lags(y_detrended, 'nc', + max_lags, method) + self._lags = bestlag + + # 3. Run Regression + lags = self._lags + + resols = _estimate_df_regression(y_detrended, + lags=lags, + trend='nc') + self._regression = resols + self._nobs = int(resols.nobs) + self._stat = resols.tvalues[0] + self._pvalue = mackinnonp(self._stat, + regression=trend, + dist_type='DFGLS') + critical_values = mackinnoncrit(regression=trend, + nobs=self._nobs, + dist_type='DFGLS') + self._critical_values = {"1%": critical_values[0], + "5%": critical_values[1], + "10%": critical_values[2]} + + @property + def valid_trends(self): + """Returns the list of value trend terms""" + return 'c', 'ct' + + @UnitRootTest.trend.setter + def trend(self, value): + if value not in self.valid_trends: + raise ValueError('trend not understood') + if self._trend != value: + self._reset() + self._trend = value + if value == 'c': + self._c = -7.0 + else: + self._c = -13.5 + + @property + def regression(self): + """Returns the OLS regression results from the ADF model estimated""" + self._compute_if_needed() + return self._regression + + +class PhillipsPerron(UnitRootTest): + """Phillips-Perron unit root test + + Parameters + ---------- + y : array-like, (nobs,) + The data to test for a unit root + lags : int, non-negative, optional + The number of lags to use in the Newey-West estimator of the long-run + covariance. If omitted or None, the lag length is set automatically to + 12 * (nobs/100) ** (1/4) + trend : str, {'nc', 'c', 'ct'}, optional + The trend component to include in the ADF test + 'nc' - No trend components + 'c' - Include a constant (Default) + 'ct' - Include a constant and linear time trend + test_type : str, {'tau', 'rho'} + The test to use when computing the test statistic. 'tau' is based on + the t-stat and 'rho' uses a test based on nobs times the re-centered + regression coefficient + + Attributes + ---------- + stat + pvalues + critical_values + test_type + h0 + hA + summary + valid_trends + y + trend + lags + + Notes + ----- + The null hypothesis of the Phillips-Perron (PP) test is that there is a + unit root, with the alternative that there is no unit root. If the pvalue + is above a critical size, then the null cannot be rejected that there + and the series appears to be a unit root. + + Unlike the ADF test, the regression estimated includes only one lag of + the dependant variable, in addition to trend terms. Any serial + correlation in the regression errors is accounted for using a long-run + variance estimator (currently Newey-West). + + The p-values are obtained through regression surface approximation from + MacKinnon (1994) using the updated 2010 tables. + If the p-value is close to significant, then the critical values should be + used to judge whether to reject the null. + + Examples + -------- + see example script + + References + ---------- + Hamilton, J. D. 1994. Time Series Analysis. Princeton: Princeton + University Press. + + Newey, W. K., and K. D. West. 1987. A simple, positive semidefinite, + heteroskedasticity and autocorrelation consistent covariance matrix. + Econometrica 55, 703-708. + + Phillips, P. C. B., and P. Perron. 1988. Testing for a unit root in + time series regression. Biometrika 75, 335-346. + + P-Values (regression surface approximation) + MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for + unit-root and cointegration tests. `Journal of Business and Economic + Statistics` 12, 167-76. + + Critical values + MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's + University, Dept of Economics, Working Papers. Available at + http://ideas.repec.org/p/qed/wpaper/1227.html + """ + + def __init__(self, y, lags=None, trend='c', test_type='tau'): + super(PhillipsPerron, self).__init__(y, lags, trend) + self._test_type = test_type + self._stat_rho = None + self._stat_tau = None + self._test_name = 'Phillips-Perron' + self._lags = lags + + def _compute_statistic(self): + """Core routine to estimate PP test statistics""" + # 1. Estimate Regression + y, trend = self._y, self._trend + nobs = y.shape[0] + + if self._lags is None: + self._lags = int(ceil(12. * power(nobs / 100., 1 / 4.))) + lags = self._lags + + rhs = y[:-1, None] + lhs = y[1:, None] + if trend != 'nc': + rhs = add_trend(rhs, trend) + + resols = OLS(lhs, rhs).fit() + k = rhs.shape[1] + n, u = resols.nobs, resols.resid + lam2 = cov_nw(u, lags, demean=False) + lam = sqrt(lam2) + # 2. Compute components + s2 = u.dot(u) / (n - k) + s = sqrt(s2) + gamma0 = s2 * (n - k) / n + sigma = resols.bse[0] + sigma2 = sigma ** 2.0 + rho = resols.params[0] + # 3. Compute statistics + self._stat_tau = sqrt(gamma0 / lam2) * ((rho - 1) / sigma) \ + - 0.5 * ((lam2 - gamma0) / lam) * (n * sigma / s) + self._stat_rho = n * (rho - 1) \ + - 0.5 * (n ** 2.0 * sigma2 / s2) * (lam2 - gamma0) + + self._nobs = int(resols.nobs) + if self._test_type == 'rho': + self._stat = self._stat_rho + dist_type = 'ADF-z' + else: + self._stat = self._stat_tau + dist_type = 'ADF-t' + + self._pvalue = mackinnonp(self._stat, + regression=trend, + dist_type=dist_type) + critical_values = mackinnoncrit(regression=trend, + nobs=n, + dist_type=dist_type) + self._critical_values = {"1%": critical_values[0], + "5%": critical_values[1], + "10%": critical_values[2]} + + self._title = self._test_name + ', Test Type Z-' + self._test_type + + @property + def test_type(self): + """Gets or sets the test type returned by stat. + Valid values are 'tau' or 'rho'""" + return self._test_type + + @test_type.setter + def test_type(self, value): + if value not in ('rho', 'tau'): + raise ValueError('stat must be either ''rho'' or ''tau''.') + self._reset() + self._test_type = value + + @property + def valid_trends(self): + """Returns the list of value trend terms""" + return 'nc', 'c', 'ct' + + +class KPSS(UnitRootTest): + """Kwiatkowski, Phillips, Schmidt and Shin (1992, KPSS) stationarity test + + Parameters + ---------- + y : array-like, (nobs,) + The data to test for stationarity + lags : int, non-negative, optional + The number of lags to use in the Newey-West estimator of the long-run + covariance. If omitted or None, the lag length is set automatically to + 12 * (nobs/100) ** (1/4) + trend : str, {'c', 'ct'}, optional + The trend component to include in the ADF test + 'c' - Include a constant (Default) + 'ct' - Include a constant and linear time trend + + Attributes + ---------- + stat + pvalues + critical_values + test_type + h0 + hA + summary + valid_trends + y + trend + lags + + Notes + ----- + The null hypothesis of the KPSS test is that the series is weakly stationary + and the alternative is that it is non-stationary. If the p-value + is above a critical size, then the null cannot be rejected that there + and the series appears stationary. + + The p-values and critical values were computed using an extensive simulation + based on 100,000,000 replications using series with 2,000 observations. + + Examples + -------- + see example script + + References + ---------- + Kwiatkowski, D.; Phillips, P. C. B.; Schmidt, P.; Shin, Y. (1992). "Testing + the null hypothesis of stationarity against the alternative of a unit root". + Journal of Econometrics 54 (1-3), 159-178 + """ + + def __init__(self, y, lags=None, trend='c'): + super(KPSS, self).__init__(y, lags, trend) + self._test_name = 'KPSS Stationarity Test' + self._h0 = 'The process is weakly stationary.' + self._h1 = 'The process contains a unit root.' + + def _compute_statistic(self): + # 1. Estimate model with trend + nobs, y, trend = self._nobs, self._y, self._trend + z = add_trend(nobs=nobs, trend=trend) + res = OLS(y, z).fit() + # 2. Compute KPSS test + u = res.resid + if self._lags is None: + self._lags = int(ceil(12. * power(nobs / 100., 1 / 4.))) + lam = cov_nw(u, self._lags, demean=False) + s = cumsum(u) + self._stat = 1 / (nobs ** 2.0) * sum(s ** 2.0) / lam + self._nobs = u.shape[0] + self._pvalue, critical_values = kpss_crit(self._stat, trend) + self._critical_values = {"1%": critical_values[0], + "5%": critical_values[1], + "10%": critical_values[2]} + + @property + def valid_trends(self): + """Returns the list of value trend terms""" + return 'c', 'ct' + + +class VarianceRatio(UnitRootTest): + """ + Variance Ratio test of a random walk. + + Parameters + ---------- + y : array-like, (nobs,) + The data to test for a random walk + lags : int, >=2 + The number of periods to used in the multi-period variance, which is the + numerator of the test statistic. Must be at least 2 + trend : str, {'nc', 'c'}, optional + 'c' allows for a non-zero drift in the random walk, while 'nc' requires + that the increments to y are mean 0 + overlap : bool, optional + Indicates whether to use all overlapping blocks. Default is True. If + False, the number of observations in y minus 1 must be an exact multiple + of lags. If this condition is not satistifed, some values at the end of + y will be discarded. + robust : bool, optional + Indicates whether to use heteroskedasticity robust inference. Default is + True. + debiased : bool, optional + Indicates whether to use a debiased version of the test. Default is + True. Only applicable if overlap is True. + + Attributes + ---------- + stat + pvalues + critical_values + test_type + h0 + hA + summary + valid_trends + y + trend + lags + overlap + robust + debiased + + Notes + ----- + The null hypothesis of a VR is that the process is a random walk, possibly + plus drift. Rejection of the null with a positive test statistic indicates + the presence of positive serial correlation in the time series. + + Examples + -------- + see example script + + References + ---------- + Campbell, John Y., Lo, Andrew W. and MacKinlay, A. Craig. (1997) The + Econometrics of Financial Markets. Princeton, NJ: Princeton University + Press. + + """ + + def __init__(self, y, lags=2, trend='c', debiased=True, + robust=True, overlap=True): + if lags < 2: + raise ValueError('lags must be an integer larger than 2') + super(VarianceRatio, self).__init__(y, lags, trend) + self._test_name = 'Variance-Ratio test' + self._h0 = 'The process is a random walk.' + self._h1 = 'The process is not a random walk.' + self._robust = robust + self._debiased = debiased + self._overlap = overlap + self._vr = None + self._stat_variance = None + quantiles = array([.01, .05, .1, .9, .95, .99]) + self._critical_values = {} + for q, cv in zip(quantiles, norm.ppf(quantiles)): + self._critical_values[str(int(100 * q)) + '%'] = cv + + @property + def vr(self): + """The ratio of the long block lags-period variance + to the 1-period variance""" + self._compute_if_needed() + return self._vr + + @property + def overlap(self): + """Sets of gets the indicator to use overlaping returns in the + long-period vairance estimator""" + return self._overlap + + @overlap.setter + def overlap(self, value): + self._reset() + self._overlap = bool(value) + + @property + def robust(self): + """Sets of gets the indicator to use a heteroskedasticity robust + variance estimator """ + return self._robust + + @robust.setter + def robust(self, value): + self._reset() + self._robust = bool(value) + + @property + def debiased(self): + """Sets of gets the indicator to use debiased variances in the ratio""" + return self._debiased + + @debiased.setter + def debiased(self, value): + self._reset() + self._debiased = bool(value) + + def _compute_statistic(self): + overlap, debiased, robust = self._overlap, self._debiased, self._robust + y, nobs, q, trend = self._y, self._nobs, self._lags, self._trend + + nq = nobs - 1 + if not overlap: + # Check length of y + if nq % q != 0: + extra = nq % q + y = y[:-extra] + warnings.warn(invalid_length_doc.format(var='y', + block=q, + drop=extra), + InvalidLengthWarning) + + nobs = y.shape[0] + if trend == 'nc': + mu = 0 + else: + mu = (y[-1] - y[0]) / (nobs - 1) + + delta_y = diff(y) + nq = delta_y.shape[0] + sigma2_1 = sum((delta_y - mu) ** 2.0) / nq + + if not overlap: + delta_y_q = y[q::q] - y[0:-q:q] + sigma2_q = sum((delta_y_q - q * mu) ** 2.0) / nq + else: + delta_y_q = y[q:] - y[:-q] + sigma2_q = sum((delta_y_q - q * mu) ** 2.0) / (nq * q) + + if debiased and overlap: + sigma2_1 *= nq / (nq - 1) + m = q * (nq - q + 1) * (1 - (q / nq)) + sigma2_q *= (nq * q) / m + + if not overlap: + self._stat_variance = 2.0 * (q - 1) + elif not robust: + self._stat_variance = (2 * (2 * q - 1) * (q - 1)) / (2 * q) + else: + z2 = (delta_y - mu) ** 2.0 + scale = sum(z2) ** 2.0 + theta = 0.0 + for k in arange(1.0, q): + delta = nq * z2[k:].dot(z2[:-k]) / scale + theta += (1 - k / q) ** 2.0 * delta + self._stat_variance = theta + self._vr = sigma2_q / sigma2_1 + self._stat = sqrt(nq) * (self._vr - 1) / sqrt(self._stat_variance) + self._pvalue = 2 - 2 * norm.cdf(abs(self._stat)) + + @property + def valid_trends(self): + """Returns the list of value trend terms""" + return 'nc', 'c' + + +# Wrapper before deprication +# See: +# Ng and Perron(2001), Lag length selection and the construction of unit root +# tests with good size and power, Econometrica, Vol 69 (6) pp 1519-1554 +# TODO: include drift keyword, only valid with regression == "c" which will +# TODO: change the distribution of the test statistic to a t distribution +def adfuller(x, maxlag=None, regression="c", autolag='AIC', + store=False, regresults=False): + """ + Augmented Dickey-Fuller unit root test + + The Augmented Dickey-Fuller test can be used to test for a unit root in a + univariate process in the presence of serial correlation. + + Parameters + ---------- + x : array_like, 1d + data series + maxlag : int + Maximum lag which is included in test, default 12*(nobs/100)^{1/4} + regression : str {'c','ct','ctt','nc'} + Constant and trend order to include in regression + * 'c' : constant only (default) + * 'ct' : constant and trend + * 'ctt' : constant, and linear and quadratic trend + * 'nc' : no constant, no trend + autolag : {'AIC', 'BIC', 't-stat', None} + * if None, then maxlag lags are used + * if 'AIC' (default) or 'BIC', then the number of lags is chosen + to minimize the corresponding information criterium + * 't-stat' based choice of maxlag. Starts with maxlag and drops a + lag until the t-statistic on the last lag length is significant at + the 95 % level. + store : bool + If True, then an ADF instance is returned additionally to + the adf statistic (default is False) + regresults : bool + If True, the full regression results are returned (default is False) + + Returns + ------- + adf : float + Test statistic + pvalue : float + MacKinnon's approximate p-value based on MacKinnon (1994) + usedlag : int + Number of lags used. + nobs : int + Number of observations used for the ADF regression and calculation of + the critical values. + critical values : dict + Critical values for the test statistic at the 1 %, 5 %, and 10 % + levels. Based on MacKinnon (2010) + icbest : float + The maximized information criterion if autolag is not None. + regresults : RegressionResults instance + The + resstore : (optional) instance of ResultStore + an instance of a dummy class with results attached as attributes + + Notes + ----- + This function has been depricated. Please use ADF instead. + + The null hypothesis of the Augmented Dickey-Fuller is that there is a unit + root, with the alternative that there is no unit root. If the pvalue is + above a critical size, then we cannot reject that there is a unit root. + + The p-values are obtained through regression surface approximation from + MacKinnon 1994, but using the updated 2010 tables. + If the p-value is close to significant, then the critical values should be + used to judge whether to accept or reject the null. + + The autolag option and maxlag for it are described in Greene. + + Examples + -------- + see example script + + References + ---------- + Greene + Hamilton + + + P-Values (regression surface approximation) + MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for + unit-root and cointegration tests. `Journal of Business and Economic + Statistics` 12, 167-76. + + Critical values + MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's + University, Dept of Economics, Working Papers. Available at + http://ideas.repec.org/p/qed/wpaper/1227.html + + """ + warnings.warn(deprication_doc.format(old_func='adfuller', new_func='ADF'), + DeprecationWarning) + lags = None + if autolag is None: + lags = maxlag + adf = ADF(x, lags=lags, trend=regression, max_lags=maxlag, method=autolag) + + adfstat = adf.stat + pvalue = adf.pvalue + critvalues = adf.critical_values + usedlag = adf.lags + nobs = adf.nobs + icbest = adf._ic_best + resstore = adf + if regresults: + # Work arounds for missing properties + setattr(resstore, 'autolag_results', resstore._autolag_results) + setattr(resstore, 'usedlag', resstore.lags) + return adfstat, pvalue, critvalues, resstore + elif autolag: + return adfstat, pvalue, usedlag, nobs, critvalues, icbest + else: + return adfstat, pvalue, usedlag, nobs, critvalues + + +def mackinnonp(stat, regression="c", num_unit_roots=1, dist_type='ADF-t'): + """ + Returns MacKinnon's approximate p-value for test stat. + + Parameters + ---------- + stat : float + "T-value" from an Augmented Dickey-Fuller or DFGLS regression. + regression : str {"c", "nc", "ct", "ctt"} + This is the method of regression that was used. Following MacKinnon's + notation, this can be "c" for constant, "nc" for no constant, "ct" for + constant and trend, and "ctt" for constant, trend, and trend-squared. + num_unit_roots : int + The number of series believed to be I(1). For (Augmented) Dickey- + Fuller N = 1. + dist_type: str, {'ADF-t', 'ADF-z', 'DFGLS'} + The test type to use when computing p-values. Options include + 'ADF-t' - ADF t-stat based tests + 'ADF-z' - ADF z tests + 'DFGLS' - GLS detrended Dickey Fuller + + Returns + ------- + p-value : float + The p-value for the ADF statistic estimated using MacKinnon 1994. + + References + ---------- + MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for + Unit-Root and Cointegration Tests." Journal of Business & Economics + Statistics, 12.2, 167-76. + + Notes + ----- + Most values are from MacKinnon (1994). Values for DFGLS test statistics + and the 'nc' version of the ADF z test statistic were computed following + the methodology of MacKinnon (1994). + """ + dist_type = dist_type.lower() + if num_unit_roots > 1 and dist_type.lower() != 'adf-t': + raise ValueError('Cointegration results (num_unit_roots > 1) are' + + 'only available for ADF-t values') + if dist_type == 'adf-t': + maxstat = tau_max[regression][num_unit_roots - 1] + minstat = tau_min[regression][num_unit_roots - 1] + starstat = tau_star[regression][num_unit_roots - 1] + small_p = tau_small_p[regression][num_unit_roots - 1] + large_p = tau_large_p[regression][num_unit_roots - 1] + elif dist_type == 'adf-z': + maxstat = adf_z_max[regression] + minstat = adf_z_min[regression] + starstat = adf_z_star[regression] + small_p = adf_z_small_p[regression] + large_p = adf_z_large_p[regression] + elif dist_type == 'dfgls': + maxstat = dfgls_tau_max[regression] + minstat = dfgls_tau_min[regression] + starstat = dfgls_tau_star[regression] + small_p = dfgls_small_p[regression] + large_p = dfgls_large_p[regression] + else: + raise ValueError('Unknown test type {0}'.format(dist_type)) + + if stat > maxstat: + return 1.0 + elif stat < minstat: + return 0.0 + if stat <= starstat: + poly_coef = small_p + if dist_type == 'adf-z': + stat = log(abs(stat)) # Transform stat for small p ADF-z + else: + poly_coef = large_p + return norm.cdf(polyval(poly_coef[::-1], stat)) + + +def mackinnoncrit(num_unit_roots=1, regression="c", nobs=inf, + dist_type='ADF-t'): + """ + Returns the critical values for cointegrating and the ADF test. + + In 2010 MacKinnon updated the values of his 1994 paper with critical values + for the augmented Dickey-Fuller tests. These new values are to be + preferred and are used here. + + Parameters + ---------- + num_unit_roots : int + The number of series of I(1) series for which the null of + non-cointegration is being tested. For N > 12, the critical values + are linearly interpolated (not yet implemented). For the ADF test, + N = 1. + reg : str {'c', 'tc', 'ctt', 'nc'} + Following MacKinnon (1996), these stand for the type of regression run. + 'c' for constant and no trend, 'tc' for constant with a linear trend, + 'ctt' for constant with a linear and quadratic trend, and 'nc' for + no constant. The values for the no constant case are taken from the + 1996 paper, as they were not updated for 2010 due to the unrealistic + assumptions that would underlie such a case. + nobs : int or np.inf + This is the sample size. If the sample size is numpy.inf, then the + asymptotic critical values are returned. + + Returns + ------- + crit_vals : array + Three critical values corresponding to 1%, 5% and 10% cut-offs. + + Notes + ----- + Results for ADF t-stats from MacKinnon (1994,2010). Results for DFGLS and + ADF z-tests use the same methodology as MacKinnon. + + References + ---------- + MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for + Unit-Root and Cointegration Tests." Journal of Business & Economics + Statistics, 12.2, 167-76. + MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." + Queen's University, Dept of Economics Working Papers 1227. + http://ideas.repec.org/p/qed/wpaper/1227.html + """ + dist_type = dist_type.lower() + valid_regression = ['c', 'ct', 'nc', 'ctt'] + if dist_type == 'dfgls': + valid_regression = ['c', 'ct'] + if regression not in valid_regression: + raise ValueError( + "regression keyword {0} not understood".format(regression)) + + if dist_type == 'adf-t': + asymptotic_cv = tau_2010[regression][num_unit_roots - 1, :, 0] + poly_coef = tau_2010[regression][num_unit_roots - 1, :, :].T + elif dist_type == 'adf-z': + poly_coef = adf_z_cv_approx[regression].T + asymptotic_cv = adf_z_cv_approx[regression][:, 0] + elif dist_type == 'dfgls': + poly_coef = dfgls_cv_approx[regression].T + asymptotic_cv = dfgls_cv_approx[regression][:, 0] + else: + raise ValueError('Unknown test type {0}'.format(dist_type)) + + if nobs is inf: + return asymptotic_cv + else: + # Flip so that highest power to lowest power + return polyval(poly_coef[::-1], 1. / nobs) + + +def kpss_crit(stat, trend='c'): + """ + Linear interpolation for KPSS p-values and critical values + + Parameters + ---------- + stat : float + The KPSS test statistic. + trend : str, {'c','ct'} + The trend used when computing the KPSS statistic + + Returns + ------- + pvalue : float + The interpolated p-value + crit_val : array + Three element array containing the 10%, 5% and 1% critical values, + in order + + Notes + ----- + The p-values are linear interpolated from the quantiles of the simulated + KPSS test statistic distribution using 100,000,000 replications and 2000 + data points. + """ + table = kpss_critical_values[trend] + y = table[:, 0] + x = table[:, 1] + pvalue = interp(stat, x, y) + cv = [1.0, 5.0, 10.0] + crit_value = interp(cv, y[::-1], x[::-1]) + + return pvalue, crit_value + + +if __name__ == '__main__': + pass