From 7512194f2c3edb8220d0b92dbd064605bd4de54b Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Sat, 8 Mar 2014 23:05:05 +0000 Subject: [PATCH] ENH: Unit Root Tests Introduces 4 new unit root tests and provides a class for Augmented Dickey Fuller testing. The four new tests are: - PhillipsPerron - KPSS - Variance Ratio - DF-GLS These have all been included in unitroot.py, and the afduller function has been moved from tsatools to unitroot, and is now just a wrapper around the ADF class. The critical value and p-value lookup codes have also been consolidated into untiroot.py. Critical value and p-value lookup information for DFGLS and KPSS is not readily available in published form, and so simulations were performed to generate these using 100,000,000 replications using a methodology similar to MacKinnon (2010). All critical value data has been consolidated into tsa/critical_values. A cov_nw function was added to tsa/stattools which computes Newey-West covariance. Also includes substantial improvements to tsatools, including numerous bug fixes. The code that handled recarrays has been improved to now handle pandas dataframes. Numerous doc strings where improved. Test coverage was also substantially increased. string_types was added to the compatibility library to address the difference between basestring in Python 2 and str in Python 3. Moved coint to a new file coinegration.py which will, in the long term, be expanded to include other tests for cointegration. --- statsmodels/compatnp/py3k.py | 2 + statsmodels/sandbox/stats/diagnostic.py | 3 +- statsmodels/tsa/adfvalues.py | 387 ----- statsmodels/tsa/cointegration.py | 70 + statsmodels/tsa/critical_values/__init__.py | 1 + statsmodels/tsa/critical_values/dfgls.py | 34 + .../tsa/critical_values/dickey_fuller.py | 265 ++++ statsmodels/tsa/critical_values/kpss.py | 40 + .../simulation/adf_z_simlation_process.py | 51 + .../dfgls_critical_values_simulation.py | 116 ++ .../simulation/dfgls_simulation_process.py | 118 ++ .../kpss_critical_values_simulation.py | 98 ++ .../simulation/kpss_simulation_process.py | 87 ++ statsmodels/tsa/stattools.py | 307 +--- statsmodels/tsa/tests/test_adfuller_lag.py | 49 - statsmodels/tsa/tests/test_stattools.py | 133 +- statsmodels/tsa/tests/test_tsa_tools.py | 403 +++-- statsmodels/tsa/tests/test_unitroot.py | 380 +++++ statsmodels/tsa/tsatools.py | 480 +++--- statsmodels/tsa/unitroot.py | 1297 +++++++++++++++++ 20 files changed, 3230 insertions(+), 1091 deletions(-) delete mode 100644 statsmodels/tsa/adfvalues.py create mode 100644 statsmodels/tsa/cointegration.py create mode 100644 statsmodels/tsa/critical_values/__init__.py create mode 100644 statsmodels/tsa/critical_values/dfgls.py create mode 100644 statsmodels/tsa/critical_values/dickey_fuller.py create mode 100644 statsmodels/tsa/critical_values/kpss.py create mode 100644 statsmodels/tsa/critical_values/simulation/adf_z_simlation_process.py create mode 100644 statsmodels/tsa/critical_values/simulation/dfgls_critical_values_simulation.py create mode 100644 statsmodels/tsa/critical_values/simulation/dfgls_simulation_process.py create mode 100644 statsmodels/tsa/critical_values/simulation/kpss_critical_values_simulation.py create mode 100644 statsmodels/tsa/critical_values/simulation/kpss_simulation_process.py delete mode 100644 statsmodels/tsa/tests/test_adfuller_lag.py create mode 100644 statsmodels/tsa/tests/test_unitroot.py create mode 100644 statsmodels/tsa/unitroot.py diff --git a/statsmodels/compatnp/py3k.py b/statsmodels/compatnp/py3k.py index aab8807205e..11359fd8f43 100644 --- a/statsmodels/compatnp/py3k.py +++ b/statsmodels/compatnp/py3k.py @@ -34,6 +34,7 @@ def isfileobj(f): def open_latin1(filename, mode='r'): return open(filename, mode=mode, encoding='iso-8859-1') strchar = 'U' + string_types = basestring from io import BytesIO, StringIO #statsmodels else: bytes = str @@ -52,6 +53,7 @@ def open_latin1(filename, mode='r'): return open(filename, mode=mode) from StringIO import StringIO BytesIO = StringIO + string_types = str def getexception(): return sys.exc_info()[1] diff --git a/statsmodels/sandbox/stats/diagnostic.py b/statsmodels/sandbox/stats/diagnostic.py index 01617142cc2..cf46da9d722 100644 --- a/statsmodels/sandbox/stats/diagnostic.py +++ b/statsmodels/sandbox/stats/diagnostic.py @@ -34,7 +34,8 @@ from scipy import stats from statsmodels.regression.linear_model import OLS from statsmodels.tools.tools import add_constant -from statsmodels.tsa.stattools import acf, adfuller +from statsmodels.tsa.stattools import acf +from statsmodels.tsa.unitroot import adfuller from statsmodels.tsa.tsatools import lagmat #get the old signature back so the examples work diff --git a/statsmodels/tsa/adfvalues.py b/statsmodels/tsa/adfvalues.py deleted file mode 100644 index 0aab1887007..00000000000 --- a/statsmodels/tsa/adfvalues.py +++ /dev/null @@ -1,387 +0,0 @@ -from scipy.stats import norm -from numpy import array, polyval, inf, asarray - -__all__ = ['mackinnonp','mackinnoncrit'] - -# These are the cut-off values for the left-tail vs. the rest of the -# tau distribution, for getting the p-values - -tau_star_nc = [-1.04, -1.53, -2.68, -3.09, -3.07, -3.77] -tau_min_nc = [-19.04,-19.62,-21.21,-23.25,-21.63,-25.74] -tau_max_nc = [inf,1.51,0.86,0.88,1.05,1.24] -tau_star_c = [-1.61, -2.62, -3.13, -3.47, -3.78, -3.93] -tau_min_c = [-18.83,-18.86,-23.48,-28.07,-25.96,-23.27] -tau_max_c = [2.74,0.92,0.55,0.61,0.79,1] -tau_star_ct = [-2.89, -3.19, -3.50, -3.65, -3.80, -4.36] -tau_min_ct = [-16.18,-21.15,-25.37,-26.63,-26.53,-26.18] -tau_max_ct = [0.7,0.63,0.71,0.93,1.19,1.42] -tau_star_ctt = [-3.21,-3.51,-3.81,-3.83,-4.12,-4.63] -tau_min_ctt = [-17.17,-21.1,-24.33,-24.03,-24.33,-28.22] -tau_max_ctt = [0.54,0.79,1.08,1.43,3.49,1.92] - -small_scaling = array([1,1,1e-2]) -tau_nc_smallp = [ [0.6344,1.2378,3.2496], - [1.9129,1.3857,3.5322], - [2.7648,1.4502,3.4186], - [3.4336,1.4835,3.19], - [4.0999,1.5533,3.59], - [4.5388,1.5344,2.9807]] -tau_nc_smallp = asarray(tau_nc_smallp)*small_scaling - -tau_c_smallp = [ [2.1659,1.4412,3.8269], - [2.92,1.5012,3.9796], - [3.4699,1.4856,3.164], - [3.9673,1.4777,2.6315], - [4.5509,1.5338,2.9545], - [5.1399,1.6036,3.4445]] -tau_c_smallp = asarray(tau_c_smallp)*small_scaling - -tau_ct_smallp = [ [3.2512,1.6047,4.9588], - [3.6646,1.5419,3.6448], - [4.0983,1.5173,2.9898], - [4.5844,1.5338,2.8796], - [5.0722,1.5634,2.9472], - [5.53,1.5914,3.0392]] -tau_ct_smallp = asarray(tau_ct_smallp)*small_scaling - -tau_ctt_smallp = [ [4.0003,1.658,4.8288], - [4.3534,1.6016,3.7947], - [4.7343,1.5768,3.2396], - [5.214,1.6077,3.3449], - [5.6481,1.6274,3.3455], - [5.9296,1.5929,2.8223]] -tau_ctt_smallp = asarray(tau_ctt_smallp)*small_scaling - -large_scaling = array([1,1e-1,1e-1,1e-2]) -tau_nc_largep = [ [0.4797,9.3557,-0.6999,3.3066], - [1.5578,8.558,-2.083,-3.3549], - [2.2268,6.8093,-3.2362,-5.4448], - [2.7654,6.4502,-3.0811,-4.4946], - [3.2684,6.8051,-2.6778,-3.4972], - [3.7268,7.167,-2.3648,-2.8288]] -tau_nc_largep = asarray(tau_nc_largep)*large_scaling - -tau_c_largep = [ [1.7339,9.3202,-1.2745,-1.0368], - [2.1945,6.4695,-2.9198,-4.2377], - [2.5893,4.5168,-3.6529,-5.0074], - [3.0387,4.5452,-3.3666,-4.1921], - [3.5049,5.2098,-2.9158,-3.3468], - [3.9489,5.8933,-2.5359,-2.721]] -tau_c_largep = asarray(tau_c_largep)*large_scaling - -tau_ct_largep = [ [2.5261,6.1654,-3.7956,-6.0285], - [2.85,5.272,-3.6622,-5.1695], - [3.221,5.255,-3.2685,-4.1501], - [3.652,5.9758,-2.7483,-3.2081], - [4.0712,6.6428,-2.3464,-2.546], - [4.4735,7.1757,-2.0681,-2.1196] ] -tau_ct_largep = asarray(tau_ct_largep)*large_scaling - -tau_ctt_largep = [ [3.0778,4.9529,-4.1477,-5.9359], - [3.4713,5.967,-3.2507,-4.2286], - [3.8637,6.7852,-2.6286,-3.1381], - [4.2736,7.6199,-2.1534,-2.4026], - [4.6679,8.2618,-1.822,-1.9147], - [5.0009,8.3735,-1.6994,-1.6928]] -tau_ctt_largep = asarray(tau_ctt_largep)*large_scaling - - -#NOTE: The Z-statistic is used when lags are included to account for -# serial correlation in the error term - -z_star_nc = [-2.9,-8.7,-14.8,-20.9,-25.7,-30.5] -z_star_c = [-8.9,-14.3,-19.5,-25.1,-29.6,-34.4] -z_star_ct = [-15.0,-19.6,-25.3,-29.6,-31.8,-38.4] -z_star_ctt = [-20.7,-25.3,-29.9,-34.4,-38.5,-44.2] - - -# These are Table 5 from MacKinnon (1994) -# small p is defined as p in .005 to .150 ie p = .005 up to z_star -# Z* is the largest value for which it is appropriate to use these -# approximations -# the left tail approximation is -# p = norm.cdf(d_0 + d_1*log(abs(z)) + d_2*log(abs(z))**2 + d_3*log(abs(z))**3 -# there is no Z-min, ie., it is well-behaved in the left tail - -z_nc_smallp = array([[.0342, -.6376,0,-.03872], - [1.3426,-.7680,0,-.04104], - [3.8607,-2.4159,.51293,-.09835], - [6.1072,-3.7250,.85887,-.13102], - [7.7800,-4.4579,1.00056,-.14014], - [4.0253, -.8815,0,-.04887]]) - -z_c_smallp = array([[2.2142,-1.7863,.32828,-.07727], - [1.1662,.1814,-.36707,0], - [6.6584,-4.3486,1.04705,-.15011], - [3.3249,-.8456,0,-.04818], - [4.0356,-.9306,0,-.04776], - [13.9959,-8.4314,1.97411,-.22234]]) - -z_ct_smallp = array([ [4.6476,-2.8932,0.5832,-0.0999], - [7.2453,-4.7021,1.127,-.15665], - [3.4893,-0.8914,0,-.04755], - [1.6604,1.0375,-0.53377,0], - [2.006,1.1197,-0.55315,0], - [11.1626,-5.6858,1.21479,-.15428]]) - -z_ctt_smallp = array([ [3.6739,-1.1549,0,-0.03947], - [3.9783,-1.0619,0,-0.04394], - [2.0062,0.8907,-0.51708,0], - [4.9218,-1.0663,0,-0.04691], - [5.1433,-0.9877,0,-0.04993], - [23.6812,-14.6485,3.42909,-.33794]]) -# These are Table 6 from MacKinnon (1994). -# These are well-behaved in the right tail. -# the approximation function is -# p = norm.cdf(d_0 + d_1 * z + d_2*z**2 + d_3*z**3 + d_4*z**4) -z_large_scaling = array([1,1e-1,1e-2,1e-3,1e-5]) -z_nc_largep = array([ [0.4927,6.906,13.2331,12.099,0], - [1.5167,4.6859,4.2401,2.7939,7.9601], - [2.2347,3.9465,2.2406,0.8746,1.4239], - [2.8239,3.6265,1.6738,0.5408,0.7449], - [3.3174,3.3492,1.2792,0.3416,0.3894], - [3.729,3.0611,0.9579,0.2087,0.1943]]) -z_nc_largep *= z_large_scaling - -z_c_largep = array([ [1.717,5.5243,4.3463,1.6671,0], - [2.2394,4.2377,2.432,0.9241,0.4364], - [2.743,3.626,1.5703,0.4612,0.567], - [3.228,3.3399,1.2319,0.3162,0.3482], - [3.6583,3.0934,0.9681,0.2111,0.1979], - [4.0379,2.8735,0.7694,0.1433,0.1146]]) -z_c_largep *= z_large_scaling - -z_ct_largep = array([ [2.7117,4.5731,2.2868,0.6362,0.5], - [3.0972,4.0873,1.8982,0.5796,0.7384], - [3.4594,3.6326,1.4284,0.3813,0.4325], - [3.806,3.2634,1.0689,0.2402,0.2304], - [4.1402,2.9867,0.8323,0.16,0.1315], - [4.4497,2.7534,0.6582,0.1089,0.0773]]) -z_ct_largep *= z_large_scaling - -z_ctt_largep = array([ [3.4671,4.3476,1.9231,0.5381,0.6216], - [3.7827,3.9421,1.5699,0.4093,0.4485], - [4.052,3.4947,1.1772,0.2642,0.2502], - [4.3311,3.1625,0.9126,0.1775,0.1462], - [4.594,2.8739,0.707,0.1181,0.0838], - [4.8479,2.6447,0.5647,0.0827,0.0518]]) -z_ctt_largep *= z_large_scaling - -#TODO: finish this and then integrate them into adf function -def mackinnonp(teststat, regression="c", N=1, lags=None): - """ - Returns MacKinnon's approximate p-value for teststat. - - Parameters - ---------- - teststat : float - "T-value" from an Augmented Dickey-Fuller regression. - regression : str {"c", "nc", "ct", "ctt"} - This is the method of regression that was used. Following MacKinnon's - notation, this can be "c" for constant, "nc" for no constant, "ct" for - constant and trend, and "ctt" for constant, trend, and trend-squared. - N : int - The number of series believed to be I(1). For (Augmented) Dickey- - Fuller N = 1. - - Returns - ------- - p-value : float - The p-value for the ADF statistic estimated using MacKinnon 1994. - - References - ---------- - MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for - Unit-Root and Cointegration Tests." Journal of Business & Economics - Statistics, 12.2, 167-76. - - Notes - ----- - For (A)DF - H_0: AR coefficient = 1 - H_a: AR coefficient < 1 - """ - maxstat = eval("tau_max_"+regression) - minstat = eval("tau_min_"+regression) - starstat = eval("tau_star_"+regression) - if teststat > maxstat[N-1]: - return 1.0 - elif teststat < minstat[N-1]: - return 0.0 - if teststat <= starstat[N-1]: - tau_coef = eval("tau_" + regression + "_smallp["+str(N-1)+"]") -# teststat = np.log(np.abs(teststat)) -#above is only for z stats - else: - tau_coef = eval("tau_" + regression + "_largep["+str(N-1)+"]") - return norm.cdf(polyval(tau_coef[::-1], teststat)) - -# These are the new estimates from MacKinnon 2010 -# the first axis is N -1 -# the second axis is 1 %, 5 %, 10 % -# the last axis is the coefficients - -tau_nc_2010 = [[ [-2.56574,-2.2358,-3.627,0], # N = 1 - [-1.94100,-0.2686,-3.365,31.223], - [-1.61682, 0.2656, -2.714, 25.364]]] -tau_nc_2010 = asarray(tau_nc_2010) - -tau_c_2010 = [[ [-3.43035,-6.5393,-16.786,-79.433], # N = 1, 1% - [-2.86154,-2.8903,-4.234,-40.040], # 5 % - [-2.56677,-1.5384,-2.809,0]], # 10 % - [ [-3.89644,-10.9519,-33.527,0], # N = 2 - [-3.33613,-6.1101,-6.823,0], - [-3.04445,-4.2412,-2.720,0]], - [ [-4.29374,-14.4354,-33.195,47.433], # N = 3 - [-3.74066,-8.5632,-10.852,27.982], - [-3.45218,-6.2143,-3.718,0]], - [ [-4.64332,-18.1031,-37.972,0], # N = 4 - [-4.09600,-11.2349,-11.175,0], - [-3.81020,-8.3931,-4.137,0]], - [ [-4.95756,-21.8883,-45.142,0], # N = 5 - [-4.41519,-14.0405,-12.575,0], - [-4.13157,-10.7417,-3.784,0]], - [ [-5.24568,-25.6688,-57.737,88.639], # N = 6 - [-4.70693,-16.9178,-17.492,60.007], - [-4.42501,-13.1875,-5.104,27.877]], - [ [-5.51233,-29.5760,-69.398,164.295],# N = 7 - [-4.97684,-19.9021,-22.045,110.761], - [-4.69648,-15.7315,-5.104,27.877]], - [ [-5.76202,-33.5258,-82.189,256.289], # N = 8 - [-5.22924,-23.0023,-24.646,144.479], - [-4.95007,-18.3959,-7.344,94.872]], - [ [-5.99742,-37.6572,-87.365,248.316],# N = 9 - [-5.46697,-26.2057,-26.627,176.382], - [-5.18897,-21.1377,-9.484,172.704]], - [ [-6.22103,-41.7154,-102.680,389.33],# N = 10 - [-5.69244,-29.4521,-30.994,251.016], - [-5.41533,-24.0006,-7.514,163.049]], - [ [-6.43377,-46.0084,-106.809,352.752],# N = 11 - [-5.90714,-32.8336,-30.275,249.994], - [-5.63086,-26.9693,-4.083,151.427]], - [ [-6.63790,-50.2095,-124.156,579.622],# N = 12 - [-6.11279,-36.2681,-32.505,314.802], - [-5.83724,-29.9864,-2.686,184.116]]] -tau_c_2010 = asarray(tau_c_2010) - -tau_ct_2010 = [[ [-3.95877,-9.0531,-28.428,-134.155], # N = 1 - [-3.41049,-4.3904,-9.036,-45.374], - [-3.12705,-2.5856,-3.925,-22.380]], - [ [-4.32762,-15.4387,-35.679,0], # N = 2 - [-3.78057,-9.5106,-12.074,0], - [-3.49631,-7.0815,-7.538,21.892]], - [ [-4.66305,-18.7688,-49.793,104.244], # N = 3 - [-4.11890,-11.8922,-19.031,77.332], - [-3.83511,-9.0723,-8.504,35.403]], - [ [-4.96940,-22.4694,-52.599,51.314], # N = 4 - [-4.42871,-14.5876,-18.228,39.647], - [-4.14633,-11.2500,-9.873,54.109]], - [ [-5.25276,-26.2183,-59.631,50.646], # N = 5 - [-4.71537,-17.3569,-22.660,91.359], - [-4.43422,-13.6078,-10.238,76.781]], - [ [-5.51727,-29.9760,-75.222,202.253], # N = 6 - [-4.98228,-20.3050,-25.224,132.03], - [-4.70233,-16.1253,-9.836,94.272]], - [ [-5.76537,-33.9165,-84.312,245.394], # N = 7 - [-5.23299,-23.3328,-28.955,182.342], - [-4.95405,-18.7352,-10.168,120.575]], - [ [-6.00003,-37.8892,-96.428,335.92], # N = 8 - [-5.46971,-26.4771,-31.034,220.165], - [-5.19183,-21.4328,-10.726,157.955]], - [ [-6.22288,-41.9496,-109.881,466.068], # N = 9 - [-5.69447,-29.7152,-33.784,273.002], - [-5.41738,-24.2882,-8.584,169.891]], - [ [-6.43551,-46.1151,-120.814,566.823], # N = 10 - [-5.90887,-33.0251,-37.208,346.189], - [-5.63255,-27.2042,-6.792,177.666]], - [ [-6.63894,-50.4287,-128.997,642.781], # N = 11 - [-6.11404,-36.4610,-36.246,348.554], - [-5.83850,-30.1995,-5.163,210.338]], - [ [-6.83488,-54.7119,-139.800,736.376], # N = 12 - [-6.31127,-39.9676,-37.021,406.051], - [-6.03650,-33.2381,-6.606,317.776]]] -tau_ct_2010 = asarray(tau_ct_2010) - -tau_ctt_2010 = [[ [-4.37113,-11.5882,-35.819,-334.047], # N = 1 - [-3.83239,-5.9057,-12.490,-118.284], - [-3.55326,-3.6596,-5.293,-63.559]], - [ [-4.69276,-20.2284,-64.919,88.884], # N =2 - [-4.15387,-13.3114,-28.402,72.741], - [-3.87346,-10.4637,-17.408,66.313]], - [ [-4.99071,-23.5873,-76.924,184.782], # N = 3 - [-4.45311,-15.7732,-32.316,122.705], - [-4.17280,-12.4909,-17.912,83.285]], - [ [-5.26780,-27.2836,-78.971,137.871], # N = 4 - [-4.73244,-18.4833,-31.875,111.817], - [-4.45268,-14.7199,-17.969,101.92]], - [ [-5.52826,-30.9051,-92.490,248.096], # N = 5 - [-4.99491,-21.2360,-37.685,194.208], - [-4.71587,-17.0820,-18.631,136.672]], - [ [-5.77379,-34.7010,-105.937,393.991], # N = 6 - [-5.24217,-24.2177,-39.153,232.528], - [-4.96397,-19.6064,-18.858,174.919]], - [ [-6.00609,-38.7383,-108.605,365.208], # N = 7 - [-5.47664,-27.3005,-39.498,246.918], - [-5.19921,-22.2617,-17.910,208.494]], - [ [-6.22758,-42.7154,-119.622,421.395], # N = 8 - [-5.69983,-30.4365,-44.300,345.48], - [-5.42320,-24.9686,-19.688,274.462]], - [ [-6.43933,-46.7581,-136.691,651.38], # N = 9 - [-5.91298,-33.7584,-42.686,346.629], - [-5.63704,-27.8965,-13.880,236.975]], - [ [-6.64235,-50.9783,-145.462,752.228], # N = 10 - [-6.11753,-37.056,-48.719,473.905], - [-5.84215,-30.8119,-14.938,316.006]], - [ [-6.83743,-55.2861,-152.651,792.577], # N = 11 - [-6.31396,-40.5507,-46.771,487.185], - [-6.03921,-33.8950,-9.122,285.164]], - [ [-7.02582,-59.6037,-166.368,989.879], # N = 12 - [-6.50353,-44.0797,-47.242,543.889], - [-6.22941,-36.9673,-10.868,418.414]]] -tau_ctt_2010 = asarray(tau_ctt_2010) - -def mackinnoncrit(N=1, regression ="c", nobs=inf): - """ - Returns the critical values for cointegrating and the ADF test. - - In 2010 MacKinnon updated the values of his 1994 paper with critical values - for the augmented Dickey-Fuller tests. These new values are to be - preferred and are used here. - - Parameters - ---------- - N : int - The number of series of I(1) series for which the null of - non-cointegration is being tested. For N > 12, the critical values - are linearly interpolated (not yet implemented). For the ADF test, - N = 1. - reg : str {'c', 'tc', 'ctt', 'nc'} - Following MacKinnon (1996), these stand for the type of regression run. - 'c' for constant and no trend, 'tc' for constant with a linear trend, - 'ctt' for constant with a linear and quadratic trend, and 'nc' for - no constant. The values for the no constant case are taken from the - 1996 paper, as they were not updated for 2010 due to the unrealistic - assumptions that would underlie such a case. - nobs : int or np.inf - This is the sample size. If the sample size is numpy.inf, then the - asymptotic critical values are returned. - - References - ---------- - MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for - Unit-Root and Cointegration Tests." Journal of Business & Economics - Statistics, 12.2, 167-76. - MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." - Queen's University, Dept of Economics Working Papers 1227. - http://ideas.repec.org/p/qed/wpaper/1227.html - """ - reg = regression - if reg not in ['c','ct','nc','ctt']: - raise ValueError("regression keyword %s not understood") % reg - if nobs is inf: - return eval("tau_"+reg+"_2010["+str(N-1)+",:,0]") - else: - return polyval(eval("tau_"+reg+"_2010["+str(N-1)+",:,::-1].T"),1./nobs) - -if __name__=="__main__": - pass diff --git a/statsmodels/tsa/cointegration.py b/statsmodels/tsa/cointegration.py new file mode 100644 index 00000000000..bb36f294831 --- /dev/null +++ b/statsmodels/tsa/cointegration.py @@ -0,0 +1,70 @@ +""" +Estimation and testing of cointegrated time series +""" +from __future__ import division + +import numpy as np + +from statsmodels.regression.linear_model import OLS +from statsmodels.tools.tools import add_constant +from statsmodels.tsa.unitroot import mackinnonp, mackinnoncrit + + +def coint(y1, y2, regression="c"): + """ + This is a simple cointegration test. Uses unit-root test on residuals to + test for cointegrated relationship + + See Hamilton (1994) 19.2 + + Parameters + ---------- + y1 : array_like, 1d + first element in cointegrating vector + y2 : array_like + remaining elements in cointegrating vector + c : str {'c'} + Included in regression + * 'c' : Constant + + Returns + ------- + coint_t : float + t-statistic of unit-root test on residuals + pvalue : float + MacKinnon's approximate p-value based on MacKinnon (1994) + crit_value : dict + Critical values for the test statistic at the 1 %, 5 %, and 10 % + levels. + + Notes + ----- + The Null hypothesis is that there is no cointegration, the alternative + hypothesis is that there is cointegrating relationship. If the pvalue is + small, below a critical size, then we can reject the hypothesis that there + is no cointegrating relationship. + + P-values are obtained through regression surface approximation from + MacKinnon 1994. + + References + ---------- + MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for + unit-root and cointegration tests. `Journal of Business and Economic + Statistics` 12, 167-76. + + """ + regression = regression.lower() + if regression not in ['c', 'nc', 'ct', 'ctt']: + raise ValueError("regression option %s not understood") % regression + y1 = np.asarray(y1) + y2 = np.asarray(y2) + if regression == 'c': + y2 = add_constant(y2, prepend=False) + st1_resid = OLS(y1, y2).fit().resid # stage one residuals + lgresid_cons = add_constant(st1_resid[0:-1], prepend=False) + uroot_reg = OLS(st1_resid[1:], lgresid_cons).fit() + coint_t = (uroot_reg.params[0] - 1) / uroot_reg.bse[0] + pvalue = mackinnonp(coint_t, regression="c", num_unit_roots=2) + crit_value = mackinnoncrit(num_unit_roots=1, regression="c", nobs=len(y1)) + return coint_t, pvalue, crit_value diff --git a/statsmodels/tsa/critical_values/__init__.py b/statsmodels/tsa/critical_values/__init__.py new file mode 100644 index 00000000000..0afb2859844 --- /dev/null +++ b/statsmodels/tsa/critical_values/__init__.py @@ -0,0 +1 @@ +__author__ = 'kevin.sheppard' diff --git a/statsmodels/tsa/critical_values/dfgls.py b/statsmodels/tsa/critical_values/dfgls.py new file mode 100644 index 00000000000..4390ba7a012 --- /dev/null +++ b/statsmodels/tsa/critical_values/dfgls.py @@ -0,0 +1,34 @@ +""" +Contains values used to approximate the critical value and +p-value from DFGLS statistics + +These have been computed using the methodology of MacKinnon (1994) and (2010). +simulation. +""" + +from numpy import array + +dfgls_cv_approx = {'c': array([[-2.56781793e+00, -2.05575392e+01, 1.82727674e+02, + -1.77866664e+03], + [-1.94363325e+00, -2.17272746e+01, 2.60815068e+02, + -2.26914916e+03], + [-1.61998241e+00, -2.32734708e+01, 3.06474378e+02, + -2.57483557e+03]]), + 'ct': array([[-3.40689134, -21.69971242, 27.26295939, -816.84404772], + [-2.84677178, -19.69109364, 84.7664136, -799.40722401], + [-2.55890707, -19.42621991, 116.53759752, -840.31342847]])} + +dfgls_tau_max = {'c': 13.365361509140614, + 'ct': 8.73743383728356} + +dfgls_tau_min = {'c': -17.561302895074206, + 'ct': -13.681153542634465} + +dfgls_tau_star = {'c': -0.4795076091714674, + 'ct': -2.1960404365401298} + +dfgls_large_p = {'c': array([0.50612497, 0.98305664, -0.05648525, 0.00140875]), + 'ct': array([2.60561421, 1.67850224, 0.0373599, -0.01017936])} + +dfgls_small_p = {'c': array([0.67422739, 1.25475826, 0.03572509]), + 'ct': array([2.38767685, 1.57454737, 0.05754439])} \ No newline at end of file diff --git a/statsmodels/tsa/critical_values/dickey_fuller.py b/statsmodels/tsa/critical_values/dickey_fuller.py new file mode 100644 index 00000000000..ddd4473aced --- /dev/null +++ b/statsmodels/tsa/critical_values/dickey_fuller.py @@ -0,0 +1,265 @@ +""" +Contains values used to approximate the critical value and +p-value from statistics that follow the Dickey-Fuller distribution. + +Most values are from MacKinnon (1994) or (2010). A small number of these +did not appear in the original paper and have been computed using an identical +simulation. +""" +from numpy import asarray, inf + +small_scaling = asarray([1, 1, 1e-2]) +tau_small_p = {} +tau_large_p = {} + +tau_small_p['nc'] = [[0.6344, 1.2378, 3.2496], + [1.9129, 1.3857, 3.5322], + [2.7648, 1.4502, 3.4186], + [3.4336, 1.4835, 3.1900], + [4.0999, 1.5533, 3.5900], + [4.5388, 1.5344, 2.9807]] +tau_small_p['nc'] = asarray(tau_small_p['nc']) * small_scaling + +tau_small_p['c'] = [[2.1659, 1.4412, 3.8269], + [2.9200, 1.5012, 3.9796], + [3.4699, 1.4856, 3.1640], + [3.9673, 1.4777, 2.6315], + [4.5509, 1.5338, 2.9545], + [5.1399, 1.6036, 3.4445]] +tau_small_p['c'] = asarray(tau_small_p['c']) * small_scaling + +tau_small_p['ct'] = [[3.2512, 1.6047, 4.9588], + [3.6646, 1.5419, 3.6448], + [4.0983, 1.5173, 2.9898], + [4.5844, 1.5338, 2.8796], + [5.0722, 1.5634, 2.9472], + [5.5300, 1.5914, 3.0392]] +tau_small_p['ct'] = asarray(tau_small_p['ct']) * small_scaling + +tau_small_p['ctt'] = [[4.0003, 1.6580, 4.8288], + [4.3534, 1.6016, 3.7947], + [4.7343, 1.5768, 3.2396], + [5.2140, 1.6077, 3.3449], + [5.6481, 1.6274, 3.3455], + [5.9296, 1.5929, 2.8223]] +tau_small_p['ctt'] = asarray(tau_small_p['ctt']) * small_scaling + +large_scaling = asarray([1, 1e-1, 1e-1, 1e-2]) +tau_large_p['nc'] = [[0.4797, 9.3557, -0.6999, 3.3066], + [1.5578, 8.5580, -2.0830, -3.3549], + [2.2268, 6.8093, -3.2362, -5.4448], + [2.7654, 6.4502, -3.0811, -4.4946], + [3.2684, 6.8051, -2.6778, -3.4972], + [3.7268, 7.1670, -2.3648, -2.8288]] +tau_large_p['nc'] = asarray(tau_large_p['nc']) * large_scaling + +tau_large_p['c'] = [[1.7339, 9.3202, -1.2745, -1.0368], + [2.1945, 6.4695, -2.9198, -4.2377], + [2.5893, 4.5168, -3.6529, -5.0074], + [3.0387, 4.5452, -3.3666, -4.1921], + [3.5049, 5.2098, -2.9158, -3.3468], + [3.9489, 5.8933, -2.5359, -2.721]] +tau_large_p['c'] = asarray(tau_large_p['c']) * large_scaling + +tau_large_p['ct'] = [[2.5261, 6.1654, -3.7956, -6.0285], + [2.8500, 5.2720, -3.6622, -5.1695], + [3.2210, 5.2550, -3.2685, -4.1501], + [3.6520, 5.9758, -2.7483, -3.2081], + [4.0712, 6.6428, -2.3464, -2.5460], + [4.4735, 7.1757, -2.0681, -2.1196]] +tau_large_p['ct'] = asarray(tau_large_p['ct']) * large_scaling + +tau_large_p['ctt'] = [[3.0778, 4.9529, -4.1477, -5.9359], + [3.4713, 5.9670, -3.2507, -4.2286], + [3.8637, 6.7852, -2.6286, -3.1381], + [4.2736, 7.6199, -2.1534, -2.4026], + [4.6679, 8.2618, -1.8220, -1.9147], + [5.0009, 8.3735, -1.6994, -1.6928]] +tau_large_p['ctt'] = asarray(tau_large_p['ctt']) * large_scaling + +# These are the new estimates from MacKinnon 2010 +# the first axis is N -1 +# the second axis is 1 %, 5 %, 10 % +# the last axis is the coefficients + +tau_2010 = {} + +tau_2010['nc'] = [[[-2.56574, -2.2358, -3.627, 0], # N = 1 + [-1.94100, -0.2686, -3.365, 31.223], + [-1.61682, 0.2656, -2.714, 25.364]]] +tau_2010['nc'] = asarray(tau_2010['nc']) + +tau_2010['c'] = [[[-3.43035, -6.5393, -16.786, -79.433], # N = 1, 1% + [-2.86154, -2.8903, -4.234, -40.040], # 5 % + [-2.56677, -1.5384, -2.809, 0]], # 10 % + [[-3.89644, -10.9519, -33.527, 0], # N = 2 + [-3.33613, -6.1101, -6.823, 0], + [-3.04445, -4.2412, -2.720, 0]], + [[-4.29374, -14.4354, -33.195, 47.433], # N = 3 + [-3.74066, -8.5632, -10.852, 27.982], + [-3.45218, -6.2143, -3.718, 0]], + [[-4.64332, -18.1031, -37.972, 0], # N = 4 + [-4.09600, -11.2349, -11.175, 0], + [-3.81020, -8.3931, -4.137, 0]], + [[-4.95756, -21.8883, -45.142, 0], # N = 5 + [-4.41519, -14.0405, -12.575, 0], + [-4.13157, -10.7417, -3.784, 0]], + [[-5.24568, -25.6688, -57.737, 88.639], # N = 6 + [-4.70693, -16.9178, -17.492, 60.007], + [-4.42501, -13.1875, -5.104, 27.877]], + [[-5.51233, -29.5760, -69.398, 164.295], # N = 7 + [-4.97684, -19.9021, -22.045, 110.761], + [-4.69648, -15.7315, -5.104, 27.877]], + [[-5.76202, -33.5258, -82.189, 256.289], # N = 8 + [-5.22924, -23.0023, -24.646, 144.479], + [-4.95007, -18.3959, -7.344, 94.872]], + [[-5.99742, -37.6572, -87.365, 248.316], # N = 9 + [-5.46697, -26.2057, -26.627, 176.382], + [-5.18897, -21.1377, -9.484, 172.704]], + [[-6.22103, -41.7154, -102.680, 389.33], # N = 10 + [-5.69244, -29.4521, -30.994, 251.016], + [-5.41533, -24.0006, -7.514, 163.049]], + [[-6.43377, -46.0084, -106.809, 352.752], # N = 11 + [-5.90714, -32.8336, -30.275, 249.994], + [-5.63086, -26.9693, -4.083, 151.427]], + [[-6.63790, -50.2095, -124.156, 579.622], # N = 12 + [-6.11279, -36.2681, -32.505, 314.802], + [-5.83724, -29.9864, -2.686, 184.116]]] +tau_2010['c'] = asarray(tau_2010['c']) + +tau_2010['ct'] = [[[-3.95877, -9.0531, -28.428, -134.155], # N = 1 + [-3.41049, -4.3904, -9.036, -45.374], + [-3.12705, -2.5856, -3.925, -22.380]], + [[-4.32762, -15.4387, -35.679, 0], # N = 2 + [-3.78057, -9.5106, -12.074, 0], + [-3.49631, -7.0815, -7.538, 21.892]], + [[-4.66305, -18.7688, -49.793, 104.244], # N = 3 + [-4.11890, -11.8922, -19.031, 77.332], + [-3.83511, -9.0723, -8.504, 35.403]], + [[-4.96940, -22.4694, -52.599, 51.314], # N = 4 + [-4.42871, -14.5876, -18.228, 39.647], + [-4.14633, -11.2500, -9.873, 54.109]], + [[-5.25276, -26.2183, -59.631, 50.646], # N = 5 + [-4.71537, -17.3569, -22.660, 91.359], + [-4.43422, -13.6078, -10.238, 76.781]], + [[-5.51727, -29.9760, -75.222, 202.253], # N = 6 + [-4.98228, -20.3050, -25.224, 132.03], + [-4.70233, -16.1253, -9.836, 94.272]], + [[-5.76537, -33.9165, -84.312, 245.394], # N = 7 + [-5.23299, -23.3328, -28.955, 182.342], + [-4.95405, -18.7352, -10.168, 120.575]], + [[-6.00003, -37.8892, -96.428, 335.92], # N = 8 + [-5.46971, -26.4771, -31.034, 220.165], + [-5.19183, -21.4328, -10.726, 157.955]], + [[-6.22288, -41.9496, -109.881, 466.068], # N = 9 + [-5.69447, -29.7152, -33.784, 273.002], + [-5.41738, -24.2882, -8.584, 169.891]], + [[-6.43551, -46.1151, -120.814, 566.823], # N = 10 + [-5.90887, -33.0251, -37.208, 346.189], + [-5.63255, -27.2042, -6.792, 177.666]], + [[-6.63894, -50.4287, -128.997, 642.781], # N = 11 + [-6.11404, -36.4610, -36.246, 348.554], + [-5.83850, -30.1995, -5.163, 210.338]], + [[-6.83488, -54.7119, -139.800, 736.376], # N = 12 + [-6.31127, -39.9676, -37.021, 406.051], + [-6.03650, -33.2381, -6.606, 317.776]]] +tau_2010['ct'] = asarray(tau_2010['ct']) + +tau_2010['ctt'] = [[[-4.37113, -11.5882, -35.819, -334.047], # N = 1 + [-3.83239, -5.9057, -12.490, -118.284], + [-3.55326, -3.6596, -5.293, -63.559]], + [[-4.69276, -20.2284, -64.919, 88.884], # N =2 + [-4.15387, -13.3114, -28.402, 72.741], + [-3.87346, -10.4637, -17.408, 66.313]], + [[-4.99071, -23.5873, -76.924, 184.782], # N = 3 + [-4.45311, -15.7732, -32.316, 122.705], + [-4.17280, -12.4909, -17.912, 83.285]], + [[-5.26780, -27.2836, -78.971, 137.871], # N = 4 + [-4.73244, -18.4833, -31.875, 111.817], + [-4.45268, -14.7199, -17.969, 101.92]], + [[-5.52826, -30.9051, -92.490, 248.096], # N = 5 + [-4.99491, -21.2360, -37.685, 194.208], + [-4.71587, -17.0820, -18.631, 136.672]], + [[-5.77379, -34.7010, -105.937, 393.991], # N = 6 + [-5.24217, -24.2177, -39.153, 232.528], + [-4.96397, -19.6064, -18.858, 174.919]], + [[-6.00609, -38.7383, -108.605, 365.208], # N = 7 + [-5.47664, -27.3005, -39.498, 246.918], + [-5.19921, -22.2617, -17.910, 208.494]], + [[-6.22758, -42.7154, -119.622, 421.395], # N = 8 + [-5.69983, -30.4365, -44.300, 345.48], + [-5.42320, -24.9686, -19.688, 274.462]], + [[-6.43933, -46.7581, -136.691, 651.38], # N = 9 + [-5.91298, -33.7584, -42.686, 346.629], + [-5.63704, -27.8965, -13.880, 236.975]], + [[-6.64235, -50.9783, -145.462, 752.228], # N = 10 + [-6.11753, -37.056, -48.719, 473.905], + [-5.84215, -30.8119, -14.938, 316.006]], + [[-6.83743, -55.2861, -152.651, 792.577], # N = 11 + [-6.31396, -40.5507, -46.771, 487.185], + [-6.03921, -33.8950, -9.122, 285.164]], + [[-7.02582, -59.6037, -166.368, 989.879], # N = 12 + [-6.50353, -44.0797, -47.242, 543.889], + [-6.22941, -36.9673, -10.868, 418.414]]] +tau_2010['ctt'] = asarray(tau_2010['ctt']) + +# These are the cut-off values for the left-tail vs. the rest of the +# tau distribution, for getting the p-values +tau_star = {'nc': [-1.04, -1.53, -2.68, -3.09, -3.07, -3.77], + 'c': [-1.61, -2.62, -3.13, -3.47, -3.78, -3.93], + 'ct': [-2.89, -3.19, -3.50, -3.65, -3.80, -4.36], + 'ctt': [-3.21, -3.51, -3.81, -3.83, -4.12, -4.63]} + +tau_min = {'nc': [-19.04, -19.62, -21.21, -23.25, -21.63, -25.74], + 'c': [-18.83, -18.86, -23.48, -28.07, -25.96, -23.27], + 'ct': [-16.18, -21.15, -25.37, -26.63, -26.53, -26.18], + 'ctt': [-17.17, -21.1, -24.33, -24.03, -24.33, -28.22]} + +tau_max = {'nc': [inf, 1.51, 0.86, 0.88, 1.05, 1.24], + 'c': [2.74, 0.92, 0.55, 0.61, 0.79, 1], + 'ct': [0.7, 0.63, 0.71, 0.93, 1.19, 1.42], + 'ctt': [0.54, 0.79, 1.08, 1.43, 3.49, 1.92]} + +# Z values from MacKinnon(1994), tables 5 and 6 +adf_z_max = {'c': inf, + 'ct': inf, + 'ctt': inf} + +adf_z_min = {'c': -22.03, + 'ct': -32.85, + 'ctt': -41.18} + +adf_z_star = {'c': -7.96, 'ct': -13.46, 'ctt': -16.27} + +adf_z_small_p = {'nc': [.0342, -.6376, 0, -.03872], + 'c': [2.2142, -1.7863, 0.3283, -0.07727], + 'ct': [4.6476, -2.8932, 0.5832, -0.09990], + 'ctt': [4.4599, -1.8635, 0.2126, -0.06070]} +# These are Table 6 from MacKinnon (1994). +# These are well-behaved in the right tail. +# the approximation function is +# p = norm.cdf(d_0 + d_1 * z + d_2*z**2 + d_3*z**3 + d_4*z**4) +adf_z_large_p = {'nc': [0.4927, 6.906, 13.2331, 12.099, 0], + 'c': [1.7157, 0.5536, 4.5518, 2.2466, 4.2537], + 'ct': [2.7119, 0.4594, 2.3747, 0.7488, 0.9333], + 'ctt': [3.4216, 0.4170, 1.6939, 0.4203, 0.4153]} + +adf_z_large_p_scale = asarray([1.0, 1.0, 1e-2, 1e-3, 1e-5]) +for k, v in adf_z_large_p.iteritems(): + adf_z_large_p[k] = asarray(adf_z_large_p[k]) * adf_z_large_p_scale + +adf_z_cv_approx = { + 'ctt': asarray([[-36.57197198, 351.1444586, -2377.68403687, 8040.27872297], + [-28.11093834, 213.04812463, -1271.25554199, 4487.9988725], + [-24.18276878, 158.37799447, -806.81475966, + 2173.15644788]]), + 'c': asarray([[-20.62490333, 118.99498032, -536.04374012, 1429.81685652], + [-14.0928388, 57.82357334, -185.65594822, 279.8800972], + [-11.25010719, 38.15946541, -109.52148583, 206.38656327]]), + 'nc': asarray( + [[-13.2887112, 3.22727093e+02, -8.29915035e+03, 7.96961213e+04], + [-7.81935288e+00, 2.17103351e+02, -5.53972290e+03, 5.24347804e+04], + [-5.56707483e+00, 1.66592983e+02, -4.18616134e+03, 3.91896116e+04]]), + 'ct': asarray([[-29.33684958, 230.67000702, -1301.64844712, 0.], + [-21.70521047, 128.65518728, -508.58212394, 0.], + [-18.24843409, 93.18181231, -355.29469296, 0.]])} diff --git a/statsmodels/tsa/critical_values/kpss.py b/statsmodels/tsa/critical_values/kpss.py new file mode 100644 index 00000000000..0111797f22f --- /dev/null +++ b/statsmodels/tsa/critical_values/kpss.py @@ -0,0 +1,40 @@ +from numpy import asarray + +kpss_critical_values = {} +kpss_critical_values['c'] = ((100.0, 0.0000), (99.5, 0.0219), (99.0, 0.0249), + (98.0, 0.0289), (97.0, 0.0319), (96.0, 0.0344), + (94.5, 0.0377), (93.0, 0.0407), (89.5, 0.0470), + (86.0, 0.0527), (74.0, 0.0720), (70.5, 0.0778), + (67.0, 0.0839), (64.0, 0.0894), (61.5, 0.0941), + (58.5, 0.1001), (56.0, 0.1053), (53.0, 0.1119), + (50.0, 0.1190), (47.0, 0.1266), (44.5, 0.1333), + (42.0, 0.1406), (39.5, 0.1484), (37.0, 0.1568), + (34.5, 0.1659), (32.0, 0.1758), (30.0, 0.1845), + (28.0, 0.1938), (26.0, 0.2041), (24.0, 0.2152), + (22.5, 0.2244), (21.0, 0.2343), (19.5, 0.2451), + (18.0, 0.2569), (16.5, 0.2699), (15.0, 0.2842), + (14.0, 0.2948), (12.0, 0.3187), (11.0, 0.3324), + (10.0, 0.3475), (8.5, 0.3737), (7.0, 0.4054), + (6.0, 0.4309), (5.0, 0.4614), (4.0, 0.4993), + (3.5, 0.5222), (2.5, 0.5806), (2.0, 0.6197), + (1.5, 0.6706), (1.0, 0.7430), (0.7, 0.8075), + (0.4, 0.9097), (0.2, 1.0375), (0.1, 1.1661)) +kpss_critical_values['c'] = asarray(kpss_critical_values['c']) +kpss_critical_values['ct'] = ((100.0, 0.0000), (99.5, 0.0156), (99.0, 0.0173), + (98.5, 0.0185), (97.5, 0.0203), (96.0, 0.0224), + (94.5, 0.0240), (93.0, 0.0254), (91.0, 0.0272), + (89.0, 0.0287), (86.5, 0.0306), (81.5, 0.0340), + (68.5, 0.0424), (62.5, 0.0464), (59.5, 0.0485), + (56.5, 0.0507), (53.5, 0.0529), (51.0, 0.0548), + (48.0, 0.0573), (45.0, 0.0598), (42.5, 0.0621), + (40.0, 0.0645), (37.5, 0.0670), (35.0, 0.0697), + (33.0, 0.0720), (31.0, 0.0744), (29.0, 0.0770), + (27.5, 0.0791), (25.5, 0.0820), (24.0, 0.0844), + (22.5, 0.0869), (21.0, 0.0897), (18.5, 0.0946), + (16.0, 0.1004), (14.0, 0.1057), (12.0, 0.1119), + (10.0, 0.1193), (8.5, 0.1259), (7.0, 0.1339), + (6.0, 0.1403), (5.0, 0.1479), (4.0, 0.1573), + (3.5, 0.1630), (2.5, 0.1775), (2.0, 0.1871), + (1.5, 0.1997), (1.0, 0.2177), (0.7, 0.2335), + (0.4, 0.2587), (0.2, 0.2904), (0.1, 0.3225)) +kpss_critical_values['ct'] = asarray(kpss_critical_values['ct']) \ No newline at end of file diff --git a/statsmodels/tsa/critical_values/simulation/adf_z_simlation_process.py b/statsmodels/tsa/critical_values/simulation/adf_z_simlation_process.py new file mode 100644 index 00000000000..bd2171c6a72 --- /dev/null +++ b/statsmodels/tsa/critical_values/simulation/adf_z_simlation_process.py @@ -0,0 +1,51 @@ +import numpy as np +from scipy.stats import norm + +from statsmodels.regression.linear_model import OLS, WLS + + +trends = ('nc', 'c', 'ct', 'ctt') +critical_values = (1.0, 5.0, 10.0) +adf_z_cv_approx = {} +for t in trends: + print t + data = np.load('adf_z_' + t + '.npz') + percentiles = data['percentiles'] + trend = data['trend'] + results = data['results'] + # T = data['T'] + data.close() + + # Remove later + T = np.array( + (20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 120, 140, 160, + 180, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, + 1000, 1200, 1400, 2000)) + T = T[::-1] + + # For percentiles 1, 5 and 10, regress on a constant, and powers of 1/T + out = [] + for cv in critical_values: + num_ex = results.shape[2] + loc = np.where(percentiles == cv)[0][0] + lhs = np.squeeze(results[loc, :, :]) + # Adjust for effective sample size, this is what lookup the code uses + tau = np.ones((num_ex, 1)).dot(T[None, :]) - 1.0 + tau = tau.T + lhs = lhs.ravel() + tau = tau.ravel() + tau = tau[:, None] + n = lhs.shape[0] + rhs = np.ones((n, 1)) + rhs = np.hstack((rhs, 1.0 / tau)) + rhs = np.hstack((rhs, (1.0 / tau) ** 2.0)) + rhs = np.hstack((rhs, (1.0 / tau) ** 3.0)) + res = OLS(lhs, rhs).fit() + res.params[np.abs(res.tvalues) < 1.96] = 0.0 + out.append(res.params) + + adf_z_cv_approx[t] = np.array(out) + +print 'from numpy import array' +print '' +print 'adf_z_cv_approx = ' + str(adf_z_cv_approx) diff --git a/statsmodels/tsa/critical_values/simulation/dfgls_critical_values_simulation.py b/statsmodels/tsa/critical_values/simulation/dfgls_critical_values_simulation.py new file mode 100644 index 00000000000..54f5b06bae7 --- /dev/null +++ b/statsmodels/tsa/critical_values/simulation/dfgls_critical_values_simulation.py @@ -0,0 +1,116 @@ +from __future__ import division +import datetime + +from numpy import ones, vstack, arange, diff, cumsum, sqrt, sum +from numpy.linalg import pinv +import numpy as np + +from statsmodels.tools.parallel import parallel_func + +# Controls memory use, in MiB +MAX_MEMORY_SIZE = 100 +NUM_JOBS = 3 + + +def wrapper(n, deterministic, b, seed=0): + """ + Wraps and blocks the main simulation so that the maximum amount of memory + can be controlled on multi processor systems when executing in parallel + """ + rng = np.random.RandomState() + rng.seed(seed) + remaining = b + res = np.zeros(b) + finished = 0 + block_size = int(2 ** 20.0 * MAX_MEMORY_SIZE / (8.0 * n)) + for j in xrange(0, b, block_size): + if block_size < remaining: + count = block_size + else: + count = remaining + st = finished + en = finished + count + res[st:en] = dfgsl_simulation(n, deterministic, count, rng) + finished += count + remaining -= count + + return res + + +def dfgsl_simulation(n, deterministic, b, rng=None): + """ + Simulates the empirical distribution of the DFGLS test statistic + """ + if rng is None: + np.random.seed(0) + from numpy.random import standard_normal + else: + standard_normal = rng.standard_normal + + nobs = n + if deterministic == 'c': + c = -7.0 + z = ones((nobs, 1)) + else: + c = -13.5 + z = vstack((ones(nobs), arange(1, nobs + 1))).T + + ct = c / nobs + + delta_z = np.copy(z) + delta_z[1:, :] = delta_z[1:, :] - (1 + ct) * delta_z[:-1, :] + delta_z_inv = pinv(delta_z) + y = standard_normal((n + 50, b)) + y = cumsum(y, axis=0) + y = y[50:, :] + delta_y = y.copy() + delta_y[1:, :] = delta_y[1:, :] - (1 + ct) * delta_y[:-1, :] + detrend_coef = delta_z_inv.dot(delta_y) + y_detrended = y - z.dot(detrend_coef) + + delta_y_detrended = diff(y_detrended, axis=0) + rhs = y_detrended[:-1, :] + lhs = delta_y_detrended + + xpy = sum(rhs * lhs, 0) + xpx = sum(rhs ** 2.0, 0) + gamma = xpy / xpx + e = lhs - rhs * gamma + sigma2 = sum(e ** 2.0, axis=0) / (n - 1) # DOF correction? + gamma_var = sigma2 / xpx + + stat = gamma / sqrt(gamma_var) + return stat + + +if __name__ == '__main__': + deterministics = ('c', 'ct') + ex_num = 500 + ex_size = 200000 + T = np.array( + (20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 120, 140, 160, + 180, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, + 1000, 1200, 1400, 2000)) + T = T[::-1] + percentiles = list(np.arange(0.5, 100.0, 0.5)) + seeds = np.arange(0, 2 ** 32, step=2 ** 23) + for d in deterministics: + results = np.zeros((len(percentiles), len(T), ex_num)) + + for i in xrange(ex_num): + print "Experiment Number {0}".format(i + 1) + now = datetime.datetime.now() + parallel, p_func, n_jobs = parallel_func(wrapper, + n_jobs=NUM_JOBS, + verbose=2) + out = parallel(p_func(t, d, ex_size, seed=seeds[i]) for t in T) + q = lambda x: np.percentile(x, percentiles) + quantiles = map(q, out) + results[:, :, i] = np.array(quantiles).T + print datetime.datetime.now() - now + + np.savez('dfgls_' + d + '.npz', + deterministic=d, + results=results, + percentiles=percentiles, + T=T) \ No newline at end of file diff --git a/statsmodels/tsa/critical_values/simulation/dfgls_simulation_process.py b/statsmodels/tsa/critical_values/simulation/dfgls_simulation_process.py new file mode 100644 index 00000000000..827ca204632 --- /dev/null +++ b/statsmodels/tsa/critical_values/simulation/dfgls_simulation_process.py @@ -0,0 +1,118 @@ +import numpy as np +from scipy.stats import norm + +from statsmodels.regression.linear_model import OLS, WLS + + +trends = ('c', 'ct') +critical_values = (1.0, 5.0, 10.0) +dfgls_cv_approx = {} +for t in trends: + print t + data = np.load('dfgls_' + t + '.npz') + percentiles = data['percentiles'] + deterministic = data['deterministic'] + results = data['results'] + # T = data['T'] + data.close() + + # Remove later + T = np.array( + (20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 120, 140, 160, + 180, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, + 1000, 1200, 1400, 2000)) + T = T[::-1] + + # For percentiles 1, 5 and 10, regress on a constant, and powers of 1/T + out = [] + for cv in critical_values: + num_ex = results.shape[2] + loc = np.where(percentiles == cv)[0][0] + lhs = np.squeeze(results[loc, :, :]) + # Adjust for effective sample size, this is what lookup the code uses + tau = np.ones((num_ex, 1)).dot(T[None, :]) - 1.0 + tau = tau.T + lhs = lhs.ravel() + tau = tau.ravel() + tau = tau[:, None] + n = lhs.shape[0] + rhs = np.ones((n, 1)) + rhs = np.hstack((rhs, 1.0 / tau)) + rhs = np.hstack((rhs, (1.0 / tau) ** 2.0)) + rhs = np.hstack((rhs, (1.0 / tau) ** 3.0)) + res = OLS(lhs, rhs).fit() + out.append(res.params) + + dfgls_cv_approx[t] = np.array(out) + +trends = ('c', 'ct') +dfgls_large_p = {} +dfgls_small_p = {} +dfgls_tau_star = {} +dfgls_tau_max = {} +dfgls_tau_min = {} +for t in trends: + data = np.load('dfgls_' + t + '.npz') + percentiles = data['percentiles'] + results = data['results'] # Remove later + # LHS is norm cdf inv of percentiles + lhs = norm().ppf(percentiles / 100.0) + lhs_large = lhs + # RHS is made up of avg test stats for largest T, which is in pos 1 + avg_test_stats = results[:, 1, :].mean(axis=1) + avg_test_std = results[:, 1, :].std(axis=1) + avg_test_stats = avg_test_stats[:, None] + m = lhs.shape[0] + rhs = np.ones((m, 1)) + rhs = np.hstack((rhs, avg_test_stats)) + rhs = np.hstack((rhs, avg_test_stats ** 2.0)) + rhs = np.hstack((rhs, avg_test_stats ** 3.0)) + rhs_large = rhs + res_large = WLS(lhs, rhs, weights=1.0 / avg_test_std).fit() + dfgls_large_p[t] = res_large.params + # Compute tau_max, by finding the func maximum + p = res_large.params + poly_roots = np.roots(np.array([3, 2, 1.0]) * p[:0:-1]) + dfgls_tau_max[t] = float(np.squeeze(np.real(np.max(poly_roots)))) + + # Small p regression using only p<=15% + cutoff = np.where(percentiles <= 15.0)[0] + avg_test_stats = results[cutoff, 1, :].mean(axis=1) + avg_test_std = results[cutoff, 1, :].std(axis=1) + avg_test_stats = avg_test_stats[:, None] + lhs = lhs[cutoff] + m = lhs.shape[0] + rhs = np.ones((m, 1)) + rhs = np.hstack((rhs, avg_test_stats)) + rhs = np.hstack((rhs, avg_test_stats ** 2.0)) + res_small = WLS(lhs, rhs, weights=1.0 / avg_test_std).fit() + dfgls_small_p[t] = res_small.params + + # Compute tau star + err_large = res_large.resid + # Missing 1 parameter here, replace with 0 + params = np.append(res_small.params, 0.0) + err_small = lhs_large - rhs_large.dot(params) + # Find the location that minimizes the total absolute error + m = lhs_large.shape[0] + abs_err = np.zeros((m, 1)) + for i in xrange(m): + abs_err[i] = np.abs(err_large[i:]).sum() + np.abs(err_small[:i]).sum() + loc = np.argmin(abs_err) + dfgls_tau_star[t] = rhs_large[loc, 1] + # Compute tau min + dfgls_tau_min[t] = -params[1] / (2 * params[2]) + +print 'from numpy import array' +print '' +print 'dfgls_cv_approx = ' + str(dfgls_cv_approx) +print '' +print 'dfgls_tau_max = ' + str(dfgls_tau_max) +print '' +print 'dfgls_tau_min = ' + str(dfgls_tau_min) +print '' +print 'dfgls_tau_star = ' + str(dfgls_tau_star) +print '' +print 'dfgls_large_p = ' + str(dfgls_large_p) +print '' +print 'dfgls_small_p = ' + str(dfgls_small_p) diff --git a/statsmodels/tsa/critical_values/simulation/kpss_critical_values_simulation.py b/statsmodels/tsa/critical_values/simulation/kpss_critical_values_simulation.py new file mode 100644 index 00000000000..f97e31ddd69 --- /dev/null +++ b/statsmodels/tsa/critical_values/simulation/kpss_critical_values_simulation.py @@ -0,0 +1,98 @@ +""" +Calculates quantiles of the KPSS test statistic for both the constant +and constant plus trend scenarios. +""" +from __future__ import division +import os +from numpy.random import RandomState +import numpy as np +import pandas as pd +from statsmodels.tsa.tsatools import add_trend + + +def simulate_kpss(nobs, B, deterministic='c', rng=None): + if rng is None: + rng = RandomState() + rng.seed(0) + + standard_normal = rng.standard_normal + + e = standard_normal((nobs, B)) + z = np.ones((nobs, 1)) + if deterministic == 'ct': + z = add_trend(z, trend='t') + zinv = np.linalg.pinv(z) + trend_coef = zinv.dot(e) + resid = e - z.dot(trend_coef) + s = np.cumsum(resid, axis=0) + lam = np.mean(resid ** 2.0, axis=0) + kpss = 1 / (nobs ** 2.0) * np.sum(s ** 2.0, axis=0) / lam + return kpss + + +def block_simulate_kpss(nobs, b, deterministic='c', max_memory=250): + rng = RandomState() + rng.seed(0) + memory = max_memory * 2 ** 20 + b_max_memory = memory // 8 // nobs + b_max_memory = max(b_max_memory, 1) + remaining = b + results = np.zeros(b) + while remaining > 0: + b_eff = min(remaining, b_max_memory) + completed = b - remaining + results[completed:completed + b_eff] = \ + simulate_kpss(nobs, b_eff, deterministic=deterministic, rng=rng) + remaining -= b_max_memory + + return results + + +if __name__ == '__main__': + import datetime as dt + import cStringIO as cio + + nobs = 2000 + B = 10 * 10e6 + + percentiles = np.concatenate((np.arange(0.0, 99.0, 0.5), + np.arange(99.0, 100.0, 0.1))) + + critical_values = 100 - percentiles + critical_values_string = map(lambda x: '{0:0.1f}'.format(x), critical_values) + sio = cio.StringIO() + sio.write("kpss_critical_values = {}\n") + + hdf_filename = 'kpss_critical_values.h5' + try: + os.remove(hdf_filename) + except OSError: + pass + + for d in ('c', 'ct'): + now = dt.datetime.now() + kpss = block_simulate_kpss(nobs, B, deterministic=d) + print dt.datetime.now() - now + quantiles = np.percentile(kpss, list(percentiles)) + df = pd.DataFrame(quantiles, index=critical_values,columns=[d]) + df.to_hdf(hdf_filename, key=d, mode='a') + quantiles = map(lambda x: '{0:0.5f}'.format(x), quantiles) + + sio.write("kpss_critical_values['" + d + "'] = (") + count = 0 + for c, q in zip(critical_values_string, quantiles): + sio.write('(' + c + ', ' + q + ')') + count += 1 + if count % 3 == 0: + sio.write(',\n ') + else: + sio.write(', ') + sio.write(")") + sio.write("\n") + + sio.seek(0) + print sio.read() + + + + diff --git a/statsmodels/tsa/critical_values/simulation/kpss_simulation_process.py b/statsmodels/tsa/critical_values/simulation/kpss_simulation_process.py new file mode 100644 index 00000000000..aa1ff63e475 --- /dev/null +++ b/statsmodels/tsa/critical_values/simulation/kpss_simulation_process.py @@ -0,0 +1,87 @@ +import cStringIO as cio + +import numpy as np +import pandas as pd + + +sio = cio.StringIO() +c = pd.read_hdf('kpss_critical_values.h5', 'c') +ct = pd.read_hdf('kpss_critical_values.h5', 'ct') + +data = {'c': c, 'ct': ct} +for k, v in data.iteritems(): + n = v.shape[0] + selected = np.zeros((n, 1), dtype=np.bool) + selected[0] = True + selected[-1] = True + selected[v.index == 10.0] = True + selected[v.index == 5.0] = True + selected[v.index == 2.5] = True + selected[v.index == 1.0] = True + max_diff = 1.0 + while max_diff > 0.05: + xp = np.squeeze(v[selected].values) + yp = np.asarray(v[selected].index, dtype=np.float64) + x = np.squeeze(v.values) + y = np.asarray(v.index, dtype=np.float64) + yi = np.interp(x, xp, yp) + abs_diff = np.abs(y - yi) + max_diff = np.max(abs_diff) + if max_diff > 0.05: + print selected.sum() + print np.where(abs_diff == max_diff) + selected[np.where(abs_diff == max_diff)] = True + + quantiles = list(np.squeeze(v[selected].index.values)) + critical_values = list(np.squeeze(v[selected].values)) + # Fix for first CV + critical_values[0] = 0.0 + sio.write("kpss_critical_values['" + k + "'] = (") + count = 0 + for c, q in zip(critical_values, quantiles): + sio.write( + '(' + '{0:0.1f}'.format(q) + ', ' + '{0:0.4f}'.format(c) + ')') + count += 1 + if count % 3 == 0: + sio.write(',\n ') + else: + sio.write(', ') + sio.write(")\n") + sio.write("kpss_critical_values['" + k + "'] = ") + sio.write("np.array(kpss_critical_values['" + k + "'])") + sio.write("\n") + +sio.seek(0) +print sio.read() + + + + + +kpss_critical_values = {} +kpss_critical_values['c'] = ((100.0, 0.0091), (99.5, 0.0218), (99.0, 0.0248), + (97.5, 0.0304), (95.5, 0.0354), (92.5, 0.0416), + (89.5, 0.0470), (73.5, 0.0726), (69.0, 0.0802), + (63.5, 0.0900), (58.5, 0.0999), (53.5, 0.1108), + (48.5, 0.1229), (43.0, 0.1376), (39.5, 0.1483), + (35.5, 0.1622), (31.5, 0.1780), (29.5, 0.1867), + (26.5, 0.2017), (24.5, 0.2127), (20.5, 0.2378), + (18.5, 0.2524), (16.0, 0.2738), (13.5, 0.2993), + (12.0, 0.3183), (9.5, 0.3549), (8.0, 0.3833), + (6.5, 0.4173), (4.5, 0.4793), (3.5, 0.5237), + (2.5, 0.5828), (1.5, 0.6734), (0.7, 0.8157), + (0.3, 0.9674), (0.1, 1.1692), ) +kpss_critical_values['c'] = np.array(kpss_critical_values['c']) +kpss_critical_values['ct'] = ((100.0, 0.0077), (99.5, 0.0156), (99.0, 0.0173), + (98.0, 0.0194), (97.0, 0.0210), (95.5, 0.0229), + (94.0, 0.0245), (91.0, 0.0271), (88.0, 0.0295), + (85.5, 0.0313), (80.5, 0.0347), (67.0, 0.0434), + (60.0, 0.0480), (50.0, 0.0555), (46.5, 0.0584), + (40.0, 0.0643), (35.5, 0.0691), (32.0, 0.0731), + (29.0, 0.0769), (25.5, 0.0817), (21.0, 0.0894), + (18.5, 0.0943), (17.0, 0.0978), (15.5, 0.1015), + (13.0, 0.1088), (11.0, 0.1153), (8.5, 0.1257), + (7.0, 0.1337), (6.0, 0.1401), (4.0, 0.1570), + (2.5, 0.1770), (1.5, 0.1990), (0.7, 0.2350), + (0.3, 0.2738), (0.1, 0.3231), ) +kpss_critical_values['ct'] = np.array(kpss_critical_values['ct']) \ No newline at end of file diff --git a/statsmodels/tsa/stattools.py b/statsmodels/tsa/stattools.py index 02f4e8d1469..37750222d21 100644 --- a/statsmodels/tsa/stattools.py +++ b/statsmodels/tsa/stattools.py @@ -1,19 +1,17 @@ """ Statistical tools for time series analysis """ - +from __future__ import division import numpy as np from numpy.linalg import LinAlgError from scipy import stats from statsmodels.regression.linear_model import OLS, yule_walker from statsmodels.tools.tools import add_constant, Bunch from tsatools import lagmat, lagmat2ds, add_trend -from adfvalues import mackinnonp, mackinnoncrit from statsmodels.tsa.arima_model import ARMA __all__ = ['acovf', 'acf', 'pacf', 'pacf_yw', 'pacf_ols', 'ccovf', 'ccf', - 'periodogram', 'q_stat', 'coint', 'arma_order_select_ic', - 'adfuller'] + 'periodogram', 'q_stat', 'arma_order_select_ic', 'cov_nw'] #NOTE: now in two places to avoid circular import @@ -23,6 +21,56 @@ def __str__(self): return self._str # pylint: disable=E1101 +def cov_nw(y, lags=0, demean=True, axis=0): + """ + Computes Newey-West covariance for 1-d and 2-d arrays + + Parameters + ---------- + y : array-like, 1d or 2d + Values to use when computing the Newey-West covariance estimator. + When u is 2d, default behavior is to treat columns as variables and + rows as observations. + lags : int, non-negative + Number of lags to include in the Newey-West covariance estimator + demean : bool + Indicates whether to subtract the mean. Default is True + axis : int, (0, 1) + The axis to use when y is 2d + + Returns + ------- + cov : array + The estimated covariance + + """ + z = y + is_1d = False + if axis > z.ndim: + raise ValueError('axis must be less than the dimension of y') + if z.ndim == 1: + is_1d = True + z = z[:, None] + if axis == 1: + z = z.T + n = z.shape[0] + if lags > n: + error = 'lags must be weakly smaller than the number of observations' + raise ValueError(error) + + if demean: + z = z - z.mean(0) + cov = z.T.dot(z) + for j in xrange(1, lags + 1): + w = (1 - j / (lags + 1)) + gamma = z[j:].T.dot(z[:-j]) + cov += w * (gamma + gamma.T) + cov = cov / n + if is_1d: + cov = float(cov) + return cov + + def _autolag(mod, endog, exog, startlag, maxlag, method, modargs=(), fitargs=(), regresults=False): """ @@ -50,7 +98,7 @@ def _autolag(mod, endog, exog, startlag, maxlag, method, modargs=(), icbest : float Best information criteria. bestlag : int - The lag length that maximizes the information criterion. + The lag length that minimizes the information criterion. Notes @@ -92,195 +140,6 @@ def _autolag(mod, endog, exog, startlag, maxlag, method, modargs=(), return icbest, bestlag, results -#this needs to be converted to a class like HetGoldfeldQuandt, -# 3 different returns are a mess -# See: -#Ng and Perron(2001), Lag length selection and the construction of unit root -#tests with good size and power, Econometrica, Vol 69 (6) pp 1519-1554 -#TODO: include drift keyword, only valid with regression == "c" -# just changes the distribution of the test statistic to a t distribution -#TODO: autolag is untested -def adfuller(x, maxlag=None, regression="c", autolag='AIC', - store=False, regresults=False): - ''' - Augmented Dickey-Fuller unit root test - - The Augmented Dickey-Fuller test can be used to test for a unit root in a - univariate process in the presence of serial correlation. - - Parameters - ---------- - x : array_like, 1d - data series - maxlag : int - Maximum lag which is included in test, default 12*(nobs/100)^{1/4} - regression : str {'c','ct','ctt','nc'} - Constant and trend order to include in regression - * 'c' : constant only (default) - * 'ct' : constant and trend - * 'ctt' : constant, and linear and quadratic trend - * 'nc' : no constant, no trend - autolag : {'AIC', 'BIC', 't-stat', None} - * if None, then maxlag lags are used - * if 'AIC' (default) or 'BIC', then the number of lags is chosen - to minimize the corresponding information criterium - * 't-stat' based choice of maxlag. Starts with maxlag and drops a - lag until the t-statistic on the last lag length is significant at - the 95 % level. - store : bool - If True, then a result instance is returned additionally to - the adf statistic (default is False) - regresults : bool - If True, the full regression results are returned (default is False) - - Returns - ------- - adf : float - Test statistic - pvalue : float - MacKinnon's approximate p-value based on MacKinnon (1994) - usedlag : int - Number of lags used. - nobs : int - Number of observations used for the ADF regression and calculation of - the critical values. - critical values : dict - Critical values for the test statistic at the 1 %, 5 %, and 10 % - levels. Based on MacKinnon (2010) - icbest : float - The maximized information criterion if autolag is not None. - regresults : RegressionResults instance - The - resstore : (optional) instance of ResultStore - an instance of a dummy class with results attached as attributes - - Notes - ----- - The null hypothesis of the Augmented Dickey-Fuller is that there is a unit - root, with the alternative that there is no unit root. If the pvalue is - above a critical size, then we cannot reject that there is a unit root. - - The p-values are obtained through regression surface approximation from - MacKinnon 1994, but using the updated 2010 tables. - If the p-value is close to significant, then the critical values should be - used to judge whether to accept or reject the null. - - The autolag option and maxlag for it are described in Greene. - - Examples - -------- - see example script - - References - ---------- - Greene - Hamilton - - - P-Values (regression surface approximation) - MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for - unit-root and cointegration tests. `Journal of Business and Economic - Statistics` 12, 167-76. - - Critical values - MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's - University, Dept of Economics, Working Papers. Available at - http://ideas.repec.org/p/qed/wpaper/1227.html - - ''' - - if regresults: - store = True - - trenddict = {None: 'nc', 0: 'c', 1: 'ct', 2: 'ctt'} - if regression is None or isinstance(regression, int): - regression = trenddict[regression] - regression = regression.lower() - if regression not in ['c', 'nc', 'ct', 'ctt']: - raise ValueError("regression option %s not understood") % regression - x = np.asarray(x) - nobs = x.shape[0] - - if maxlag is None: - #from Greene referencing Schwert 1989 - maxlag = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.))) - - xdiff = np.diff(x) - xdall = lagmat(xdiff[:, None], maxlag, trim='both', original='in') - nobs = xdall.shape[0] # pylint: disable=E1103 - - xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x - xdshort = xdiff[-nobs:] - - if store: - resstore = ResultsStore() - if autolag: - if regression != 'nc': - fullRHS = add_trend(xdall, regression, prepend=True) - else: - fullRHS = xdall - startlag = fullRHS.shape[1] - xdall.shape[1] + 1 # 1 for level # pylint: disable=E1103 - #search for lag length with smallest information criteria - #Note: use the same number of observations to have comparable IC - #aic and bic: smaller is better - - if not regresults: - icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, - maxlag, autolag) - else: - icbest, bestlag, alres = _autolag(OLS, xdshort, fullRHS, startlag, - maxlag, autolag, - regresults=regresults) - resstore.autolag_results = alres - - bestlag -= startlag # convert to lag not column index - - #rerun ols with best autolag - xdall = lagmat(xdiff[:, None], bestlag, trim='both', original='in') - nobs = xdall.shape[0] # pylint: disable=E1103 - xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x - xdshort = xdiff[-nobs:] - usedlag = bestlag - else: - usedlag = maxlag - icbest = None - if regression != 'nc': - resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1], - regression)).fit() - else: - resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit() - - adfstat = resols.tvalues[0] -# adfstat = (resols.params[0]-1.0)/resols.bse[0] - # the "asymptotically correct" z statistic is obtained as - # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1) - # I think this is the statistic that is used for series that are integrated - # for orders higher than I(1), ie., not ADF but cointegration tests. - - # Get approx p-value and critical values - pvalue = mackinnonp(adfstat, regression=regression, N=1) - critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs) - critvalues = {"1%" : critvalues[0], "5%" : critvalues[1], - "10%" : critvalues[2]} - if store: - resstore.resols = resols - resstore.maxlag = maxlag - resstore.usedlag = usedlag - resstore.adfstat = adfstat - resstore.critvalues = critvalues - resstore.nobs = nobs - resstore.H0 = ("The coefficient on the lagged level equals 1 - " - "unit root") - resstore.HA = "The coefficient on the lagged level < 1 - stationary" - resstore.icbest = icbest - return adfstat, pvalue, critvalues, resstore - else: - if not autolag: - return adfstat, pvalue, usedlag, nobs, critvalues - else: - return adfstat, pvalue, usedlag, nobs, critvalues, icbest - - def acovf(x, unbiased=False, demean=True, fft=False): ''' Autocovariance for 1D @@ -884,66 +743,6 @@ def grangercausalitytests(x, maxlag, addconst=True, verbose=True): return resli -def coint(y1, y2, regression="c"): - """ - This is a simple cointegration test. Uses unit-root test on residuals to - test for cointegrated relationship - - See Hamilton (1994) 19.2 - - Parameters - ---------- - y1 : array_like, 1d - first element in cointegrating vector - y2 : array_like - remaining elements in cointegrating vector - c : str {'c'} - Included in regression - * 'c' : Constant - - Returns - ------- - coint_t : float - t-statistic of unit-root test on residuals - pvalue : float - MacKinnon's approximate p-value based on MacKinnon (1994) - crit_value : dict - Critical values for the test statistic at the 1 %, 5 %, and 10 % - levels. - - Notes - ----- - The Null hypothesis is that there is no cointegration, the alternative - hypothesis is that there is cointegrating relationship. If the pvalue is - small, below a critical size, then we can reject the hypothesis that there - is no cointegrating relationship. - - P-values are obtained through regression surface approximation from - MacKinnon 1994. - - References - ---------- - MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for - unit-root and cointegration tests. `Journal of Business and Economic - Statistics` 12, 167-76. - - """ - regression = regression.lower() - if regression not in ['c', 'nc', 'ct', 'ctt']: - raise ValueError("regression option %s not understood") % regression - y1 = np.asarray(y1) - y2 = np.asarray(y2) - if regression == 'c': - y2 = add_constant(y2, prepend=False) - st1_resid = OLS(y1, y2).fit().resid # stage one residuals - lgresid_cons = add_constant(st1_resid[0:-1], prepend=False) - uroot_reg = OLS(st1_resid[1:], lgresid_cons).fit() - coint_t = (uroot_reg.params[0] - 1) / uroot_reg.bse[0] - pvalue = mackinnonp(coint_t, regression="c", N=2, lags=None) - crit_value = mackinnoncrit(N=1, regression="c", nobs=len(y1)) - return coint_t, pvalue, crit_value - - def _safe_arma_fit(y, order, model_kw, trend, fit_kw, start_params=None): try: return ARMA(y, order=order, **model_kw).fit(disp=0, trend=trend, diff --git a/statsmodels/tsa/tests/test_adfuller_lag.py b/statsmodels/tsa/tests/test_adfuller_lag.py deleted file mode 100644 index 9b77633e1d6..00000000000 --- a/statsmodels/tsa/tests/test_adfuller_lag.py +++ /dev/null @@ -1,49 +0,0 @@ -# -*- coding: utf-8 -*- -"""Test for autolag of adfuller, unitroot_adf - -Created on Wed May 30 21:39:46 2012 -Author: Josef Perktold -""" - -import numpy as np -from numpy.testing import assert_equal, assert_almost_equal -import statsmodels.tsa.stattools as tsast -from statsmodels.datasets import macrodata - -def test_adf_autolag(): - #see issue #246 - #this is mostly a unit test - d2 = macrodata.load().data - - for k_trend, tr in enumerate(['nc', 'c', 'ct', 'ctt']): - #[None:'nc', 0:'c', 1:'ct', 2:'ctt'] - x = np.log(d2['realgdp']) - xd = np.diff(x) - - #check exog - adf3 = tsast.adfuller(x, maxlag=None, autolag='aic', - regression=tr, store=True, regresults=True) - st2 = adf3[-1] - - assert_equal(len(st2.autolag_results), 15 + 1) #+1 for lagged level - for l, res in sorted(st2.autolag_results.iteritems())[:5]: - lag = l-k_trend - #assert correct design matrices in _autolag - assert_equal(res.model.exog[-10:,k_trend], x[-11:-1]) - assert_equal(res.model.exog[-1,k_trend+1:], xd[-lag:-1][::-1]) - #min-ic lag of dfgls in Stata is also 2, or 9 for maic with notrend - assert_equal(st2.usedlag, 2) - - #same result with lag fixed at usedlag of autolag - adf2 = tsast.adfuller(x, maxlag=2, autolag=None, regression=tr) - assert_almost_equal(adf3[:2], adf2[:2], decimal=12) - - - tr = 'c' - #check maxlag with autolag - adf3 = tsast.adfuller(x, maxlag=5, autolag='aic', - regression=tr, store=True, regresults=True) - assert_equal(len(adf3[-1].autolag_results), 5 + 1) - adf3 = tsast.adfuller(x, maxlag=0, autolag='aic', - regression=tr, store=True, regresults=True) - assert_equal(len(adf3[-1].autolag_results), 0 + 1) diff --git a/statsmodels/tsa/tests/test_stattools.py b/statsmodels/tsa/tests/test_stattools.py index 775af8ec8f0..069bd1510e1 100644 --- a/statsmodels/tsa/tests/test_stattools.py +++ b/statsmodels/tsa/tests/test_stattools.py @@ -1,12 +1,12 @@ -from statsmodels.tsa.stattools import (adfuller, acf, pacf_ols, pacf_yw, - pacf, grangercausalitytests, - coint, acovf, - arma_order_select_ic) +from statsmodels.tsa.stattools import (acf, pacf_yw, pacf, + grangercausalitytests, acovf, + arma_order_select_ic, cov_nw) +from statsmodels.tsa.cointegration import coint from statsmodels.tsa.base.datetools import dates_from_range import numpy as np from numpy.testing import (assert_almost_equal, assert_equal, assert_raises, dec, assert_) -from numpy import genfromtxt#, concatenate +from numpy import genfromtxt, log, diff from statsmodels.datasets import macrodata, sunspots from pandas import Series, Index, DataFrame import os @@ -20,92 +20,6 @@ DECIMAL_2 = 2 DECIMAL_1 = 1 -class CheckADF(object): - """ - Test Augmented Dickey-Fuller - - Test values taken from Stata. - """ - levels = ['1%', '5%', '10%'] - data = macrodata.load() - x = data.data['realgdp'] - y = data.data['infl'] - - def test_teststat(self): - assert_almost_equal(self.res1[0], self.teststat, DECIMAL_5) - - def test_pvalue(self): - assert_almost_equal(self.res1[1], self.pvalue, DECIMAL_5) - - def test_critvalues(self): - critvalues = [self.res1[4][lev] for lev in self.levels] - assert_almost_equal(critvalues, self.critvalues, DECIMAL_2) - -class TestADFConstant(CheckADF): - """ - Dickey-Fuller test for unit root - """ - def __init__(self): - self.res1 = adfuller(self.x, regression="c", autolag=None, - maxlag=4) - self.teststat = .97505319 - self.pvalue = .99399563 - self.critvalues = [-3.476, -2.883, -2.573] - -class TestADFConstantTrend(CheckADF): - """ - """ - def __init__(self): - self.res1 = adfuller(self.x, regression="ct", autolag=None, - maxlag=4) - self.teststat = -1.8566374 - self.pvalue = .67682968 - self.critvalues = [-4.007, -3.437, -3.137] - -#class TestADFConstantTrendSquared(CheckADF): -# """ -# """ -# pass -#TODO: get test values from R? - -class TestADFNoConstant(CheckADF): - """ - """ - def __init__(self): - self.res1 = adfuller(self.x, regression="nc", autolag=None, - maxlag=4) - self.teststat = 3.5227498 - self.pvalue = .99999 # Stata does not return a p-value for noconstant. - # Tau^max in MacKinnon (1994) is missing, so it is - # assumed that its right-tail is well-behaved - self.critvalues = [-2.587, -1.950, -1.617] - -# No Unit Root - -class TestADFConstant2(CheckADF): - def __init__(self): - self.res1 = adfuller(self.y, regression="c", autolag=None, - maxlag=1) - self.teststat = -4.3346988 - self.pvalue = .00038661 - self.critvalues = [-3.476, -2.883, -2.573] - -class TestADFConstantTrend2(CheckADF): - def __init__(self): - self.res1 = adfuller(self.y, regression="ct", autolag=None, - maxlag=1) - self.teststat = -4.425093 - self.pvalue = .00199633 - self.critvalues = [-4.006, -3.437, -3.137] - -class TestADFNoConstant2(CheckADF): - def __init__(self): - self.res1 = adfuller(self.y, regression="nc", autolag=None, - maxlag=1) - self.teststat = -2.4511596 - self.pvalue = 0.013747 # Stata does not return a p-value for noconstant - # this value is just taken from our results - self.critvalues = [-2.587,-1.950,-1.617] class CheckCorrGram(object): """ @@ -321,6 +235,43 @@ def test_arma_order_select_ic_failure(): res = arma_order_select_ic(y) +class TestVarNW(object): + @classmethod + def setupClass(cls): + from statsmodels.datasets.macrodata import load + + cls.cpi = log(load().data['cpi']) + cls.inflation = diff(cls.cpi) + + def test_cov_nw(self): + y = self.inflation + simple_cov = cov_nw(y, lags=0) + e = y - y.mean() + assert_almost_equal(e.dot(e) / e.shape[0], simple_cov) + + def test_cov_nw_no_demean(self): + y = self.inflation + simple_cov = cov_nw(y, lags=0, demean=False) + assert_almost_equal(y.dot(y) / y.shape[0], simple_cov) + + def test_cov_nw_2d(self): + y = np.random.randn(100, 2) + simple_cov = cov_nw(y, lags=0) + e = y - y.mean(0) + assert_almost_equal(e.T.dot(e) / e.shape[0], simple_cov) + + def test_cov_nw_2d_2lags(self): + y = np.random.randn(100, 2) + e = y - y.mean(0) + gamma_0 = e.T.dot(e) + gamma_1 = e[1:].T.dot(e[:-1]) + gamma_2 = e[2:].T.dot(e[:-2]) + w1, w2 = 1.0 - (1.0 / 3.0), 1.0 - (2.0 / 3.0) + expected = (gamma_0 + w1 * (gamma_1 + gamma_1.T) + w2 * ( + gamma_2 + gamma_2.T)) / 100.0 + assert_almost_equal(cov_nw(y, lags=2), expected) + + if __name__=="__main__": import nose # nose.runmodule(argv=[__file__, '-vvs','-x','-pdb'], exit=False) diff --git a/statsmodels/tsa/tests/test_tsa_tools.py b/statsmodels/tsa/tests/test_tsa_tools.py index c8b7fc73873..ec7a19568dc 100644 --- a/statsmodels/tsa/tests/test_tsa_tools.py +++ b/statsmodels/tsa/tests/test_tsa_tools.py @@ -2,26 +2,51 @@ ''' +import warnings + import numpy as np -from numpy.testing import assert_array_almost_equal, assert_equal +from numpy.random import randn +from numpy.testing import assert_array_almost_equal, assert_equal, assert_raises +import pandas as pd +from nose import SkipTest +from nose.tools import nottest +from pandas.util.testing import assert_frame_equal, assert_produces_warning + +import statsmodels.tsa.tsatools as tools +from statsmodels.tsa.tsatools import vec, vech, reintegrate, unvec import statsmodels.api as sm import statsmodels.tsa.stattools as tsa -import statsmodels.tsa.tsatools as tools -from statsmodels.tsa.tsatools import vec, vech +from statsmodels.regression.linear_model import OLS +from statsmodels.tsa.tests.results import savedrvs +from statsmodels.tsa.tests.results.datamlw_tls import mlacf, mlccf, mlpacf, \ + mlywar -from results import savedrvs -from results.datamlw_tls import mlacf, mlccf, mlpacf, mlywar xo = savedrvs.rvsdata.xar2 -x100 = xo[-100:]/1000. -x1000 = xo/1000. +x100 = xo[-100:] / 1000. +x1000 = xo / 1000. + +pdversion = pd.version.version.split('.') +pdmajor = int(pdversion[0]) +pdminor = int(pdversion[1]) + +macro_data = sm.datasets.macrodata.load().data[['year', + 'quarter', + 'realgdp', + 'cpi']] + + +def skip_if_early_pandas(): + if pdmajor == 0 and pdminor <= 12: + raise SkipTest("known failure of test on early pandas") def test_acf(): acf_x = tsa.acf(x100, unbiased=False)[:21] - assert_array_almost_equal(mlacf.acf100.ravel(), acf_x, 8) #why only dec=8 + assert_array_almost_equal(mlacf.acf100.ravel(), acf_x, 8) # why only dec=8 acf_x = tsa.acf(x1000, unbiased=False)[:21] - assert_array_almost_equal(mlacf.acf1000.ravel(), acf_x, 8) #why only dec=9 + assert_array_almost_equal(mlacf.acf1000.ravel(), acf_x, 8) # why only dec=9 + def test_ccf(): ccf_x = tsa.ccf(x100[4:], x100[:-4], unbiased=False)[:21] @@ -29,6 +54,7 @@ def test_ccf(): ccf_x = tsa.ccf(x1000[4:], x1000[:-4], unbiased=False)[:21] assert_array_almost_equal(mlccf.ccf1000.ravel()[:21][::-1], ccf_x, 8) + def test_pacf_yw(): pacfyw = tsa.pacf_yw(x100, 20, method='mle') assert_array_almost_equal(mlpacf.pacf100.ravel(), pacfyw, 1) @@ -36,6 +62,8 @@ def test_pacf_yw(): assert_array_almost_equal(mlpacf.pacf1000.ravel(), pacfyw, 2) #assert False + +@nottest def test_pacf_ols(): pacfols = tsa.pacf_ols(x100, 20) assert_array_almost_equal(mlpacf.pacf100.ravel(), pacfols, 8) @@ -43,112 +71,154 @@ def test_pacf_ols(): assert_array_almost_equal(mlpacf.pacf1000.ravel(), pacfols, 8) #assert False + def test_ywcoef(): - assert_array_almost_equal(mlywar.arcoef100[1:], - -sm.regression.yule_walker(x100, 10, method='mle')[0], 8) - assert_array_almost_equal(mlywar.arcoef1000[1:], - -sm.regression.yule_walker(x1000, 20, method='mle')[0], 8) + yw = -sm.regression.yule_walker(x100, 10, method='mle')[0] + ar_coef = mlywar.arcoef100[1:] + assert_array_almost_equal(ar_coef, yw, 8) + yw = -sm.regression.yule_walker(x1000, 20, method='mle')[0] + ar_coef = mlywar.arcoef1000[1:] + assert_array_almost_equal(yw, ar_coef, 8) + def test_duplication_matrix(): for k in range(2, 10): - m = tools.unvech(np.random.randn(k * (k + 1) / 2)) - Dk = tools.duplication_matrix(k) - assert(np.array_equal(vec(m), np.dot(Dk, vech(m)))) + m = tools.unvech(randn(k * (k + 1) / 2)) + duplication = tools.duplication_matrix(k) + assert_equal(vec(m), np.dot(duplication, vech(m))) + def test_elimination_matrix(): for k in range(2, 10): - m = np.random.randn(k, k) - Lk = tools.elimination_matrix(k) - assert(np.array_equal(vech(m), np.dot(Lk, vec(m)))) + m = randn(k, k) + elimination = tools.elimination_matrix(k) + assert_equal(vech(m), np.dot(elimination, vec(m))) + def test_commutation_matrix(): - m = np.random.randn(4, 3) - K = tools.commutation_matrix(4, 3) - assert(np.array_equal(vec(m.T), np.dot(K, vec(m)))) + m = randn(4, 3) + commutation = tools.commutation_matrix(4, 3) + assert_equal(vec(m.T), np.dot(commutation, vec(m))) + def test_vec(): arr = np.array([[1, 2], [3, 4]]) - assert(np.array_equal(vec(arr), [1, 3, 2, 4])) + assert_equal(vec(arr), [1, 3, 2, 4]) + def test_vech(): arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - assert(np.array_equal(vech(arr), [1, 4, 7, 5, 8, 9])) + assert_equal(vech(arr), [1, 4, 7, 5, 8, 9]) + + +def test_unvec(): + arr = np.array([[1, 2, 3], + [4, 5, 6], + [7, 8, 9]]) + assert_equal(unvec(vec(arr)), arr) + + +def test_lagmat_trim(): + x = np.arange(20.0) + 1.0 + lags = 5 + base_lagmat = sm.tsa.lagmat(x, lags, trim=None) + forward_lagmat = sm.tsa.lagmat(x, lags, trim='forward') + assert_equal(base_lagmat[:-lags], forward_lagmat) + backward_lagmat = sm.tsa.lagmat(x, lags, trim='backward') + assert_equal(base_lagmat[lags:], backward_lagmat) + both_lagmat = sm.tsa.lagmat(x, lags, trim='both') + assert_equal(base_lagmat[lags:-lags], both_lagmat) + + +def test_lagmat_bad_trim(): + x = randn(10, 1) + assert_raises(ValueError, sm.tsa.lagmat, x, 1, trim='unknown') def test_add_lag_insert(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,2],3,trim='Both') - results = np.column_stack((nddata[3:,:3],lagmat,nddata[3:,-1])) + data = macro_data + nddata = data.view((float, 4)) + lagmat = sm.tsa.lagmat(nddata[:, 2], 3, trim='Both') + results = np.column_stack((nddata[3:, :3], lagmat, nddata[3:, -1])) + data = pd.DataFrame(data) lag_data = sm.tsa.add_lag(data, 'realgdp', 3) - assert_equal(lag_data.view((float,len(lag_data.dtype.names))), results) + assert_equal(lag_data.values, results) + def test_add_lag_noinsert(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,2],3,trim='Both') - results = np.column_stack((nddata[3:,:],lagmat)) + data = macro_data + nddata = data.view((float, 4)) + lagmat = sm.tsa.lagmat(nddata[:, 2], 3, trim='Both') + results = np.column_stack((nddata[3:, :], lagmat)) + data = pd.DataFrame(data) lag_data = sm.tsa.add_lag(data, 'realgdp', 3, insert=False) - assert_equal(lag_data.view((float,len(lag_data.dtype.names))), results) + assert_equal(lag_data.values, results) + def test_add_lag_noinsert_atend(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,-1],3,trim='Both') - results = np.column_stack((nddata[3:,:],lagmat)) + data = macro_data + nddata = data.view((float, 4)) + lagmat = sm.tsa.lagmat(nddata[:, -1], 3, trim='Both') + results = np.column_stack((nddata[3:, :], lagmat)) + data = pd.DataFrame(data) lag_data = sm.tsa.add_lag(data, 'cpi', 3, insert=False) - assert_equal(lag_data.view((float,len(lag_data.dtype.names))), results) + assert_equal(lag_data.values, results) # should be the same as insert lag_data2 = sm.tsa.add_lag(data, 'cpi', 3, insert=True) - assert_equal(lag_data2.view((float,len(lag_data2.dtype.names))), results) + assert_equal(lag_data2.values, results) + def test_add_lag_ndarray(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,2],3,trim='Both') - results = np.column_stack((nddata[3:,:3],lagmat,nddata[3:,-1])) + data = macro_data + nddata = data.view((float, 4)) + lagmat = sm.tsa.lagmat(nddata[:, 2], 3, trim='Both') + results = np.column_stack((nddata[3:, :3], lagmat, nddata[3:, -1])) lag_data = sm.tsa.add_lag(nddata, 2, 3) assert_equal(lag_data, results) + def test_add_lag_noinsert_ndarray(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,2],3,trim='Both') - results = np.column_stack((nddata[3:,:],lagmat)) + data = macro_data + nddata = data.view((float, 4)) + lagmat = sm.tsa.lagmat(nddata[:, 2], 3, trim='Both') + results = np.column_stack((nddata[3:, :], lagmat)) lag_data = sm.tsa.add_lag(nddata, 2, 3, insert=False) assert_equal(lag_data, results) + def test_add_lag_noinsertatend_ndarray(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,-1],3,trim='Both') - results = np.column_stack((nddata[3:,:],lagmat)) + data = macro_data + nddata = data.view((float, 4)) + lagmat = sm.tsa.lagmat(nddata[:, -1], 3, trim='Both') + results = np.column_stack((nddata[3:, :], lagmat)) lag_data = sm.tsa.add_lag(nddata, 3, 3, insert=False) assert_equal(lag_data, results) # should be the same as insert also check negative col number lag_data2 = sm.tsa.add_lag(nddata, -1, 3, insert=True) assert_equal(lag_data2, results) + def test_add_lag1d(): - data = np.random.randn(100) - lagmat = sm.tsa.lagmat(data,3,trim='Both') - results = np.column_stack((data[3:],lagmat)) + data = randn(100) + lagmat = sm.tsa.lagmat(data, 3, trim='Both') + results = np.column_stack((data[3:], lagmat)) lag_data = sm.tsa.add_lag(data, lags=3, insert=True) assert_equal(results, lag_data) # add index - data = data[:,None] - lagmat = sm.tsa.lagmat(data,3,trim='Both') # test for lagmat too - results = np.column_stack((data[3:],lagmat)) - lag_data = sm.tsa.add_lag(data,lags=3, insert=True) + data = data[:, None] + lagmat = sm.tsa.lagmat(data, 3, trim='Both') # test for lagmat too + results = np.column_stack((data[3:], lagmat)) + lag_data = sm.tsa.add_lag(data, lags=3, insert=True) assert_equal(results, lag_data) + def test_add_lag1d_drop(): - data = np.random.randn(100) - lagmat = sm.tsa.lagmat(data,3,trim='Both') + data = randn(100) + lagmat = sm.tsa.lagmat(data, 3, trim='Both') lag_data = sm.tsa.add_lag(data, lags=3, drop=True, insert=True) assert_equal(lagmat, lag_data) @@ -156,45 +226,52 @@ def test_add_lag1d_drop(): lag_data = sm.tsa.add_lag(data, lags=3, drop=True, insert=False) assert_equal(lagmat, lag_data) + def test_add_lag1d_struct(): - data = np.zeros(100, dtype=[('variable',float)]) - nddata = np.random.randn(100) + data = np.zeros(100, dtype=[('variable', float)]) + nddata = randn(100) data['variable'] = nddata - - lagmat = sm.tsa.lagmat(nddata,3,trim='Both', original='in') + data = pd.DataFrame(data) + lagmat = sm.tsa.lagmat(nddata, 3, trim='Both', original='in') lag_data = sm.tsa.add_lag(data, 'variable', lags=3, insert=True) - assert_equal(lagmat, lag_data.view((float,4))) + assert_equal(lagmat, lag_data.values) lag_data = sm.tsa.add_lag(data, 'variable', lags=3, insert=False) - assert_equal(lagmat, lag_data.view((float,4))) + assert_equal(lagmat, lag_data.values) lag_data = sm.tsa.add_lag(data, lags=3, insert=True) - assert_equal(lagmat, lag_data.view((float,4))) + assert_equal(lagmat, lag_data.values) + def test_add_lag_1d_drop_struct(): - data = np.zeros(100, dtype=[('variable',float)]) - nddata = np.random.randn(100) + data = np.zeros(100, dtype=[('variable', float)]) + nddata = randn(100) data['variable'] = nddata + data = pd.DataFrame(data) - lagmat = sm.tsa.lagmat(nddata,3,trim='Both') + lagmat = sm.tsa.lagmat(nddata, 3, trim='Both') lag_data = sm.tsa.add_lag(data, lags=3, drop=True) - assert_equal(lagmat, lag_data.view((float,3))) + assert_equal(lagmat, lag_data.values) + def test_add_lag_drop_insert(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,2],3,trim='Both') - results = np.column_stack((nddata[3:,:2],lagmat,nddata[3:,-1])) + data = macro_data + nddata = data.view((float, 4)) + data = pd.DataFrame(data) + lagmat = sm.tsa.lagmat(nddata[:, 2], 3, trim='Both') + results = np.column_stack((nddata[3:, :2], lagmat, nddata[3:, -1])) lag_data = sm.tsa.add_lag(data, 'realgdp', 3, drop=True) - assert_equal(lag_data.view((float,len(lag_data.dtype.names))), results) + assert_equal(lag_data.values, results) + def test_add_lag_drop_noinsert(): - data = sm.datasets.macrodata.load().data[['year','quarter','realgdp','cpi']] - nddata = data.view((float,4)) - lagmat = sm.tsa.lagmat(nddata[:,2],3,trim='Both') - results = np.column_stack((nddata[3:,np.array([0,1,3])],lagmat)) + data = macro_data + nddata = data.view((float, 4)) + data = pd.DataFrame(data) + lagmat = sm.tsa.lagmat(nddata[:, 2], 3, trim='Both') + results = np.column_stack((nddata[3:, np.array([0, 1, 3])], lagmat)) lag_data = sm.tsa.add_lag(data, 'realgdp', 3, insert=False, drop=True) - assert_equal(lag_data.view((float,len(lag_data.dtype.names))), results) + assert_equal(lag_data.values, results) def test_freq_to_period(): from pandas.tseries.frequencies import to_offset @@ -204,13 +281,159 @@ def test_freq_to_period(): assert_equal(tools.freq_to_period(i), j) assert_equal(tools.freq_to_period(to_offset(i)), j) -if __name__ == '__main__': - #running them directly - # test_acf() - # test_ccf() - # test_pacf_yw() - # test_pacf_ols() - # test_ywcoef() +def test_add_lag_drop_insert_early(): + x = randn(5, 3) + x_lags_incl = sm.tsa.add_lag(x, lags=2, insert=0, drop=False) + x_lags_excl = sm.tsa.add_lag(x, lags=2, insert=0, drop=True) + assert_equal(x_lags_incl[:, :2], x_lags_excl[:, :2]) + assert_equal(x_lags_incl[:, 3:], x_lags_excl[:, 2:]) + + +def test_add_lag_negative_index(): + x = randn(5, 3) + x_lags_incl = sm.tsa.add_lag(x, lags=2, insert=-1, drop=False) + assert_equal(x_lags_incl[:, 0], x[2:, 0]) + assert_equal(x_lags_incl[:, 3], x[1:4, 0]) + assert_equal(x_lags_incl[:, 4], x[:3, 0]) + + +def test_add_trend_prepend(): + n = 10 + x = randn(n, 1) + trend_1 = sm.tsa.add_trend(x, trend='ct', prepend=True) + trend_2 = sm.tsa.add_trend(x, trend='ct', prepend=False) + assert_equal(trend_1[:, :2], trend_2[:, 1:]) + + +def test_add_time_trend_dataframe(): + n = 10 + x = randn(n, 1) + x = pd.DataFrame(x, columns=['col1']) + trend_1 = sm.tsa.add_trend(x, trend='t') + assert_array_almost_equal(np.asarray(trend_1['trend']), + np.arange(1.0, n + 1)) + + +def test_add_trend_prepend_dataframe(): + # Skipped on pandas < 13.1 it seems + skip_if_early_pandas() + n = 10 + x = randn(n, 1) + x = pd.DataFrame(x, columns=['col1']) + trend_1 = sm.tsa.add_trend(x, trend='ct', prepend=True) + trend_2 = sm.tsa.add_trend(x, trend='ct', prepend=False) + assert_frame_equal(trend_1.iloc[:, :2], trend_2.iloc[:, 1:]) + + +def test_add_trend_duplicate_name(): + x = pd.DataFrame(np.zeros((10, 1)), columns=['trend']) + with warnings.catch_warnings(record=True) as w: + assert_produces_warning(sm.tsa.add_trend(x, trend='ct'), + tools.ColumnNameConflict) + y = sm.tsa.add_trend(x, trend='ct') + # should produce a single warning + np.testing.assert_equal(len(w), 1) + assert 'const' in y.columns + assert 'trend_0' in y.columns + + +def test_add_trend_c(): + x = np.zeros((10, 1)) + y = sm.tsa.add_trend(x, trend='c') + assert np.all(y[:, 1] == 1.0) + + +def test_add_trend_ct(): + n = 20 + x = np.zeros((20, 1)) + y = sm.tsa.add_trend(x, trend='ct') + assert np.all(y[:, 1] == 1.0) + assert_equal(y[0, 2], 1.0) + assert_array_almost_equal(np.diff(y[:, 2]), np.ones((n - 1))) + + +def test_add_trend_ctt(): + n = 10 + x = np.zeros((n, 1)) + y = sm.tsa.add_trend(x, trend='ctt') + assert np.all(y[:, 1] == 1.0) + assert y[0, 2] == 1.0 + assert_array_almost_equal(np.diff(y[:, 2]), np.ones((n - 1))) + assert y[0, 3] == 1.0 + assert_array_almost_equal(np.diff(y[:, 3]), np.arange(3.0, 2.0 * n, 2.0)) + + +def test_add_trend_t(): + n = 20 + x = np.zeros((20, 1)) + y = sm.tsa.add_trend(x, trend='t') + assert y[0, 1] == 1.0 + assert_array_almost_equal(np.diff(y[:, 1]), np.ones((n - 1))) + +def test_add_trend_no_input(): + n = 100 + y = sm.tsa.add_trend(x=None, trend='ct', nobs=n) + assert np.all(y[:, 0] == 1.0) + assert y[0, 1] == 1.0 + assert_array_almost_equal(np.diff(y[:, 1]), np.ones((n - 1))) + + +def test_reintegrate_1_diff(): + x = randn(10, 1) + y = np.cumsum(x) + 1.0 + assert_array_almost_equal(y, reintegrate(np.diff(y), [y[0]])) + + +def test_reintegrate_2_diff(): + x = randn(10, 1) + y = np.cumsum(x) + 1.0 + z = np.cumsum(y) + 1.0 + levels = [z[0], np.diff(z, 1)[0]] + assert_array_almost_equal(z, reintegrate(np.diff(z, 2), levels)) + + +def test_detrend_1d_order_0(): + x = randn(100) + assert_array_almost_equal(x - x.mean(), sm.tsa.detrend(x, order=0)) + + +def test_detrend_1d_order_1(): + n = 100 + x = randn(n) + z = sm.tsa.add_trend(x=None, trend='ct', nobs=n) + resid = OLS(x, z).fit().resid + detrended = sm.tsa.detrend(x, order=1) + assert_array_almost_equal(resid, detrended) + + +def test_detrend_2d_order_0(): + n = 100 + x = randn(n, 1) + assert_array_almost_equal(x - x.mean(), sm.tsa.detrend(x, order=0)) + + +def test_detrend_2d_order_2(): + n = 100 + x = randn(n, 1) + z = sm.tsa.add_trend(x=None, trend='ctt', nobs=n) + resid = OLS(x, z).fit().resid[:, None] + detrended = sm.tsa.detrend(x, order=2) + assert_array_almost_equal(resid, detrended) + + +def test_detrend_2d_order_2_axis_1(): + n = 100 + x = randn(1, n) + z = sm.tsa.add_trend(x=None, trend='ctt', nobs=n) + resid = OLS(x.T, z).fit().resid[None, :] + detrended = sm.tsa.detrend(x, order=2, axis=1) + assert_array_almost_equal(resid, detrended) + + pass + + +if __name__ == '__main__': import nose - nose.runmodule() + + nose.runmodule(argv=[__file__, '-vvs'], exit=False) diff --git a/statsmodels/tsa/tests/test_unitroot.py b/statsmodels/tsa/tests/test_unitroot.py new file mode 100644 index 00000000000..84cecfb035f --- /dev/null +++ b/statsmodels/tsa/tests/test_unitroot.py @@ -0,0 +1,380 @@ +# TODO: Tests for features that are just called +# TODO: Test for trend='ctt' +from __future__ import division + +import numpy as np +from numpy.testing import (assert_almost_equal, assert_equal, assert_raises) +from numpy import log, polyval, diff, ceil, ones + +from statsmodels.tsa.unitroot import (ADF, adfuller, DFGLS, ensure_1d, + PhillipsPerron, KPSS, UnitRootTest, + VarianceRatio) +from statsmodels.tsa.critical_values.dickey_fuller import tau_2010 +from statsmodels.datasets import macrodata +import warnings + +DECIMAL_5 = 5 +DECIMAL_4 = 4 +DECIMAL_3 = 3 +DECIMAL_2 = 2 +DECIMAL_1 = 1 + + +def test_adf_autolag(): + #see issue #246 + #this is mostly a unit test + d2 = macrodata.load().data + x = log(d2['realgdp']) + xd = diff(x) + + for k_trend, tr in enumerate(['nc', 'c', 'ct', 'ctt']): + #check exog + adf3 = adfuller(x, maxlag=None, autolag='aic', + regression=tr, store=True, regresults=True) + st2 = adf3[-1] + + assert_equal(len(st2.autolag_results), 15 + 1) # +1 for lagged level + for l, res in sorted(st2.autolag_results.iteritems())[:5]: + lag = l - k_trend + #assert correct design matrices in _autolag + assert_equal(res.model.exog[-10:, k_trend], x[-11:-1]) + assert_equal(res.model.exog[-1, k_trend + 1:], xd[-lag:-1][::-1]) + #min-ic lag of dfgls in Stata is also 2, or 9 for maic with notrend + assert_equal(st2.usedlag, 2) + + #same result with lag fixed at usedlag of autolag + adf2 = adfuller(x, maxlag=2, autolag=None, regression=tr) + assert_almost_equal(adf3[:2], adf2[:2], decimal=12) + + tr = 'c' + #check maxlag with autolag + adf3 = adfuller(x, maxlag=5, autolag='aic', + regression=tr, store=True, regresults=True) + assert_equal(len(adf3[-1].autolag_results), 5 + 1) + adf3 = adfuller(x, maxlag=0, autolag='aic', + regression=tr, store=True, regresults=True) + assert_equal(len(adf3[-1].autolag_results), 0 + 1) + + +class CheckADF(object): + """ + Test Augmented Dickey-Fuller + + Test values taken from Stata. + """ + levels = ['1%', '5%', '10%'] + data = macrodata.load() + x = data.data['realgdp'] + y = data.data['infl'] + + def test_teststat(self): + assert_almost_equal(self.res1[0], self.teststat, DECIMAL_5) + + def test_pvalue(self): + assert_almost_equal(self.res1[1], self.pvalue, DECIMAL_5) + + def test_critvalues(self): + critvalues = [self.res1[4][lev] for lev in self.levels] + assert_almost_equal(critvalues, self.critvalues, DECIMAL_2) + + +class TestADFConstant(CheckADF): + """ + Dickey-Fuller test for unit root + """ + + def __init__(self): + self.res1 = adfuller(self.x, regression="c", autolag=None, + maxlag=4) + self.teststat = .97505319 + self.pvalue = .99399563 + self.critvalues = [-3.476, -2.883, -2.573] + + +class TestADFConstantTrend(CheckADF): + """ + """ + + def __init__(self): + self.res1 = adfuller(self.x, regression="ct", autolag=None, + maxlag=4) + self.teststat = -1.8566374 + self.pvalue = .67682968 + self.critvalues = [-4.007, -3.437, -3.137] + + +#class TestADFConstantTrendSquared(CheckADF): +# """ +# """ +# pass +#TODO: get test values from R? + +class TestADFNoConstant(CheckADF): + """ + """ + + def __init__(self): + self.res1 = adfuller(self.x, regression="nc", autolag=None, + maxlag=4) + self.teststat = 3.5227498 + self.pvalue = .99999 # Stata does not return a p-value for noconstant. + # Tau^max in MacKinnon (1994) is missing, so it is + # assumed that its right-tail is well-behaved + self.critvalues = [-2.587, -1.950, -1.617] + + +# No Unit Root + +class TestADFConstant2(CheckADF): + def __init__(self): + self.res1 = adfuller(self.y, regression="c", autolag=None, + maxlag=1) + self.teststat = -4.3346988 + self.pvalue = .00038661 + self.critvalues = [-3.476, -2.883, -2.573] + + +class TestADFConstantTrend2(CheckADF): + def __init__(self): + self.res1 = adfuller(self.y, regression="ct", autolag=None, + maxlag=1) + self.teststat = -4.425093 + self.pvalue = .00199633 + self.critvalues = [-4.006, -3.437, -3.137] + + +class TestADFNoConstant2(CheckADF): + def __init__(self): + self.res1 = adfuller(self.y, regression="nc", autolag=None, + maxlag=1) + self.teststat = -2.4511596 + self.pvalue = 0.013747 # Stata does not return a p-value for noconstant + # this value is just taken from our results + self.critvalues = [-2.587, -1.950, -1.617] + + +class TestUnitRoot(object): + @classmethod + def setupClass(cls): + from statsmodels.datasets.macrodata import load + + cls.cpi = log(load().data['cpi']) + cls.inflation = diff(cls.cpi) + cls.inflation_change = diff(cls.inflation) + + def test_adf_no_options(self): + adf = ADF(self.inflation) + assert_almost_equal(adf.stat, -3.09310, DECIMAL_4) + assert_equal(adf.lags, 2) + assert_almost_equal(adf.pvalue, .027067, DECIMAL_4) + adf.regression.summary() + + def test_adf_no_lags(self): + adf = ADF(self.inflation, lags=0).stat + assert_almost_equal(adf, -6.56880, DECIMAL_4) + + def test_adf_nc_no_lags(self): + adf = ADF(self.inflation, trend='nc', lags=0) + assert_almost_equal(adf.stat, -3.88845, DECIMAL_4) + # 16.239 + + def test_adf_c_no_lags(self): + adf = ADF(self.inflation, trend='c', lags=0) + assert_almost_equal(adf.stat, -6.56880, DECIMAL_4) + assert_equal(adf.nobs, self.inflation.shape[0] - adf.lags - 1) + + def test_adf_ct_no_lags(self): + adf = ADF(self.inflation, trend='ct', lags=0) + assert_almost_equal(adf.stat, -6.66705, DECIMAL_4) + + def test_adf_lags_10(self): + adf = ADF(self.inflation, lags=10) + assert_almost_equal(adf.stat, -2.28375, DECIMAL_4) + adf.summary() + + def test_adf_auto_bic(self): + adf = ADF(self.inflation, method='BIC') + assert_equal(adf.lags, 2) + + def test_adf_critical_value(self): + adf = ADF(self.inflation, trend='c', lags=3) + adf_cv = adf.critical_values + temp = polyval(tau_2010['c'][0, :, ::-1].T, 1. / adf.nobs) + cv = {'1%': temp[0], '5%': temp[1], '10%': temp[2]} + for k, v in cv.iteritems(): + assert_almost_equal(v, adf_cv[k]) + + def test_adf_auto_t_stat(self): + adf = ADF(self.inflation, method='t-stat') + assert_equal(adf.lags, 10) + old_stat = adf.stat + adf.lags += 1 + assert adf.stat != old_stat + old_stat = adf.stat + assert_equal(adf.y, self.inflation) + adf.trend = 'ctt' + assert adf.stat != old_stat + assert adf.trend == 'ctt' + assert len(adf.valid_trends) == len(('nc', 'c', 'ct', 'ctt')) + for d in adf.valid_trends: + assert d in ('nc', 'c', 'ct', 'ctt') + assert adf.h0 == 'The process contains a unit root.' + assert adf.hA == 'The process is weakly stationary.' + + def test_kpss_auto(self): + kpss = KPSS(self.inflation) + m = self.inflation.shape[0] + lags = np.ceil(12.0 * (m / 100) ** (1.0 / 4)) + assert_equal(kpss.lags, lags) + + def test_kpss(self): + kpss = KPSS(self.inflation, trend='ct', lags=12) + assert_almost_equal(kpss.stat, .235581902996454, DECIMAL_4) + assert_equal(self.inflation.shape[0], kpss.nobs) + kpss.summary() + + def test_kpss_c(self): + kpss = KPSS(self.inflation, trend='c', lags=12) + assert_almost_equal(kpss.stat, .3276290340191141, DECIMAL_4) + + def test_pp(self): + pp = PhillipsPerron(self.inflation, lags=12) + assert_almost_equal(pp.stat, -7.8076512, DECIMAL_4) + assert pp.test_type == 'tau' + pp.test_type = 'rho' + assert_almost_equal(pp.stat, -108.1552688, DECIMAL_2) + with assert_raises(ValueError): + pp.test_type = 'unknown' + pp.summary() + + def test_pp_auto(self): + pp = PhillipsPerron(self.inflation) + n = self.inflation.shape[0] - 1 + lags = ceil(12.0 * ((n / 100.0) ** (1.0 / 4.0))) + assert_equal(pp.lags, lags) + assert_almost_equal(pp.stat, -8.135547778, DECIMAL_4) + pp.test_type = 'rho' + assert_almost_equal(pp.stat, -118.7746451, DECIMAL_2) + + def test_dfgls_c(self): + dfgls = DFGLS(self.inflation, trend='c', lags=0) + assert_almost_equal(dfgls.stat, -6.017304, DECIMAL_4) + dfgls.summary() + dfgls.regression.summary() + + def test_dfgls(self): + dfgls = DFGLS(self.inflation, trend='ct', lags=0) + assert_almost_equal(dfgls.stat, -6.300927, DECIMAL_4) + dfgls.summary() + dfgls.regression.summary() + + def test_dfgls_auto(self): + dfgls = DFGLS(self.inflation, trend='ct', method='BIC', max_lags=3) + assert_equal(dfgls.lags, 2) + assert_almost_equal(dfgls.stat, -2.9035369, DECIMAL_4) + with assert_raises(ValueError): + dfgls.trend = 'nc' + + assert dfgls != 0.0 + + def test_ensure_1d(self): + assert_raises(ValueError, ensure_1d, *(ones((2, 2)),)) + + def test_negative_lag(self): + adf = ADF(self.inflation) + with assert_raises(ValueError): + adf.lags = -1 + + def test_invalid_determinstic(self): + adf = ADF(self.inflation) + with assert_raises(ValueError): + adf.trend = 'bad-value' + + def test_variance_ratio(self): + vr = VarianceRatio(self.inflation, debiased=False) + y = self.inflation + dy = np.diff(y) + mu = dy.mean() + dy2 = y[2:] - y[:-2] + nq = dy.shape[0] + denom = np.sum((dy - mu) ** 2.0) / (nq) + num = np.sum((dy2 - 2 * mu) ** 2.0) / (nq * 2) + ratio = num / denom + + assert_almost_equal(ratio, vr.vr) + + def test_variance_ratio_no_overlap(self): + vr = VarianceRatio(self.inflation, overlap=False) + + with warnings.catch_warnings(record=True) as w: + computed_value = vr.vr + assert_equal(len(w), 1) + + y = self.inflation + # Adjust due ot sample size + y = y[:-1] + dy = np.diff(y) + mu = dy.mean() + dy2 = y[2::2] - y[:-2:2] + nq = dy.shape[0] + denom = np.sum((dy - mu) ** 2.0) / nq + num = np.sum((dy2 - 2 * mu) ** 2.0) / nq + ratio = num / denom + assert_equal(ratio, computed_value) + + vr.overlap = True + assert_equal(vr.overlap, True) + vr2 = VarianceRatio(self.inflation) + assert_almost_equal(vr.stat, vr2.stat) + + def test_variance_ratio_non_robust(self): + vr = VarianceRatio(self.inflation, robust=False, debiased=False) + y = self.inflation + dy = np.diff(y) + mu = dy.mean() + dy2 = y[2:] - y[:-2] + nq = dy.shape[0] + denom = np.sum((dy - mu) ** 2.0) / nq + num = np.sum((dy2 - 2 * mu) ** 2.0) / (nq * 2) + ratio = num / denom + variance = 3.0 / 2.0 + stat = np.sqrt(nq) * (ratio - 1) / np.sqrt(variance) + assert_almost_equal(stat, vr.stat) + orig_stat = vr.stat + vr.robust = True + assert_equal(vr.robust, True) + assert vr.stat != orig_stat + + def test_variance_ratio_no_constant(self): + y = np.random.randn(100) + vr = VarianceRatio(y, trend='nc', debiased=False) + dy = np.diff(y) + mu = 0.0 + dy2 = y[2:] - y[:-2] + nq = dy.shape[0] + denom = np.sum((dy - mu) ** 2.0) / nq + num = np.sum((dy2 - 2 * mu) ** 2.0) / (nq * 2) + ratio = num / denom + assert_almost_equal(ratio, vr.vr) + assert_equal(vr.debiased, False) + + def test_variance_ratio_invalid_lags(self): + y = self.inflation + assert_raises(ValueError, VarianceRatio, y, lags=1) + + def test_variance_ratio_generic(self): + # TODO: Currently not a test, just makes sure code runs at all + y = np.random.randn(100, 1).cumsum() + vr = VarianceRatio(y) + print vr.summary() + + def test_base_class(self): + assert_raises(NotImplementedError, UnitRootTest, + self.inflation, 0, 'nc') + + +if __name__ == "__main__": + import nose + + nose.runmodule(argv=[__file__, '-vvs', '-x'], exit=False) + diff --git a/statsmodels/tsa/tsatools.py b/statsmodels/tsa/tsatools.py index ac22af0bd98..757316c4b8e 100644 --- a/statsmodels/tsa/tsatools.py +++ b/statsmodels/tsa/tsatools.py @@ -1,74 +1,123 @@ import numpy as np -import numpy.lib.recfunctions as nprf -from statsmodels.tools.tools import add_constant +from statsmodels.compatnp.py3k import string_types +import numpy as np +import pandas as pd + +from statsmodels.compatnp.py3k import string_types + +import pandas as pd from pandas.tseries import offsets from pandas.tseries.frequencies import to_offset -def add_trend(X, trend="c", prepend=False): +__all__ = ['lagmat', 'lagmat2ds', 'add_trend', 'duplication_matrix', + 'elimination_matrix', 'commutation_matrix', + 'vec', 'vech', 'unvec', 'unvech', 'reintegrate'] + + +class ColumnNameConflict(Warning): + pass + + +column_name_conflict_doc = """ +Some of the column named being added were not unique and have been renamed. + + {0} +""" + + +def _enforce_unique_col_name(existing, new): + converted_names = [] + unique_names = list(new[:]) + for i, n in enumerate(new): + if n in existing: + original_name = n + fixed_name = n + duplicate_count = 0 + while fixed_name in existing: + fixed_name = n + '_' + str(duplicate_count) + duplicate_count += 1 + unique_names[i] = fixed_name + converted_names.append('{0} -> {1}'.format(original_name, fixed_name)) + if converted_names: + import warnings + + ws = column_name_conflict_doc.format('\n '.join(converted_names)) + warnings.warn(ws, ColumnNameConflict) + + return unique_names + + +def add_trend(x=None, trend='c', prepend=False, nobs=None): """ Adds a trend and/or constant to an array. Parameters ---------- - X : array-like - Original array of data. + x : array-like or None + Original array of data. If None, then nobs must be a positive integer trend : str {"c","t","ct","ctt"} "c" add constant only "t" add trend only "ct" add constant and linear trend "ctt" add constant and linear and quadratic trend. prepend : bool - If True, prepends the new data to the columns of X. + If True, prepends the new data to the columns of x. + n : int, positive + Positive integer containing the length of the trend series. Only used + if x is none. Notes ----- - Returns columns as ["ctt","ct","c"] whenever applicable. There is currently + Returns columns as ["ctt","ct","c"] whenever applicable. There is no checking for an existing constant or trend. See also -------- - statsmodels.add_constant + statsmodels.tools.tools.add_constant """ #TODO: could be generalized for trend of aribitrary order trend = trend.lower() - if trend == "c": # handles structured arrays - return add_constant(X, prepend=prepend) + if trend == "c": # handles structured arrays + trend_order = 0 elif trend == "ct" or trend == "t": - trendorder = 1 + trend_order = 1 elif trend == "ctt": - trendorder = 2 + trend_order = 2 else: raise ValueError("trend %s not understood" % trend) - X = np.asanyarray(X) - nobs = len(X) - trendarr = np.vander(np.arange(1,nobs+1, dtype=float), trendorder+1) + if x is not None: + nobs = len(np.asanyarray(x)) + elif nobs <= 0: + raise ValueError("nobs must be a positive integer if x is None") + trend_array = np.vander(np.arange(1, nobs + 1, dtype=np.float64), + trend_order + 1) # put in order ctt - trendarr = np.fliplr(trendarr) + trend_array = np.fliplr(trend_array) if trend == "t": - trendarr = trendarr[:,1] - if not X.dtype.names: - if not prepend: - X = np.column_stack((X, trendarr)) + trend_array = trend_array[:, 1:] + if x is None: + return trend_array + if isinstance(x, pd.DataFrame): + columns = ('const', 'trend', 'quadratic_trend') + if trend == 't': + columns = (columns[1],) else: - X = np.column_stack((trendarr, X)) + columns = columns[0:trend_order + 1] + columns = _enforce_unique_col_name(x.columns, columns) + trend_array = pd.DataFrame(trend_array, index=x.index, columns=columns) + if prepend: + x = trend_array.join(x) + else: + x = x.join(trend_array) else: - return_rec = data.__clas__ is np.recarray - if trendorder == 1: - if trend == "ct": - dt = [('const',float),('trend',float)] - else: - dt = [('trend', float)] - elif trendorder == 2: - dt = [('const',float),('trend',float),('trend_squared', float)] - trendarr = trendarr.view(dt) if prepend: - X = nprf.append_fields(trendarr, X.dtype.names, [X[i] for i - in data.dtype.names], usemask=False, asrecarray=return_rec) + x = np.column_stack((trend_array, x)) else: - X = nprf.append_fields(X, trendarr.dtype.names, [trendarr[i] for i - in trendarr.dtype.names], usemask=false, asrecarray=return_rec) - return X + x = np.column_stack((x, trend_array)) + + return x + def add_lag(x, col=None, lags=1, drop=False, insert=True): """ @@ -80,10 +129,9 @@ def add_lag(x, col=None, lags=1, drop=False, insert=True): An array or NumPy ndarray subclass. Can be either a 1d or 2d array with observations in columns. col : 'string', int, or None - If data is a structured array or a recarray, `col` can be a string - that is the name of the column containing the variable. Or `col` can - be an int of the zero-based column index. If it's a 1d array `col` - can be None. + If data is a pandas DataFrame, `col` can be a string that is the name + of the column containing the variable. Or `col` can be an int of the + zero-based column index. If it's a 1d array `col` can be None. lags : int The number of lags desired. drop : bool @@ -111,134 +159,108 @@ def add_lag(x, col=None, lags=1, drop=False, insert=True): so that the length of the returned array is len(`X`) - lags. The lags are returned in increasing order, ie., t-1,t-2,...,t-lags """ - if x.dtype.names: - names = x.dtype.names - if not col and np.squeeze(x).ndim > 1: - raise IndexError, "col is None and the input array is not 1d" - elif len(names) == 1: - col = names[0] - if isinstance(col, int): - col = x.dtype.names[col] - contemp = x[col] - - # make names for lags - tmp_names = [col + '_'+'L(%i)' % i for i in range(1,lags+1)] - ndlags = lagmat(contemp, maxlag=lags, trim='Both') - - # get index for return - if insert is True: - ins_idx = list(names).index(col) + 1 - elif insert is False: - ins_idx = len(names) + 1 - else: # insert is an int - if insert > len(names): - raise Warning("insert > number of variables, inserting at the"+ - " last position") - ins_idx = insert - - first_names = list(names[:ins_idx]) - last_names = list(names[ins_idx:]) - - if drop: - if col in first_names: - first_names.pop(first_names.index(col)) - else: - last_names.pop(last_names.index(col)) - - if first_names: # only do this if x isn't "empty" - first_arr = nprf.append_fields(x[first_names][lags:],tmp_names, - ndlags.T, usemask=False) - else: - first_arr = np.zeros(len(x)-lags, dtype=zip(tmp_names, - (x[col].dtype,)*lags)) - for i,name in enumerate(tmp_names): - first_arr[name] = ndlags[:,i] - if last_names: - return nprf.append_fields(first_arr, last_names, - [x[name][lags:] for name in last_names], usemask=False) - else: # lags for last variable - return first_arr - - else: # we have an ndarray - - if x.ndim == 1: # make 2d if 1d - x = x[:,None] - if col is None: - col = 0 - - # handle negative index - if col < 0: - col = x.shape[1] + col - - contemp = x[:,col] - - if insert is True: - ins_idx = col + 1 - elif insert is False: - ins_idx = x.shape[1] + df = None + if isinstance(x, pd.DataFrame): + df = x + if isinstance(col, string_types): + col = list(df.columns).index(col) + + x = np.asarray(x) + if x.ndim == 1: # make 2d if 1d + x = x[:, None] + if col is None: + col = 0 + # handle negative index + if col < 0: + col = x.shape[1] + col + contemporary = x[:, col] + + if insert is True: + ins_idx = col + 1 + elif insert is False: + ins_idx = x.shape[1] + else: + if insert < 0: # handle negative index + insert = x.shape[1] + insert + 1 + if insert > x.shape[1]: + raise ValueError("insert greater than the number of variables") + ins_idx = insert + + ndlags = lagmat(contemporary, lags, trim='Both') + first_cols = range(ins_idx) + last_cols = range(ins_idx, x.shape[1]) + if drop: + if col in first_cols: + first_cols.pop(first_cols.index(col)) else: - if insert < 0: # handle negative index - insert = x.shape[1] + insert + 1 - if insert > x.shape[1]: - insert = x.shape[1] - raise Warning("insert > number of variables, inserting at the"+ - " last position") - ins_idx = insert - - ndlags = lagmat(contemp, lags, trim='Both') - first_cols = range(ins_idx) - last_cols = range(ins_idx,x.shape[1]) - if drop: - if col in first_cols: - first_cols.pop(first_cols.index(col)) - else: - last_cols.pop(last_cols.index(col)) - return np.column_stack((x[lags:,first_cols],ndlags, - x[lags:,last_cols])) + last_cols.pop(last_cols.index(col)) + out = np.column_stack((x[lags:, first_cols], ndlags, x[lags:, last_cols])) + if df is not None: + columns = list(df.columns) + index = df.index + # Create new column labels + lag_columns = [str(columns[col]) + '_L_' + str(i + 1) for i in + xrange(lags)] + out_columns = [columns[col_idx] for col_idx in first_cols] + out_columns.extend(lag_columns) + for col_idx in last_cols: + out_columns.append(columns[col_idx]) + # Alter index for correct length + index = index[lags:] + lag_columns = _enforce_unique_col_name(df.columns, lag_columns) + ndlags = pd.DataFrame(ndlags, columns=lag_columns, index=index) + df = df.join(ndlags) + out = df[out_columns].reindex(index) + + return out + def detrend(x, order=1, axis=0): - '''detrend an array with a trend of given order along axis 0 or 1 + """ + Detrend an array with a trend of given order along axis 0 or 1 Parameters ---------- x : array_like, 1d or 2d data, if 2d, then each row or column is independently detrended with the - same trendorder, but independent trend estimates + same trend order, but independent trend estimates order : int specifies the polynomial order of the trend, zero is constant, one is linear trend, two is quadratic trend axis : int - for detrending with order > 0, axis can be either 0 observations by rows, - or 1, observations by columns + for detrending with order > 0, axis can be either 0 observations by + rows, or 1, observations by columns Returns ------- - detrended data series : ndarray + y : array The detrended series is the residual of the linear regression of the data on the trend of given order. - - - ''' + """ x = np.asarray(x) - nobs = x.shape[0] + ndim = x.ndim if order == 0: - return x - np.expand_dims(x.mean(ax), x) - else: - if x.ndim == 2 and range(2)[axis]==1: - x = x.T - elif x.ndim > 2: - raise NotImplementedError('x.ndim>2 is not implemented until it is needed') - #could use a polynomial, but this should work also with 2d x, but maybe not yet - trends = np.vander(np.arange(nobs).astype(float), N=order+1) - beta = np.linalg.lstsq(trends, x)[0] - resid = x - np.dot(trends, beta) - if x.ndim == 2 and range(2)[axis]==1: - resid = resid.T - return resid - - -def lagmat(x, maxlag, trim='forward', original='ex'): - '''create 2d array of lags + return x - np.expand_dims(x.mean(axis), axis) + if ndim > 2: + raise NotImplementedError('x must be 1d or 2d') + if ndim == 2 and axis == 1: + x = x.T + elif ndim == 1: + x = x[:, None] + nobs = x.shape[0] + trends = np.vander(np.arange(nobs).astype(np.float64), N=order + 1) + beta = np.linalg.pinv(trends).dot(x) + resid = x - np.dot(trends, beta) + if x.ndim == 2 and axis == 1: + resid = resid.T + elif ndim == 1: + resid = resid.ravel() + + return resid + + +def lagmat(x, maxlag, trim='forward', original='ex', fill_value=np.nan): + """create 2d array of lags Parameters ---------- @@ -258,6 +280,8 @@ def lagmat(x, maxlag, trim='forward', original='ex'): * 'sep' : returns a tuple (original array, lagged values). The original array is truncated to have the same number of rows as the returned lagmat. + fill_value : float + the value to use for filling missing values. Default is np.nan Returns ------- @@ -293,22 +317,23 @@ def lagmat(x, maxlag, trim='forward', original='ex'): Notes ----- - TODO: - * allow list of lags additional to maxlag - * create varnames for columns - ''' + If trim is not 'both', new values are 0-filled + """ + # TODO: allow list of lags additional to maxlag + # TODO: create varnames for columns x = np.asarray(x) dropidx = 0 if x.ndim == 1: - x = x[:,None] + x = x[:, None] nobs, nvar = x.shape - if original in ['ex','sep']: + if original in ['ex', 'sep']: dropidx = nvar if maxlag >= nobs: raise ValueError("maxlag should be < nobs") - lm = np.zeros((nobs+maxlag, nvar*(maxlag+1))) - for k in range(0, int(maxlag+1)): - lm[maxlag-k:nobs+maxlag-k, nvar*(maxlag-k):nvar*(maxlag-k+1)] = x + lm = np.tile(fill_value, (nobs + maxlag, nvar * (maxlag + 1))) + for k in range(0, int(maxlag + 1)): + lm[maxlag - k:nobs + maxlag - k, + nvar * (maxlag - k):nvar * (maxlag - k + 1)] = x if trim: trimlower = trim.lower() else: @@ -318,10 +343,10 @@ def lagmat(x, maxlag, trim='forward', original='ex'): stopobs = len(lm) elif trimlower == 'forward': startobs = 0 - stopobs = nobs+maxlag-k + stopobs = nobs + maxlag - k elif trimlower == 'both': startobs = maxlag - stopobs = nobs+maxlag-k + stopobs = nobs + maxlag - k elif trimlower == 'backward': startobs = maxlag stopobs = len(lm) @@ -329,12 +354,13 @@ def lagmat(x, maxlag, trim='forward', original='ex'): else: raise ValueError('trim option not valid') if original == 'sep': - return lm[startobs:stopobs,dropidx:], x[startobs:stopobs] + return lm[startobs:stopobs, dropidx:], x[startobs:stopobs] else: - return lm[startobs:stopobs,dropidx:] + return lm[startobs:stopobs, dropidx:] + def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim='forward'): - '''generate lagmatrix for 2d array, columns arranged by variables + """Generate lagmatrix for 2d array, columns arranged by variables Parameters ---------- @@ -343,7 +369,7 @@ def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim='forward'): maxlag0 : int for first variable all lags from zero to maxlag are included maxlagex : None or int - max lag for all other variables all lags from zero to maxlag are included + max lag for all other variables; all lags from zero to maxlag are included dropex : int (default is 0) exclude first dropex lags from other variables for all variables, except the first, lags from dropex to maxlagex are @@ -362,42 +388,50 @@ def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim='forward'): Notes ----- very inefficient for unequal lags, just done for convenience - ''' + """ if maxlagex is None: maxlagex = maxlag0 maxlag = max(maxlag0, maxlagex) nobs, nvar = x.shape - lagsli = [lagmat(x[:,0], maxlag, trim=trim, original='in')[:,:maxlag0+1]] - for k in range(1,nvar): - lagsli.append(lagmat(x[:,k], maxlag, trim=trim, original='in')[:,dropex:maxlagex+1]) + lagsli = [ + lagmat(x[:, 0], maxlag, trim=trim, original='in')[:, :maxlag0 + 1]] + for k in range(1, nvar): + lagsli.append(lagmat(x[:, k], maxlag, trim=trim, original='in')[:, + dropex:maxlagex + 1]) return np.column_stack(lagsli) + def vec(mat): return mat.ravel('F') + def vech(mat): # Gets Fortran-order return mat.T.take(_triu_indices(len(mat))) -# tril/triu/diag, suitable for ndarray.take +# tril/triu/diag, suitable for ndarray.take def _tril_indices(n): rows, cols = np.tril_indices(n) return rows * n + cols + def _triu_indices(n): rows, cols = np.triu_indices(n) return rows * n + cols + def _diag_indices(n): rows, cols = np.diag_indices(n) return rows * n + cols + def unvec(v): k = int(np.sqrt(len(v))) - assert(k * k == len(v)) + assert (k * k == len(v)) return v.reshape((k, k), order='F') + def unvech(v): # quadratic formula, correct fp error rows = .5 * (-1 + np.sqrt(1 + 8 * len(v))) @@ -412,18 +446,26 @@ def unvech(v): return result + def duplication_matrix(n): """ Create duplication matrix D_n which satisfies vec(S) = D_n vech(S) for symmetric matrix S + Parameters + ---------- + n : int + The length of vech(S) + Returns ------- - D_n : ndarray + D_n : array + The duplication array """ tmp = np.eye(n * (n + 1) / 2) return np.array([unvech(x).ravel() for x in tmp]).T + def elimination_matrix(n): """ Create the elimination matrix L_n which satisfies vech(M) = L_n vec(M) for @@ -431,31 +473,36 @@ def elimination_matrix(n): Parameters ---------- + n : int + The length of vec(M) Returns ------- + L : array + The commutation matrix """ vech_indices = vec(np.tril(np.ones((n, n)))) return np.eye(n * n)[vech_indices != 0] + def commutation_matrix(p, q): """ Create the commutation matrix K_{p,q} satisfying vec(A') = K_{p,q} vec(A) Parameters ---------- - p : int - q : int + p, q : int Returns ------- - K : ndarray (pq x pq) + K : ndarray, (pq, pq) """ K = np.eye(p * q) indices = np.arange(p * q).reshape((p, q), order='F') return K.take(indices.ravel(), axis=0) + def _ar_transparams(params): """ Transforms params to induce stationarity/invertability. @@ -469,16 +516,15 @@ def _ar_transparams(params): --------- Jones(1980) """ - newparams = ((1-np.exp(-params))/ - (1+np.exp(-params))).copy() - tmp = ((1-np.exp(-params))/ - (1+np.exp(-params))).copy() - for j in range(1,len(params)): - a = newparams[j] - for kiter in range(j): - tmp[kiter] -= a * newparams[j-kiter-1] - newparams[:j] = tmp[:j] - return newparams + trans_params = ((1 - np.exp(-params)) / (1 + np.exp(-params))) + tmp = trans_params.copy() + for j in range(1, len(params)): + a = trans_params[j] + for k in range(j): + tmp[k] -= a * trans_params[j - k - 1] + trans_params[:j] = tmp[:j] + return trans_params + def _ar_invtransparams(params): """ @@ -489,16 +535,17 @@ def _ar_invtransparams(params): params : array The transformed AR coefficients """ - # AR coeffs + # AR coefficients tmp = params.copy() - for j in range(len(params)-1,0,-1): + for j in range(len(params) - 1, 0, -1): a = params[j] - for kiter in range(j): - tmp[kiter] = (params[kiter] + a * params[j-kiter-1])/\ - (1-a**2) + for k in range(j): + tmp[k] = (params[k] + a * params[j - k - 1]) / \ + (1 - a ** 2) params[:j] = tmp[:j] - invarcoefs = -np.log((1-params)/(1+params)) - return invarcoefs + inv_ar_coefs = -np.log((1 - params) / (1 + params)) + return inv_ar_coefs + def _ma_transparams(params): """ @@ -507,22 +554,23 @@ def _ma_transparams(params): Parameters ---------- params : array - The ma coeffecients of an (AR)MA model. + The MA coeffecients of an (AR)MA model. Reference --------- Jones(1980) """ - newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy() - tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy() + trans_params = ((1 - np.exp(-params)) / (1 + np.exp(-params))).copy() + tmp = trans_params.copy() # levinson-durbin to get macf - for j in range(1,len(params)): - b = newparams[j] - for kiter in range(j): - tmp[kiter] += b * newparams[j-kiter-1] - newparams[:j] = tmp[:j] - return newparams + for j in range(1, len(params)): + b = trans_params[j] + for k in range(j): + tmp[k] += b * trans_params[j - k - 1] + trans_params[:j] = tmp[:j] + return trans_params + def _ma_invtransparams(macoefs): """ @@ -534,15 +582,17 @@ def _ma_invtransparams(macoefs): The transformed MA coefficients """ tmp = macoefs.copy() - for j in range(len(macoefs)-1,0,-1): + for j in range(len(macoefs) - 1, 0, -1): b = macoefs[j] - for kiter in range(j): - tmp[kiter] = (macoefs[kiter]-b *macoefs[j-kiter-1])/(1-b**2) + scale = (1 - b ** 2) + for k in range(j): + tmp[k] = (macoefs[k] - b * macoefs[j - k - 1]) / scale macoefs[:j] = tmp[:j] - invmacoefs = -np.log((1-macoefs)/(1+macoefs)) - return invmacoefs + inv_ma_coefs = -np.log((1 - macoefs) / (1 + macoefs)) + return inv_ma_coefs -def unintegrate(x, levels): + +def reintegrate(x, levels): """ After taking n-differences of a series, return the original series @@ -551,25 +601,25 @@ def unintegrate(x, levels): x : array-like The n-th differenced series levels : list - A list of the first-value in each differenced series, for + A list of the initial-value in each differenced series, for [first-difference, second-difference, ..., n-th difference] Returns ------- y : array-like - The original series de-differenced + The original series, reintegrated Examples -------- >>> x = np.array([1, 3, 9., 19, 8.]) >>> levels = [x[0], np.diff(x, 1)[0]] - >>> _unintegrate(np.diff(x, 2), levels) + >>> reintegrate(np.diff(x, 2), levels) array([ 1., 3., 9., 19., 8.]) """ - levels = levels[:] # copy + levels = levels[:] # copy if len(levels) > 1: x0 = levels.pop(-1) - return _unintegrate(np.cumsum(np.r_[x0, x]), levels) + return reintegrate(np.cumsum(np.r_[x0, x]), levels) x0 = levels[0] return np.cumsum(np.r_[x0, x]) @@ -609,13 +659,5 @@ def freq_to_period(freq): "think this in error.".format(freq)) -__all__ = ['lagmat', 'lagmat2ds','add_trend', 'duplication_matrix', - 'elimination_matrix', 'commutation_matrix', - 'vec', 'vech', 'unvec', 'unvech'] - if __name__ == '__main__': - # sanity check, mainly for imports - x = np.random.normal(size=(100,2)) - tmp = lagmat(x,2) - tmp = lagmat2ds(x,2) -# grangercausalitytests(x, 2) + pass diff --git a/statsmodels/tsa/unitroot.py b/statsmodels/tsa/unitroot.py new file mode 100644 index 00000000000..3889fde09c4 --- /dev/null +++ b/statsmodels/tsa/unitroot.py @@ -0,0 +1,1297 @@ +from __future__ import division + +import warnings + +from numpy import (diff, ceil, power, squeeze, sqrt, sum, cumsum, int32, int64, + interp, abs, log, arange, sort) +from numpy.linalg import pinv +from scipy.stats import norm +from numpy import polyval + +from statsmodels.regression.linear_model import OLS +from statsmodels.tsa.tsatools import lagmat, add_trend +from statsmodels.tsa.stattools import _autolag, cov_nw +from statsmodels.tsa.critical_values.dickey_fuller import * +from statsmodels.tsa.critical_values.kpss import * +from statsmodels.tsa.critical_values.dfgls import * +from statsmodels.iolib.summary import Summary +from statsmodels.iolib.table import SimpleTable + + +class InvalidLengthWarning(Warning): + pass + + +invalid_length_doc = """ +The length of {var} is not an exact multiple of {block}, and so the final {drop} +observations have been dropped. +""" + +deprication_doc = """ +{old_func} has been depricated. Please use {new_func}. +""" + +__all__ = ['ADF', 'DFGLS', 'PhillipsPerron', 'KPSS', 'VarianceRatio', + 'adfuller', 'kpss_crit', 'mackinnoncrit', 'mackinnonp'] + +TREND_MAP = {None: 'nc', 0: 'c', 1: 'ct', 2: 'ctt'} + +TREND_DESCRIPTION = {'nc': 'No Trend', + 'c': 'Constant', + 'ct': 'Constant and Linear Time Trend', + 'ctt': 'Constant, Linear and Quadratic Time Trends'} + + +def _df_select_lags(y, trend, max_lags, method): + """ + Helper method to determine the best lag length in DF-like regressions + + Parameters + ---------- + y : array-like, (nobs,) + The data for the lag selection exercise + trend : str, {'nc','c','ct','ctt'} + The trend order + max_lags : int + The maximum number of lags to check. This setting affects all + estimation since the sample is adjusted by max_lags when + fitting the models + method : str, {'AIC','BIC','t-stat'} + The method to use when estimating the model + + Returns + ------- + best_ic : float + The information criteria at the selected lag + best_lag : int + The selected lag + all_res : list + List of OLS results from fitting max_lag + 1 models + + Notes + ----- + See statsmodels.tsa.tsatools._autolag for details + """ + nobs = y.shape[0] + delta_y = diff(y) + + if max_lags is None: + max_lags = int(ceil(12. * power(nobs / 100., 1 / 4.))) + + rhs = lagmat(delta_y[:, None], max_lags, trim='both', original='in') + nobs = rhs.shape[0] # pylint: disable=E1103 + rhs[:, 0] = y[-nobs - 1:-1] # replace 0 xdiff with level of x + lhs = delta_y[-nobs:] + + if trend != 'nc': + full_rhs = add_trend(rhs, trend, prepend=True) + else: + full_rhs = rhs + + start_lag = full_rhs.shape[1] - rhs.shape[1] + 1 + # TODO: Remove all_res after adfuller deprication + ic_best, best_lag, all_res = _autolag(OLS, lhs, full_rhs, start_lag, + max_lags, + method, regresults=True) + # To get the correct number of lags, subtract the start_lag since + # lags 0,1,...,start_lag-1 were not actual lags, but other variables + best_lag -= start_lag + return ic_best, best_lag, all_res + + +def _estimate_df_regression(y, trend, lags): + """Helper function that estimates the core (A)DF regression + + Parameters + ---------- + y : array-like, (nobs,) + The data for the lag selection + trend : str, {'nc','c','ct','ctt'} + The trend order + lags : int + The number of lags to include in the ADF regression + + Returns + ------- + ols_res : OLSResults + A results class object produced by OLS.fit() + + Notes + ----- + See statsmodels.regression.linear_model.OLS for details on the + value returned + """ + delta_y = diff(y) + + rhs = lagmat(delta_y[:, None], lags, trim='both', original='in') + nobs = rhs.shape[0] + lhs = rhs[:, 0].copy() # lag-0 values are lhs, Is copy() necessary? + rhs[:, 0] = y[-nobs - 1:-1] # replace lag 0 with level of y + + if trend != 'nc': + rhs = add_trend(rhs[:, :lags + 1], trend) + + return OLS(lhs, rhs).fit() + + +def ensure_1d(y): + """Returns a 1d array if the input is squeezable to 1d. Otherwise raises + and error + + Parameters + ---------- + y : array-like + The array to squeeze to 1d, or raise an error if not compatible + + Returns + ------- + y_1d : array + A 1d version of the input, returned as an array + """ + y = squeeze(asarray(y)) + if y.ndim != 1: + raise ValueError('Input must be 1d or squeezable to 1d.') + return y + + +class UnitRootTest(object): + """Base class to be used for inheritance in unit root tests""" + + def __init__(self, y, lags, trend): + self._y = ensure_1d(y) + self._delta_y = diff(y) + self._nobs = self._y.shape[0] + self._lags = None + self.lags = lags + self._trend = None + self.trend = trend + self._stat = None + self._critical_values = None + self._pvalue = None + self.trend = trend + self._h0 = 'The process contains a unit root.' + self._h1 = 'The process is weakly stationary.' + self._test_name = None + self._title = None + self._summary_text = None + + def _compute_statistic(self): + raise NotImplementedError("Subclass must implement") + + def _reset(self): + """Resets the unit root test so that it will be recomputed""" + self._stat = None + + def _compute_if_needed(self): + """Checks if statistic needs to be computed""" + if self._stat is None: + self._compute_statistic() + + @property + def h0(self): + """The null hypothesis""" + return self._h0 + + @property + def hA(self): + """The alternative hypothesis""" + return self._h1 + + @property + def nobs(self): + """The number of observations used in the test statistic. + Account for loss of data due to lags.""" + return self._nobs + + @property + def valid_trends(self): + """List of valid trend terms. Must be overridden""" + raise NotImplementedError("Subclass must implement") + + @property + def pvalue(self): + """Returns the p-value for the test statistic""" + self._compute_if_needed() + return self._pvalue + + @property + def stat(self): + """The test statistic for a unit root""" + self._compute_if_needed() + return self._stat + + @property + def critical_values(self): + """Dictionary containing critical values""" + self._compute_if_needed() + return self._critical_values + + def summary(self): + """Summary of test, containing statistic, p-value and critical values + """ + data = [('Test Statistic', '{0:0.3f}'.format(self.stat)), + ('P-value', '{0:0.3f}'.format(self.pvalue)), + ('Lags', '{0:d}'.format(self.lags))] + width = 15 + formatted_text = '{0:<' + str(width) + '} {1:>' + str(width) + '}' + table_data = [] + for label, val in data: + table_data.append([formatted_text.format(label, val)]) + title = self._title + if not title: + title = self._test_name + " Results" + table = SimpleTable(table_data, stubs=None, title=title) + + smry = Summary() + smry.tables.append(table) + + cv_string = 'Critical Values: ' + cv = self._critical_values.keys() + g = lambda x: float(x.split('%')[0]) + cv_numeric = array(map(g, cv)) + cv_numeric = sort(cv_numeric) + for val in cv_numeric: + p = str(int(val)) + '%' + cv_string += '{0:0.2f}'.format(self._critical_values[p]) + cv_string += ' (' + p + ')' + if val != cv_numeric[-1]: + cv_string += ', ' + + extra_text = ['Trend: ' + TREND_DESCRIPTION[self._trend], + cv_string, + 'Null Hypothesis: ' + self.h0, + 'Alternative Hypothesis: ' + self.hA] + + smry.add_extra_txt(extra_text) + if self._summary_text: + smry.add_extra_txt(self._summary_text) + return smry + + @property + def lags(self): + """Sets or gets the number of lags used in the model. + WHen tests use DF-type regressions, lags is the number of lags in the + regression model. When tests use long-run variance estimators, lags + is the number of lags used in the long-run variance estimator.""" + self._compute_if_needed() + return self._lags + + @lags.setter + def lags(self, value): + types = (int, long, int32, int64) + if value is not None and not isinstance(value, types) or \ + (isinstance(value, types) and value < 0): + raise ValueError('lags must be a non-negative integer or None') + if self._lags != value: + self._reset() + self._lags = value + + @property + def y(self): + """Returns the data used in the test""" + return self._y + + @property + def trend(self): + """Sets or gets the trend term""" + return self._trend + + @trend.setter + def trend(self, value): + if value not in self.valid_trends: + raise ValueError('trend not understood') + if self._trend != value: + self._reset() + self._trend = value + + +class ADF(UnitRootTest): + """Augmented Dickey Fuller test + + Parameters + ---------- + y : array-like, (nobs,) + The data to test for a unit root + lags : int, non-negative, optional + The number of lags to use in the ADF regression. If omitted or None, + `method` is used to automatically select the lag length with no more + than `max_lags` are included. + trend : str, {'nc', 'c', 'ct', 'ctt'}, optional + The trend component to include in the ADF test + 'nc' - No trend components + 'c' - Include a constant (Default) + 'ct' - Include a constant and linear time trend + 'ctt' - Include a constant and linear and quadratic time trends + max_lags : int, non-negative, optional + The maximum number of lags to use when selecting lag length + method : str, {'AIC', 'BIC', 't-stat'}, optional + The method to use when selecting the lag length + 'AIC' - Select the minimum of the Akaike IC + 'BIC' - Select the minimum of the Schwarz/Bayesian IC + 't-stat' - Select the minimum of the Schwarz/Bayesian IC + + Attributes + ---------- + stat + pvalues + critical_values + h0 + hA + summary + regression + valid_trends + y + trend + lags + + Notes + ----- + The null hypothesis of the Augmented Dickey-Fuller is that there is a unit + root, with the alternative that there is no unit root. If the pvalue is + above a critical size, then the null cannot be rejected that there + and the series appears to be a unit root. + + The p-values are obtained through regression surface approximation from + MacKinnon (1994) using the updated 2010 tables. + If the p-value is close to significant, then the critical values should be + used to judge whether to reject the null. + + The autolag option and maxlag for it are described in Greene. + + Examples + -------- + see example script + + References + ---------- + Greene + + Hamilton, J. D. 1994. Time Series Analysis. Princeton: Princeton + University Press. + + P-Values (regression surface approximation) + MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for + unit-root and cointegration tests. `Journal of Business and Economic + Statistics` 12, 167-76. + + Critical values + MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's + University, Dept of Economics, Working Papers. Available at + http://ideas.repec.org/p/qed/wpaper/1227.html + """ + + def __init__(self, y, lags=None, trend='c', + max_lags=None, method='AIC'): + super(ADF, self).__init__(y, lags, trend) + self._max_lags = max_lags + self._method = method + self._test_name = 'Augmented Dickey-Fuller' + self._regression = None + # TODO: Remove when adfuller is depricated + self._ic_best = None # For compat with adfuller + self._autolag_results = None # For compat with adfuller + + def _select_lag(self): + ic_best, best_lag, all_res = _df_select_lags(self._y, + self._trend, + self._max_lags, + self._method) + # TODO: Remove when adfuller is depricated + self._autolag_results = all_res + self._ic_best = ic_best + self._lags = best_lag + + def _compute_statistic(self): + if self._lags is None: + self._select_lag() + y, trend, lags = self._y, self._trend, self._lags + resols = _estimate_df_regression(y, trend, lags) + self._regression = resols + self._stat = stat = resols.tvalues[0] + self._nobs = int(resols.nobs) + self._pvalue = mackinnonp(stat, regression=trend, + num_unit_roots=1) + critical_values = mackinnoncrit(num_unit_roots=1, + regression=trend, + nobs=resols.nobs) + self._critical_values = {"1%": critical_values[0], + "5%": critical_values[1], + "10%": critical_values[2]} + + @property + def valid_trends(self): + """Returns the list of value trend terms""" + return 'nc', 'c', 'ct', 'ctt' + + @property + def regression(self): + """Returns the OLS regression results from the ADF model estimated""" + self._compute_if_needed() + return self._regression + + +class DFGLS(UnitRootTest): + """Elliott, Rothenberg and Stock's GLS version of the Dickey-Fuller test + + Parameters + ---------- + y : array-like, (nobs,) + The data to test for a unit root + lags : int, non-negative, optional + The number of lags to use in the ADF regression. If omitted or None, + `method` is used to automatically select the lag length with no more + than `max_lags` are included. + trend : str, {'c', 'ct'}, optional + The trend component to include in the ADF test + 'nc' - No trend components + 'c' - Include a constant (Default) + 'ct' - Include a constant and linear time trend + 'ctt' - Include a constant and linear and quadratic time trends + max_lags : int, non-negative, optional + The maximum number of lags to use when selecting lag length + method : str, {'AIC', 'BIC', 't-stat'}, optional + The method to use when selecting the lag length + 'AIC' - Select the minimum of the Akaike IC + 'BIC' - Select the minimum of the Schwarz/Bayesian IC + 't-stat' - Select the minimum of the Schwarz/Bayesian IC + + Attributes + ---------- + stat + pvalues + critical_values + h0 + hA + summary + regression + valid_trends + y + trend + lags + + Notes + ----- + The null hypothesis of the Dickey-Fuller GLS is that there is a unit + root, with the alternative that there is no unit root. If the pvalue is + above a critical size, then the null cannot be rejected that there + and the series appears to be a unit root. + + DFGLS differs from the ADF test in that an initial GLS detrending step + is used before a trend-less ADF regression is run. + + Critical values and p-values when trend is 'c' are identical to + the ADF. When trend is set to 'ct, they are from ... + + Examples + -------- + see example script + + References + ---------- + Elliott, G. R., T. J. Rothenberg, and J. H. Stock. 1996. Efficient tests + for an autoregressive unit root. Econometrica 64: 813-836 + """ + + def __init__(self, y, lags=None, trend='c', + max_lags=None, method='AIC'): + super(DFGLS, self).__init__(y, lags, trend) + self._max_lags = max_lags + self._method = method + self._regression = None + self._test_name = 'Dickey-Fuller GLS' + if trend == 'c': + self._c = -7.0 + else: + self._c = -13.5 + + def _compute_statistic(self): + """Core routine to estimate DF-GLS test statistic""" + # 1. GLS detrend + trend, c = self._trend, self._c + + nobs = self._y.shape[0] + ct = c / nobs + z = add_trend(nobs=nobs, trend=trend) + + delta_z = z.copy() + delta_z[1:, :] = delta_z[1:, :] - (1 + ct) * delta_z[:-1, :] + delta_y = self._y.copy()[:, None] + delta_y[1:] = delta_y[1:] - (1 + ct) * delta_y[:-1] + detrend_coef = pinv(delta_z).dot(delta_y) + y = self._y + y_detrended = y - z.dot(detrend_coef).ravel() + + # 2. determine lag length, if needed + if self._lags is None: + max_lags, method = self._max_lags, self._method + icbest, bestlag, all_res = _df_select_lags(y_detrended, 'nc', + max_lags, method) + self._lags = bestlag + + # 3. Run Regression + lags = self._lags + + resols = _estimate_df_regression(y_detrended, + lags=lags, + trend='nc') + self._regression = resols + self._nobs = int(resols.nobs) + self._stat = resols.tvalues[0] + self._pvalue = mackinnonp(self._stat, + regression=trend, + dist_type='DFGLS') + critical_values = mackinnoncrit(regression=trend, + nobs=self._nobs, + dist_type='DFGLS') + self._critical_values = {"1%": critical_values[0], + "5%": critical_values[1], + "10%": critical_values[2]} + + @property + def valid_trends(self): + """Returns the list of value trend terms""" + return 'c', 'ct' + + @UnitRootTest.trend.setter + def trend(self, value): + if value not in self.valid_trends: + raise ValueError('trend not understood') + if self._trend != value: + self._reset() + self._trend = value + if value == 'c': + self._c = -7.0 + else: + self._c = -13.5 + + @property + def regression(self): + """Returns the OLS regression results from the ADF model estimated""" + self._compute_if_needed() + return self._regression + + +class PhillipsPerron(UnitRootTest): + """Phillips-Perron unit root test + + Parameters + ---------- + y : array-like, (nobs,) + The data to test for a unit root + lags : int, non-negative, optional + The number of lags to use in the Newey-West estimator of the long-run + covariance. If omitted or None, the lag length is set automatically to + 12 * (nobs/100) ** (1/4) + trend : str, {'nc', 'c', 'ct'}, optional + The trend component to include in the ADF test + 'nc' - No trend components + 'c' - Include a constant (Default) + 'ct' - Include a constant and linear time trend + test_type : str, {'tau', 'rho'} + The test to use when computing the test statistic. 'tau' is based on + the t-stat and 'rho' uses a test based on nobs times the re-centered + regression coefficient + + Attributes + ---------- + stat + pvalues + critical_values + test_type + h0 + hA + summary + valid_trends + y + trend + lags + + Notes + ----- + The null hypothesis of the Phillips-Perron (PP) test is that there is a + unit root, with the alternative that there is no unit root. If the pvalue + is above a critical size, then the null cannot be rejected that there + and the series appears to be a unit root. + + Unlike the ADF test, the regression estimated includes only one lag of + the dependant variable, in addition to trend terms. Any serial + correlation in the regression errors is accounted for using a long-run + variance estimator (currently Newey-West). + + The p-values are obtained through regression surface approximation from + MacKinnon (1994) using the updated 2010 tables. + If the p-value is close to significant, then the critical values should be + used to judge whether to reject the null. + + Examples + -------- + see example script + + References + ---------- + Hamilton, J. D. 1994. Time Series Analysis. Princeton: Princeton + University Press. + + Newey, W. K., and K. D. West. 1987. A simple, positive semidefinite, + heteroskedasticity and autocorrelation consistent covariance matrix. + Econometrica 55, 703-708. + + Phillips, P. C. B., and P. Perron. 1988. Testing for a unit root in + time series regression. Biometrika 75, 335-346. + + P-Values (regression surface approximation) + MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for + unit-root and cointegration tests. `Journal of Business and Economic + Statistics` 12, 167-76. + + Critical values + MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's + University, Dept of Economics, Working Papers. Available at + http://ideas.repec.org/p/qed/wpaper/1227.html + """ + + def __init__(self, y, lags=None, trend='c', test_type='tau'): + super(PhillipsPerron, self).__init__(y, lags, trend) + self._test_type = test_type + self._stat_rho = None + self._stat_tau = None + self._test_name = 'Phillips-Perron' + self._lags = lags + + def _compute_statistic(self): + """Core routine to estimate PP test statistics""" + # 1. Estimate Regression + y, trend = self._y, self._trend + nobs = y.shape[0] + + if self._lags is None: + self._lags = int(ceil(12. * power(nobs / 100., 1 / 4.))) + lags = self._lags + + rhs = y[:-1, None] + lhs = y[1:, None] + if trend != 'nc': + rhs = add_trend(rhs, trend) + + resols = OLS(lhs, rhs).fit() + k = rhs.shape[1] + n, u = resols.nobs, resols.resid + lam2 = cov_nw(u, lags, demean=False) + lam = sqrt(lam2) + # 2. Compute components + s2 = u.dot(u) / (n - k) + s = sqrt(s2) + gamma0 = s2 * (n - k) / n + sigma = resols.bse[0] + sigma2 = sigma ** 2.0 + rho = resols.params[0] + # 3. Compute statistics + self._stat_tau = sqrt(gamma0 / lam2) * ((rho - 1) / sigma) \ + - 0.5 * ((lam2 - gamma0) / lam) * (n * sigma / s) + self._stat_rho = n * (rho - 1) \ + - 0.5 * (n ** 2.0 * sigma2 / s2) * (lam2 - gamma0) + + self._nobs = int(resols.nobs) + if self._test_type == 'rho': + self._stat = self._stat_rho + dist_type = 'ADF-z' + else: + self._stat = self._stat_tau + dist_type = 'ADF-t' + + self._pvalue = mackinnonp(self._stat, + regression=trend, + dist_type=dist_type) + critical_values = mackinnoncrit(regression=trend, + nobs=n, + dist_type=dist_type) + self._critical_values = {"1%": critical_values[0], + "5%": critical_values[1], + "10%": critical_values[2]} + + self._title = self._test_name + ', Test Type Z-' + self._test_type + + @property + def test_type(self): + """Gets or sets the test type returned by stat. + Valid values are 'tau' or 'rho'""" + return self._test_type + + @test_type.setter + def test_type(self, value): + if value not in ('rho', 'tau'): + raise ValueError('stat must be either ''rho'' or ''tau''.') + self._reset() + self._test_type = value + + @property + def valid_trends(self): + """Returns the list of value trend terms""" + return 'nc', 'c', 'ct' + + +class KPSS(UnitRootTest): + """Kwiatkowski, Phillips, Schmidt and Shin (1992, KPSS) stationarity test + + Parameters + ---------- + y : array-like, (nobs,) + The data to test for stationarity + lags : int, non-negative, optional + The number of lags to use in the Newey-West estimator of the long-run + covariance. If omitted or None, the lag length is set automatically to + 12 * (nobs/100) ** (1/4) + trend : str, {'c', 'ct'}, optional + The trend component to include in the ADF test + 'c' - Include a constant (Default) + 'ct' - Include a constant and linear time trend + + Attributes + ---------- + stat + pvalues + critical_values + test_type + h0 + hA + summary + valid_trends + y + trend + lags + + Notes + ----- + The null hypothesis of the KPSS test is that the series is weakly stationary + and the alternative is that it is non-stationary. If the p-value + is above a critical size, then the null cannot be rejected that there + and the series appears stationary. + + The p-values and critical values were computed using an extensive simulation + based on 100,000,000 replications using series with 2,000 observations. + + Examples + -------- + see example script + + References + ---------- + Kwiatkowski, D.; Phillips, P. C. B.; Schmidt, P.; Shin, Y. (1992). "Testing + the null hypothesis of stationarity against the alternative of a unit root". + Journal of Econometrics 54 (1-3), 159-178 + """ + + def __init__(self, y, lags=None, trend='c'): + super(KPSS, self).__init__(y, lags, trend) + self._test_name = 'KPSS Stationarity Test' + self._h0 = 'The process is weakly stationary.' + self._h1 = 'The process contains a unit root.' + + def _compute_statistic(self): + # 1. Estimate model with trend + nobs, y, trend = self._nobs, self._y, self._trend + z = add_trend(nobs=nobs, trend=trend) + res = OLS(y, z).fit() + # 2. Compute KPSS test + u = res.resid + if self._lags is None: + self._lags = int(ceil(12. * power(nobs / 100., 1 / 4.))) + lam = cov_nw(u, self._lags, demean=False) + s = cumsum(u) + self._stat = 1 / (nobs ** 2.0) * sum(s ** 2.0) / lam + self._nobs = u.shape[0] + self._pvalue, critical_values = kpss_crit(self._stat, trend) + self._critical_values = {"1%": critical_values[0], + "5%": critical_values[1], + "10%": critical_values[2]} + + @property + def valid_trends(self): + """Returns the list of value trend terms""" + return 'c', 'ct' + + +class VarianceRatio(UnitRootTest): + """ + Variance Ratio test of a random walk. + + Parameters + ---------- + y : array-like, (nobs,) + The data to test for a random walk + lags : int, >=2 + The number of periods to used in the multi-period variance, which is the + numerator of the test statistic. Must be at least 2 + trend : str, {'nc', 'c'}, optional + 'c' allows for a non-zero drift in the random walk, while 'nc' requires + that the increments to y are mean 0 + overlap : bool, optional + Indicates whether to use all overlapping blocks. Default is True. If + False, the number of observations in y minus 1 must be an exact multiple + of lags. If this condition is not satistifed, some values at the end of + y will be discarded. + robust : bool, optional + Indicates whether to use heteroskedasticity robust inference. Default is + True. + debiased : bool, optional + Indicates whether to use a debiased version of the test. Default is + True. Only applicable if overlap is True. + + Attributes + ---------- + stat + pvalues + critical_values + test_type + h0 + hA + summary + valid_trends + y + trend + lags + overlap + robust + debiased + + Notes + ----- + The null hypothesis of a VR is that the process is a random walk, possibly + plus drift. Rejection of the null with a positive test statistic indicates + the presence of positive serial correlation in the time series. + + Examples + -------- + see example script + + References + ---------- + Campbell, John Y., Lo, Andrew W. and MacKinlay, A. Craig. (1997) The + Econometrics of Financial Markets. Princeton, NJ: Princeton University + Press. + + """ + + def __init__(self, y, lags=2, trend='c', debiased=True, + robust=True, overlap=True): + if lags < 2: + raise ValueError('lags must be an integer larger than 2') + super(VarianceRatio, self).__init__(y, lags, trend) + self._test_name = 'Variance-Ratio test' + self._h0 = 'The process is a random walk.' + self._h1 = 'The process is not a random walk.' + self._robust = robust + self._debiased = debiased + self._overlap = overlap + self._vr = None + self._stat_variance = None + quantiles = array([.01, .05, .1, .9, .95, .99]) + self._critical_values = {} + for q, cv in zip(quantiles, norm.ppf(quantiles)): + self._critical_values[str(int(100 * q)) + '%'] = cv + + @property + def vr(self): + """The ratio of the long block lags-period variance + to the 1-period variance""" + self._compute_if_needed() + return self._vr + + @property + def overlap(self): + """Sets of gets the indicator to use overlaping returns in the + long-period vairance estimator""" + return self._overlap + + @overlap.setter + def overlap(self, value): + self._reset() + self._overlap = bool(value) + + @property + def robust(self): + """Sets of gets the indicator to use a heteroskedasticity robust + variance estimator """ + return self._robust + + @robust.setter + def robust(self, value): + self._reset() + self._robust = bool(value) + + @property + def debiased(self): + """Sets of gets the indicator to use debiased variances in the ratio""" + return self._debiased + + @debiased.setter + def debiased(self, value): + self._reset() + self._debiased = bool(value) + + def _compute_statistic(self): + overlap, debiased, robust = self._overlap, self._debiased, self._robust + y, nobs, q, trend = self._y, self._nobs, self._lags, self._trend + + nq = nobs - 1 + if not overlap: + # Check length of y + if nq % q != 0: + extra = nq % q + y = y[:-extra] + warnings.warn(invalid_length_doc.format(var='y', + block=q, + drop=extra), + InvalidLengthWarning) + + nobs = y.shape[0] + if trend == 'nc': + mu = 0 + else: + mu = (y[-1] - y[0]) / (nobs - 1) + + delta_y = diff(y) + nq = delta_y.shape[0] + sigma2_1 = sum((delta_y - mu) ** 2.0) / nq + + if not overlap: + delta_y_q = y[q::q] - y[0:-q:q] + sigma2_q = sum((delta_y_q - q * mu) ** 2.0) / nq + else: + delta_y_q = y[q:] - y[:-q] + sigma2_q = sum((delta_y_q - q * mu) ** 2.0) / (nq * q) + + if debiased and overlap: + sigma2_1 *= nq / (nq - 1) + m = q * (nq - q + 1) * (1 - (q / nq)) + sigma2_q *= (nq * q) / m + + if not overlap: + self._stat_variance = 2.0 * (q - 1) + elif not robust: + self._stat_variance = (2 * (2 * q - 1) * (q - 1)) / (2 * q) + else: + z2 = (delta_y - mu) ** 2.0 + scale = sum(z2) ** 2.0 + theta = 0.0 + for k in arange(1.0, q): + delta = nq * z2[k:].dot(z2[:-k]) / scale + theta += (1 - k / q) ** 2.0 * delta + self._stat_variance = theta + self._vr = sigma2_q / sigma2_1 + self._stat = sqrt(nq) * (self._vr - 1) / sqrt(self._stat_variance) + self._pvalue = 2 - 2 * norm.cdf(abs(self._stat)) + + @property + def valid_trends(self): + """Returns the list of value trend terms""" + return 'nc', 'c' + + +# Wrapper before deprication +# See: +# Ng and Perron(2001), Lag length selection and the construction of unit root +# tests with good size and power, Econometrica, Vol 69 (6) pp 1519-1554 +# TODO: include drift keyword, only valid with regression == "c" which will +# TODO: change the distribution of the test statistic to a t distribution +def adfuller(x, maxlag=None, regression="c", autolag='AIC', + store=False, regresults=False): + """ + Augmented Dickey-Fuller unit root test + + The Augmented Dickey-Fuller test can be used to test for a unit root in a + univariate process in the presence of serial correlation. + + Parameters + ---------- + x : array_like, 1d + data series + maxlag : int + Maximum lag which is included in test, default 12*(nobs/100)^{1/4} + regression : str {'c','ct','ctt','nc'} + Constant and trend order to include in regression + * 'c' : constant only (default) + * 'ct' : constant and trend + * 'ctt' : constant, and linear and quadratic trend + * 'nc' : no constant, no trend + autolag : {'AIC', 'BIC', 't-stat', None} + * if None, then maxlag lags are used + * if 'AIC' (default) or 'BIC', then the number of lags is chosen + to minimize the corresponding information criterium + * 't-stat' based choice of maxlag. Starts with maxlag and drops a + lag until the t-statistic on the last lag length is significant at + the 95 % level. + store : bool + If True, then an ADF instance is returned additionally to + the adf statistic (default is False) + regresults : bool + If True, the full regression results are returned (default is False) + + Returns + ------- + adf : float + Test statistic + pvalue : float + MacKinnon's approximate p-value based on MacKinnon (1994) + usedlag : int + Number of lags used. + nobs : int + Number of observations used for the ADF regression and calculation of + the critical values. + critical values : dict + Critical values for the test statistic at the 1 %, 5 %, and 10 % + levels. Based on MacKinnon (2010) + icbest : float + The maximized information criterion if autolag is not None. + regresults : RegressionResults instance + The + resstore : (optional) instance of ResultStore + an instance of a dummy class with results attached as attributes + + Notes + ----- + This function has been depricated. Please use ADF instead. + + The null hypothesis of the Augmented Dickey-Fuller is that there is a unit + root, with the alternative that there is no unit root. If the pvalue is + above a critical size, then we cannot reject that there is a unit root. + + The p-values are obtained through regression surface approximation from + MacKinnon 1994, but using the updated 2010 tables. + If the p-value is close to significant, then the critical values should be + used to judge whether to accept or reject the null. + + The autolag option and maxlag for it are described in Greene. + + Examples + -------- + see example script + + References + ---------- + Greene + Hamilton + + + P-Values (regression surface approximation) + MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for + unit-root and cointegration tests. `Journal of Business and Economic + Statistics` 12, 167-76. + + Critical values + MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's + University, Dept of Economics, Working Papers. Available at + http://ideas.repec.org/p/qed/wpaper/1227.html + + """ + warnings.warn(deprication_doc.format(old_func='adfuller', new_func='ADF'), + DeprecationWarning) + lags = None + if autolag is None: + lags = maxlag + adf = ADF(x, lags=lags, trend=regression, max_lags=maxlag, method=autolag) + + adfstat = adf.stat + pvalue = adf.pvalue + critvalues = adf.critical_values + usedlag = adf.lags + nobs = adf.nobs + icbest = adf._ic_best + resstore = adf + if regresults: + # Work arounds for missing properties + setattr(resstore, 'autolag_results', resstore._autolag_results) + setattr(resstore, 'usedlag', resstore.lags) + return adfstat, pvalue, critvalues, resstore + elif autolag: + return adfstat, pvalue, usedlag, nobs, critvalues, icbest + else: + return adfstat, pvalue, usedlag, nobs, critvalues + + +def mackinnonp(stat, regression="c", num_unit_roots=1, dist_type='ADF-t'): + """ + Returns MacKinnon's approximate p-value for test stat. + + Parameters + ---------- + stat : float + "T-value" from an Augmented Dickey-Fuller or DFGLS regression. + regression : str {"c", "nc", "ct", "ctt"} + This is the method of regression that was used. Following MacKinnon's + notation, this can be "c" for constant, "nc" for no constant, "ct" for + constant and trend, and "ctt" for constant, trend, and trend-squared. + num_unit_roots : int + The number of series believed to be I(1). For (Augmented) Dickey- + Fuller N = 1. + dist_type: str, {'ADF-t', 'ADF-z', 'DFGLS'} + The test type to use when computing p-values. Options include + 'ADF-t' - ADF t-stat based tests + 'ADF-z' - ADF z tests + 'DFGLS' - GLS detrended Dickey Fuller + + Returns + ------- + p-value : float + The p-value for the ADF statistic estimated using MacKinnon 1994. + + References + ---------- + MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for + Unit-Root and Cointegration Tests." Journal of Business & Economics + Statistics, 12.2, 167-76. + + Notes + ----- + Most values are from MacKinnon (1994). Values for DFGLS test statistics + and the 'nc' version of the ADF z test statistic were computed following + the methodology of MacKinnon (1994). + """ + dist_type = dist_type.lower() + if num_unit_roots > 1 and dist_type.lower() != 'adf-t': + raise ValueError('Cointegration results (num_unit_roots > 1) are' + + 'only available for ADF-t values') + if dist_type == 'adf-t': + maxstat = tau_max[regression][num_unit_roots - 1] + minstat = tau_min[regression][num_unit_roots - 1] + starstat = tau_star[regression][num_unit_roots - 1] + small_p = tau_small_p[regression][num_unit_roots - 1] + large_p = tau_large_p[regression][num_unit_roots - 1] + elif dist_type == 'adf-z': + maxstat = adf_z_max[regression] + minstat = adf_z_min[regression] + starstat = adf_z_star[regression] + small_p = adf_z_small_p[regression] + large_p = adf_z_large_p[regression] + elif dist_type == 'dfgls': + maxstat = dfgls_tau_max[regression] + minstat = dfgls_tau_min[regression] + starstat = dfgls_tau_star[regression] + small_p = dfgls_small_p[regression] + large_p = dfgls_large_p[regression] + else: + raise ValueError('Unknown test type {0}'.format(dist_type)) + + if stat > maxstat: + return 1.0 + elif stat < minstat: + return 0.0 + if stat <= starstat: + poly_coef = small_p + if dist_type == 'adf-z': + stat = log(abs(stat)) # Transform stat for small p ADF-z + else: + poly_coef = large_p + return norm.cdf(polyval(poly_coef[::-1], stat)) + + +def mackinnoncrit(num_unit_roots=1, regression="c", nobs=inf, + dist_type='ADF-t'): + """ + Returns the critical values for cointegrating and the ADF test. + + In 2010 MacKinnon updated the values of his 1994 paper with critical values + for the augmented Dickey-Fuller tests. These new values are to be + preferred and are used here. + + Parameters + ---------- + num_unit_roots : int + The number of series of I(1) series for which the null of + non-cointegration is being tested. For N > 12, the critical values + are linearly interpolated (not yet implemented). For the ADF test, + N = 1. + reg : str {'c', 'tc', 'ctt', 'nc'} + Following MacKinnon (1996), these stand for the type of regression run. + 'c' for constant and no trend, 'tc' for constant with a linear trend, + 'ctt' for constant with a linear and quadratic trend, and 'nc' for + no constant. The values for the no constant case are taken from the + 1996 paper, as they were not updated for 2010 due to the unrealistic + assumptions that would underlie such a case. + nobs : int or np.inf + This is the sample size. If the sample size is numpy.inf, then the + asymptotic critical values are returned. + + Returns + ------- + crit_vals : array + Three critical values corresponding to 1%, 5% and 10% cut-offs. + + Notes + ----- + Results for ADF t-stats from MacKinnon (1994,2010). Results for DFGLS and + ADF z-tests use the same methodology as MacKinnon. + + References + ---------- + MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for + Unit-Root and Cointegration Tests." Journal of Business & Economics + Statistics, 12.2, 167-76. + MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." + Queen's University, Dept of Economics Working Papers 1227. + http://ideas.repec.org/p/qed/wpaper/1227.html + """ + dist_type = dist_type.lower() + valid_regression = ['c', 'ct', 'nc', 'ctt'] + if dist_type == 'dfgls': + valid_regression = ['c', 'ct'] + if regression not in valid_regression: + raise ValueError( + "regression keyword {0} not understood".format(regression)) + + if dist_type == 'adf-t': + asymptotic_cv = tau_2010[regression][num_unit_roots - 1, :, 0] + poly_coef = tau_2010[regression][num_unit_roots - 1, :, :].T + elif dist_type == 'adf-z': + poly_coef = adf_z_cv_approx[regression].T + asymptotic_cv = adf_z_cv_approx[regression][:, 0] + elif dist_type == 'dfgls': + poly_coef = dfgls_cv_approx[regression].T + asymptotic_cv = dfgls_cv_approx[regression][:, 0] + else: + raise ValueError('Unknown test type {0}'.format(dist_type)) + + if nobs is inf: + return asymptotic_cv + else: + # Flip so that highest power to lowest power + return polyval(poly_coef[::-1], 1. / nobs) + + +def kpss_crit(stat, trend='c'): + """ + Linear interpolation for KPSS p-values and critical values + + Parameters + ---------- + stat : float + The KPSS test statistic. + trend : str, {'c','ct'} + The trend used when computing the KPSS statistic + + Returns + ------- + pvalue : float + The interpolated p-value + crit_val : array + Three element array containing the 10%, 5% and 1% critical values, + in order + + Notes + ----- + The p-values are linear interpolated from the quantiles of the simulated + KPSS test statistic distribution using 100,000,000 replications and 2000 + data points. + """ + table = kpss_critical_values[trend] + y = table[:, 0] + x = table[:, 1] + pvalue = interp(stat, x, y) + cv = [1.0, 5.0, 10.0] + crit_value = interp(cv, y[::-1], x[::-1]) + + return pvalue, crit_value + + +if __name__ == '__main__': + pass