diff --git a/docs/source/release/version0.13.3.rst b/docs/source/release/version0.13.3.rst index e02d5303b8d..1a6ad35cd10 100644 --- a/docs/source/release/version0.13.3.rst +++ b/docs/source/release/version0.13.3.rst @@ -30,45 +30,16 @@ Stats The Highlights ============== - - -What's new - an overview -======================== - -The following lists the main new features of statsmodels 0.13.3. In addition, -release 0.13.3 includes bug fixes, refactorings and improvements in many areas. +This is a Python 3.11 compatability release only. There are no significant +new features or bug fixes. Submodules ---------- - ``maintenance`` ~~~~~~~~~~~~~~~ - Backport Python 3.11 to 0.13.x branch (:pr:`8484`) - - - - -bug-wrong ---------- - -A new issue label `type-bug-wrong` indicates bugs that cause that incorrect -numbers are returned without warnings. -(Regular bugs are mostly usability bugs or bugs that raise an exception for -unsupported use cases.) -`see tagged issues `_ - - -Major Bugs Fixed -================ - -See github issues for a list of bug fixes included in this release - -- `Closed bugs `_ -- `Closed bugs (wrong result) `_ - - Development summary and credits =============================== diff --git a/requirements-dev.txt b/requirements-dev.txt index e0845a518fe..96425f5b7e4 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,6 +7,7 @@ oldest-supported-numpy>=2022.4.18 matplotlib>=3 colorama joblib +Jinja2 # Remove due to failures on azure CI # cvxopt; os_name != "win32" diff --git a/statsmodels/sandbox/distributions/__init__.py b/statsmodels/sandbox/distributions/__init__.py index 49b1d3002f3..dd786862bd2 100644 --- a/statsmodels/sandbox/distributions/__init__.py +++ b/statsmodels/sandbox/distributions/__init__.py @@ -1,4 +1,4 @@ -'''temporary location for enhancements to scipy.stats +"""temporary location for enhancements to scipy.stats includes ^^^^^^^^ @@ -20,4 +20,4 @@ mixed status : from not-working to well-tested -''' +""" diff --git a/statsmodels/sandbox/distributions/estimators.py b/statsmodels/sandbox/distributions/estimators.py index 1dcb173bdf9..0a0384eb4b1 100644 --- a/statsmodels/sandbox/distributions/estimators.py +++ b/statsmodels/sandbox/distributions/estimators.py @@ -1,4 +1,4 @@ -'''estimate distribution parameters by various methods +"""estimate distribution parameters by various methods method of moments or matching quantiles, and Maximum Likelihood estimation based on binned data and Maximum Product-of-Spacings @@ -85,17 +85,17 @@ changes: added Maximum Product-of-Spacings 2010-05-12 -''' +""" import numpy as np -from scipy import stats, optimize, special +from scipy import optimize, special, stats -cache = {} #module global storage for temp results, not used +cache = {} # module global storage for temp results, not used # the next two use distfn from module scope - not anymore def gammamomentcond(distfn, params, mom2, quantile=None): - '''estimate distribution parameters based method of moments (mean, + """estimate distribution parameters based method of moments (mean, variance) for distributions with 1 shape parameter and fixed loc=0. Returns @@ -106,16 +106,19 @@ def gammamomentcond(distfn, params, mom2, quantile=None): ----- first test version, quantile argument not used - ''' + """ + def cond(params): alpha, scale = params - mom2s = distfn.stats(alpha, 0.,scale) - #quantil - return np.array(mom2)-mom2s + mom2s = distfn.stats(alpha, 0.0, scale) + # quantil + return np.array(mom2) - mom2s + return cond + def gammamomentcond2(distfn, params, mom2, quantile=None): - '''estimate distribution parameters based method of moments (mean, + """estimate distribution parameters based method of moments (mean, variance) for distributions with 1 shape parameter and fixed loc=0. Returns @@ -129,16 +132,15 @@ def gammamomentcond2(distfn, params, mom2, quantile=None): The only difference to previous function is return type. - ''' + """ alpha, scale = params - mom2s = distfn.stats(alpha, 0.,scale) - return np.array(mom2)-mom2s - + mom2s = distfn.stats(alpha, 0.0, scale) + return np.array(mom2) - mom2s ######### fsolve does not move in small samples, fmin not very accurate def momentcondunbound(distfn, params, mom2, quantile=None): - '''moment conditions for estimating distribution parameters using method + """moment conditions for estimating distribution parameters using method of moments, uses mean, variance and one quantile for distributions with 1 shape parameter. @@ -147,12 +149,12 @@ def momentcondunbound(distfn, params, mom2, quantile=None): difference : ndarray difference between theoretical and empirical moments and quantiles - ''' + """ shape, loc, scale = params - mom2diff = np.array(distfn.stats(shape, loc,scale)) - mom2 + mom2diff = np.array(distfn.stats(shape, loc, scale)) - mom2 if quantile is not None: pq, xq = quantile - #ppfdiff = distfn.ppf(pq, alpha) + # ppfdiff = distfn.ppf(pq, alpha) cdfdiff = distfn.cdf(xq, shape, loc, scale) - pq return np.concatenate([mom2diff, cdfdiff[:1]]) return mom2diff @@ -160,7 +162,7 @@ def momentcondunbound(distfn, params, mom2, quantile=None): ###### loc scale only def momentcondunboundls(distfn, params, mom2, quantile=None, shape=None): - '''moment conditions for estimating loc and scale of a distribution + """moment conditions for estimating loc and scale of a distribution with method of moments using either 2 quantiles or 2 moments (not both). Returns @@ -168,24 +170,24 @@ def momentcondunboundls(distfn, params, mom2, quantile=None, shape=None): difference : ndarray difference between theoretical and empirical moments or quantiles - ''' + """ loc, scale = params mom2diff = np.array(distfn.stats(shape, loc, scale)) - mom2 if quantile is not None: pq, xq = quantile - #ppfdiff = distfn.ppf(pq, alpha) + # ppfdiff = distfn.ppf(pq, alpha) cdfdiff = distfn.cdf(xq, shape, loc, scale) - pq - #return np.concatenate([mom2diff, cdfdiff[:1]]) + # return np.concatenate([mom2diff, cdfdiff[:1]]) return cdfdiff return mom2diff - ######### try quantile GMM with identity weight matrix -#(just a guess that's what it is +# (just a guess that's what it is + def momentcondquant(distfn, params, mom2, quantile=None, shape=None): - '''moment conditions for estimating distribution parameters by matching + """moment conditions for estimating distribution parameters by matching quantiles, defines as many moment conditions as quantiles. Returns @@ -198,44 +200,49 @@ def momentcondquant(distfn, params, mom2, quantile=None, shape=None): This can be used for method of moments or for generalized method of moments. - ''' - #this check looks redundant/unused know + """ + # this check looks redundant/unused know if len(params) == 2: loc, scale = params elif len(params) == 3: shape, loc, scale = params else: - #raise NotImplementedError - pass #see whether this might work, seems to work for beta with 2 shape args + # raise NotImplementedError + pass # see whether this might work, seems to work for beta with 2 shape args - #mom2diff = np.array(distfn.stats(*params)) - mom2 - #if not quantile is None: + # mom2diff = np.array(distfn.stats(*params)) - mom2 + # if not quantile is None: pq, xq = quantile - #ppfdiff = distfn.ppf(pq, alpha) + # ppfdiff = distfn.ppf(pq, alpha) cdfdiff = distfn.cdf(xq, *params) - pq - #return np.concatenate([mom2diff, cdfdiff[:1]]) + # return np.concatenate([mom2diff, cdfdiff[:1]]) return cdfdiff - #return mom2diff + # return mom2diff + def fitquantilesgmm(distfn, x, start=None, pquant=None, frozen=None): if pquant is None: - pquant = np.array([0.01, 0.05,0.1,0.4,0.6,0.9,0.95,0.99]) + pquant = np.array([0.01, 0.05, 0.1, 0.4, 0.6, 0.9, 0.95, 0.99]) if start is None: - if hasattr(distfn, '_fitstart'): + if hasattr(distfn, "_fitstart"): start = distfn._fitstart(x) else: - start = [1]*distfn.numargs + [0.,1.] - #TODO: vectorize this: - xqs = [stats.scoreatpercentile(x, p) for p in pquant*100] + start = [1] * distfn.numargs + [0.0, 1.0] + # TODO: vectorize this: + xqs = [stats.scoreatpercentile(x, p) for p in pquant * 100] mom2s = None - parest = optimize.fmin(lambda params:np.sum( - momentcondquant(distfn, params, mom2s,(pquant,xqs), shape=None)**2), start) + parest = optimize.fmin( + lambda params: np.sum( + momentcondquant(distfn, params, mom2s, (pquant, xqs), shape=None) + ** 2 + ), + start, + ) return parest - def fitbinned(distfn, freq, binedges, start, fixed=None): - '''estimate parameters of distribution function for binned data using MLE + """estimate parameters of distribution function for binned data using MLE Parameters ---------- @@ -259,24 +266,30 @@ def fitbinned(distfn, freq, binedges, start, fixed=None): added factorial - ''' + """ if fixed is not None: raise NotImplementedError nobs = np.sum(freq) - lnnobsfact = special.gammaln(nobs+1) + lnnobsfact = special.gammaln(nobs + 1) def nloglike(params): - '''negative loglikelihood function of binned data + """negative loglikelihood function of binned data corresponds to multinomial - ''' + """ prob = np.diff(distfn.cdf(binedges, *params)) - return -(lnnobsfact + np.sum(freq*np.log(prob)- special.gammaln(freq+1))) + return -( + lnnobsfact + + np.sum(freq * np.log(prob) - special.gammaln(freq + 1)) + ) + return optimize.fmin(nloglike, start) -def fitbinnedgmm(distfn, freq, binedges, start, fixed=None, weightsoptimal=True): - '''estimate parameters of distribution function for binned data using GMM +def fitbinnedgmm( + distfn, freq, binedges, start, fixed=None, weightsoptimal=True +): + """estimate parameters of distribution function for binned data using GMM Parameters ---------- @@ -305,28 +318,30 @@ def fitbinnedgmm(distfn, freq, binedges, start, fixed=None, weightsoptimal=True) added factorial - ''' + """ if fixed is not None: raise NotImplementedError nobs = np.sum(freq) if weightsoptimal: - weights = freq/float(nobs) + weights = freq / float(nobs) else: weights = np.ones(len(freq)) - freqnormed = freq/float(nobs) + freqnormed = freq / float(nobs) # skip turning weights into matrix diag(freq/float(nobs)) def gmmobjective(params): - '''negative loglikelihood function of binned data + """negative loglikelihood function of binned data corresponds to multinomial - ''' + """ prob = np.diff(distfn.cdf(binedges, *params)) momcond = freqnormed - prob - return np.dot(momcond*weights, momcond) + return np.dot(momcond * weights, momcond) + return optimize.fmin(gmmobjective, start) -#Addition from try_maxproductspacings: + +# Addition from try_maxproductspacings: """Estimating Parameters of Log-Normal Distribution with Maximum Likelihood and Maximum Product-of-Spacings @@ -337,16 +352,19 @@ def gmmobjective(params): License: BSD """ + def hess_ndt(fun, pars, args, options): import numdifftools as ndt - if not ('stepMax' in options or 'stepFix' in options): - options['stepMax'] = 1e-5 + + if not ("stepMax" in options or "stepFix" in options): + options["stepMax"] = 1e-5 f = lambda params: fun(params, *args) h = ndt.Hessian(f, **options) return h(pars), h + def logmps(params, xsorted, dist): - '''calculate negative log of Product-of-Spacings + """calculate negative log of Product-of-Spacings Parameters ---------- @@ -366,13 +384,14 @@ def logmps(params, xsorted, dist): Notes ----- MPS definiton from JKB page 233 - ''' - xcdf = np.r_[0., dist.cdf(xsorted, *params), 1.] + """ + xcdf = np.r_[0.0, dist.cdf(xsorted, *params), 1.0] D = np.diff(xcdf) return -np.log(D).mean() + def getstartparams(dist, data): - '''get starting values for estimation of distribution parameters + """get starting values for estimation of distribution parameters Parameters ---------- @@ -389,19 +408,20 @@ def getstartparams(dist, data): preliminary estimate or starting value for the parameters of the distribution given the data, including loc and scale - ''' - if hasattr(dist, 'fitstart'): - #x0 = getattr(dist, 'fitstart')(data) + """ + if hasattr(dist, "fitstart"): + # x0 = getattr(dist, 'fitstart')(data) x0 = dist.fitstart(data) else: if np.isfinite(dist.a): - x0 = np.r_[[1.]*dist.numargs, (data.min()-1), 1.] + x0 = np.r_[[1.0] * dist.numargs, (data.min() - 1), 1.0] else: - x0 = np.r_[[1.]*dist.numargs, (data.mean()-1), 1.] + x0 = np.r_[[1.0] * dist.numargs, (data.mean() - 1), 1.0] return x0 + def fit_mps(dist, data, x0=None): - '''Estimate distribution parameters with Maximum Product-of-Spacings + """Estimate distribution parameters with Maximum Product-of-Spacings Parameters ---------- @@ -419,93 +439,123 @@ def fit_mps(dist, data, x0=None): including loc and scale - ''' + """ xsorted = np.sort(data) if x0 is None: x0 = getstartparams(dist, xsorted) args = (xsorted, dist) print(x0) - #print(args) + # print(args) return optimize.fmin(logmps, x0, args=args) +if __name__ == "__main__": -if __name__ == '__main__': + # Example: gamma - distribution + # ----------------------------- - #Example: gamma - distribution - #----------------------------- - - print('\n\nExample: gamma Distribution') - print( '---------------------------') + print("\n\nExample: gamma Distribution") + print("---------------------------") alpha = 2 xq = [0.5, 4] pq = [0.1, 0.9] print(stats.gamma.ppf(pq, alpha)) xq = stats.gamma.ppf(pq, alpha) - print(np.diff((stats.gamma.ppf(pq, np.linspace(0.01,4,10)[:,None])*xq[::-1]))) - #optimize.bisect(lambda alpha: np.diff((stats.gamma.ppf(pq, alpha)*xq[::-1]))) - print(optimize.fsolve(lambda alpha: np.diff((stats.gamma.ppf(pq, alpha)*xq[::-1])), 3.)) + print( + np.diff( + (stats.gamma.ppf(pq, np.linspace(0.01, 4, 10)[:, None]) * xq[::-1]) + ) + ) + # optimize.bisect(lambda alpha: np.diff((stats.gamma.ppf(pq, alpha)*xq[::-1]))) + print( + optimize.fsolve( + lambda alpha: np.diff((stats.gamma.ppf(pq, alpha) * xq[::-1])), 3.0 + ) + ) distfn = stats.gamma - mcond = gammamomentcond(distfn, [5.,10], mom2=stats.gamma.stats(alpha, 0.,1.), quantile=None) - print(optimize.fsolve(mcond, [1.,2.])) - mom2 = stats.gamma.stats(alpha, 0.,1.) - print(optimize.fsolve(lambda params:gammamomentcond2(distfn, params, mom2), [1.,2.])) - - grvs = stats.gamma.rvs(alpha, 0.,2., size=1000) + mcond = gammamomentcond( + distfn, + [5.0, 10], + mom2=stats.gamma.stats(alpha, 0.0, 1.0), + quantile=None, + ) + print(optimize.fsolve(mcond, [1.0, 2.0])) + mom2 = stats.gamma.stats(alpha, 0.0, 1.0) + print( + optimize.fsolve( + lambda params: gammamomentcond2(distfn, params, mom2), [1.0, 2.0] + ) + ) + + grvs = stats.gamma.rvs(alpha, 0.0, 2.0, size=1000) mom2 = np.array([grvs.mean(), grvs.var()]) - alphaestq = optimize.fsolve(lambda params:gammamomentcond2(distfn, params, mom2), [1.,3.]) + alphaestq = optimize.fsolve( + lambda params: gammamomentcond2(distfn, params, mom2), [1.0, 3.0] + ) print(alphaestq) - print('scale = ', xq/stats.gamma.ppf(pq, alphaestq)) - + print("scale = ", xq / stats.gamma.ppf(pq, alphaestq)) - #Example beta - distribution - #--------------------------- + # Example beta - distribution + # --------------------------- - #Warning: this example had cut-and-paste errors + # Warning: this example had cut-and-paste errors - print('\n\nExample: beta Distribution') - print( '--------------------------') + print("\n\nExample: beta Distribution") + print("--------------------------") - #monkey patching : -## if hasattr(stats.beta, '_fitstart'): -## del stats.beta._fitstart #bug in _fitstart #raises AttributeError: _fitstart - #stats.distributions.beta_gen._fitstart = lambda self, data : np.array([1,1,0,1]) - #_fitstart seems to require a tuple - stats.distributions.beta_gen._fitstart = lambda self, data : (5,5,0,1) + # monkey patching : + ## if hasattr(stats.beta, '_fitstart'): + ## del stats.beta._fitstart #bug in _fitstart #raises AttributeError: _fitstart + # stats.distributions.beta_gen._fitstart = lambda self, data : np.array([1,1,0,1]) + # _fitstart seems to require a tuple + stats.distributions.beta_gen._fitstart = lambda self, data: (5, 5, 0, 1) - pq = np.array([0.01, 0.05,0.1,0.4,0.6,0.9,0.95,0.99]) - #rvsb = stats.beta.rvs(0.5,0.15,size=200) - rvsb = stats.beta.rvs(10,15,size=2000) - print('true params', 10, 15, 0, 1) + pq = np.array([0.01, 0.05, 0.1, 0.4, 0.6, 0.9, 0.95, 0.99]) + # rvsb = stats.beta.rvs(0.5,0.15,size=200) + rvsb = stats.beta.rvs(10, 15, size=2000) + print("true params", 10, 15, 0, 1) print(stats.beta.fit(rvsb)) - xqsb = [stats.scoreatpercentile(rvsb, p) for p in pq*100] + xqsb = [stats.scoreatpercentile(rvsb, p) for p in pq * 100] mom2s = np.array([rvsb.mean(), rvsb.var()]) - betaparest_gmmquantile = optimize.fmin(lambda params:np.sum(momentcondquant(stats.beta, params, mom2s,(pq,xqsb), shape=None)**2), - [10,10, 0., 1.], maxiter=2000) - print('betaparest_gmmquantile', betaparest_gmmquantile) - #result sensitive to initial condition - - - #Example t - distribution - #------------------------ - - print('\n\nExample: t Distribution') - print( '-----------------------') + betaparest_gmmquantile = optimize.fmin( + lambda params: np.sum( + momentcondquant(stats.beta, params, mom2s, (pq, xqsb), shape=None) + ** 2 + ), + [10, 10, 0.0, 1.0], + maxiter=2000, + ) + print("betaparest_gmmquantile", betaparest_gmmquantile) + # result sensitive to initial condition + + # Example t - distribution + # ------------------------ + + print("\n\nExample: t Distribution") + print("-----------------------") nobs = 1000 distfn = stats.t - pq = np.array([0.1,0.9]) + pq = np.array([0.1, 0.9]) paramsdgp = (5, 0, 1) trvs = distfn.rvs(5, 0, 1, size=nobs) - xqs = [stats.scoreatpercentile(trvs, p) for p in pq*100] + xqs = [stats.scoreatpercentile(trvs, p) for p in pq * 100] mom2th = distfn.stats(*paramsdgp) mom2s = np.array([trvs.mean(), trvs.var()]) - tparest_gmm3quantilefsolve = optimize.fsolve(lambda params:momentcondunbound(distfn,params, mom2s,(pq,xqs)), [10,1.,2.]) - print('tparest_gmm3quantilefsolve', tparest_gmm3quantilefsolve) - tparest_gmm3quantile = optimize.fmin(lambda params:np.sum(momentcondunbound(distfn,params, mom2s,(pq,xqs))**2), [10,1.,2.]) - print('tparest_gmm3quantile', tparest_gmm3quantile) + tparest_gmm3quantilefsolve = optimize.fsolve( + lambda params: momentcondunbound(distfn, params, mom2s, (pq, xqs)), + [10, 1.0, 2.0], + ) + print("tparest_gmm3quantilefsolve", tparest_gmm3quantilefsolve) + tparest_gmm3quantile = optimize.fmin( + lambda params: np.sum( + momentcondunbound(distfn, params, mom2s, (pq, xqs)) ** 2 + ), + [10, 1.0, 2.0], + ) + print("tparest_gmm3quantile", tparest_gmm3quantile) print(distfn.fit(trvs)) ## @@ -517,65 +567,93 @@ def fit_mps(dist, data, x0=None): ##xqs = [stats.scoreatpercentile(trvs, p) for p in pq*100] ##mom2th = distfn.stats(*paramsdgp) ##mom2s = np.array([trvs.mean(), trvs.var()]) - print(optimize.fsolve(lambda params:momentcondunboundls(distfn, params, mom2s,shape=5), [1.,2.])) - print(optimize.fmin(lambda params:np.sum(momentcondunboundls(distfn, params, mom2s,shape=5)**2), [1.,2.])) + print( + optimize.fsolve( + lambda params: momentcondunboundls(distfn, params, mom2s, shape=5), + [1.0, 2.0], + ) + ) + print( + optimize.fmin( + lambda params: np.sum( + momentcondunboundls(distfn, params, mom2s, shape=5) ** 2 + ), + [1.0, 2.0], + ) + ) print(distfn.fit(trvs)) - #loc, scale, based on quantiles - print(optimize.fsolve(lambda params:momentcondunboundls(distfn, params, mom2s,(pq,xqs),shape=5), [1.,2.])) + # loc, scale, based on quantiles + print( + optimize.fsolve( + lambda params: momentcondunboundls( + distfn, params, mom2s, (pq, xqs), shape=5 + ), + [1.0, 2.0], + ) + ) ## - pq = np.array([0.01, 0.05,0.1,0.4,0.6,0.9,0.95,0.99]) - #paramsdgp = (5, 0, 1) - xqs = [stats.scoreatpercentile(trvs, p) for p in pq*100] - tparest_gmmquantile = optimize.fmin(lambda params:np.sum(momentcondquant(distfn, params, mom2s,(pq,xqs), shape=None)**2), [10, 1.,2.]) - print('tparest_gmmquantile', tparest_gmmquantile) - tparest_gmmquantile2 = fitquantilesgmm(distfn, trvs, start=[10, 1.,2.], pquant=None, frozen=None) - print('tparest_gmmquantile2', tparest_gmmquantile2) - + pq = np.array([0.01, 0.05, 0.1, 0.4, 0.6, 0.9, 0.95, 0.99]) + # paramsdgp = (5, 0, 1) + xqs = [stats.scoreatpercentile(trvs, p) for p in pq * 100] + tparest_gmmquantile = optimize.fmin( + lambda params: np.sum( + momentcondquant(distfn, params, mom2s, (pq, xqs), shape=None) ** 2 + ), + [10, 1.0, 2.0], + ) + print("tparest_gmmquantile", tparest_gmmquantile) + tparest_gmmquantile2 = fitquantilesgmm( + distfn, trvs, start=[10, 1.0, 2.0], pquant=None, frozen=None + ) + print("tparest_gmmquantile2", tparest_gmmquantile2) ## - - #use trvs from before - bt = stats.t.ppf(np.linspace(0,1,21),5) - ft,bt = np.histogram(trvs,bins=bt) - print('fitbinned t-distribution') + # use trvs from before + bt = stats.t.ppf(np.linspace(0, 1, 21), 5) + ft, bt = np.histogram(trvs, bins=bt) + print("fitbinned t-distribution") tparest_mlebinew = fitbinned(stats.t, ft, bt, [10, 0, 1]) tparest_gmmbinewidentity = fitbinnedgmm(stats.t, ft, bt, [10, 0, 1]) - tparest_gmmbinewoptimal = fitbinnedgmm(stats.t, ft, bt, [10, 0, 1], weightsoptimal=False) + tparest_gmmbinewoptimal = fitbinnedgmm( + stats.t, ft, bt, [10, 0, 1], weightsoptimal=False + ) print(paramsdgp) - #Note: this can be used for chisquare test and then has correct asymptotic + # Note: this can be used for chisquare test and then has correct asymptotic # distribution for a distribution with estimated parameters, find ref again - #TODO combine into test with binning included, check rule for number of bins + # TODO combine into test with binning included, check rule for number of bins - #bt2 = stats.t.ppf(np.linspace(trvs.,1,21),5) - ft2,bt2 = np.histogram(trvs,bins=50) - 'fitbinned t-distribution' + # bt2 = stats.t.ppf(np.linspace(trvs.,1,21),5) + ft2, bt2 = np.histogram(trvs, bins=50) + "fitbinned t-distribution" tparest_mlebinel = fitbinned(stats.t, ft2, bt2, [10, 0, 1]) tparest_gmmbinelidentity = fitbinnedgmm(stats.t, ft2, bt2, [10, 0, 1]) - tparest_gmmbineloptimal = fitbinnedgmm(stats.t, ft2, bt2, [10, 0, 1], weightsoptimal=False) + tparest_gmmbineloptimal = fitbinnedgmm( + stats.t, ft2, bt2, [10, 0, 1], weightsoptimal=False + ) tparest_mle = stats.t.fit(trvs) np.set_printoptions(precision=6) - print('sample size', nobs) - print('true (df, loc, scale) ', paramsdgp) - print('parest_mle ', tparest_mle) + print("sample size", nobs) + print("true (df, loc, scale) ", paramsdgp) + print("parest_mle ", tparest_mle) print - print('tparest_mlebinel ', tparest_mlebinel) - print('tparest_gmmbinelidentity ', tparest_gmmbinelidentity) - print('tparest_gmmbineloptimal ', tparest_gmmbineloptimal) + print("tparest_mlebinel ", tparest_mlebinel) + print("tparest_gmmbinelidentity ", tparest_gmmbinelidentity) + print("tparest_gmmbineloptimal ", tparest_gmmbineloptimal) print - print('tparest_mlebinew ', tparest_mlebinew) - print('tparest_gmmbinewidentity ', tparest_gmmbinewidentity) - print('tparest_gmmbinewoptimal ', tparest_gmmbinewoptimal) + print("tparest_mlebinew ", tparest_mlebinew) + print("tparest_gmmbinewidentity ", tparest_gmmbinewidentity) + print("tparest_gmmbinewoptimal ", tparest_gmmbinewoptimal) print - print('tparest_gmmquantileidentity', tparest_gmmquantile) - print('tparest_gmm3quantilefsolve ', tparest_gmm3quantilefsolve) - print('tparest_gmm3quantile ', tparest_gmm3quantile) + print("tparest_gmmquantileidentity", tparest_gmmquantile) + print("tparest_gmm3quantilefsolve ", tparest_gmm3quantilefsolve) + print("tparest_gmm3quantile ", tparest_gmm3quantile) - ''' example results: + """ example results: standard error for df estimate looks large note: iI do not impose that df is an integer, (b/c not necessary) need Monte Carlo to check variance of estimators @@ -596,82 +674,89 @@ def fit_mps(dist, data, x0=None): tparest_gmmquantileidentity [ 3.940797 -0.046469 1.002001] tparest_gmm3quantilefsolve [ 10. 1. 2.] tparest_gmm3quantile [ 6.376101 -0.029322 1.112403] - ''' + """ - #Example with Maximum Product of Spacings Estimation - #=================================================== + # Example with Maximum Product of Spacings Estimation + # =================================================== - #Example: Lognormal Distribution - #------------------------------- + # Example: Lognormal Distribution + # ------------------------------- - #tough problem for MLE according to JKB - #but not sure for which parameters + # tough problem for MLE according to JKB + # but not sure for which parameters - print('\n\nExample: Lognormal Distribution') - print( '-------------------------------') + print("\n\nExample: Lognormal Distribution") + print("-------------------------------") sh = np.exp(10) sh = 0.01 print(sh) - x = stats.lognorm.rvs(sh,loc=100, scale=10,size=200) + x = stats.lognorm.rvs(sh, loc=100, scale=10, size=200) print(x.min()) - print(stats.lognorm.fit(x, 1.,loc=x.min()-1,scale=1)) + print(stats.lognorm.fit(x, 1.0, loc=x.min() - 1, scale=1)) xsorted = np.sort(x) - x0 = [1., x.min()-1, 1] + x0 = [1.0, x.min() - 1, 1] args = (xsorted, stats.lognorm) - print(optimize.fmin(logmps,x0,args=args)) - + print(optimize.fmin(logmps, x0, args=args)) - #Example: Lomax, Pareto, Generalized Pareto Distributions - #-------------------------------------------------------- + # Example: Lomax, Pareto, Generalized Pareto Distributions + # -------------------------------------------------------- - #partially a follow-up to the discussion about numpy.random.pareto - #Reference: JKB - #example Maximum Product of Spacings Estimation + # partially a follow-up to the discussion about numpy.random.pareto + # Reference: JKB + # example Maximum Product of Spacings Estimation # current results: # does not look very good yet sensitivity to starting values # Pareto and Generalized Pareto look like a tough estimation problemprint('\n\nExample: Lognormal Distribution' - print('\n\nExample: Lomax, Pareto, Generalized Pareto Distributions') - print( '--------------------------------------------------------') + print("\n\nExample: Lomax, Pareto, Generalized Pareto Distributions") + print("--------------------------------------------------------") p2rvs = stats.genpareto.rvs(2, size=500) - #Note: is Lomax without +1; and classical Pareto with +1 + # Note: is Lomax without +1; and classical Pareto with +1 p2rvssorted = np.sort(p2rvs) argsp = (p2rvssorted, stats.pareto) - x0p = [1., p2rvs.min()-5, 1] - print(optimize.fmin(logmps,x0p,args=argsp)) + x0p = [1.0, p2rvs.min() - 5, 1] + print(optimize.fmin(logmps, x0p, args=argsp)) print(stats.pareto.fit(p2rvs, 0.5, loc=-20, scale=0.5)) - print('gpdparest_ mle', stats.genpareto.fit(p2rvs)) + print("gpdparest_ mle", stats.genpareto.fit(p2rvs)) parsgpd = fit_mps(stats.genpareto, p2rvs) - print('gpdparest_ mps', parsgpd) + print("gpdparest_ mps", parsgpd) argsgpd = (p2rvssorted, stats.genpareto) options = dict(stepFix=1e-7) - #hess_ndt(fun, pars, argsgdp, options) - #the results for the following look strange, maybe refactoring error + # hess_ndt(fun, pars, argsgdp, options) + # the results for the following look strange, maybe refactoring error he, h = hess_ndt(logmps, parsgpd, argsgpd, options) print(np.linalg.eigh(he)[0]) f = lambda params: logmps(params, *argsgpd) print(f(parsgpd)) - #add binned + # add binned fp2, bp2 = np.histogram(p2rvs, bins=50) - 'fitbinned t-distribution' + "fitbinned t-distribution" gpdparest_mlebinel = fitbinned(stats.genpareto, fp2, bp2, x0p) gpdparest_gmmbinelidentity = fitbinnedgmm(stats.genpareto, fp2, bp2, x0p) - print('gpdparest_mlebinel', gpdparest_mlebinel) - print('gpdparest_gmmbinelidentity', gpdparest_gmmbinelidentity) + print("gpdparest_mlebinel", gpdparest_mlebinel) + print("gpdparest_gmmbinelidentity", gpdparest_gmmbinelidentity) gpdparest_gmmquantile2 = fitquantilesgmm( - stats.genpareto, p2rvs, start=x0p, pquant=None, frozen=None) - print('gpdparest_gmmquantile2', gpdparest_gmmquantile2) - - print(fitquantilesgmm(stats.genpareto, p2rvs, start=x0p, - pquant=np.linspace(0.01,0.99,10), frozen=None)) + stats.genpareto, p2rvs, start=x0p, pquant=None, frozen=None + ) + print("gpdparest_gmmquantile2", gpdparest_gmmquantile2) + + print( + fitquantilesgmm( + stats.genpareto, + p2rvs, + start=x0p, + pquant=np.linspace(0.01, 0.99, 10), + frozen=None, + ) + ) fp2, bp2 = np.histogram( - p2rvs, - bins=stats.genpareto(2).ppf(np.linspace(0,0.99,10))) - print('fitbinnedgmm equal weight bins') + p2rvs, bins=stats.genpareto(2).ppf(np.linspace(0, 0.99, 10)) + ) + print("fitbinnedgmm equal weight bins") print(fitbinnedgmm(stats.genpareto, fp2, bp2, x0p)) diff --git a/statsmodels/sandbox/distributions/examples/ex_extras.py b/statsmodels/sandbox/distributions/examples/ex_extras.py index 9ab22c84783..6783fd1d0df 100644 --- a/statsmodels/sandbox/distributions/examples/ex_extras.py +++ b/statsmodels/sandbox/distributions/examples/ex_extras.py @@ -9,72 +9,99 @@ import numpy as np from scipy import stats -from statsmodels.sandbox.distributions.extras import (SkewNorm_gen, skewnorm, - ACSkewT_gen, - NormExpan_gen, pdf_moments, - ExpTransf_gen, LogTransf_gen) +from statsmodels.sandbox.distributions.extras import ( + ACSkewT_gen, + ExpTransf_gen, + LogTransf_gen, + NormExpan_gen, + SkewNorm_gen, + pdf_moments, + skewnorm, +) from statsmodels.stats.moment_helpers import mc2mvsk, mnc2mc, mvsk2mnc def example_n(): - print(skewnorm.pdf(1,0), stats.norm.pdf(1), skewnorm.pdf(1,0) - stats.norm.pdf(1)) - print(skewnorm.pdf(1,1000), stats.chi.pdf(1,1), skewnorm.pdf(1,1000) - stats.chi.pdf(1,1)) - print(skewnorm.pdf(-1,-1000), stats.chi.pdf(1,1), skewnorm.pdf(-1,-1000) - stats.chi.pdf(1,1)) - rvs = skewnorm.rvs(0,size=500) - print('sample mean var: ', rvs.mean(), rvs.var()) - print('theoretical mean var', skewnorm.stats(0)) - rvs = skewnorm.rvs(5,size=500) - print('sample mean var: ', rvs.mean(), rvs.var()) - print('theoretical mean var', skewnorm.stats(5)) - print(skewnorm.cdf(1,0), stats.norm.cdf(1), skewnorm.cdf(1,0) - stats.norm.cdf(1)) - print(skewnorm.cdf(1,1000), stats.chi.cdf(1,1), skewnorm.cdf(1,1000) - stats.chi.cdf(1,1)) - print(skewnorm.sf(0.05,1000), stats.chi.sf(0.05,1), skewnorm.sf(0.05,1000) - stats.chi.sf(0.05,1)) + print( + skewnorm.pdf(1, 0), + stats.norm.pdf(1), + skewnorm.pdf(1, 0) - stats.norm.pdf(1), + ) + print( + skewnorm.pdf(1, 1000), + stats.chi.pdf(1, 1), + skewnorm.pdf(1, 1000) - stats.chi.pdf(1, 1), + ) + print( + skewnorm.pdf(-1, -1000), + stats.chi.pdf(1, 1), + skewnorm.pdf(-1, -1000) - stats.chi.pdf(1, 1), + ) + rvs = skewnorm.rvs(0, size=500) + print("sample mean var: ", rvs.mean(), rvs.var()) + print("theoretical mean var", skewnorm.stats(0)) + rvs = skewnorm.rvs(5, size=500) + print("sample mean var: ", rvs.mean(), rvs.var()) + print("theoretical mean var", skewnorm.stats(5)) + print( + skewnorm.cdf(1, 0), + stats.norm.cdf(1), + skewnorm.cdf(1, 0) - stats.norm.cdf(1), + ) + print( + skewnorm.cdf(1, 1000), + stats.chi.cdf(1, 1), + skewnorm.cdf(1, 1000) - stats.chi.cdf(1, 1), + ) + print( + skewnorm.sf(0.05, 1000), + stats.chi.sf(0.05, 1), + skewnorm.sf(0.05, 1000) - stats.chi.sf(0.05, 1), + ) def example_T(): skewt = ACSkewT_gen() - rvs = skewt.rvs(10,0,size=500) - print('sample mean var: ', rvs.mean(), rvs.var()) - print('theoretical mean var', skewt.stats(10,0)) - print('t mean var', stats.t.stats(10)) - print(skewt.stats(10,1000)) # -> folded t distribution, as alpha -> inf - rvs = np.abs(stats.t.rvs(10,size=1000)) + rvs = skewt.rvs(10, 0, size=500) + print("sample mean var: ", rvs.mean(), rvs.var()) + print("theoretical mean var", skewt.stats(10, 0)) + print("t mean var", stats.t.stats(10)) + print(skewt.stats(10, 1000)) # -> folded t distribution, as alpha -> inf + rvs = np.abs(stats.t.rvs(10, size=1000)) print(rvs.mean(), rvs.var()) - def examples_normexpand(): skewnorm = SkewNorm_gen() - rvs = skewnorm.rvs(5,size=100) - normexpan = NormExpan_gen(rvs, mode='sample') + rvs = skewnorm.rvs(5, size=100) + normexpan = NormExpan_gen(rvs, mode="sample") smvsk = stats.describe(rvs)[2:] - print('sample: mu,sig,sk,kur') + print("sample: mu,sig,sk,kur") print(smvsk) - dmvsk = normexpan.stats(moments='mvsk') - print('normexpan: mu,sig,sk,kur') + dmvsk = normexpan.stats(moments="mvsk") + print("normexpan: mu,sig,sk,kur") print(dmvsk) - print('mvsk diff distribution - sample') + print("mvsk diff distribution - sample") print(np.array(dmvsk) - np.array(smvsk)) - print('normexpan attributes mvsk') + print("normexpan attributes mvsk") print(mc2mvsk(normexpan.cnt)) print(normexpan.mvsk) mnc = mvsk2mnc(dmvsk) mc = mnc2mc(mnc) - print('central moments') + print("central moments") print(mc) - print('non-central moments') + print("non-central moments") print(mnc) - pdffn = pdf_moments(mc) - print('\npdf approximation from moments') - print('pdf at', mc[0]-1,mc[0]+1) - print(pdffn([mc[0]-1,mc[0]+1])) - print(normexpan.pdf([mc[0]-1,mc[0]+1])) + print("\npdf approximation from moments") + print("pdf at", mc[0] - 1, mc[0] + 1) + print(pdffn([mc[0] - 1, mc[0] + 1])) + print(normexpan.pdf([mc[0] - 1, mc[0] + 1])) def examples_transf(): @@ -85,10 +112,12 @@ def examples_transf(): ##print(stats.lognorm.stats(1)) ##print(lognormal.rvs(size=10)) - print('Results for lognormal') - lognormalg = ExpTransf_gen(stats.norm, a=0, name = 'Log transformed normal general') + print("Results for lognormal") + lognormalg = ExpTransf_gen( + stats.norm, a=0, name="Log transformed normal general" + ) print(lognormalg.cdf(1)) - print(stats.lognorm.cdf(1,1)) + print(stats.lognorm.cdf(1, 1)) print(lognormalg.stats()) print(stats.lognorm.stats(1)) print(lognormalg.rvs(size=5)) @@ -98,29 +127,29 @@ def examples_transf(): ##print(loggammag._cdf(1,10)) ##print(stats.loggamma.cdf(1,10)) - print('Results for expgamma') + print("Results for expgamma") loggammaexpg = LogTransf_gen(stats.gamma) - print(loggammaexpg._cdf(1,10)) - print(stats.loggamma.cdf(1,10)) - print(loggammaexpg._cdf(2,15)) - print(stats.loggamma.cdf(2,15)) - + print(loggammaexpg._cdf(1, 10)) + print(stats.loggamma.cdf(1, 10)) + print(loggammaexpg._cdf(2, 15)) + print(stats.loggamma.cdf(2, 15)) # this requires change in scipy.stats.distribution - #print(loggammaexpg.cdf(1,10)) + # print(loggammaexpg.cdf(1,10)) - print('Results for loglaplace') + print("Results for loglaplace") loglaplaceg = LogTransf_gen(stats.laplace) print(loglaplaceg._cdf(2)) - print(stats.loglaplace.cdf(2,1)) + print(stats.loglaplace.cdf(2, 1)) loglaplaceexpg = ExpTransf_gen(stats.laplace) print(loglaplaceexpg._cdf(2)) - stats.loglaplace.cdf(3,3) - #0.98148148148148151 - loglaplaceexpg._cdf(3,0,1./3) - #0.98148148148148151 + stats.loglaplace.cdf(3, 3) + # 0.98148148148148151 + loglaplaceexpg._cdf(3, 0, 1.0 / 3) + # 0.98148148148148151 + -if __name__ == '__main__': +if __name__ == "__main__": example_n() example_T() examples_normexpand() diff --git a/statsmodels/sandbox/distributions/examples/ex_fitfr.py b/statsmodels/sandbox/distributions/examples/ex_fitfr.py index 5f00a125110..745301362b6 100644 --- a/statsmodels/sandbox/distributions/examples/ex_fitfr.py +++ b/statsmodels/sandbox/distributions/examples/ex_fitfr.py @@ -1,14 +1,15 @@ -'''Example for estimating distribution parameters when some are fixed. +"""Example for estimating distribution parameters when some are fixed. This uses currently a patched version of the distributions, two methods are added to the continuous distributions. This has no side effects. It also adds bounds to vonmises, which changes the behavior of it for some methods. -''' +""" import numpy as np from scipy import stats + # Note the following import attaches methods to scipy.stats.distributions # and adds bounds to stats.vonmises # from statsmodels.sandbox.distributions import sppatch @@ -17,12 +18,12 @@ np.random.seed(12345) x = stats.gamma.rvs(2.5, loc=0, scale=1.2, size=200) -#estimate all parameters +# estimate all parameters print(stats.gamma.fit(x)) print(stats.gamma.fit_fr(x, frozen=[np.nan, np.nan, np.nan])) -#estimate shape parameter only -print(stats.gamma.fit_fr(x, frozen=[np.nan, 0., 1.2])) +# estimate shape parameter only +print(stats.gamma.fit_fr(x, frozen=[np.nan, 0.0, 1.2])) np.random.seed(12345) x = stats.lognorm.rvs(2, loc=0, scale=2, size=200) -print(stats.lognorm.fit_fr(x, frozen=[np.nan, 0., np.nan])) +print(stats.lognorm.fit_fr(x, frozen=[np.nan, 0.0, np.nan])) diff --git a/statsmodels/sandbox/distributions/examples/ex_gof.py b/statsmodels/sandbox/distributions/examples/ex_gof.py index 13345fb7a69..6a15943e094 100644 --- a/statsmodels/sandbox/distributions/examples/ex_gof.py +++ b/statsmodels/sandbox/distributions/examples/ex_gof.py @@ -1,11 +1,17 @@ from scipy import stats + from statsmodels.stats import gof -poissrvs = stats.poisson.rvs(0.6, size = 200) +poissrvs = stats.poisson.rvs(0.6, size=200) -freq, expfreq, histsupp = gof.gof_binning_discrete(poissrvs, stats.poisson, (0.6,), nsupp=20) +freq, expfreq, histsupp = gof.gof_binning_discrete( + poissrvs, stats.poisson, (0.6,), nsupp=20 +) (chi2val, pval) = stats.chisquare(freq, expfreq) print(chi2val, pval) -print(gof.gof_chisquare_discrete(stats.poisson, (0.6,), poissrvs, 0.05, - 'Poisson')) +print( + gof.gof_chisquare_discrete( + stats.poisson, (0.6,), poissrvs, 0.05, "Poisson" + ) +) diff --git a/statsmodels/sandbox/distributions/examples/ex_mvelliptical.py b/statsmodels/sandbox/distributions/examples/ex_mvelliptical.py index 55801491e04..cf0710e4182 100644 --- a/statsmodels/sandbox/distributions/examples/ex_mvelliptical.py +++ b/statsmodels/sandbox/distributions/examples/ex_mvelliptical.py @@ -10,50 +10,44 @@ for comparison I used R mvtnorm version 0.9-96 """ +import matplotlib.pyplot as plt import numpy as np from numpy.testing import assert_array_almost_equal -import matplotlib.pyplot as plt import statsmodels.api as sm import statsmodels.distributions.mixture_rvs as mix import statsmodels.sandbox.distributions.mv_normal as mvd - -cov3 = np.array([[ 1. , 0.5 , 0.75], - [ 0.5 , 1.5 , 0.6 ], - [ 0.75, 0.6 , 2. ]]) +cov3 = np.array([[1.0, 0.5, 0.75], [0.5, 1.5, 0.6], [0.75, 0.6, 2.0]]) mu = np.array([-1, 0.0, 2.0]) -#************** multivariate normal distribution *************** +# ************** multivariate normal distribution *************** mvn3 = mvd.MVNormal(mu, cov3) -#compare with random sample +# compare with random sample x = mvn3.rvs(size=1000000) -xli = [[2., 1., 1.5], - [0., 2., 1.5], - [1.5, 1., 2.5], - [0., 1., 1.5]] +xli = [[2.0, 1.0, 1.5], [0.0, 2.0, 1.5], [1.5, 1.0, 2.5], [0.0, 1.0, 1.5]] -xliarr = np.asarray(xli).T[None,:, :] +xliarr = np.asarray(xli).T[None, :, :] -#from R session -#pmvnorm(lower=-Inf,upper=(x[0,.]-mu)/sqrt(diag(cov3)),mean=rep(0,3),corr3) +# from R session +# pmvnorm(lower=-Inf,upper=(x[0,.]-mu)/sqrt(diag(cov3)),mean=rep(0,3),corr3) r_cdf = [0.3222292, 0.3414643, 0.5450594, 0.3116296] r_cdf_errors = [1.715116e-05, 1.590284e-05, 5.356471e-05, 3.567548e-05] n_cdf = [mvn3.cdf(a) for a in xli] assert_array_almost_equal(r_cdf, n_cdf, decimal=4) print(n_cdf) -print('') -print((x>> np.random.seed(464239857) >>> rvstsq = squaretg.rvs(10,size=100000) >>> squaretg.moment(4,10) @@ -271,21 +388,22 @@ def test_squared_normal_chi2(): (array(1.2500000000000022), array(4.6874999999630909), array(5.7735026919777912), array(106.00000000170148)) >>> stats.describe(rvstsq) (100000, (3.2953470738423724e-009, 92.649615690914473), 1.2534924690963247, 4.7741427958594098, 6.1562177957041895, 100.99331166052181) - ''' + """ # checking the distribution # fraction of observations in each decile - dec = squaretg.ppf(np.linspace(0.,1,11),10) - freq,edges = np.histogram(rvstsq, bins=dec) - print(freq/float(len(rvstsq))) + dec = squaretg.ppf(np.linspace(0.0, 1, 11), 10) + freq, edges = np.histogram(rvstsq, bins=dec) + print(freq / float(len(rvstsq))) import matplotlib.pyplot as plt - freq,edges,_ = plt.hist(rvstsq, bins=50, range=(0,4),normed=True) - edges += (edges[1]-edges[0])/2.0 - plt.plot(edges[:-1], squaretg.pdf(edges[:-1], 10), 'r') - #plt.show() - #plt.close() - ''' + freq, edges, _ = plt.hist(rvstsq, bins=50, range=(0, 4), normed=True) + edges += (edges[1] - edges[0]) / 2.0 + plt.plot(edges[:-1], squaretg.pdf(edges[:-1], 10), "r") + # plt.show() + # plt.close() + + """ >>> plt.plot(edges[:-1], squaretg.pdf(edges[:-1], 10), 'r') [] >>> plt.fill(edges[4:8], squaretg.pdf(edges[4:8], 10), 'r') @@ -309,7 +427,8 @@ def test_squared_normal_chi2(): AttributeError: 'AxesSubplot' object has no attribute 'fill_between' >>> ax1.fill(edges[4:8], squaretg.pdf(edges[4:8], 10), 0, 'r') Traceback (most recent call last): - ''' + """ import pytest - pytest.main([__file__, '-vvs', '-x', '--pdb']) + + pytest.main([__file__, "-vvs", "-x", "--pdb"]) diff --git a/statsmodels/sandbox/distributions/examples/matchdist.py b/statsmodels/sandbox/distributions/examples/matchdist.py index a2d4a7bf8f5..c97e541cdd5 100644 --- a/statsmodels/sandbox/distributions/examples/matchdist.py +++ b/statsmodels/sandbox/distributions/examples/matchdist.py @@ -1,4 +1,4 @@ -'''given a 1D sample of observation, find a matching distribution +"""given a 1D sample of observation, find a matching distribution * estimate maximum likelihood parameter for each distribution * rank estimated distribution by Kolmogorov-Smirnov and Anderson-Darling @@ -14,69 +14,153 @@ * split estimation by support, add option and choose automatically * -''' -from scipy import stats -import numpy as np +""" import matplotlib.pyplot as plt +import numpy as np +from scipy import stats + +# stats.distributions.beta_gen._fitstart = lambda self, data : (5,5,0,1) -#stats.distributions.beta_gen._fitstart = lambda self, data : (5,5,0,1) -def plothist(x,distfn, args, loc, scale, right=1): +def plothist(x, distfn, args, loc, scale, right=1): plt.figure() # the histogram of the data - n, bins, patches = plt.hist(x, 25, normed=1, facecolor='green', alpha=0.75) + n, bins, patches = plt.hist(x, 25, normed=1, facecolor="green", alpha=0.75) maxheight = max([p.get_height() for p in patches]) print(maxheight) axlim = list(plt.axis()) - #print(axlim) - axlim[-1] = maxheight*1.05 - #plt.axis(tuple(axlim)) -## print(bins) -## print('args in plothist', args) + # print(axlim) + axlim[-1] = maxheight * 1.05 + # plt.axis(tuple(axlim)) + ## print(bins) + ## print('args in plothist', args) # add a 'best fit' line - #yt = stats.norm.pdf( bins, loc=loc, scale=scale) - yt = distfn.pdf( bins, loc=loc, scale=scale, *args) - yt[yt>maxheight]=maxheight - lt = plt.plot(bins, yt, 'r--', linewidth=1) - ys = stats.t.pdf( bins, 10,scale=10,)*right - ls = plt.plot(bins, ys, 'b-', linewidth=1) - - plt.xlabel('Smarts') - plt.ylabel('Probability') - plt.title(r'$\mathrm{Testing: %s :}\ \mu=%f,\ \sigma=%f$' % (distfn.name,loc,scale)) - - #plt.axis([bins[0], bins[-1], 0, 0.134+0.05]) + # yt = stats.norm.pdf( bins, loc=loc, scale=scale) + yt = distfn.pdf(bins, loc=loc, scale=scale, *args) + yt[yt > maxheight] = maxheight + lt = plt.plot(bins, yt, "r--", linewidth=1) + ys = ( + stats.t.pdf( + bins, + 10, + scale=10, + ) + * right + ) + ls = plt.plot(bins, ys, "b-", linewidth=1) + + plt.xlabel("Smarts") + plt.ylabel("Probability") + plt.title( + r"$\mathrm{Testing: %s :}\ \mu=%f,\ \sigma=%f$" + % (distfn.name, loc, scale) + ) + + # plt.axis([bins[0], bins[-1], 0, 0.134+0.05]) plt.grid(True) plt.draw() - #plt.show() - #plt.close() - - - - - -#targetdist = ['norm','t','truncnorm','johnsonsu','johnsonsb', -targetdist = ['norm','alpha', 'anglit', 'arcsine', - 'beta', 'betaprime', 'bradford', 'burr', 'fisk', 'cauchy', - 'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'erlang', - 'expon', 'exponweib', 'exponpow', 'fatiguelife', 'foldcauchy', - 'f', 'foldnorm', 'frechet_r', 'weibull_min', 'frechet_l', - 'weibull_max', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', - 'gamma', 'gengamma', 'genhalflogistic', 'gompertz', 'gumbel_r', - 'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', - 'gausshyper', 'invgamma', 'invnorm', 'invweibull', 'johnsonsb', - 'johnsonsu', 'laplace', 'levy', 'levy_l', - 'logistic', 'loggamma', 'loglaplace', 'lognorm', 'gilbrat', - 'maxwell', 'mielke', 'nakagami', 'ncx2', 'ncf', 't', - 'nct', 'pareto', 'lomax', 'powerlaw', 'powerlognorm', 'powernorm', - 'rdist', 'rayleigh', 'reciprocal', 'rice', 'recipinvgauss', - 'semicircular', 'triang', 'truncexpon', 'truncnorm', - 'tukeylambda', 'uniform', 'vonmises', 'wald', 'wrapcauchy', - - 'binom', 'bernoulli', 'nbinom', 'geom', 'hypergeom', 'logser', - 'poisson', 'planck', 'boltzmann', 'randint', 'zipf', 'dlaplace'] + # plt.show() + # plt.close() + + +# targetdist = ['norm','t','truncnorm','johnsonsu','johnsonsb', +targetdist = [ + "norm", + "alpha", + "anglit", + "arcsine", + "beta", + "betaprime", + "bradford", + "burr", + "fisk", + "cauchy", + "chi", + "chi2", + "cosine", + "dgamma", + "dweibull", + "erlang", + "expon", + "exponweib", + "exponpow", + "fatiguelife", + "foldcauchy", + "f", + "foldnorm", + "frechet_r", + "weibull_min", + "frechet_l", + "weibull_max", + "genlogistic", + "genpareto", + "genexpon", + "genextreme", + "gamma", + "gengamma", + "genhalflogistic", + "gompertz", + "gumbel_r", + "gumbel_l", + "halfcauchy", + "halflogistic", + "halfnorm", + "hypsecant", + "gausshyper", + "invgamma", + "invnorm", + "invweibull", + "johnsonsb", + "johnsonsu", + "laplace", + "levy", + "levy_l", + "logistic", + "loggamma", + "loglaplace", + "lognorm", + "gilbrat", + "maxwell", + "mielke", + "nakagami", + "ncx2", + "ncf", + "t", + "nct", + "pareto", + "lomax", + "powerlaw", + "powerlognorm", + "powernorm", + "rdist", + "rayleigh", + "reciprocal", + "rice", + "recipinvgauss", + "semicircular", + "triang", + "truncexpon", + "truncnorm", + "tukeylambda", + "uniform", + "vonmises", + "wald", + "wrapcauchy", + "binom", + "bernoulli", + "nbinom", + "geom", + "hypergeom", + "logser", + "poisson", + "planck", + "boltzmann", + "randint", + "zipf", + "dlaplace", +] left = [] right = [] @@ -86,89 +170,180 @@ def plothist(x,distfn, args, loc, scale, right=1): contdist = [] discrete = [] -categ = {('open','open'):'unbound', ('0','open'):'right',('open','0',):'left', - ('finite','finite'):'finite',('oth','oth'):'other'} -categ = {('open','open'):unbound, ('0','open'):right,('open','0',):left, - ('finite','finite'):finite,('oth','oth'):other} +categ = { + ("open", "open"): "unbound", + ("0", "open"): "right", + ( + "open", + "0", + ): "left", + ("finite", "finite"): "finite", + ("oth", "oth"): "other", +} +categ = { + ("open", "open"): unbound, + ("0", "open"): right, + ( + "open", + "0", + ): left, + ("finite", "finite"): finite, + ("oth", "oth"): other, +} categ2 = { - ('open', '0') : ['frechet_l', 'weibull_max', 'levy_l'], - ('finite', 'finite') : ['anglit', 'cosine', 'rdist', 'semicircular'], - ('0', 'open') : ['alpha', 'burr', 'fisk', 'chi', 'chi2', 'erlang', - 'expon', 'exponweib', 'exponpow', 'fatiguelife', 'foldcauchy', 'f', - 'foldnorm', 'frechet_r', 'weibull_min', 'genpareto', 'genexpon', - 'gamma', 'gengamma', 'genhalflogistic', 'gompertz', 'halfcauchy', - 'halflogistic', 'halfnorm', 'invgamma', 'invnorm', 'invweibull', - 'levy', 'loglaplace', 'lognorm', 'gilbrat', 'maxwell', 'mielke', - 'nakagami', 'ncx2', 'ncf', 'lomax', 'powerlognorm', 'rayleigh', - 'rice', 'recipinvgauss', 'truncexpon', 'wald'], - ('open', 'open') : ['cauchy', 'dgamma', 'dweibull', 'genlogistic', 'genextreme', - 'gumbel_r', 'gumbel_l', 'hypsecant', 'johnsonsu', 'laplace', - 'logistic', 'loggamma', 't', 'nct', 'powernorm', 'reciprocal', - 'truncnorm', 'tukeylambda', 'vonmises'], - ('0', 'finite') : ['arcsine', 'beta', 'betaprime', 'bradford', 'gausshyper', - 'johnsonsb', 'powerlaw', 'triang', 'uniform', 'wrapcauchy'], - ('finite', 'open') : ['pareto'] - } - -#Note: weibull_max == frechet_l - -right_incorrect = ['genextreme'] - -right_all = categ2[('0', 'open')] + categ2[('0', 'finite')] + categ2[('finite', 'open')]\ - + right_incorrect + ("open", "0"): ["frechet_l", "weibull_max", "levy_l"], + ("finite", "finite"): ["anglit", "cosine", "rdist", "semicircular"], + ("0", "open"): [ + "alpha", + "burr", + "fisk", + "chi", + "chi2", + "erlang", + "expon", + "exponweib", + "exponpow", + "fatiguelife", + "foldcauchy", + "f", + "foldnorm", + "frechet_r", + "weibull_min", + "genpareto", + "genexpon", + "gamma", + "gengamma", + "genhalflogistic", + "gompertz", + "halfcauchy", + "halflogistic", + "halfnorm", + "invgamma", + "invnorm", + "invweibull", + "levy", + "loglaplace", + "lognorm", + "gilbrat", + "maxwell", + "mielke", + "nakagami", + "ncx2", + "ncf", + "lomax", + "powerlognorm", + "rayleigh", + "rice", + "recipinvgauss", + "truncexpon", + "wald", + ], + ("open", "open"): [ + "cauchy", + "dgamma", + "dweibull", + "genlogistic", + "genextreme", + "gumbel_r", + "gumbel_l", + "hypsecant", + "johnsonsu", + "laplace", + "logistic", + "loggamma", + "t", + "nct", + "powernorm", + "reciprocal", + "truncnorm", + "tukeylambda", + "vonmises", + ], + ("0", "finite"): [ + "arcsine", + "beta", + "betaprime", + "bradford", + "gausshyper", + "johnsonsb", + "powerlaw", + "triang", + "uniform", + "wrapcauchy", + ], + ("finite", "open"): ["pareto"], +} + +# Note: weibull_max == frechet_l + +right_incorrect = ["genextreme"] + +right_all = ( + categ2[("0", "open")] + + categ2[("0", "finite")] + + categ2[("finite", "open")] + + right_incorrect +) for distname in targetdist: - distfn = getattr(stats,distname) - if hasattr(distfn,'_pdf'): + distfn = getattr(stats, distname) + if hasattr(distfn, "_pdf"): if np.isinf(distfn.a): - low = 'open' + low = "open" elif distfn.a == 0: - low = '0' + low = "0" else: - low = 'finite' + low = "finite" if np.isinf(distfn.b): - high = 'open' + high = "open" elif distfn.b == 0: - high = '0' + high = "0" else: - high = 'finite' + high = "finite" contdist.append(distname) - categ.setdefault((low,high),[]).append(distname) + categ.setdefault((low, high), []).append(distname) -not_good = ['genextreme', 'reciprocal', 'vonmises'] +not_good = ["genextreme", "reciprocal", "vonmises"] # 'genextreme' is right (or left?), 'reciprocal' requires 00] + rvs_pos = rvs_orig[rvs_orig > 0] rightfactor = 1 rvs_right = rvs_pos - print('='*50) - print('samplesize = ', n) + print("=" * 50) + print("samplesize = ", n) for distname in targetdist: - distfn = getattr(stats,distname) + distfn = getattr(stats, distname) if distname in right_all: rvs = rvs_right rind = rightfactor @@ -176,85 +351,112 @@ def plothist(x,distfn, args, loc, scale, right=1): else: rvs = rvs_orig rind = 1 - print('-'*30) - print('target = %s' % distname) + print("-" * 30) + print("target = %s" % distname) sm = rvs.mean() sstd = np.sqrt(rvs.var()) ssupp = (rvs.min(), rvs.max()) - if distname in ['truncnorm','betaprime','reciprocal']: - - par0 = (sm-2*sstd,sm+2*sstd) - par_est = tuple(distfn.fit(rvs,loc=sm,scale=sstd,*par0)) - elif distname == 'norm': - par_est = tuple(distfn.fit(rvs,loc=sm,scale=sstd)) - elif distname == 'genextreme': - par_est = tuple(distfn.fit(rvs,-5,loc=sm,scale=sstd)) - elif distname == 'wrapcauchy': - par_est = tuple(distfn.fit(rvs,0.5,loc=0,scale=sstd)) - elif distname == 'f': - par_est = tuple(distfn.fit(rvs,10,15,loc=0,scale=1)) + if distname in ["truncnorm", "betaprime", "reciprocal"]: + + par0 = (sm - 2 * sstd, sm + 2 * sstd) + par_est = tuple(distfn.fit(rvs, loc=sm, scale=sstd, *par0)) + elif distname == "norm": + par_est = tuple(distfn.fit(rvs, loc=sm, scale=sstd)) + elif distname == "genextreme": + par_est = tuple(distfn.fit(rvs, -5, loc=sm, scale=sstd)) + elif distname == "wrapcauchy": + par_est = tuple(distfn.fit(rvs, 0.5, loc=0, scale=sstd)) + elif distname == "f": + par_est = tuple(distfn.fit(rvs, 10, 15, loc=0, scale=1)) elif distname in right: sm = rvs.mean() sstd = np.sqrt(rvs.var()) - par_est = tuple(distfn.fit(rvs,loc=0,scale=1)) + par_est = tuple(distfn.fit(rvs, loc=0, scale=1)) else: sm = rvs.mean() sstd = np.sqrt(rvs.var()) - par_est = tuple(distfn.fit(rvs,loc=sm,scale=sstd)) + par_est = tuple(distfn.fit(rvs, loc=sm, scale=sstd)) - - print('fit', par_est) + print("fit", par_est) arg_est = par_est[:-2] loc_est = par_est[-2] scale_est = par_est[-1] - rvs_normed = (rvs-loc_est)/scale_est - ks_stat, ks_pval = stats.kstest(rvs_normed,distname, arg_est) - print('kstest', ks_stat, ks_pval) + rvs_normed = (rvs - loc_est) / scale_est + ks_stat, ks_pval = stats.kstest(rvs_normed, distname, arg_est) + print("kstest", ks_stat, ks_pval) quant = 0.1 - crit = distfn.ppf(1-quant*float(rind), loc=loc_est, scale=scale_est,*par_est) - tail_prob = stats.t.sf(crit,dgp_arg,scale=dgp_scale) - print('crit, prob', quant, crit, tail_prob) - #if distname == 'norm': - #plothist(rvs,loc_est,scale_est) - #args = tuple() - results.append([distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob ]) - #plothist(rvs,distfn,arg_est,loc_est,scale_est) - - #plothist(rvs,distfn,arg_est,loc_est,scale_est) - #plt.show() - #plt.close() - #TODO: collect results and compare tail quantiles - + crit = distfn.ppf( + 1 - quant * float(rind), loc=loc_est, scale=scale_est, *par_est + ) + tail_prob = stats.t.sf(crit, dgp_arg, scale=dgp_scale) + print("crit, prob", quant, crit, tail_prob) + # if distname == 'norm': + # plothist(rvs,loc_est,scale_est) + # args = tuple() + results.append( + [ + distname, + ks_stat, + ks_pval, + arg_est, + loc_est, + scale_est, + crit, + tail_prob, + ] + ) + # plothist(rvs,distfn,arg_est,loc_est,scale_est) + + # plothist(rvs,distfn,arg_est,loc_est,scale_est) + # plt.show() + # plt.close() + # TODO: collect results and compare tail quantiles from operator import itemgetter - res_sort = sorted(results, key = itemgetter(2)) + res_sort = sorted(results, key=itemgetter(2)) - res_sort.reverse() #kstest statistic: smaller is better, pval larger is better + res_sort.reverse() # kstest statistic: smaller is better, pval larger is better - print('number of distributions', len(res_sort)) - imagedir = 'matchresults' + print("number of distributions", len(res_sort)) + imagedir = "matchresults" import os + if not os.path.exists(imagedir): os.makedirs(imagedir) - for ii,di in enumerate(res_sort): - distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob = di[:] - distfn = getattr(stats,distname) + for ii, di in enumerate(res_sort): + ( + distname, + ks_stat, + ks_pval, + arg_est, + loc_est, + scale_est, + crit, + tail_prob, + ) = di[:] + distfn = getattr(stats, distname) if distname in right_all: rvs = rvs_right rind = rightfactor - ri = 'r' + ri = "r" else: rvs = rvs_orig - ri = '' + ri = "" rind = 1 - print('%s ks-stat = %f, ks-pval = %f tail_prob = %f)' % \ - (distname, ks_stat, ks_pval, tail_prob)) - ## print('arg_est = %s, loc_est = %f scale_est = %f)' % \ - ## (repr(arg_est),loc_est,scale_est)) - plothist(rvs,distfn,arg_est,loc_est,scale_est,right = rind) - plt.savefig(os.path.join(imagedir,'%s%s%02d_%s.png'% (prefix, ri,ii, distname))) + print( + "%s ks-stat = %f, ks-pval = %f tail_prob = %f)" + % (distname, ks_stat, ks_pval, tail_prob) + ) + ## print('arg_est = %s, loc_est = %f scale_est = %f)' % \ + ## (repr(arg_est),loc_est,scale_est)) + plothist(rvs, distfn, arg_est, loc_est, scale_est, right=rind) + plt.savefig( + os.path.join( + imagedir, "%s%s%02d_%s.png" % (prefix, ri, ii, distname) + ) + ) ##plt.show() ##plt.close() diff --git a/statsmodels/sandbox/distributions/extras.py b/statsmodels/sandbox/distributions/extras.py index da6e12b80b8..6241e16c004 100644 --- a/statsmodels/sandbox/distributions/extras.py +++ b/statsmodels/sandbox/distributions/extras.py @@ -1,4 +1,5 @@ -"""Various extensions to distributions +""" +Various extensions to distributions * skew normal and skew t distribution by Azzalini, A. & Capitanio, A. * Gram-Charlier expansion distribution (using 4 moments), @@ -30,7 +31,7 @@ >>> logtg = Transf_gen(stats.t, np.exp, np.log, numargs = 1, a=0, name = 'lnnorm', longname = 'Exp transformed normal', - extradoc = '\ndistribution of y = exp(x), with x standard normal' + # extradoc = '\ndistribution of y = exp(x), with x standard normal' 'precision for moment andstats is not very high, 2-3 decimals') >>> logtg.cdf(5, 6) 0.92067704211191848 @@ -51,13 +52,12 @@ """ import numpy as np -from numpy import poly1d, sqrt, exp - +from numpy import exp, poly1d, sqrt import scipy -from scipy import stats, special +from scipy import special, stats from scipy.stats import distributions -from statsmodels.stats.moment_helpers import mvsk2mc, mc2mvsk +from statsmodels.stats.moment_helpers import mc2mvsk, mvsk2mc try: from scipy.stats._mvn import mvndst @@ -71,7 +71,8 @@ class SkewNorm_gen(distributions.rv_continuous): - """univariate Skew-Normal distribution of Azzalini + """ + Univariate Skew-Normal distribution of Azzalini class follows scipy.stats.distributions pattern but with __init__ @@ -85,7 +86,7 @@ def __init__(self): self, name="Skew Normal distribution", shapes="alpha", - extradoc=""" """, + # extradoc=""" """, ) def _argcheck(self, alpha): @@ -124,10 +125,14 @@ def _stats_skip(self, x, alpha, moments="mvsk"): # generated the same way as distributions in stats.distributions class SkewNorm2_gen(distributions.rv_continuous): - """univariate Skew-Normal distribution of Azzalini + """ + Univariate Skew-Normal distribution of Azzalini class follows scipy.stats.distributions pattern + Notes + ----- + -inf < alpha < inf """ def _argcheck(self, alpha): @@ -146,15 +151,34 @@ def _pdf(self, x, alpha): skewnorm2 = SkewNorm2_gen( name="Skew Normal distribution", shapes="alpha", - extradoc=""" -inf < alpha < inf""", + # extradoc=""" -inf < alpha < inf""", ) class ACSkewT_gen(distributions.rv_continuous): - """univariate Skew-T distribution of Azzalini + """ + Univariate Skew-T distribution of Azzalini class follows scipy.stats.distributions pattern but with __init__ + + Notes + ----- + Skewed T distribution by Azzalini, A. & Capitanio, A. (2003)_ + + the pdf is given by: + + pdf(x) = 2.0 * t.pdf(x, df) * t.cdf(df+1, alpha*x*np.sqrt((1+df)/(x**2+df))) + + with alpha >=0 + + Note: different from skewed t distribution by Hansen 1999 + + .._ + Azzalini, A. & Capitanio, A. (2003), Distributions generated by + perturbation of symmetry with emphasis on a multivariate skew-t + distribution, appears in J.Roy.Statist.Soc, series B, vol.65, + pp.367-389 """ def __init__(self): @@ -163,21 +187,18 @@ def __init__(self): self, name="Skew T distribution", shapes="df, alpha", - extradoc=""" -Skewed T distribution by Azzalini, A. & Capitanio, A. (2003)_ - -the pdf is given by: - pdf(x) = 2.0 * t.pdf(x, df) * t.cdf(df+1, alpha*x*np.sqrt((1+df)/(x**2+df))) - -with alpha >=0 - -Note: different from skewed t distribution by Hansen 1999 -.._ -Azzalini, A. & Capitanio, A. (2003), Distributions generated by perturbation of -symmetry with emphasis on a multivariate skew-t distribution, -appears in J.Roy.Statist.Soc, series B, vol.65, pp.367-389 - -""", + # extradoc=""" + # Skewed T distribution by Azzalini, A. & Capitanio, A. (2003)_ + # + # the pdf is given by: + # pdf(x) = 2.0 * t.pdf(x, df) * t.cdf(df+1, alpha*x*np.sqrt((1+df)/(x**2+df))) + # with alpha >=0 + # Note: different from skewed t distribution by Hansen 1999 + # .._ + # Azzalini, A. & Capitanio, A. (2003), Distributions generated by perturbation of + # symmetry with emphasis on a multivariate skew-t distribution, + # appears in J.Roy.Statist.Soc, series B, vol.65, pp.367-389 + # """, ) def _argcheck(self, df, alpha): @@ -443,6 +464,23 @@ class NormExpan_gen(distributions.rv_continuous): class follows scipy.stats.distributions pattern but with __init__ + Notes + ----- + The distribution is defined as the Gram-Charlier expansion of + the normal distribution using the first four moments. The pdf + is given by + + pdf(x) = (1+ skew/6.0 * H(xc,3) + kurt/24.0 * H(xc,4))*normpdf(xc) + + where xc = (x-mu)/sig is the standardized value of the random variable + and H(xc,3) and H(xc,4) are Hermite polynomials + + Note: This distribution has to be parametrized during + initialization and instantiation, and does not have a shape + parameter after instantiation (similar to frozen distribution + except for location and scale.) Location and scale can be used + as with other distributions, however note, that they are relative + to the initialized distribution. """ def __init__(self, args, **kwds): @@ -451,23 +489,20 @@ def __init__(self, args, **kwds): self, name="Normal Expansion distribution", shapes=" ", - extradoc=""" - The distribution is defined as the Gram-Charlier expansion of - the normal distribution using the first four moments. The pdf - is given by - - pdf(x) = (1+ skew/6.0 * H(xc,3) + kurt/24.0 * H(xc,4))*normpdf(xc) - - where xc = (x-mu)/sig is the standardized value of the random variable - and H(xc,3) and H(xc,4) are Hermite polynomials - - Note: This distribution has to be parametrized during - initialization and instantiation, and does not have a shape - parameter after instantiation (similar to frozen distribution - except for location and scale.) Location and scale can be used - as with other distributions, however note, that they are relative - to the initialized distribution. - """, + # extradoc=""" + # The distribution is defined as the Gram-Charlier expansion of + # the normal distribution using the first four moments. The pdf + # is given by + # pdf(x) = (1+ skew/6.0 * H(xc,3) + kurt/24.0 * H(xc,4))*normpdf(xc) + # where xc = (x-mu)/sig is the standardized value of the random variable + # and H(xc,3) and H(xc,4) are Hermite polynomials + # Note: This distribution has to be parametrized during + # initialization and instantiation, and does not have a shape + # parameter after instantiation (similar to frozen distribution + # except for location and scale.) Location and scale can be used + # as with other distributions, however note, that they are relative + # to the initialized distribution. + # """, ) # print args, kwds mode = kwds.get("mode", "sample") @@ -568,7 +603,6 @@ def __init__(self, kls, func, funcinv, *args, **kwargs): longname = kwargs.pop( "longname", "Non-linear transformed distribution" ) - extradoc = kwargs.pop("extradoc", None) a = kwargs.pop("a", -np.inf) b = kwargs.pop("b", np.inf) self.decr = kwargs.pop("decr", False) @@ -580,7 +614,7 @@ def __init__(self, kls, func, funcinv, *args, **kwargs): # possible to freeze the underlying distribution super(Transf_gen, self).__init__( - a=a, b=b, name=name, longname=longname, extradoc=extradoc + a=a, b=b, name=name, longname=longname ) def _rvs(self, *args, **kwargs): @@ -630,7 +664,7 @@ def identit(x): numargs=0, name="discf", longname="normal-based discount factor", - extradoc="\ndistribution of discount factor y=1/(1+x)) with x N(0.05,0.1**2)", + # extradoc="\ndistribution of discount factor y=1/(1+x)) with x N(0.05,0.1**2)", ) lognormalg = Transf_gen( @@ -641,8 +675,8 @@ def identit(x): a=0, name="lnnorm", longname="Exp transformed normal", - extradoc="\ndistribution of y = exp(x), with x standard normal" - "precision for moment andstats is not very high, 2-3 decimals", + # extradoc="\ndistribution of y = exp(x), with x standard normal" + # "precision for moment andstats is not very high, 2-3 decimals", ) @@ -809,7 +843,6 @@ def __init__( longname = kwargs.pop( "longname", "Non-linear transformed distribution" ) - extradoc = kwargs.pop("extradoc", None) a = kwargs.pop("a", -np.inf) # attached to self in super b = kwargs.pop("b", np.inf) # self.a, self.b would be overwritten self.shape = kwargs.pop("shape", False) @@ -826,7 +859,6 @@ def __init__( name=name, shapes=kls.shapes, longname=longname, - extradoc=extradoc, ) # add enough info for self.freeze() to be able to reconstruct the instance @@ -938,8 +970,8 @@ def squarefunc(self, x): numargs=0, name="squarenorm", longname="squared normal distribution", - extradoc="\ndistribution of the square of a normal random variable" - + " y=x**2 with x N(0.0,1)", + # extradoc="\ndistribution of the square of a normal random variable" + # + " y=x**2 with x N(0.0,1)", ) # u_loc=l, u_scale=s) squaretg = TransfTwo_gen( @@ -955,8 +987,8 @@ def squarefunc(self, x): numargs=1, name="squarenorm", longname="squared t distribution", - extradoc="\ndistribution of the square of a t random variable" - + " y=x**2 with x t(dof,0.0,1)", + # extradoc="\ndistribution of the square of a t random variable" + # + " y=x**2 with x t(dof,0.0,1)", ) @@ -993,8 +1025,8 @@ def negsquarefunc(x): numargs=0, name="negsquarenorm", longname="negative squared normal distribution", - extradoc="\ndistribution of the negative square of a normal random variable" - + " y=-x**2 with x N(0.0,1)", + # extradoc="\ndistribution of the negative square of a normal random variable" + # + " y=-x**2 with x N(0.0,1)", ) # u_loc=l, u_scale=s) @@ -1032,8 +1064,8 @@ def absfunc(x): numargs=0, name="absnorm", longname="absolute of normal distribution", - extradoc="\ndistribution of the absolute value of a normal random variable" - + " y=abs(x) with x N(0,1)", + # extradoc="\ndistribution of the absolute value of a normal random variable" + # + " y=abs(x) with x N(0,1)", ) @@ -1244,7 +1276,8 @@ def mvstdnormcdf(lower, upper, corrcoef, **kwds): def mvnormcdf(upper, mu, cov, lower=None, **kwds): - """multivariate normal cumulative distribution function + """ + Multivariate normal cumulative distribution function This is a wrapper for scipy.stats._mvn.mvndst which calculates a rectangular integral over a multivariate normal distribution. diff --git a/statsmodels/sandbox/distributions/genpareto.py b/statsmodels/sandbox/distributions/genpareto.py index fa0c9cdb445..67f93017c46 100644 --- a/statsmodels/sandbox/distributions/genpareto.py +++ b/statsmodels/sandbox/distributions/genpareto.py @@ -6,114 +6,137 @@ Author: josef-pktd """ +import matplotlib.pyplot as plt import numpy as np +from numpy import abs as np_abs, inf, where from scipy import stats from scipy.special import comb from scipy.stats.distributions import rv_continuous -import matplotlib.pyplot as plt -from numpy import where, inf -from numpy import abs as np_abs ## Generalized Pareto with reversed sign of c as in literature class genpareto2_gen(rv_continuous): def _argcheck(self, c): c = np.asarray(c) - self.b = where(c > 0, 1.0/np_abs(c), inf) - return where(c==0, 0, 1) + self.b = where(c > 0, 1.0 / np_abs(c), inf) + return where(c == 0, 0, 1) + def _pdf(self, x, c): - Px = np.power(1-c*x,-1.0+1.0/c) + Px = np.power(1 - c * x, -1.0 + 1.0 / c) return Px + def _logpdf(self, x, c): - return (-1.0+1.0/c) * np.log1p(-c*x) + return (-1.0 + 1.0 / c) * np.log1p(-c * x) + def _cdf(self, x, c): - return 1.0 - np.power(1-c*x,1.0/c) + return 1.0 - np.power(1 - c * x, 1.0 / c) + def _ppf(self, q, c): - vals = -1.0/c * (np.power(1-q, c)-1) + vals = -1.0 / c * (np.power(1 - q, c) - 1) return vals + def _munp(self, n, c): - k = np.arange(0,n+1) - val = (1.0/c)**n * np.sum(comb(n,k)*(-1)**k / (1.0+c*k),axis=0) - return where(c*n > -1, val, inf) + k = np.arange(0, n + 1) + val = (1.0 / c) ** n * np.sum( + comb(n, k) * (-1) ** k / (1.0 + c * k), axis=0 + ) + return where(c * n > -1, val, inf) + def _entropy(self, c): - if (c < 0): - return 1-c + if c < 0: + return 1 - c else: self.b = 1.0 / c return rv_continuous._entropy(self, c) -genpareto2 = genpareto2_gen(a=0.0,name='genpareto', - longname="A generalized Pareto", - shapes='c',extradoc=""" - -Generalized Pareto distribution -genpareto2.pdf(x,c) = (1+c*x)**(-1-1/c) -for c != 0, and for x >= 0 for all c, and x < 1/abs(c) for c < 0. -""") +genpareto2 = genpareto2_gen( + a=0.0, + name="genpareto", + longname="A generalized Pareto", + shapes="c", # extradoc=""" + # + # Generalized Pareto distribution + # + # genpareto2.pdf(x,c) = (1+c*x)**(-1-1/c) + # for c != 0, and for x >= 0 for all c, and x < 1/abs(c) for c < 0. + # """ +) shape, loc, scale = 0.5, 0, 1 rv = np.arange(5) quant = [0.01, 0.1, 0.5, 0.9, 0.99] -for method, x in [('pdf', rv), - ('cdf', rv), - ('sf', rv), - ('ppf', quant), - ('isf', quant)]: +for method, x in [ + ("pdf", rv), + ("cdf", rv), + ("sf", rv), + ("ppf", quant), + ("isf", quant), +]: print(getattr(genpareto2, method)(x, shape, loc, scale)) print(getattr(stats.genpareto, method)(x, -shape, loc, scale)) -print(genpareto2.stats(shape, loc, scale, moments='mvsk')) -print(stats.genpareto.stats(-shape, loc, scale, moments='mvsk')) +print(genpareto2.stats(shape, loc, scale, moments="mvsk")) +print(stats.genpareto.stats(-shape, loc, scale, moments="mvsk")) print(genpareto2.entropy(shape, loc, scale)) print(stats.genpareto.entropy(-shape, loc, scale)) def paramstopot(thresh, shape, scale): - '''transform shape scale for peak over threshold + """transform shape scale for peak over threshold y = x-u|x>u ~ GPD(k, sigma-k*u) if x ~ GPD(k, sigma) notation of de Zea Bermudez, Kotz k, sigma is shape, scale - ''' - return shape, scale - shape*thresh + """ + return shape, scale - shape * thresh + def paramsfrompot(thresh, shape, scalepot): - return shape, scalepot + shape*thresh + return shape, scalepot + shape * thresh + def warnif(cond, msg): if not cond: - print(msg, 'does not hold') + print(msg, "does not hold") + def meanexcess(thresh, shape, scale): - '''mean excess function of genpareto + """mean excess function of genpareto assert are inequality conditions in de Zea Bermudez, Kotz - ''' - warnif(shape > -1, 'shape > -1') - warnif(thresh >= 0, 'thresh >= 0') #make it weak inequality - warnif((scale - shape*thresh) > 0, '(scale - shape*thresh) > 0') - return (scale - shape*thresh) / (1 + shape) + """ + warnif(shape > -1, "shape > -1") + warnif(thresh >= 0, "thresh >= 0") # make it weak inequality + warnif((scale - shape * thresh) > 0, "(scale - shape*thresh) > 0") + return (scale - shape * thresh) / (1 + shape) -def meanexcess_plot(data, params=None, lidx=100, uidx=10, method='emp', plot=0): - if method == 'est': - #does not make much sense yet, - #estimate the parameters and use theoretical meanexcess +def meanexcess_plot( + data, params=None, lidx=100, uidx=10, method="emp", plot=0 +): + if method == "est": + # does not make much sense yet, + # estimate the parameters and use theoretical meanexcess if params is None: raise NotImplementedError else: - pass #estimate parames - elif method == 'emp': - #calculate meanexcess from data + pass # estimate parames + elif method == "emp": + # calculate meanexcess from data datasorted = np.sort(data) - meanexcess = (datasorted[::-1].cumsum())/np.arange(1,len(data)+1) - datasorted[::-1] + meanexcess = (datasorted[::-1].cumsum()) / np.arange( + 1, len(data) + 1 + ) - datasorted[::-1] meanexcess = meanexcess[::-1] if plot: plt.plot(datasorted[:-uidx], meanexcess[:-uidx]) if params is not None: shape, scale = params - plt.plot(datasorted[:-uidx], (scale - datasorted[:-uidx] * shape) / (1. + shape)) + plt.plot( + datasorted[:-uidx], + (scale - datasorted[:-uidx] * shape) / (1.0 + shape), + ) return datasorted, meanexcess @@ -121,31 +144,35 @@ def meanexcess_plot(data, params=None, lidx=100, uidx=10, method='emp', plot=0): print(meanexcess(5, -2, 10)) data = genpareto2.rvs(-0.75, scale=5, size=1000) -#data = np.random.uniform(50, size=1000) -#data = stats.norm.rvs(0, np.sqrt(50), size=1000) -#data = stats.pareto.rvs(1.5, np.sqrt(50), size=1000) +# data = np.random.uniform(50, size=1000) +# data = stats.norm.rvs(0, np.sqrt(50), size=1000) +# data = stats.pareto.rvs(1.5, np.sqrt(50), size=1000) tmp = meanexcess_plot(data, params=(-0.75, 5), plot=1) print(tmp[1][-20:]) print(tmp[0][-20:]) -#plt.show() +# plt.show() + def meanexcess_emp(data): datasorted = np.sort(data).astype(float) - meanexcess = (datasorted[::-1].cumsum())/np.arange(1,len(data)+1) - datasorted[::-1] - meancont = (datasorted[::-1].cumsum())/np.arange(1,len(data)+1) + meanexcess = (datasorted[::-1].cumsum()) / np.arange( + 1, len(data) + 1 + ) - datasorted[::-1] + meancont = (datasorted[::-1].cumsum()) / np.arange(1, len(data) + 1) meanexcess = meanexcess[::-1] return datasorted, meanexcess, meancont[::-1] + def meanexcess_dist(self, lb, *args, **kwds): - #default function in expect is identity + # default function in expect is identity # need args in call if np.ndim(lb) == 0: return self.expect(lb=lb, conditional=True) else: - return np.array([self.expect(lb=lbb, conditional=True) for - lbb in lb]) + return np.array([self.expect(lb=lbb, conditional=True) for lbb in lb]) -ds, me, mc = meanexcess_emp(1.*np.arange(1,10)) + +ds, me, mc = meanexcess_emp(1.0 * np.arange(1, 10)) print(ds) print(me) print(mc) @@ -154,11 +181,15 @@ def meanexcess_dist(self, lb, *args, **kwds): print(meanexcess_dist(stats.norm, lb=[-np.inf, -0.5, 0, 0.5])) rvs = stats.norm.rvs(size=100000) rvs = rvs - rvs.mean() -print(rvs.mean(), rvs[rvs>-0.5].mean(), rvs[rvs>0].mean(), rvs[rvs>0.5].mean()) - +print( + rvs.mean(), + rvs[rvs > -0.5].mean(), + rvs[rvs > 0].mean(), + rvs[rvs > 0.5].mean(), +) -''' +""" [ 1. 0.5 0. 0. 0. ] [ 1. 0.5 0. 0. 0. ] [ 0. 0.75 1. 1. 1. ] @@ -231,4 +262,4 @@ def meanexcess_dist(self, lb, *args, **kwds): >>> datasorted[::-1] array([ 9., 8., 7., 6., 5., 4., 3., 2., 1.]) >>> -''' +""" diff --git a/statsmodels/sandbox/distributions/gof_new.py b/statsmodels/sandbox/distributions/gof_new.py index ffb9588b9df..0a804858163 100644 --- a/statsmodels/sandbox/distributions/gof_new.py +++ b/statsmodels/sandbox/distributions/gof_new.py @@ -1,4 +1,4 @@ -'''More Goodness of fit tests +"""More Goodness of fit tests contains @@ -16,17 +16,17 @@ References ---------- -''' +""" from statsmodels.compat.python import lmap -import numpy as np +import numpy as np +from scipy.special import kolmogorov as ksprob from scipy.stats import distributions from statsmodels.tools.decorators import cache_readonly -from scipy.special import kolmogorov as ksprob -#from scipy.stats unchanged +# from scipy.stats unchanged def ks_2samp(data1, data2): """ Computes the Kolmogorov-Smirnof statistic on 2 samples. @@ -106,23 +106,24 @@ def ks_2samp(data1, data2): n2 = len(data2) data1 = np.sort(data1) data2 = np.sort(data2) - data_all = np.concatenate([data1,data2]) - #reminder: searchsorted inserts 2nd into 1st array - cdf1 = np.searchsorted(data1,data_all,side='right')/(1.0*n1) - cdf2 = (np.searchsorted(data2,data_all,side='right'))/(1.0*n2) - d = np.max(np.absolute(cdf1-cdf2)) - #Note: d absolute not signed distance - en = np.sqrt(n1*n2/float(n1+n2)) + data_all = np.concatenate([data1, data2]) + # reminder: searchsorted inserts 2nd into 1st array + cdf1 = np.searchsorted(data1, data_all, side="right") / (1.0 * n1) + cdf2 = (np.searchsorted(data2, data_all, side="right")) / (1.0 * n2) + d = np.max(np.absolute(cdf1 - cdf2)) + # Note: d absolute not signed distance + en = np.sqrt(n1 * n2 / float(n1 + n2)) try: - prob = ksprob((en+0.12+0.11/en)*d) + prob = ksprob((en + 0.12 + 0.11 / en) * d) except: prob = 1.0 return d, prob - -#from scipy.stats unchanged -def kstest(rvs, cdf, args=(), N=20, alternative = 'two_sided', mode='approx',**kwds): +# from scipy.stats unchanged +def kstest( + rvs, cdf, args=(), N=20, alternative="two_sided", mode="approx", **kwds +): """ Perform the Kolmogorov-Smirnov test for goodness of fit @@ -238,56 +239,60 @@ def kstest(rvs, cdf, args=(), N=20, alternative = 'two_sided', mode='approx',**k (0.131016895759829, 0.058826222555312224) """ if isinstance(rvs, str): - #cdf = getattr(stats, rvs).cdf + # cdf = getattr(stats, rvs).cdf if (not cdf) or (cdf == rvs): cdf = getattr(distributions, rvs).cdf rvs = getattr(distributions, rvs).rvs else: - raise AttributeError('if rvs is string, cdf has to be the same distribution') - + raise AttributeError( + "if rvs is string, cdf has to be the same distribution" + ) if isinstance(cdf, str): cdf = getattr(distributions, cdf).cdf if callable(rvs): - kwds = {'size':N} - vals = np.sort(rvs(*args,**kwds)) + kwds = {"size": N} + vals = np.sort(rvs(*args, **kwds)) else: vals = np.sort(rvs) N = len(vals) cdfvals = cdf(vals, *args) - if alternative in ['two_sided', 'greater']: - Dplus = (np.arange(1.0, N+1)/N - cdfvals).max() - if alternative == 'greater': - return Dplus, distributions.ksone.sf(Dplus,N) - - if alternative in ['two_sided', 'less']: - Dmin = (cdfvals - np.arange(0.0, N)/N).max() - if alternative == 'less': - return Dmin, distributions.ksone.sf(Dmin,N) - - if alternative == 'two_sided': - D = np.max([Dplus,Dmin]) - if mode == 'asymp': - return D, distributions.kstwobign.sf(D*np.sqrt(N)) - if mode == 'approx': - pval_two = distributions.kstwobign.sf(D*np.sqrt(N)) - if N > 2666 or pval_two > 0.80 - N*0.3/1000.0 : - return D, distributions.kstwobign.sf(D*np.sqrt(N)) + if alternative in ["two_sided", "greater"]: + Dplus = (np.arange(1.0, N + 1) / N - cdfvals).max() + if alternative == "greater": + return Dplus, distributions.ksone.sf(Dplus, N) + + if alternative in ["two_sided", "less"]: + Dmin = (cdfvals - np.arange(0.0, N) / N).max() + if alternative == "less": + return Dmin, distributions.ksone.sf(Dmin, N) + + if alternative == "two_sided": + D = np.max([Dplus, Dmin]) + if mode == "asymp": + return D, distributions.kstwobign.sf(D * np.sqrt(N)) + if mode == "approx": + pval_two = distributions.kstwobign.sf(D * np.sqrt(N)) + if N > 2666 or pval_two > 0.80 - N * 0.3 / 1000.0: + return D, distributions.kstwobign.sf(D * np.sqrt(N)) else: - return D, distributions.ksone.sf(D,N)*2 + return D, distributions.ksone.sf(D, N) * 2 -#TODO: split into modification and pvalue functions separately ? + +# TODO: split into modification and pvalue functions separately ? # for separate testing and combining different pieces + def dplus_st70_upp(stat, nobs): mod_factor = np.sqrt(nobs) + 0.12 + 0.11 / np.sqrt(nobs) stat_modified = stat * mod_factor pval = np.exp(-2 * stat_modified**2) digits = np.sum(stat > np.array([0.82, 0.82, 1.00])) - #repeat low to get {0,2,3} + # repeat low to get {0,2,3} return stat_modified, pval, digits + dminus_st70_upp = dplus_st70_upp @@ -296,75 +301,85 @@ def d_st70_upp(stat, nobs): stat_modified = stat * mod_factor pval = 2 * np.exp(-2 * stat_modified**2) digits = np.sum(stat > np.array([0.91, 0.91, 1.08])) - #repeat low to get {0,2,3} + # repeat low to get {0,2,3} return stat_modified, pval, digits + def v_st70_upp(stat, nobs): mod_factor = np.sqrt(nobs) + 0.155 + 0.24 / np.sqrt(nobs) - #repeat low to get {0,2,3} + # repeat low to get {0,2,3} stat_modified = stat * mod_factor zsqu = stat_modified**2 pval = (8 * zsqu - 2) * np.exp(-2 * zsqu) digits = np.sum(stat > np.array([1.06, 1.06, 1.26])) return stat_modified, pval, digits + def wsqu_st70_upp(stat, nobs): - nobsinv = 1. / nobs + nobsinv = 1.0 / nobs stat_modified = (stat - 0.4 * nobsinv + 0.6 * nobsinv**2) * (1 + nobsinv) pval = 0.05 * np.exp(2.79 - 6 * stat_modified) digits = np.nan # some explanation in txt - #repeat low to get {0,2,3} + # repeat low to get {0,2,3} return stat_modified, pval, digits + def usqu_st70_upp(stat, nobs): - nobsinv = 1. / nobs - stat_modified = (stat - 0.1 * nobsinv + 0.1 * nobsinv**2) - stat_modified *= (1 + 0.8 * nobsinv) - pval = 2 * np.exp(- 2 * stat_modified * np.pi**2) + nobsinv = 1.0 / nobs + stat_modified = stat - 0.1 * nobsinv + 0.1 * nobsinv**2 + stat_modified *= 1 + 0.8 * nobsinv + pval = 2 * np.exp(-2 * stat_modified * np.pi**2) digits = np.sum(stat > np.array([0.29, 0.29, 0.34])) - #repeat low to get {0,2,3} + # repeat low to get {0,2,3} return stat_modified, pval, digits + def a_st70_upp(stat, nobs): - nobsinv = 1. / nobs - stat_modified = (stat - 0.7 * nobsinv + 0.9 * nobsinv**2) - stat_modified *= (1 + 1.23 * nobsinv) - pval = 1.273 * np.exp(- 2 * stat_modified / 2. * np.pi**2) + nobsinv = 1.0 / nobs + stat_modified = stat - 0.7 * nobsinv + 0.9 * nobsinv**2 + stat_modified *= 1 + 1.23 * nobsinv + pval = 1.273 * np.exp(-2 * stat_modified / 2.0 * np.pi**2) digits = np.sum(stat > np.array([0.11, 0.11, 0.452])) - #repeat low to get {0,2,3} + # repeat low to get {0,2,3} return stat_modified, pval, digits - gof_pvals = {} -gof_pvals['stephens70upp'] = { - 'd_plus' : dplus_st70_upp, - 'd_minus' : dplus_st70_upp, - 'd' : d_st70_upp, - 'v' : v_st70_upp, - 'wsqu' : wsqu_st70_upp, - 'usqu' : usqu_st70_upp, - 'a' : a_st70_upp } +gof_pvals["stephens70upp"] = { + "d_plus": dplus_st70_upp, + "d_minus": dplus_st70_upp, + "d": d_st70_upp, + "v": v_st70_upp, + "wsqu": wsqu_st70_upp, + "usqu": usqu_st70_upp, + "a": a_st70_upp, +} + def pval_kstest_approx(D, N): - pval_two = distributions.kstwobign.sf(D*np.sqrt(N)) - if N > 2666 or pval_two > 0.80 - N*0.3/1000.0 : - return D, distributions.kstwobign.sf(D*np.sqrt(N)), np.nan + pval_two = distributions.kstwobign.sf(D * np.sqrt(N)) + if N > 2666 or pval_two > 0.80 - N * 0.3 / 1000.0: + return D, distributions.kstwobign.sf(D * np.sqrt(N)), np.nan else: - return D, distributions.ksone.sf(D,N)*2, np.nan + return D, distributions.ksone.sf(D, N) * 2, np.nan -gof_pvals['scipy'] = { - 'd_plus' : lambda Dplus, N: (Dplus, distributions.ksone.sf(Dplus, N), np.nan), - 'd_minus' : lambda Dmin, N: (Dmin, distributions.ksone.sf(Dmin,N), np.nan), - 'd' : lambda D, N: (D, distributions.kstwobign.sf(D*np.sqrt(N)), np.nan) - } -gof_pvals['scipy_approx'] = { - 'd' : pval_kstest_approx } +gof_pvals["scipy"] = { + "d_plus": lambda Dplus, N: ( + Dplus, + distributions.ksone.sf(Dplus, N), + np.nan, + ), + "d_minus": lambda Dmin, N: (Dmin, distributions.ksone.sf(Dmin, N), np.nan), + "d": lambda D, N: (D, distributions.kstwobign.sf(D * np.sqrt(N)), np.nan), +} + +gof_pvals["scipy_approx"] = {"d": pval_kstest_approx} + class GOF(object): - '''One Sample Goodness of Fit tests + """One Sample Goodness of Fit tests includes Kolmogorov-Smirnov D, D+, D-, Kuiper V, Cramer-von Mises W^2, U^2 and Anderson-Darling A, A^2. The p-values for all tests except for A^2 are based on @@ -382,26 +397,24 @@ class GOF(object): - ''' - - - + """ def __init__(self, rvs, cdf, args=(), N=20): if isinstance(rvs, str): - #cdf = getattr(stats, rvs).cdf + # cdf = getattr(stats, rvs).cdf if (not cdf) or (cdf == rvs): cdf = getattr(distributions, rvs).cdf rvs = getattr(distributions, rvs).rvs else: - raise AttributeError('if rvs is string, cdf has to be the same distribution') - + raise AttributeError( + "if rvs is string, cdf has to be the same distribution" + ) if isinstance(cdf, str): cdf = getattr(distributions, cdf).cdf if callable(rvs): - kwds = {'size':N} - vals = np.sort(rvs(*args,**kwds)) + kwds = {"size": N} + vals = np.sort(rvs(*args, **kwds)) else: vals = np.sort(rvs) N = len(vals) @@ -411,19 +424,17 @@ def __init__(self, rvs, cdf, args=(), N=20): self.vals_sorted = vals self.cdfvals = cdfvals - - @cache_readonly def d_plus(self): nobs = self.nobs cdfvals = self.cdfvals - return (np.arange(1.0, nobs+1)/nobs - cdfvals).max() + return (np.arange(1.0, nobs + 1) / nobs - cdfvals).max() @cache_readonly def d_minus(self): nobs = self.nobs cdfvals = self.cdfvals - return (cdfvals - np.arange(0.0, nobs)/nobs).max() + return (cdfvals - np.arange(0.0, nobs) / nobs).max() @cache_readonly def d(self): @@ -431,25 +442,26 @@ def d(self): @cache_readonly def v(self): - '''Kuiper''' + """Kuiper""" return self.d_plus + self.d_minus @cache_readonly def wsqu(self): - '''Cramer von Mises''' + """Cramer von Mises""" nobs = self.nobs cdfvals = self.cdfvals - #use literal formula, TODO: simplify with arange(,,2) - wsqu = ((cdfvals - (2. * np.arange(1., nobs+1) - 1)/nobs/2.)**2).sum() \ - + 1./nobs/12. + # use literal formula, TODO: simplify with arange(,,2) + wsqu = ( + (cdfvals - (2.0 * np.arange(1.0, nobs + 1) - 1) / nobs / 2.0) ** 2 + ).sum() + 1.0 / nobs / 12.0 return wsqu @cache_readonly def usqu(self): nobs = self.nobs cdfvals = self.cdfvals - #use literal formula, TODO: simplify with arange(,,2) - usqu = self.wsqu - nobs * (cdfvals.mean() - 0.5)**2 + # use literal formula, TODO: simplify with arange(,,2) + usqu = self.wsqu - nobs * (cdfvals.mean() - 0.5) ** 2 return usqu @cache_readonly @@ -457,49 +469,46 @@ def a(self): nobs = self.nobs cdfvals = self.cdfvals - #one loop instead of large array + # one loop instead of large array msum = 0 - for j in range(1,nobs): + for j in range(1, nobs): mj = cdfvals[j] - cdfvals[:j] - mask = (mj > 0.5) + mask = mj > 0.5 mj[mask] = 1 - mj[mask] msum += mj.sum() - a = nobs / 4. - 2. / nobs * msum + a = nobs / 4.0 - 2.0 / nobs * msum return a @cache_readonly def asqu(self): - '''Stephens 1974, does not have p-value formula for A^2''' + """Stephens 1974, does not have p-value formula for A^2""" nobs = self.nobs cdfvals = self.cdfvals - asqu = -((2. * np.arange(1., nobs+1) - 1) * - (np.log(cdfvals) + np.log(1-cdfvals[::-1]) )).sum()/nobs - nobs + asqu = ( + -( + (2.0 * np.arange(1.0, nobs + 1) - 1) + * (np.log(cdfvals) + np.log(1 - cdfvals[::-1])) + ).sum() + / nobs + - nobs + ) return asqu - - def get_test(self, testid='d', pvals='stephens70upp'): - ''' - - ''' - #print gof_pvals[pvals][testid] + def get_test(self, testid="d", pvals="stephens70upp"): + """ """ + # print gof_pvals[pvals][testid] stat = getattr(self, testid) - if pvals == 'stephens70upp': + if pvals == "stephens70upp": return gof_pvals[pvals][testid](stat, self.nobs), stat else: return gof_pvals[pvals][testid](stat, self.nobs) - - - - - - def gof_mc(randfn, distr, nobs=100): - #print '\nIs it correctly sized?' + # print '\nIs it correctly sized?' from collections import defaultdict results = defaultdict(list) @@ -507,39 +516,47 @@ def gof_mc(randfn, distr, nobs=100): rvs = randfn(nobs) goft = GOF(rvs, distr) for ti in all_gofs: - results[ti].append(goft.get_test(ti, 'stephens70upp')[0][1]) + results[ti].append(goft.get_test(ti, "stephens70upp")[0][1]) resarr = np.array([results[ti] for ti in all_gofs]) - print(' ', ' '.join(all_gofs)) - print('at 0.01:', (resarr < 0.01).mean(1)) - print('at 0.05:', (resarr < 0.05).mean(1)) - print('at 0.10:', (resarr < 0.1).mean(1)) + print(" ", " ".join(all_gofs)) + print("at 0.01:", (resarr < 0.01).mean(1)) + print("at 0.05:", (resarr < 0.05).mean(1)) + print("at 0.10:", (resarr < 0.1).mean(1)) + def asquare(cdfvals, axis=0): - '''vectorized Anderson Darling A^2, Stephens 1974''' + """vectorized Anderson Darling A^2, Stephens 1974""" ndim = len(cdfvals.shape) nobs = cdfvals.shape[axis] - slice_reverse = [slice(None)] * ndim #might make copy if not specific axis??? + slice_reverse = [ + slice(None) + ] * ndim # might make copy if not specific axis??? islice = [None] * ndim islice[axis] = slice(None) slice_reverse[axis] = slice(None, None, -1) - asqu = -((2. * np.arange(1., nobs+1)[tuple(islice)] - 1) * - (np.log(cdfvals) + np.log(1-cdfvals[tuple(slice_reverse)]))/nobs).sum(axis) \ - - nobs + asqu = ( + -( + (2.0 * np.arange(1.0, nobs + 1)[tuple(islice)] - 1) + * (np.log(cdfvals) + np.log(1 - cdfvals[tuple(slice_reverse)])) + / nobs + ).sum(axis) + - nobs + ) return asqu -#class OneSGOFFittedVec(object): +# class OneSGOFFittedVec(object): # '''for vectorized fitting''' - # currently I use the bootstrap as function instead of full class +# currently I use the bootstrap as function instead of full class - #note: kwds loc and scale are a pain - # I would need to overwrite rvs, fit and cdf depending on fixed parameters +# note: kwds loc and scale are a pain +# I would need to overwrite rvs, fit and cdf depending on fixed parameters - #def bootstrap(self, distr, args=(), kwds={}, nobs=200, nrep=1000, +# def bootstrap(self, distr, args=(), kwds={}, nobs=200, nrep=1000, def bootstrap(distr, args=(), nobs=200, nrep=100, value=None, batch_size=None): - '''Monte Carlo (or parametric bootstrap) p-values for gof + """Monte Carlo (or parametric bootstrap) p-values for gof currently hardcoded for A^2 only @@ -550,23 +567,22 @@ def bootstrap(distr, args=(), nobs=200, nrep=100, value=None, batch_size=None): this works also with nrep=1 - ''' - #signature similar to kstest ? - #delegate to fn ? - - #rvs_kwds = {'size':(nobs, nrep)} - #rvs_kwds.update(kwds) + """ + # signature similar to kstest ? + # delegate to fn ? + # rvs_kwds = {'size':(nobs, nrep)} + # rvs_kwds.update(kwds) - #it will be better to build a separate batch function that calls bootstrap - #keep batch if value is true, but batch iterate from outside if stat is returned + # it will be better to build a separate batch function that calls bootstrap + # keep batch if value is true, but batch iterate from outside if stat is returned if batch_size is not None: if value is None: - raise ValueError('using batching requires a value') - n_batch = int(np.ceil(nrep/float(batch_size))) + raise ValueError("using batching requires a value") + n_batch = int(np.ceil(nrep / float(batch_size))) count = 0 for irep in range(n_batch): - rvs = distr.rvs(args, **{'size':(batch_size, nobs)}) + rvs = distr.rvs(args, **{"size": (batch_size, nobs)}) params = distr.fit_vec(rvs, axis=1) params = lmap(lambda x: np.expand_dims(x, 1), params) cdfvals = np.sort(distr.cdf(rvs, params), axis=1) @@ -574,22 +590,21 @@ def bootstrap(distr, args=(), nobs=200, nrep=100, value=None, batch_size=None): count += (stat >= value).sum() return count / float(n_batch * batch_size) else: - #rvs = distr.rvs(args, **kwds) #extension to distribution kwds ? - rvs = distr.rvs(args, **{'size':(nrep, nobs)}) + # rvs = distr.rvs(args, **kwds) #extension to distribution kwds ? + rvs = distr.rvs(args, **{"size": (nrep, nobs)}) params = distr.fit_vec(rvs, axis=1) params = lmap(lambda x: np.expand_dims(x, 1), params) cdfvals = np.sort(distr.cdf(rvs, params), axis=1) stat = asquare(cdfvals, axis=1) - if value is None: #return all bootstrap results + if value is None: # return all bootstrap results stat_sorted = np.sort(stat) return stat_sorted - else: #calculate and return specific p-value + else: # calculate and return specific p-value return (stat >= value).mean() - def bootstrap2(value, distr, args=(), nobs=200, nrep=100): - '''Monte Carlo (or parametric bootstrap) p-values for gof + """Monte Carlo (or parametric bootstrap) p-values for gof currently hardcoded for A^2 only @@ -598,28 +613,26 @@ def bootstrap2(value, distr, args=(), nobs=200, nrep=100): rename function to less generic - ''' - #signature similar to kstest ? - #delegate to fn ? - - #rvs_kwds = {'size':(nobs, nrep)} - #rvs_kwds.update(kwds) + """ + # signature similar to kstest ? + # delegate to fn ? + # rvs_kwds = {'size':(nobs, nrep)} + # rvs_kwds.update(kwds) count = 0 for irep in range(nrep): - #rvs = distr.rvs(args, **kwds) #extension to distribution kwds ? - rvs = distr.rvs(args, **{'size':nobs}) + # rvs = distr.rvs(args, **kwds) #extension to distribution kwds ? + rvs = distr.rvs(args, **{"size": nobs}) params = distr.fit_vec(rvs) cdfvals = np.sort(distr.cdf(rvs, params)) stat = asquare(cdfvals, axis=0) - count += (stat >= value) - return count * 1. / nrep + count += stat >= value + return count * 1.0 / nrep class NewNorm(object): - '''just a holder for modified distributions - ''' + """just a holder for modified distributions""" def fit_vec(self, x, axis=0): return x.mean(axis), x.std(axis) @@ -628,59 +641,57 @@ def cdf(self, x, args): return distributions.norm.cdf(x, loc=args[0], scale=args[1]) def rvs(self, args, size): - loc=args[0] - scale=args[1] + loc = args[0] + scale = args[1] return loc + scale * distributions.norm.rvs(size=size) - - - -if __name__ == '__main__': +if __name__ == "__main__": from scipy import stats - #rvs = np.random.randn(1000) + + # rvs = np.random.randn(1000) rvs = stats.t.rvs(3, size=200) - print('scipy kstest') - print(kstest(rvs, 'norm')) - goft = GOF(rvs, 'norm') + print("scipy kstest") + print(kstest(rvs, "norm")) + goft = GOF(rvs, "norm") print(goft.get_test()) - all_gofs = ['d', 'd_plus', 'd_minus', 'v', 'wsqu', 'usqu', 'a'] + all_gofs = ["d", "d_plus", "d_minus", "v", "wsqu", "usqu", "a"] for ti in all_gofs: - print(ti, goft.get_test(ti, 'stephens70upp')) + print(ti, goft.get_test(ti, "stephens70upp")) - print('\nIs it correctly sized?') + print("\nIs it correctly sized?") from collections import defaultdict results = defaultdict(list) nobs = 200 for i in range(100): rvs = np.random.randn(nobs) - goft = GOF(rvs, 'norm') + goft = GOF(rvs, "norm") for ti in all_gofs: - results[ti].append(goft.get_test(ti, 'stephens70upp')[0][1]) + results[ti].append(goft.get_test(ti, "stephens70upp")[0][1]) resarr = np.array([results[ti] for ti in all_gofs]) - print(' ', ' '.join(all_gofs)) - print('at 0.01:', (resarr < 0.01).mean(1)) - print('at 0.05:', (resarr < 0.05).mean(1)) - print('at 0.10:', (resarr < 0.1).mean(1)) + print(" ", " ".join(all_gofs)) + print("at 0.01:", (resarr < 0.01).mean(1)) + print("at 0.05:", (resarr < 0.05).mean(1)) + print("at 0.10:", (resarr < 0.1).mean(1)) - gof_mc(lambda nobs: stats.t.rvs(3, size=nobs), 'norm', nobs=200) + gof_mc(lambda nobs: stats.t.rvs(3, size=nobs), "norm", nobs=200) nobs = 200 nrep = 100 - bt = bootstrap(NewNorm(), args=(0,1), nobs=nobs, nrep=nrep, value=None) + bt = bootstrap(NewNorm(), args=(0, 1), nobs=nobs, nrep=nrep, value=None) quantindex = np.floor(nrep * np.array([0.99, 0.95, 0.9])).astype(int) print(bt[quantindex]) - #the bootstrap results match Stephens pretty well for nobs=100, but not so well for - #large (1000) or small (20) nobs - ''' + # the bootstrap results match Stephens pretty well for nobs=100, but not so well for + # large (1000) or small (20) nobs + """ >>> np.array([15.0, 10.0, 5.0, 2.5, 1.0])/100. #Stephens array([ 0.15 , 0.1 , 0.05 , 0.025, 0.01 ]) >>> nobs = 100 >>> [bootstrap(NewNorm(), args=(0,1), nobs=nobs, nrep=10000, value=c/ (1 + 4./nobs - 25./nobs**2)) for c in [0.576, 0.656, 0.787, 0.918, 1.092]] [0.1545, 0.10009999999999999, 0.049000000000000002, 0.023, 0.0104] >>> - ''' + """ diff --git a/statsmodels/sandbox/distributions/multivariate.py b/statsmodels/sandbox/distributions/multivariate.py index fce61204629..b6c232804ae 100644 --- a/statsmodels/sandbox/distributions/multivariate.py +++ b/statsmodels/sandbox/distributions/multivariate.py @@ -1,4 +1,4 @@ -'''Multivariate Distribution +"""Multivariate Distribution Probability of a multivariate t distribution @@ -13,54 +13,72 @@ Reference: Genz and Bretz for formula -''' +""" import numpy as np -from scipy import integrate, stats, special +from numpy import exp as np_exp, log as np_log +from scipy import integrate, special, stats +from scipy.special import gamma as sps_gamma, gammaln as sps_gammaln from scipy.stats import chi from .extras import mvstdnormcdf -from numpy import exp as np_exp -from numpy import log as np_log -from scipy.special import gamma as sps_gamma -from scipy.special import gammaln as sps_gammaln def chi2_pdf(self, x, df): - '''pdf of chi-square distribution''' - #from scipy.stats.distributions - Px = x**(df/2.0-1)*np.exp(-x/2.0) - Px /= special.gamma(df/2.0)* 2**(df/2.0) + """pdf of chi-square distribution""" + # from scipy.stats.distributions + Px = x ** (df / 2.0 - 1) * np.exp(-x / 2.0) + Px /= special.gamma(df / 2.0) * 2 ** (df / 2.0) return Px + def chi_pdf(x, df): - tmp = (df-1.)*np_log(x) + (-x*x*0.5) - (df*0.5-1)*np_log(2.0) \ - - sps_gammaln(df*0.5) + tmp = ( + (df - 1.0) * np_log(x) + + (-x * x * 0.5) + - (df * 0.5 - 1) * np_log(2.0) + - sps_gammaln(df * 0.5) + ) return np_exp(tmp) - #return x**(df-1.)*np_exp(-x*x*0.5)/(2.0)**(df*0.5-1)/sps_gamma(df*0.5) + # return x**(df-1.)*np_exp(-x*x*0.5)/(2.0)**(df*0.5-1)/sps_gamma(df*0.5) + def chi_logpdf(x, df): - tmp = (df-1.)*np_log(x) + (-x*x*0.5) - (df*0.5-1)*np_log(2.0) \ - - sps_gammaln(df*0.5) + tmp = ( + (df - 1.0) * np_log(x) + + (-x * x * 0.5) + - (df * 0.5 - 1) * np_log(2.0) + - sps_gammaln(df * 0.5) + ) return tmp + def funbgh(s, a, b, R, df): - sqrt_df = np.sqrt(df+0.5) - ret = chi_logpdf(s,df) - ret += np_log(mvstdnormcdf(s*a/sqrt_df, s*b/sqrt_df, R, - maxpts=1000000, abseps=1e-6)) + sqrt_df = np.sqrt(df + 0.5) + ret = chi_logpdf(s, df) + ret += np_log( + mvstdnormcdf( + s * a / sqrt_df, s * b / sqrt_df, R, maxpts=1000000, abseps=1e-6 + ) + ) ret = np_exp(ret) return ret + def funbgh2(s, a, b, R, df): n = len(a) sqrt_df = np.sqrt(df) - #np.power(s, df-1) * np_exp(-s*s*0.5) - return np_exp((df-1)*np_log(s)-s*s*0.5) \ - * mvstdnormcdf(s*a/sqrt_df, s*b/sqrt_df, R[np.tril_indices(n, -1)], - maxpts=1000000, abseps=1e-4) + # np.power(s, df-1) * np_exp(-s*s*0.5) + return np_exp((df - 1) * np_log(s) - s * s * 0.5) * mvstdnormcdf( + s * a / sqrt_df, + s * b / sqrt_df, + R[np.tril_indices(n, -1)], + maxpts=1000000, + abseps=1e-4, + ) + def bghfactor(df): - return np.power(2.0, 1-df*0.5) / sps_gamma(df*0.5) + return np.power(2.0, 1 - df * 0.5) / sps_gamma(df * 0.5) def mvstdtprob(a, b, R, df, ieps=1e-5, quadkwds=None, mvstkwds=None): @@ -83,10 +101,11 @@ def mvstdtprob(a, b, R, df, ieps=1e-5, quadkwds=None, mvstkwds=None): prob = res * bghfactor(df) return prob -#written by Enzo Michelangeli, style changes by josef-pktd + +# written by Enzo Michelangeli, style changes by josef-pktd # Student's T random variable def multivariate_t_rvs(m, S, df=np.inf, n=1): - '''generate random variables of multivariate t distribution + """generate random variables of multivariate t distribution Parameters ---------- @@ -106,54 +125,52 @@ def multivariate_t_rvs(m, S, df=np.inf, n=1): random variable - ''' + """ m = np.asarray(m) d = len(m) if df == np.inf: x = np.ones(n) else: - x = np.random.chisquare(df, n)/df - z = np.random.multivariate_normal(np.zeros(d),S,(n,)) - return m + z/np.sqrt(x)[:,None] # same output format as random.multivariate_normal + x = np.random.chisquare(df, n) / df + z = np.random.multivariate_normal(np.zeros(d), S, (n,)) + return ( + m + z / np.sqrt(x)[:, None] + ) # same output format as random.multivariate_normal - - -if __name__ == '__main__': - corr = np.asarray([[1.0, 0, 0.5],[0,1,0],[0.5,0,1]]) - corr_indep = np.asarray([[1.0, 0, 0],[0,1,0],[0,0,1]]) - corr_equal = np.asarray([[1.0, 0.5, 0.5],[0.5,1,0.5],[0.5,0.5,1]]) +if __name__ == "__main__": + corr = np.asarray([[1.0, 0, 0.5], [0, 1, 0], [0.5, 0, 1]]) + corr_indep = np.asarray([[1.0, 0, 0], [0, 1, 0], [0, 0, 1]]) + corr_equal = np.asarray([[1.0, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]) R = corr_equal - a = np.array([-np.inf,-np.inf,-100.0]) - a = np.array([-0.96,-0.96,-0.96]) - b = np.array([0.0,0.0,0.0]) - b = np.array([0.96,0.96, 0.96]) + a = np.array([-np.inf, -np.inf, -100.0]) + a = np.array([-0.96, -0.96, -0.96]) + b = np.array([0.0, 0.0, 0.0]) + b = np.array([0.96, 0.96, 0.96]) a[:] = -1 b[:] = 3 - df = 10. + df = 10.0 sqrt_df = np.sqrt(df) print(mvstdnormcdf(a, b, corr, abseps=1e-6)) - #print integrate.quad(funbgh, 0, np.inf, args=(a,b,R,df)) - print((stats.t.cdf(b[0], df) - stats.t.cdf(a[0], df))**3) + # print integrate.quad(funbgh, 0, np.inf, args=(a,b,R,df)) + print((stats.t.cdf(b[0], df) - stats.t.cdf(a[0], df)) ** 3) s = 1 - print(mvstdnormcdf(s*a/sqrt_df, s*b/sqrt_df, R)) - + print(mvstdnormcdf(s * a / sqrt_df, s * b / sqrt_df, R)) - df=4 + df = 4 print(mvstdtprob(a, b, R, df)) - S = np.array([[1.,.5],[.5,1.]]) - print(multivariate_t_rvs([10.,20.], S, 2, 5)) + S = np.array([[1.0, 0.5], [0.5, 1.0]]) + print(multivariate_t_rvs([10.0, 20.0], S, 2, 5)) nobs = 10000 - rvst = multivariate_t_rvs([10.,20.], S, 2, nobs) - print(np.sum((rvst<[10.,20.]).all(1),0) * 1. / nobs) - print(mvstdtprob(-np.inf*np.ones(2), np.zeros(2), R[:2,:2], 2)) + rvst = multivariate_t_rvs([10.0, 20.0], S, 2, nobs) + print(np.sum((rvst < [10.0, 20.0]).all(1), 0) * 1.0 / nobs) + print(mvstdtprob(-np.inf * np.ones(2), np.zeros(2), R[:2, :2], 2)) - - ''' + """ > lower <- -1 > upper <- 3 > df <- 4 @@ -168,4 +185,4 @@ def multivariate_t_rvs(m, S, df=np.inf, n=1): > (pt(upper, df) - pt(lower, df))**3 [1] 0.4988254 - ''' + """ diff --git a/statsmodels/sandbox/distributions/mv_measures.py b/statsmodels/sandbox/distributions/mv_measures.py index 7074da810f9..3b4f8eba523 100644 --- a/statsmodels/sandbox/distributions/mv_measures.py +++ b/statsmodels/sandbox/distributions/mv_measures.py @@ -1,4 +1,4 @@ -'''using multivariate dependence and divergence measures +"""using multivariate dependence and divergence measures The standard correlation coefficient measures only linear dependence between random variables. @@ -18,7 +18,7 @@ http://pre.aps.org/abstract/PRE/v76/i2/e026209 -''' +""" import numpy as np from scipy import stats @@ -28,15 +28,13 @@ def mutualinfo_kde(y, x, normed=True): - '''mutual information of two random variables estimated with kde - - ''' + """mutual information of two random variables estimated with kde""" nobs = len(x) if not len(y) == nobs: - raise ValueError('both data arrays need to have the same size') + raise ValueError("both data arrays need to have the same size") x = np.asarray(x, float) y = np.asarray(y, float) - yx = np.vstack((y,x)) + yx = np.vstack((y, x)) kde_x = gaussian_kde(x)(x) kde_y = gaussian_kde(y)(y) kde_yx = gaussian_kde(yx)(yx) @@ -44,35 +42,35 @@ def mutualinfo_kde(y, x, normed=True): mi_obs = np.log(kde_yx) - np.log(kde_x) - np.log(kde_y) mi = mi_obs.sum() / nobs if normed: - mi_normed = np.sqrt(1. - np.exp(-2 * mi)) + mi_normed = np.sqrt(1.0 - np.exp(-2 * mi)) return mi_normed else: return mi -def mutualinfo_kde_2sample(y, x, normed=True): - '''mutual information of two random variables estimated with kde - ''' +def mutualinfo_kde_2sample(y, x, normed=True): + """mutual information of two random variables estimated with kde""" nobs = len(x) x = np.asarray(x, float) y = np.asarray(y, float) - #yx = np.vstack((y,x)) + # yx = np.vstack((y,x)) kde_x = gaussian_kde(x.T)(x.T) kde_y = gaussian_kde(y.T)(x.T) - #kde_yx = gaussian_kde(yx)(yx) + # kde_yx = gaussian_kde(yx)(yx) mi_obs = np.log(kde_x) - np.log(kde_y) if len(mi_obs) != nobs: raise ValueError("Wrong number of observations") mi = mi_obs.mean() if normed: - mi_normed = np.sqrt(1. - np.exp(-2 * mi)) + mi_normed = np.sqrt(1.0 - np.exp(-2 * mi)) return mi_normed else: return mi + def mutualinfo_binned(y, x, bins, normed=True): - '''mutual information of two random variables estimated with kde + """mutual information of two random variables estimated with kde @@ -82,114 +80,123 @@ def mutualinfo_binned(y, x, bins, normed=True): are expected to be in each bin under the assumption of independence. This follows roughly the description in Kahn et al. 2007 - ''' + """ nobs = len(x) if not len(y) == nobs: - raise ValueError('both data arrays need to have the same size') + raise ValueError("both data arrays need to have the same size") x = np.asarray(x, float) y = np.asarray(y, float) - #yx = np.vstack((y,x)) + # yx = np.vstack((y,x)) + ## fyx, binsy, binsx = np.histogram2d(y, x, bins=bins) + ## fx, binsx_ = np.histogram(x, bins=binsx) + ## fy, binsy_ = np.histogram(y, bins=binsy) -## fyx, binsy, binsx = np.histogram2d(y, x, bins=bins) -## fx, binsx_ = np.histogram(x, bins=binsx) -## fy, binsy_ = np.histogram(y, bins=binsy) - - if bins == 'auto': + if bins == "auto": ys = np.sort(y) xs = np.sort(x) - #quantiles = np.array([0,0.25, 0.4, 0.6, 0.75, 1]) - qbin_sqr = np.sqrt(5./nobs) - quantiles = np.linspace(0, 1, 1./qbin_sqr) - quantile_index = ((nobs-1)*quantiles).astype(int) - #move edges so that they do not coincide with an observation + # quantiles = np.array([0,0.25, 0.4, 0.6, 0.75, 1]) + qbin_sqr = np.sqrt(5.0 / nobs) + quantiles = np.linspace(0, 1, 1.0 / qbin_sqr) + quantile_index = ((nobs - 1) * quantiles).astype(int) + # move edges so that they do not coincide with an observation shift = 1e-6 + np.ones(quantiles.shape) - shift[0] -= 2*1e-6 + shift[0] -= 2 * 1e-6 binsy = ys[quantile_index] + shift binsx = xs[quantile_index] + shift elif np.size(bins) == 1: binsy = bins binsx = bins - elif (len(bins) == 2): + elif len(bins) == 2: binsy, binsx = bins -## if np.size(bins[0]) == 1: -## binsx = bins[0] -## if np.size(bins[1]) == 1: -## binsx = bins[1] + ## if np.size(bins[0]) == 1: + ## binsx = bins[0] + ## if np.size(bins[1]) == 1: + ## binsx = bins[1] fx, binsx = np.histogram(x, bins=binsx) fy, binsy = np.histogram(y, bins=binsy) fyx, binsy, binsx = np.histogram2d(y, x, bins=(binsy, binsx)) - pyx = fyx * 1. / nobs - px = fx * 1. / nobs - py = fy * 1. / nobs - + pyx = fyx * 1.0 / nobs + px = fx * 1.0 / nobs + py = fy * 1.0 / nobs - mi_obs = pyx * (np.log(pyx+1e-10) - np.log(py)[:,None] - np.log(px)) + mi_obs = pyx * (np.log(pyx + 1e-10) - np.log(py)[:, None] - np.log(px)) mi = mi_obs.sum() if normed: - mi_normed = np.sqrt(1. - np.exp(-2 * mi)) + mi_normed = np.sqrt(1.0 - np.exp(-2 * mi)) return mi_normed, (pyx, py, px, binsy, binsx), mi_obs else: return mi -if __name__ == '__main__': +if __name__ == "__main__": import statsmodels.api as sm - funtype = ['linear', 'quadratic'][1] + funtype = ["linear", "quadratic"][1] nobs = 200 - sig = 2#5. - #x = np.linspace(-3, 3, nobs) + np.random.randn(nobs) - x = np.sort(3*np.random.randn(nobs)) + sig = 2 # 5. + # x = np.linspace(-3, 3, nobs) + np.random.randn(nobs) + x = np.sort(3 * np.random.randn(nobs)) exog = sm.add_constant(x, prepend=True) - #y = 0 + np.log(1+x**2) + sig * np.random.randn(nobs) - if funtype == 'quadratic': + # y = 0 + np.log(1+x**2) + sig * np.random.randn(nobs) + if funtype == "quadratic": y = 0 + x**2 + sig * np.random.randn(nobs) - if funtype == 'linear': + if funtype == "linear": y = 0 + x + sig * np.random.randn(nobs) - print('correlation') - print(np.corrcoef(y,x)[0, 1]) - print('pearsonr', stats.pearsonr(y,x)) - print('spearmanr', stats.spearmanr(y,x)) - print('kendalltau', stats.kendalltau(y,x)) + print("correlation") + print(np.corrcoef(y, x)[0, 1]) + print("pearsonr", stats.pearsonr(y, x)) + print("spearmanr", stats.spearmanr(y, x)) + print("kendalltau", stats.kendalltau(y, x)) - pxy, binsx, binsy = np.histogram2d(x,y, bins=5) + pxy, binsx, binsy = np.histogram2d(x, y, bins=5) px, binsx_ = np.histogram(x, bins=binsx) py, binsy_ = np.histogram(y, bins=binsy) - print('mutualinfo', infotheo.mutualinfo(px*1./nobs, py*1./nobs, - 1e-15+pxy*1./nobs, logbase=np.e)) - - print('mutualinfo_kde normed', mutualinfo_kde(y,x)) - print('mutualinfo_kde ', mutualinfo_kde(y,x, normed=False)) - mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \ - mutualinfo_binned(y, x, 5, normed=True) - print('mutualinfo_binned normed', mi_normed) - print('mutualinfo_binned ', mi_obs.sum()) - - mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \ - mutualinfo_binned(y, x, 'auto', normed=True) - print('auto') - print('mutualinfo_binned normed', mi_normed) - print('mutualinfo_binned ', mi_obs.sum()) + print( + "mutualinfo", + infotheo.mutualinfo( + px * 1.0 / nobs, + py * 1.0 / nobs, + 1e-15 + pxy * 1.0 / nobs, + logbase=np.e, + ), + ) + + print("mutualinfo_kde normed", mutualinfo_kde(y, x)) + print("mutualinfo_kde ", mutualinfo_kde(y, x, normed=False)) + mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = mutualinfo_binned( + y, x, 5, normed=True + ) + print("mutualinfo_binned normed", mi_normed) + print("mutualinfo_binned ", mi_obs.sum()) + + mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = mutualinfo_binned( + y, x, "auto", normed=True + ) + print("auto") + print("mutualinfo_binned normed", mi_normed) + print("mutualinfo_binned ", mi_obs.sum()) ys = np.sort(y) xs = np.sort(x) - by = ys[((nobs-1)*np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)] - bx = xs[((nobs-1)*np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)] - mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \ - mutualinfo_binned(y, x, (by,bx), normed=True) - print('quantiles') - print('mutualinfo_binned normed', mi_normed) - print('mutualinfo_binned ', mi_obs.sum()) - - doplot = 1#False + by = ys[((nobs - 1) * np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)] + bx = xs[((nobs - 1) * np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)] + mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = mutualinfo_binned( + y, x, (by, bx), normed=True + ) + print("quantiles") + print("mutualinfo_binned normed", mi_normed) + print("mutualinfo_binned ", mi_obs.sum()) + + doplot = 1 # False if doplot: import matplotlib.pyplot as plt - plt.plot(x, y, 'o') + + plt.plot(x, y, "o") olsres = sm.OLS(y, exog).fit() plt.plot(x, olsres.fittedvalues) diff --git a/statsmodels/sandbox/distributions/mv_normal.py b/statsmodels/sandbox/distributions/mv_normal.py index 56510e54ce5..91d4ddc9a0e 100644 --- a/statsmodels/sandbox/distributions/mv_normal.py +++ b/statsmodels/sandbox/distributions/mv_normal.py @@ -148,11 +148,12 @@ from scipy import special from statsmodels.sandbox.distributions.multivariate import mvstdtprob + from .extras import mvnormcdf def expect_mc(dist, func=lambda x: 1, size=50000): - '''calculate expected value of function by Monte Carlo integration + """calculate expected value of function by Monte Carlo integration Parameters ---------- @@ -196,15 +197,25 @@ def expect_mc(dist, func=lambda x: 1, size=50000): array([ 0.09937, 0.10075]) - ''' + """ + def fun(x): - return func(x) # * dist.pdf(x) + return func(x) # * dist.pdf(x) + rvs = dist.rvs(size=size) return fun(rvs).mean(0) -def expect_mc_bounds(dist, func=lambda x: 1, size=50000, lower=None, upper=None, - conditional=False, overfact=1.2): - '''calculate expected value of function by Monte Carlo integration + +def expect_mc_bounds( + dist, + func=lambda x: 1, + size=50000, + lower=None, + upper=None, + conditional=False, + overfact=1.2, +): + """calculate expected value of function by Monte Carlo integration Parameters ---------- @@ -260,8 +271,8 @@ def expect_mc_bounds(dist, func=lambda x: 1, size=50000, lower=None, upper=None, [0.0, 1.0, 0.0, 3.0] - ''' - #call rvs once to find length of random vector + """ + # call rvs once to find length of random vector rvsdim = dist.rvs(size=1).shape[-1] if lower is None: lower = -np.inf * np.ones(rvsdim) @@ -273,33 +284,33 @@ def expect_mc_bounds(dist, func=lambda x: 1, size=50000, lower=None, upper=None, upper = np.asarray(upper) def fun(x): - return func(x) # * dist.pdf(x) + return func(x) # * dist.pdf(x) rvsli = [] - used = 0 #remain = size #inplace changes size + used = 0 # remain = size #inplace changes size total = 0 while True: - remain = size - used #just a temp variable + remain = size - used # just a temp variable rvs = dist.rvs(size=int(remain * overfact)) total += int(size * overfact) rvsok = rvs[((rvs >= lower) & (rvs <= upper)).all(-1)] - #if rvsok.ndim == 1: #possible shape problems if only 1 random vector + # if rvsok.ndim == 1: #possible shape problems if only 1 random vector rvsok = np.atleast_2d(rvsok) used += rvsok.shape[0] - rvsli.append(rvsok) #[:remain]) use extras instead + rvsli.append(rvsok) # [:remain]) use extras instead print(used) if used >= size: break rvs = np.vstack(rvsli) print(rvs.shape) - assert used == rvs.shape[0] #saftey check + assert used == rvs.shape[0] # saftey check mean_conditional = fun(rvs).mean(0) if conditional: return mean_conditional else: - return mean_conditional * (used * 1. / total) + return mean_conditional * (used * 1.0 / total) def bivariate_normal(x, mu, cov): @@ -314,20 +325,22 @@ def bivariate_normal(x, mu, cov): mux, muy = mu sigmax, sigmaxy, tmp, sigmay = np.ravel(cov) sigmax, sigmay = np.sqrt(sigmax), np.sqrt(sigmay) - Xmu = X-mux - Ymu = Y-muy - - rho = sigmaxy/(sigmax*sigmay) - z = Xmu**2/sigmax**2 + Ymu**2/sigmay**2 - 2*rho*Xmu*Ymu/(sigmax*sigmay) - denom = 2*np.pi*sigmax*sigmay*np.sqrt(1-rho**2) - return np.exp( -z/(2*(1-rho**2))) / denom + Xmu = X - mux + Ymu = Y - muy + rho = sigmaxy / (sigmax * sigmay) + z = ( + Xmu**2 / sigmax**2 + + Ymu**2 / sigmay**2 + - 2 * rho * Xmu * Ymu / (sigmax * sigmay) + ) + denom = 2 * np.pi * sigmax * sigmay * np.sqrt(1 - rho**2) + return np.exp(-z / (2 * (1 - rho**2))) / denom class BivariateNormal(object): - - #TODO: make integration limits more flexible + # TODO: make integration limits more flexible # or normalize before integration def __init__(self, mean, cov): @@ -343,22 +356,25 @@ def pdf(self, x): return bivariate_normal(x, self.mean, self.cov) def logpdf(self, x): - #TODO: replace this + # TODO: replace this return np.log(self.pdf(x)) def cdf(self, x): return self.expect(upper=x) - def expect(self, func=lambda x: 1, lower=(-10,-10), upper=(10,10)): + def expect(self, func=lambda x: 1, lower=(-10, -10), upper=(10, 10)): def fun(x, y): - x = np.column_stack((x,y)) + x = np.column_stack((x, y)) return func(x) * self.pdf(x) + from scipy.integrate import dblquad - return dblquad(fun, lower[0], upper[0], lambda y: lower[1], - lambda y: upper[1]) + + return dblquad( + fun, lower[0], upper[0], lambda y: lower[1], lambda y: upper[1] + ) def kl(self, other): - '''Kullback-Leibler divergence between this and another distribution + """Kullback-Leibler divergence between this and another distribution int f(x) (log f(x) - log g(x)) dx @@ -368,27 +384,28 @@ def kl(self, other): limits currently hardcoded - ''' - fun = lambda x : self.logpdf(x) - other.logpdf(x) + """ + fun = lambda x: self.logpdf(x) - other.logpdf(x) return self.expect(fun) def kl_mc(self, other, size=500000): - fun = lambda x : self.logpdf(x) - other.logpdf(x) + fun = lambda x: self.logpdf(x) - other.logpdf(x) rvs = self.rvs(size=size) return fun(rvs).mean() + class MVElliptical(object): - '''Base Class for multivariate elliptical distributions, normal and t + """Base Class for multivariate elliptical distributions, normal and t contains common initialization, and some common methods subclass needs to implement at least rvs and logpdf methods - ''' - #getting common things between normal and t distribution + """ + # getting common things between normal and t distribution def __init__(self, mean, sigma, *args, **kwds): - '''initialize instance + """initialize instance Parameters ---------- @@ -403,39 +420,38 @@ def __init__(self, mean, sigma, *args, **kwds): kwds : dict currently not used - ''' + """ self.extra_args = [] self.mean = np.asarray(mean) self.sigma = sigma = np.asarray(sigma) sigma = np.squeeze(sigma) self.nvars = nvars = len(mean) - #self.covchol = np.linalg.cholesky(sigma) - + # self.covchol = np.linalg.cholesky(sigma) - #in the following sigma is original, self.sigma is full matrix + # in the following sigma is original, self.sigma is full matrix if sigma.shape == (): - #iid + # iid self.sigma = np.eye(nvars) * sigma self.sigmainv = np.eye(nvars) / sigma self.cholsigmainv = np.eye(nvars) / np.sqrt(sigma) elif (sigma.ndim == 1) and (len(sigma) == nvars): - #independent heteroskedastic + # independent heteroskedastic self.sigma = np.diag(sigma) - self.sigmainv = np.diag(1. / sigma) - self.cholsigmainv = np.diag( 1. / np.sqrt(sigma)) - elif sigma.shape == (nvars, nvars): #python tuple comparison - #general + self.sigmainv = np.diag(1.0 / sigma) + self.cholsigmainv = np.diag(1.0 / np.sqrt(sigma)) + elif sigma.shape == (nvars, nvars): # python tuple comparison + # general self.sigmainv = np.linalg.pinv(sigma) self.cholsigmainv = np.linalg.cholesky(self.sigmainv).T else: - raise ValueError('sigma has invalid shape') + raise ValueError("sigma has invalid shape") - #store logdetsigma for logpdf + # store logdetsigma for logpdf self.logdetsigma = np.log(np.linalg.det(self.sigma)) def rvs(self, size=1): - '''random variable + """random variable Parameters ---------- @@ -450,11 +466,11 @@ def rvs(self, size=1): dimension - ''' + """ raise NotImplementedError def logpdf(self, x): - '''logarithm of probability density function + """logarithm of probability density function Parameters ---------- @@ -472,13 +488,12 @@ def logpdf(self, x): with multivariate normal vector in each row and iid across rows does not work now because of dot in whiten - ''' - + """ raise NotImplementedError def cdf(self, x, **kwds): - '''cumulative distribution function + """cumulative distribution function Parameters ---------- @@ -493,14 +508,13 @@ def cdf(self, x, **kwds): cdf : float or array probability density value of each random vector - ''' + """ raise NotImplementedError - def affine_transformed(self, shift, scale_matrix): - '''affine transformation define in subclass because of distribution - specific restrictions''' - #implemented in subclass at least for now + """affine transformation define in subclass because of distribution + specific restrictions""" + # implemented in subclass at least for now raise NotImplementedError def whiten(self, x): @@ -530,7 +544,7 @@ def whiten(self, x): return np.dot(x, self.cholsigmainv.T) def pdf(self, x): - '''probability density function + """probability density function Parameters ---------- @@ -543,11 +557,11 @@ def pdf(self, x): pdf : float or array probability density value of each random vector - ''' + """ return np.exp(self.logpdf(x)) def standardize(self, x): - '''standardize the random variable, i.e. subtract mean and whiten + """standardize the random variable, i.e. subtract mean and whiten Parameters ---------- @@ -568,17 +582,15 @@ def standardize(self, x): whiten : rescale random variable, standardize without subtracting mean. - ''' + """ return self.whiten(x - self.mean) def standardized(self): - '''return new standardized MVNormal instance - ''' + """return new standardized MVNormal instance""" return self.affine_transformed(-self.mean, self.cholsigmainv) - def normalize(self, x): - '''normalize the random variable, i.e. subtract mean and rescale + """normalize the random variable, i.e. subtract mean and rescale The distribution will have zero mean and sigma equal to correlation @@ -601,16 +613,16 @@ def normalize(self, x): whiten : rescale random variable, standardize without subtracting mean. - ''' + """ std_ = np.atleast_2d(self.std_sigma) - return (x - self.mean)/std_ #/std_.T + return (x - self.mean) / std_ # /std_.T def normalized(self, demeaned=True): - '''return a normalized distribution where sigma=corr + """return a normalized distribution where sigma=corr if demeaned is True, then mean will be set to zero - ''' + """ if demeaned: mean_new = np.zeros_like(self.mean) else: @@ -620,44 +632,39 @@ def normalized(self, demeaned=True): return self.__class__(mean_new, sigma_new, *args) def normalized2(self, demeaned=True): - '''return a normalized distribution where sigma=corr + """return a normalized distribution where sigma=corr second implementation for testing affine transformation - ''' + """ if demeaned: shift = -self.mean else: - shift = self.mean * (1. / self.std_sigma - 1.) - return self.affine_transformed(shift, np.diag(1. / self.std_sigma)) - #the following "standardizes" cov instead - #return self.affine_transformed(shift, self.cholsigmainv) - - + shift = self.mean * (1.0 / self.std_sigma - 1.0) + return self.affine_transformed(shift, np.diag(1.0 / self.std_sigma)) + # the following "standardizes" cov instead + # return self.affine_transformed(shift, self.cholsigmainv) @property def std(self): - '''standard deviation, square root of diagonal elements of cov - ''' + """standard deviation, square root of diagonal elements of cov""" return np.sqrt(np.diag(self.cov)) @property def std_sigma(self): - '''standard deviation, square root of diagonal elements of sigma - ''' + """standard deviation, square root of diagonal elements of sigma""" return np.sqrt(np.diag(self.sigma)) - @property def corr(self): - '''correlation matrix''' + """correlation matrix""" return self.cov / np.outer(self.std, self.std) expect_mc = expect_mc def marginal(self, indices): - '''return marginal distribution for variables given by indices + """return marginal distribution for variables given by indices this should be correct for normal and t distribution @@ -673,17 +680,17 @@ def marginal(self, indices): contains the marginal distribution of the variables given in indices - ''' + """ indices = np.asarray(indices) mean_new = self.mean[indices] - sigma_new = self.sigma[indices[:,None], indices] + sigma_new = self.sigma[indices[:, None], indices] args = [getattr(self, ea) for ea in self.extra_args] return self.__class__(mean_new, sigma_new, *args) -#parts taken from linear_model, but heavy adjustments +# parts taken from linear_model, but heavy adjustments class MVNormal0(object): - '''Class for Multivariate Normal Distribution + """Class for Multivariate Normal Distribution original full version, kept for testing, new version inherits from MVElliptical @@ -691,8 +698,7 @@ class MVNormal0(object): uses Cholesky decomposition of covariance matrix for the transformation of the data - ''' - + """ def __init__(self, mean, cov): self.mean = mean @@ -700,26 +706,25 @@ def __init__(self, mean, cov): cov = np.squeeze(cov) self.nvars = nvars = len(mean) - - #in the following cov is original, self.cov is full matrix + # in the following cov is original, self.cov is full matrix if cov.shape == (): - #iid + # iid self.cov = np.eye(nvars) * cov self.covinv = np.eye(nvars) / cov self.cholcovinv = np.eye(nvars) / np.sqrt(cov) elif (cov.ndim == 1) and (len(cov) == nvars): - #independent heteroskedastic + # independent heteroskedastic self.cov = np.diag(cov) - self.covinv = np.diag(1. / cov) - self.cholcovinv = np.diag( 1. / np.sqrt(cov)) - elif cov.shape == (nvars, nvars): #python tuple comparison - #general + self.covinv = np.diag(1.0 / cov) + self.cholcovinv = np.diag(1.0 / np.sqrt(cov)) + elif cov.shape == (nvars, nvars): # python tuple comparison + # general self.covinv = np.linalg.pinv(cov) self.cholcovinv = np.linalg.cholesky(self.covinv).T else: - raise ValueError('cov has invalid shape') + raise ValueError("cov has invalid shape") - #store logdetcov for logpdf + # store logdetcov for logpdf self.logdetcov = np.log(np.linalg.det(self.cov)) def whiten(self, x): @@ -747,13 +752,13 @@ def whiten(self, x): """ x = np.asarray(x) if np.any(self.cov): - #return np.dot(self.cholcovinv, x) + # return np.dot(self.cholcovinv, x) return np.dot(x, self.cholcovinv.T) else: return x def rvs(self, size=1): - '''random variable + """random variable Parameters ---------- @@ -771,11 +776,11 @@ def rvs(self, size=1): ----- uses numpy.random.multivariate_normal directly - ''' + """ return np.random.multivariate_normal(self.mean, self.cov, size=size) def pdf(self, x): - '''probability density function + """probability density function Parameters ---------- @@ -788,12 +793,12 @@ def pdf(self, x): pdf : float or array probability density value of each random vector - ''' + """ return np.exp(self.logpdf(x)) def logpdf(self, x): - '''logarithm of probability density function + """logarithm of probability density function Parameters ---------- @@ -811,12 +816,12 @@ def logpdf(self, x): with multivariate normal vector in each row and iid across rows does not work now because of dot in whiten - ''' + """ x = np.asarray(x) x_whitened = self.whiten(x - self.mean) SSR = np.sum(x_whitened**2, -1) llf = -SSR - llf -= self.nvars * np.log(2. * np.pi) + llf -= self.nvars * np.log(2.0 * np.pi) llf -= self.logdetcov llf *= 0.5 return llf @@ -825,17 +830,17 @@ def logpdf(self, x): class MVNormal(MVElliptical): - '''Class for Multivariate Normal Distribution + """Class for Multivariate Normal Distribution uses Cholesky decomposition of covariance matrix for the transformation of the data - ''' - __name__ == 'Multivariate Normal Distribution' + """ + __name__ == "Multivariate Normal Distribution" def rvs(self, size=1): - '''random variable + """random variable Parameters ---------- @@ -853,11 +858,11 @@ def rvs(self, size=1): ----- uses numpy.random.multivariate_normal directly - ''' + """ return np.random.multivariate_normal(self.mean, self.sigma, size=size) def logpdf(self, x): - '''logarithm of probability density function + """logarithm of probability density function Parameters ---------- @@ -875,18 +880,18 @@ def logpdf(self, x): with multivariate normal vector in each row and iid across rows does not work now because of dot in whiten - ''' + """ x = np.asarray(x) x_whitened = self.whiten(x - self.mean) SSR = np.sum(x_whitened**2, -1) llf = -SSR - llf -= self.nvars * np.log(2. * np.pi) + llf -= self.nvars * np.log(2.0 * np.pi) llf -= self.logdetsigma llf *= 0.5 return llf def cdf(self, x, **kwds): - '''cumulative distribution function + """cumulative distribution function Parameters ---------- @@ -901,18 +906,18 @@ def cdf(self, x, **kwds): cdf : float or array probability density value of each random vector - ''' - #lower = -np.inf * np.ones_like(x) - #return mvstdnormcdf(lower, self.standardize(x), self.corr, **kwds) + """ + # lower = -np.inf * np.ones_like(x) + # return mvstdnormcdf(lower, self.standardize(x), self.corr, **kwds) return mvnormcdf(x, self.mean, self.cov, **kwds) @property def cov(self): - '''covariance matrix''' + """covariance matrix""" return self.sigma def affine_transformed(self, shift, scale_matrix): - '''return distribution of an affine transform + """return distribution of an affine transform for full rank scale_matrix only @@ -944,14 +949,14 @@ def affine_transformed(self, shift, scale_matrix): currently only tested because it's called by standardized - ''' - B = scale_matrix #tmp variable + """ + B = scale_matrix # tmp variable mean_new = np.dot(B, self.mean) + shift sigma_new = np.dot(np.dot(B, self.sigma), B.T) return MVNormal(mean_new, sigma_new) def conditional(self, indices, values): - r'''return conditional distribution + r"""return conditional distribution indices are the variables to keep, the complement is the conditioning set @@ -978,8 +983,8 @@ def conditional(self, indices, values): values of the excluded variables. - ''' - #indices need to be nd arrays for broadcasting + """ + # indices need to be nd arrays for broadcasting keep = np.asarray(indices) given = np.asarray([i for i in range(self.nvars) if i not in keep]) sigmakk = self.sigma[keep[:, None], keep] @@ -987,29 +992,32 @@ def conditional(self, indices, values): sigmakg = self.sigma[keep[:, None], given] sigmagk = self.sigma[given[:, None], keep] - - sigma_new = sigmakk - np.dot(sigmakg, np.linalg.solve(sigmagg, sigmagk)) - mean_new = self.mean[keep] + \ - np.dot(sigmakg, np.linalg.solve(sigmagg, values-self.mean[given])) - -# #or -# sig = np.linalg.solve(sigmagg, sigmagk).T -# mean_new = self.mean[keep] + np.dot(sigmakg, values-self.mean[given]) -# sigma_new = sigmakk - np.dot(sigmakg, sig) + sigma_new = sigmakk - np.dot( + sigmakg, np.linalg.solve(sigmagg, sigmagk) + ) + mean_new = self.mean[keep] + np.dot( + sigmakg, np.linalg.solve(sigmagg, values - self.mean[given]) + ) + + # #or + # sig = np.linalg.solve(sigmagg, sigmagk).T + # mean_new = self.mean[keep] + np.dot(sigmakg, values-self.mean[given]) + # sigma_new = sigmakk - np.dot(sigmakg, sig) return MVNormal(mean_new, sigma_new) -#redefine some shortcuts +# redefine some shortcuts np_log = np.log np_pi = np.pi sps_gamln = special.gammaln + class MVT(MVElliptical): - __name__ == 'Multivariate Student T Distribution' + __name__ == "Multivariate Student T Distribution" def __init__(self, mean, sigma, df): - '''initialize instance + """initialize instance Parameters ---------- @@ -1024,13 +1032,13 @@ def __init__(self, mean, sigma, df): kwds : dict currently not used - ''' + """ super(MVT, self).__init__(mean, sigma) - self.extra_args = ['df'] #overwrites extra_args of super + self.extra_args = ["df"] # overwrites extra_args of super self.df = df def rvs(self, size=1): - '''random variables with Student T distribution + """random variables with Student T distribution Parameters ---------- @@ -1052,13 +1060,13 @@ def rvs(self, size=1): does this require df>2 ? - ''' + """ from .multivariate import multivariate_t_rvs - return multivariate_t_rvs(self.mean, self.sigma, df=self.df, n=size) + return multivariate_t_rvs(self.mean, self.sigma, df=self.df, n=size) def logpdf(self, x): - '''logarithm of probability density function + """logarithm of probability density function Parameters ---------- @@ -1071,25 +1079,25 @@ def logpdf(self, x): logpdf : float or array probability density value of each random vector - ''' + """ x = np.asarray(x) df = self.df nvars = self.nvars - x_whitened = self.whiten(x - self.mean) #should be float + x_whitened = self.whiten(x - self.mean) # should be float - llf = - nvars * np_log(df * np_pi) + llf = -nvars * np_log(df * np_pi) llf -= self.logdetsigma - llf -= (df + nvars) * np_log(1 + np.sum(x_whitened**2,-1) / df) + llf -= (df + nvars) * np_log(1 + np.sum(x_whitened**2, -1) / df) llf *= 0.5 - llf += sps_gamln((df + nvars) / 2.) - sps_gamln(df / 2.) + llf += sps_gamln((df + nvars) / 2.0) - sps_gamln(df / 2.0) return llf def cdf(self, x, **kwds): - '''cumulative distribution function + """cumulative distribution function Parameters ---------- @@ -1104,29 +1112,29 @@ def cdf(self, x, **kwds): cdf : float or array probability density value of each random vector - ''' + """ lower = -np.inf * np.ones_like(x) - #std_sigma = np.sqrt(np.diag(self.sigma)) - upper = (x - self.mean)/self.std_sigma + # std_sigma = np.sqrt(np.diag(self.sigma)) + upper = (x - self.mean) / self.std_sigma return mvstdtprob(lower, upper, self.corr, self.df, **kwds) - #mvstdtcdf does not exist yet - #return mvstdtcdf(lower, x, self.corr, df, **kwds) + # mvstdtcdf does not exist yet + # return mvstdtcdf(lower, x, self.corr, df, **kwds) @property def cov(self): - '''covariance matrix + """covariance matrix The covariance matrix for the t distribution does not exist for df<=2, and is equal to sigma * df/(df-2) for df>2 - ''' + """ if self.df <= 2: return np.nan * np.ones_like(self.sigma) else: - return self.df / (self.df - 2.) * self.sigma + return self.df / (self.df - 2.0) * self.sigma def affine_transformed(self, shift, scale_matrix): - '''return distribution of a full rank affine transform + """return distribution of a full rank affine transform for full rank scale_matrix only @@ -1160,115 +1168,123 @@ def affine_transformed(self, shift, scale_matrix): where a is shift, B is full rank scale matrix with same dimension as sigma - ''' - #full rank method could also be in elliptical and called with super - #after the rank check - B = scale_matrix #tmp variable as shorthand + """ + # full rank method could also be in elliptical and called with super + # after the rank check + B = scale_matrix # tmp variable as shorthand if not B.shape == (self.nvars, self.nvars): if (np.linalg.eigvals(B) <= 0).any(): - raise ValueError('affine transform has to be full rank') + raise ValueError("affine transform has to be full rank") mean_new = np.dot(B, self.mean) + shift sigma_new = np.dot(np.dot(B, self.sigma), B.T) return MVT(mean_new, sigma_new, self.df) -def quad2d(func=lambda x: 1, lower=(-10,-10), upper=(10,10)): +def quad2d(func=lambda x: 1, lower=(-10, -10), upper=(10, 10)): def fun(x, y): - x = np.column_stack((x,y)) + x = np.column_stack((x, y)) return func(x) + from scipy.integrate import dblquad - return dblquad(fun, lower[0], upper[0], lambda y: lower[1], - lambda y: upper[1]) -if __name__ == '__main__': + return dblquad( + fun, lower[0], upper[0], lambda y: lower[1], lambda y: upper[1] + ) + + +if __name__ == "__main__": from numpy.testing import assert_almost_equal, assert_array_almost_equal - examples = ['mvn'] + examples = ["mvn"] - mu = (0,0) + mu = (0, 0) covx = np.array([[1.0, 0.5], [0.5, 1.0]]) - mu3 = [-1, 0., 2.] - cov3 = np.array([[ 1. , 0.5 , 0.75], - [ 0.5 , 1.5 , 0.6 ], - [ 0.75, 0.6 , 2. ]]) + mu3 = [-1, 0.0, 2.0] + cov3 = np.array([[1.0, 0.5, 0.75], [0.5, 1.5, 0.6], [0.75, 0.6, 2.0]]) - - if 'mvn' in examples: + if "mvn" in examples: bvn = BivariateNormal(mu, covx) rvs = bvn.rvs(size=1000) print(rvs.mean(0)) print(np.cov(rvs, rowvar=0)) print(bvn.expect()) - print(bvn.cdf([0,0])) + print(bvn.cdf([0, 0])) bvn1 = BivariateNormal(mu, np.eye(2)) - bvn2 = BivariateNormal(mu, 4*np.eye(2)) - fun = lambda x : np.log(bvn1.pdf(x)) - np.log(bvn.pdf(x)) + bvn2 = BivariateNormal(mu, 4 * np.eye(2)) + fun = lambda x: np.log(bvn1.pdf(x)) - np.log(bvn.pdf(x)) print(bvn1.expect(fun)) print(bvn1.kl(bvn2), bvn1.kl_mc(bvn2)) print(bvn2.kl(bvn1), bvn2.kl_mc(bvn1)) print(bvn1.kl(bvn), bvn1.kl_mc(bvn)) mvn = MVNormal(mu, covx) - mvn.pdf([0,0]) - mvn.pdf(np.zeros((2,2))) - #np.dot(mvn.cholcovinv.T, mvn.cholcovinv) - mvn.covinv - - cov3 = np.array([[ 1. , 0.5 , 0.75], - [ 0.5 , 1.5 , 0.6 ], - [ 0.75, 0.6 , 2. ]]) - mu3 = [-1, 0., 2.] + mvn.pdf([0, 0]) + mvn.pdf(np.zeros((2, 2))) + # np.dot(mvn.cholcovinv.T, mvn.cholcovinv) - mvn.covinv + + cov3 = np.array([[1.0, 0.5, 0.75], [0.5, 1.5, 0.6], [0.75, 0.6, 2.0]]) + mu3 = [-1, 0.0, 2.0] mvn3 = MVNormal(mu3, cov3) - mvn3.pdf((0., 2., 3.)) - mvn3.logpdf((0., 2., 3.)) - #comparisons with R mvtnorm::dmvnorm - #decimal=14 -# mvn3.logpdf(cov3) - [-7.667977543898155, -6.917977543898155, -5.167977543898155] -# #decimal 18 -# mvn3.pdf(cov3) - [0.000467562492721686, 0.000989829804859273, 0.005696077243833402] -# #cheating new mean, same cov -# mvn3.mean = np.array([0,0,0]) -# #decimal= 16 -# mvn3.pdf(cov3) - [0.02914269740502042, 0.02269635555984291, 0.01767593948287269] - - #as asserts + mvn3.pdf((0.0, 2.0, 3.0)) + mvn3.logpdf((0.0, 2.0, 3.0)) + # comparisons with R mvtnorm::dmvnorm + # decimal=14 + # mvn3.logpdf(cov3) - [-7.667977543898155, -6.917977543898155, -5.167977543898155] + # #decimal 18 + # mvn3.pdf(cov3) - [0.000467562492721686, 0.000989829804859273, 0.005696077243833402] + # #cheating new mean, same cov + # mvn3.mean = np.array([0,0,0]) + # #decimal= 16 + # mvn3.pdf(cov3) - [0.02914269740502042, 0.02269635555984291, 0.01767593948287269] + + # as asserts r_val = [-7.667977543898155, -6.917977543898155, -5.167977543898155] - assert_array_almost_equal( mvn3.logpdf(cov3), r_val, decimal = 14) - #decimal 18 - r_val = [0.000467562492721686, 0.000989829804859273, 0.005696077243833402] - assert_array_almost_equal( mvn3.pdf(cov3), r_val, decimal = 17) - #cheating new mean, same cov, too dangerous, got wrong instance in tests - #mvn3.mean = np.array([0,0,0]) - mvn3c = MVNormal(np.array([0,0,0]), cov3) + assert_array_almost_equal(mvn3.logpdf(cov3), r_val, decimal=14) + # decimal 18 + r_val = [ + 0.000467562492721686, + 0.000989829804859273, + 0.005696077243833402, + ] + assert_array_almost_equal(mvn3.pdf(cov3), r_val, decimal=17) + # cheating new mean, same cov, too dangerous, got wrong instance in tests + # mvn3.mean = np.array([0,0,0]) + mvn3c = MVNormal(np.array([0, 0, 0]), cov3) r_val = [0.02914269740502042, 0.02269635555984291, 0.01767593948287269] - assert_array_almost_equal( mvn3c.pdf(cov3), r_val, decimal = 16) + assert_array_almost_equal(mvn3c.pdf(cov3), r_val, decimal=16) - mvn3b = MVNormal((0,0,0), 1) - fun = lambda x : np.log(mvn3.pdf(x)) - np.log(mvn3b.pdf(x)) + mvn3b = MVNormal((0, 0, 0), 1) + fun = lambda x: np.log(mvn3.pdf(x)) - np.log(mvn3b.pdf(x)) print(mvn3.expect_mc(fun)) print(mvn3.expect_mc(fun, size=200000)) + mvt = MVT((0, 0), 1, 5) + assert_almost_equal( + mvt.logpdf(np.array([0.0, 0.0])), -1.837877066409345, decimal=15 + ) + assert_almost_equal( + mvt.pdf(np.array([0.0, 0.0])), 0.1591549430918953, decimal=15 + ) - mvt = MVT((0,0), 1, 5) - assert_almost_equal(mvt.logpdf(np.array([0.,0.])), -1.837877066409345, - decimal=15) - assert_almost_equal(mvt.pdf(np.array([0.,0.])), 0.1591549430918953, - decimal=15) - - mvt.logpdf(np.array([1.,1.]))-(-3.01552989458359) + mvt.logpdf(np.array([1.0, 1.0])) - (-3.01552989458359) - mvt1 = MVT((0,0), 1, 1) - mvt1.logpdf(np.array([1.,1.]))-(-3.48579549941151) #decimal=16 + mvt1 = MVT((0, 0), 1, 1) + mvt1.logpdf(np.array([1.0, 1.0])) - (-3.48579549941151) # decimal=16 rvs = mvt.rvs(100000) assert_almost_equal(np.cov(rvs, rowvar=0), mvt.cov, decimal=1) mvt31 = MVT(mu3, cov3, 1) - assert_almost_equal(mvt31.pdf(cov3), + assert_almost_equal( + mvt31.pdf(cov3), [0.0007276818698165781, 0.0009980625182293658, 0.0027661422056214652], - decimal=18) + decimal=18, + ) mvt = MVT(mu3, cov3, 3) - assert_almost_equal(mvt.pdf(cov3), + assert_almost_equal( + mvt.pdf(cov3), [0.000863777424247410, 0.001277510788307594, 0.004156314279452241], - decimal=17) + decimal=17, + ) diff --git a/statsmodels/sandbox/distributions/otherdist.py b/statsmodels/sandbox/distributions/otherdist.py index bc249a9b0d1..335e50f0621 100644 --- a/statsmodels/sandbox/distributions/otherdist.py +++ b/statsmodels/sandbox/distributions/otherdist.py @@ -1,4 +1,4 @@ -'''Parametric Mixture Distributions +"""Parametric Mixture Distributions Created on Sat Jun 04 2011 @@ -18,14 +18,15 @@ Question: Metaclasses and class factories for generating new distributions from existing distributions by transformation, mixing, compounding -''' +""" import numpy as np from scipy import stats + class ParametricMixtureD(object): - '''mixtures with a discrete distribution + """mixtures with a discrete distribution The mixing distribution is a discrete distribution like scipy.stats.poisson. All distribution in the mixture of the same type and parametrized @@ -42,10 +43,12 @@ class ParametricMixtureD(object): initialization looks fragile for all possible cases of lower and upper bounds of the distributions. - ''' - def __init__(self, mixing_dist, base_dist, bd_args_func, bd_kwds_func, - cutoff=1e-3): - '''create a mixture distribution + """ + + def __init__( + self, mixing_dist, base_dist, bd_args_func, bd_kwds_func, cutoff=1e-3 + ): + """create a mixture distribution Parameters ---------- @@ -70,10 +73,10 @@ def __init__(self, mixing_dist, base_dist, bd_args_func, bd_kwds_func, draws that are outside the truncated range are clipped, that is assigned to the highest or lowest value in the truncated support. - ''' + """ self.mixing_dist = mixing_dist self.base_dist = base_dist - #self.bd_args = bd_args + # self.bd_args = bd_args if not np.isneginf(mixing_dist.dist.a): lower = mixing_dist.dist.a else: @@ -84,7 +87,7 @@ def __init__(self, mixing_dist, base_dist, bd_args_func, bd_kwds_func, upper = mixing_dist.isf(1e-4) self.ma = lower self.mb = upper - mixing_support = np.arange(lower, upper+1) + mixing_support = np.arange(lower, upper + 1) self.mixing_probs = mixing_dist.pmf(mixing_support) self.bd_args = bd_args_func(mixing_support) @@ -92,24 +95,20 @@ def __init__(self, mixing_dist, base_dist, bd_args_func, bd_kwds_func, def rvs(self, size=1): mrvs = self.mixing_dist.rvs(size) - #TODO: check strange cases ? this assumes continous integers + # TODO: check strange cases ? this assumes continous integers mrvs_idx = (np.clip(mrvs, self.ma, self.mb) - self.ma).astype(int) bd_args = tuple(md[mrvs_idx] for md in self.bd_args) bd_kwds = dict((k, self.bd_kwds[k][mrvs_idx]) for k in self.bd_kwds) - kwds = {'size':size} + kwds = {"size": size} kwds.update(bd_kwds) rvs = self.base_dist.rvs(*self.bd_args, **kwds) return rvs, mrvs_idx - - - - def pdf(self, x): x = np.asarray(x) if np.size(x) > 1: - x = x[...,None] #[None, ...] + x = x[..., None] # [None, ...] bd_probs = self.base_dist.pdf(x, *self.bd_args, **self.bd_kwds) prob = (bd_probs * self.mixing_probs).sum(-1) return prob, bd_probs @@ -117,16 +116,17 @@ def pdf(self, x): def cdf(self, x): x = np.asarray(x) if np.size(x) > 1: - x = x[...,None] #[None, ...] + x = x[..., None] # [None, ...] bd_probs = self.base_dist.cdf(x, *self.bd_args, **self.bd_kwds) prob = (bd_probs * self.mixing_probs).sum(-1) return prob, bd_probs -#try: +# try: + class ClippedContinuous(object): - '''clipped continuous distribution with a masspoint at clip_lower + """clipped continuous distribution with a masspoint at clip_lower Notes @@ -149,82 +149,77 @@ class ClippedContinuous(object): We could add a check whether the values are in a small neighborhood, but it would be expensive (need to search and check all values). - ''' + """ def __init__(self, base_dist, clip_lower): self.base_dist = base_dist self.clip_lower = clip_lower def _get_clip_lower(self, kwds): - '''helper method to get clip_lower from kwds or attribute - - ''' - if 'clip_lower' not in kwds: + """helper method to get clip_lower from kwds or attribute""" + if "clip_lower" not in kwds: clip_lower = self.clip_lower else: - clip_lower = kwds.pop('clip_lower') + clip_lower = kwds.pop("clip_lower") return clip_lower, kwds def rvs(self, *args, **kwds): clip_lower, kwds = self._get_clip_lower(kwds) rvs_ = self.base_dist.rvs(*args, **kwds) - #same as numpy.clip ? + # same as numpy.clip ? rvs_[rvs_ < clip_lower] = clip_lower return rvs_ - - def pdf(self, x, *args, **kwds): x = np.atleast_1d(x) - if 'clip_lower' not in kwds: + if "clip_lower" not in kwds: clip_lower = self.clip_lower else: - #allow clip_lower to be a possible parameter - clip_lower = kwds.pop('clip_lower') + # allow clip_lower to be a possible parameter + clip_lower = kwds.pop("clip_lower") pdf_raw = np.atleast_1d(self.base_dist.pdf(x, *args, **kwds)) - clip_mask = (x == self.clip_lower) + clip_mask = x == self.clip_lower if np.any(clip_mask): clip_prob = self.base_dist.cdf(clip_lower, *args, **kwds) pdf_raw[clip_mask] = clip_prob - #the following will be handled by sub-classing rv_continuous + # the following will be handled by sub-classing rv_continuous pdf_raw[x < clip_lower] = 0 return pdf_raw def cdf(self, x, *args, **kwds): - if 'clip_lower' not in kwds: + if "clip_lower" not in kwds: clip_lower = self.clip_lower else: - #allow clip_lower to be a possible parameter - clip_lower = kwds.pop('clip_lower') + # allow clip_lower to be a possible parameter + clip_lower = kwds.pop("clip_lower") cdf_raw = self.base_dist.cdf(x, *args, **kwds) - #not needed if equality test is used -## clip_mask = (x == self.clip_lower) -## if np.any(clip_mask): -## clip_prob = self.base_dist.cdf(clip_lower, *args, **kwds) -## pdf_raw[clip_mask] = clip_prob + # not needed if equality test is used + ## clip_mask = (x == self.clip_lower) + ## if np.any(clip_mask): + ## clip_prob = self.base_dist.cdf(clip_lower, *args, **kwds) + ## pdf_raw[clip_mask] = clip_prob - #the following will be handled by sub-classing rv_continuous - #if self.a is defined + # the following will be handled by sub-classing rv_continuous + # if self.a is defined cdf_raw[x < clip_lower] = 0 return cdf_raw def sf(self, x, *args, **kwds): - if 'clip_lower' not in kwds: + if "clip_lower" not in kwds: clip_lower = self.clip_lower else: - #allow clip_lower to be a possible parameter - clip_lower = kwds.pop('clip_lower') + # allow clip_lower to be a possible parameter + clip_lower = kwds.pop("clip_lower") sf_raw = self.base_dist.sf(x, *args, **kwds) sf_raw[x <= clip_lower] = 1 return sf_raw - def ppf(self, x, *args, **kwds): raise NotImplementedError @@ -232,76 +227,76 @@ def plot(self, x, *args, **kwds): clip_lower, kwds = self._get_clip_lower(kwds) mass = self.pdf(clip_lower, *args, **kwds) - xr = np.concatenate(([clip_lower+1e-6], x[x>clip_lower])) + xr = np.concatenate(([clip_lower + 1e-6], x[x > clip_lower])) import matplotlib.pyplot as plt - #x = np.linspace(-4, 4, 21) - #plt.figure() - plt.xlim(clip_lower-0.1, x.max()) - #remove duplicate calculation + + # x = np.linspace(-4, 4, 21) + # plt.figure() + plt.xlim(clip_lower - 0.1, x.max()) + # remove duplicate calculation xpdf = self.pdf(x, *args, **kwds) - plt.ylim(0, max(mass, xpdf.max())*1.1) + plt.ylim(0, max(mass, xpdf.max()) * 1.1) plt.plot(xr, self.pdf(xr, *args, **kwds)) - #plt.vline(clip_lower, self.pdf(clip_lower, *args, **kwds)) - plt.stem([clip_lower], [mass], - linefmt='b-', markerfmt='bo', basefmt='r-') + # plt.vline(clip_lower, self.pdf(clip_lower, *args, **kwds)) + plt.stem( + [clip_lower], [mass], linefmt="b-", markerfmt="bo", basefmt="r-" + ) return - - -if __name__ == '__main__': +if __name__ == "__main__": doplots = 1 - #*********** Poisson-Normal Mixture - mdist = stats.poisson(2.) + # *********** Poisson-Normal Mixture + mdist = stats.poisson(2.0) bdist = stats.norm bd_args_fn = lambda x: () - #bd_kwds_fn = lambda x: {'loc': np.atleast_2d(10./(1+x))} - bd_kwds_fn = lambda x: {'loc': x, 'scale': 0.1*np.ones_like(x)} #10./(1+x)} - + # bd_kwds_fn = lambda x: {'loc': np.atleast_2d(10./(1+x))} + bd_kwds_fn = lambda x: { + "loc": x, + "scale": 0.1 * np.ones_like(x), + } # 10./(1+x)} pd = ParametricMixtureD(mdist, bdist, bd_args_fn, bd_kwds_fn) print(pd.pdf(1)) - p, bp = pd.pdf(np.linspace(0,20,21)) - pc, bpc = pd.cdf(np.linspace(0,20,21)) + p, bp = pd.pdf(np.linspace(0, 20, 21)) + pc, bpc = pd.cdf(np.linspace(0, 20, 21)) print(pd.rvs()) rvs, m = pd.rvs(size=1000) - if doplots: import matplotlib.pyplot as plt - plt.hist(rvs, bins = 100) - plt.title('poisson mixture of normal distributions') - #********** clipped normal distribution (Tobit) + plt.hist(rvs, bins=100) + plt.title("poisson mixture of normal distributions") + + # ********** clipped normal distribution (Tobit) bdist = stats.norm - clip_lower_ = 0. #-0.5 + clip_lower_ = 0.0 # -0.5 cnorm = ClippedContinuous(bdist, clip_lower_) x = np.linspace(1e-8, 4, 11) print(cnorm.pdf(x)) print(cnorm.cdf(x)) if doplots: - #plt.figure() - #cnorm.plot(x) + # plt.figure() + # cnorm.plot(x) plt.figure() - cnorm.plot(x = np.linspace(-1, 4, 51), loc=0.5, scale=np.sqrt(2)) - plt.title('clipped normal distribution') + cnorm.plot(x=np.linspace(-1, 4, 51), loc=0.5, scale=np.sqrt(2)) + plt.title("clipped normal distribution") fig = plt.figure() - for i, loc in enumerate([0., 0.5, 1.,2.]): - fig.add_subplot(2,2,i+1) - cnorm.plot(x = np.linspace(-1, 4, 51), loc=loc, scale=np.sqrt(2)) - plt.title('clipped normal, loc = %3.2f' % loc) - + for i, loc in enumerate([0.0, 0.5, 1.0, 2.0]): + fig.add_subplot(2, 2, i + 1) + cnorm.plot(x=np.linspace(-1, 4, 51), loc=loc, scale=np.sqrt(2)) + plt.title("clipped normal, loc = %3.2f" % loc) loc = 1.5 rvs = cnorm.rvs(loc=loc, size=2000) plt.figure() plt.hist(rvs, bins=50) - plt.title('clipped normal rvs, loc = %3.2f' % loc) - + plt.title("clipped normal rvs, loc = %3.2f" % loc) - #plt.show() + # plt.show() diff --git a/statsmodels/sandbox/distributions/quantize.py b/statsmodels/sandbox/distributions/quantize.py index 1d1c06a452a..2a537e1a84e 100644 --- a/statsmodels/sandbox/distributions/quantize.py +++ b/statsmodels/sandbox/distributions/quantize.py @@ -1,12 +1,14 @@ -'''Quantizing a continuous distribution in 2d +"""Quantizing a continuous distribution in 2d Author: josef-pktd -''' +""" from statsmodels.compat.python import lmap + import numpy as np + def prob_bv_rectangle(lower, upper, cdf): - '''helper function for probability of a rectangle in a bivariate distribution + """helper function for probability of a rectangle in a bivariate distribution Parameters ---------- @@ -19,15 +21,16 @@ def prob_bv_rectangle(lower, upper, cdf): how does this generalize to more than 2 variates ? - ''' + """ probuu = cdf(*upper) probul = cdf(upper[0], lower[1]) problu = cdf(lower[0], upper[1]) probll = cdf(*lower) return probuu - probul - problu + probll + def prob_mv_grid(bins, cdf, axis=-1): - '''helper function for probability of a rectangle grid in a multivariate distribution + """helper function for probability of a rectangle grid in a multivariate distribution how does this generalize to more than 2 variates ? @@ -35,18 +38,18 @@ def prob_mv_grid(bins, cdf, axis=-1): tuple of bin edges, currently it is assumed that they broadcast correctly - ''' + """ if not isinstance(bins, np.ndarray): bins = lmap(np.asarray, bins) n_dim = len(bins) bins_ = [] - #broadcast if binedges are 1d + # broadcast if binedges are 1d if all(lmap(np.ndim, bins) == np.ones(n_dim)): for d in range(n_dim): - sl = [None]*n_dim + sl = [None] * n_dim sl[d] = slice(None) bins_.append(bins[d][sl]) - else: #assume it is already correctly broadcasted + else: # assume it is already correctly broadcasted n_dim = bins.shape[0] bins_ = bins @@ -60,33 +63,36 @@ def prob_mv_grid(bins, cdf, axis=-1): def prob_quantize_cdf(binsx, binsy, cdf): - '''quantize a continuous distribution given by a cdf + """quantize a continuous distribution given by a cdf Parameters ---------- binsx : array_like, 1d binedges - ''' + """ binsx = np.asarray(binsx) binsy = np.asarray(binsy) nx = len(binsx) - 1 ny = len(binsy) - 1 - probs = np.nan * np.ones((nx, ny)) #np.empty(nx,ny) - cdf_values = cdf(binsx[:,None], binsy) - cdf_func = lambda x, y: cdf_values[x,y] - for xind in range(1, nx+1): - for yind in range(1, ny+1): + probs = np.nan * np.ones((nx, ny)) # np.empty(nx,ny) + cdf_values = cdf(binsx[:, None], binsy) + cdf_func = lambda x, y: cdf_values[x, y] + for xind in range(1, nx + 1): + for yind in range(1, ny + 1): upper = (xind, yind) - lower = (xind-1, yind-1) - #print upper,lower, - probs[xind-1,yind-1] = prob_bv_rectangle(lower, upper, cdf_func) + lower = (xind - 1, yind - 1) + # print upper,lower, + probs[xind - 1, yind - 1] = prob_bv_rectangle( + lower, upper, cdf_func + ) assert not np.isnan(probs).any() return probs + def prob_quantize_cdf_old(binsx, binsy, cdf): - '''quantize a continuous distribution given by a cdf + """quantize a continuous distribution given by a cdf old version without precomputing cdf values @@ -95,47 +101,55 @@ def prob_quantize_cdf_old(binsx, binsy, cdf): binsx : array_like, 1d binedges - ''' + """ binsx = np.asarray(binsx) binsy = np.asarray(binsy) nx = len(binsx) - 1 ny = len(binsy) - 1 - probs = np.nan * np.ones((nx, ny)) #np.empty(nx,ny) - for xind in range(1, nx+1): - for yind in range(1, ny+1): + probs = np.nan * np.ones((nx, ny)) # np.empty(nx,ny) + for xind in range(1, nx + 1): + for yind in range(1, ny + 1): upper = (binsx[xind], binsy[yind]) - lower = (binsx[xind-1], binsy[yind-1]) - #print upper,lower, - probs[xind-1,yind-1] = prob_bv_rectangle(lower, upper, cdf) + lower = (binsx[xind - 1], binsy[yind - 1]) + # print upper,lower, + probs[xind - 1, yind - 1] = prob_bv_rectangle(lower, upper, cdf) assert not np.isnan(probs).any() return probs - - -if __name__ == '__main__': +if __name__ == "__main__": from numpy.testing import assert_almost_equal - unif_2d = lambda x,y: x*y - assert_almost_equal(prob_bv_rectangle([0,0], [1,0.5], unif_2d), 0.5, 14) - assert_almost_equal(prob_bv_rectangle([0,0], [0.5,0.5], unif_2d), 0.25, 14) - - arr1b = np.array([[ 0.05, 0.05, 0.05, 0.05], - [ 0.05, 0.05, 0.05, 0.05], - [ 0.05, 0.05, 0.05, 0.05], - [ 0.05, 0.05, 0.05, 0.05], - [ 0.05, 0.05, 0.05, 0.05]]) - arr1a = prob_quantize_cdf(np.linspace(0,1,6), np.linspace(0,1,5), unif_2d) + unif_2d = lambda x, y: x * y + assert_almost_equal(prob_bv_rectangle([0, 0], [1, 0.5], unif_2d), 0.5, 14) + assert_almost_equal( + prob_bv_rectangle([0, 0], [0.5, 0.5], unif_2d), 0.25, 14 + ) + + arr1b = np.array( + [ + [0.05, 0.05, 0.05, 0.05], + [0.05, 0.05, 0.05, 0.05], + [0.05, 0.05, 0.05, 0.05], + [0.05, 0.05, 0.05, 0.05], + [0.05, 0.05, 0.05, 0.05], + ] + ) + + arr1a = prob_quantize_cdf( + np.linspace(0, 1, 6), np.linspace(0, 1, 5), unif_2d + ) assert_almost_equal(arr1a, arr1b, 14) - arr2b = np.array([[ 0.25], - [ 0.25], - [ 0.25], - [ 0.25]]) - arr2a = prob_quantize_cdf(np.linspace(0,1,5), np.linspace(0,1,2), unif_2d) + arr2b = np.array([[0.25], [0.25], [0.25], [0.25]]) + arr2a = prob_quantize_cdf( + np.linspace(0, 1, 5), np.linspace(0, 1, 2), unif_2d + ) assert_almost_equal(arr2a, arr2b, 14) - arr3b = np.array([[ 0.25, 0.25, 0.25, 0.25]]) - arr3a = prob_quantize_cdf(np.linspace(0,1,2), np.linspace(0,1,5), unif_2d) + arr3b = np.array([[0.25, 0.25, 0.25, 0.25]]) + arr3a = prob_quantize_cdf( + np.linspace(0, 1, 2), np.linspace(0, 1, 5), unif_2d + ) assert_almost_equal(arr3a, arr3b, 14) diff --git a/statsmodels/sandbox/distributions/sppatch.py b/statsmodels/sandbox/distributions/sppatch.py index cd62f4a50f3..ca2f5a7e428 100644 --- a/statsmodels/sandbox/distributions/sppatch.py +++ b/statsmodels/sandbox/distributions/sppatch.py @@ -1,4 +1,4 @@ -'''patching scipy to fit distributions and expect method +"""patching scipy to fit distributions and expect method This adds new methods to estimate continuous distribution parameters with some fixed/frozen parameters. It also contains functions that calculate the expected @@ -9,25 +9,26 @@ Author: josef-pktd License: Simplified BSD -''' +""" from statsmodels.compat.python import lmap -import numpy as np -from scipy import stats, optimize, integrate +import numpy as np +from scipy import integrate, optimize, stats ########## patching scipy -#vonmises does not define finite bounds, because it is intended for circular -#support which does not define a proper pdf on the real line +# vonmises does not define finite bounds, because it is intended for circular +# support which does not define a proper pdf on the real line stats.distributions.vonmises.a = -np.pi stats.distributions.vonmises.b = np.pi -#the next 3 functions are for fit with some fixed parameters -#As they are written, they do not work as functions, only as methods +# the next 3 functions are for fit with some fixed parameters +# As they are written, they do not work as functions, only as methods + def _fitstart(self, x): - '''example method, method of moment estimator as starting values + """example method, method of moment estimator as starting values Parameters ---------- @@ -47,14 +48,15 @@ def _fitstart(self, x): This example was written for the gamma distribution, but not verified with literature - ''' - loc = np.min([x.min(),0]) - a = 4/stats.skew(x)**2 + """ + loc = np.min([x.min(), 0]) + a = 4 / stats.skew(x) ** 2 scale = np.std(x) / np.sqrt(a) return (a, loc, scale) + def _fitstart_beta(self, x, fixed=None): - '''method of moment estimator as starting values for beta distribution + """method of moment estimator as starting values for beta distribution Parameters ---------- @@ -82,41 +84,47 @@ def _fitstart_beta(self, x, fixed=None): NIST reference also includes reference to MLE in Johnson, Kotz, and Balakrishan, Volume II, pages 221-235 - ''' - #todo: separate out this part to be used for other compact support distributions + """ + # todo: separate out this part to be used for other compact support distributions # e.g. rdist, vonmises, and truncnorm # but this might not work because it might still be distribution specific a, b = x.min(), x.max() - eps = (a-b)*0.01 + eps = (a - b) * 0.01 if fixed is None: - #this part not checked with books + # this part not checked with books loc = a - eps - scale = (a - b) * (1 + 2*eps) + scale = (a - b) * (1 + 2 * eps) else: if np.isnan(fixed[-2]): - #estimate loc + # estimate loc loc = a - eps else: loc = fixed[-2] if np.isnan(fixed[-1]): - #estimate scale + # estimate scale scale = (b + eps) - loc else: scale = fixed[-1] - #method of moment for known loc scale: + # method of moment for known loc scale: scale = float(scale) - xtrans = (x - loc)/scale + xtrans = (x - loc) / scale xm = xtrans.mean() xv = xtrans.var() - tmp = (xm*(1-xm)/xv - 1) + tmp = xm * (1 - xm) / xv - 1 p = xm * tmp q = (1 - xm) * tmp - return (p, q, loc, scale) #check return type and should fixed be returned ? + return ( + p, + q, + loc, + scale, + ) # check return type and should fixed be returned ? + def _fitstart_poisson(self, x, fixed=None): - '''maximum likelihood estimator as starting values for Poisson distribution + """maximum likelihood estimator as starting values for Poisson distribution Parameters ---------- @@ -141,30 +149,30 @@ def _fitstart_poisson(self, x, fixed=None): MLE : https://en.wikipedia.org/wiki/Poisson_distribution#Maximum_likelihood - ''' - #todo: separate out this part to be used for other compact support distributions + """ + # todo: separate out this part to be used for other compact support distributions # e.g. rdist, vonmises, and truncnorm # but this might not work because it might still be distribution specific a = x.min() - eps = 0 # is this robust ? + eps = 0 # is this robust ? if fixed is None: - #this part not checked with books + # this part not checked with books loc = a - eps else: if np.isnan(fixed[-1]): - #estimate loc + # estimate loc loc = a - eps else: loc = fixed[-1] - #MLE for standard (unshifted, if loc=0) Poisson distribution + # MLE for standard (unshifted, if loc=0) Poisson distribution - xtrans = (x - loc) + xtrans = x - loc lambd = xtrans.mean() - #second derivative d loglike/ dlambd Not used - #dlldlambd = 1/lambd # check + # second derivative d loglike/ dlambd Not used + # dlldlambd = 1/lambd # check - return (lambd, loc) #check return type and should fixed be returned ? + return (lambd, loc) # check return type and should fixed be returned ? def nnlf_fr(self, thetash, x, frmask): @@ -185,17 +193,18 @@ def nnlf_fr(self, thetash, x, frmask): raise ValueError("Not enough input arguments.") if not self._argcheck(*args) or scale <= 0: return np.inf - x = np.array((x-loc) / scale) + x = np.array((x - loc) / scale) cond0 = (x <= self.a) | (x >= self.b) - if (np.any(cond0)): + if np.any(cond0): return np.inf else: N = len(x) - #raise ValueError - return self._nnlf(x, *args) + N*np.log(scale) + # raise ValueError + return self._nnlf(x, *args) + N * np.log(scale) + def fit_fr(self, data, *args, **kwds): - '''estimate distribution parameters by MLE taking some parameters as fixed + """estimate distribution parameters by MLE taking some parameters as fixed Parameters ---------- @@ -251,22 +260,22 @@ def fit_fr(self, data, *args, **kwds): * check if docstring is correct * more input checking, args is list ? might also apply to current fit method - ''' - loc0, scale0 = lmap(kwds.get, ['loc', 'scale'],[0.0, 1.0]) + """ + loc0, scale0 = lmap(kwds.get, ["loc", "scale"], [0.0, 1.0]) Narg = len(args) - if Narg == 0 and hasattr(self, '_fitstart'): + if Narg == 0 and hasattr(self, "_fitstart"): x0 = self._fitstart(data) elif Narg > self.numargs: raise ValueError("Too many input arguments.") else: - args += (1.0,)*(self.numargs-Narg) + args += (1.0,) * (self.numargs - Narg) # location and scale are at the end x0 = args + (loc0, scale0) - if 'frozen' in kwds: - frmask = np.array(kwds['frozen']) - if len(frmask) != self.numargs+2: + if "frozen" in kwds: + frmask = np.array(kwds["frozen"]) + if len(frmask) != self.numargs + 2: raise ValueError("Incorrect number of frozen arguments.") else: # keep starting values for not frozen parameters @@ -278,25 +287,29 @@ def fit_fr(self, data, *args, **kwds): # If there were array elements, then frmask will be object-dtype, # in which case np.isnan will raise TypeError frmask = frmask.astype(np.float64) - x0 = np.array(x0)[np.isnan(frmask)] + x0 = np.array(x0)[np.isnan(frmask)] else: frmask = None - #print(x0 - #print(frmask - return optimize.fmin(self.nnlf_fr, x0, - args=(np.ravel(data), frmask), disp=0) + # print(x0 + # print(frmask + return optimize.fmin( + self.nnlf_fr, x0, args=(np.ravel(data), frmask), disp=0 + ) -#The next two functions/methods calculate expected value of an arbitrary -#function, however for the continuous functions intquad is use, which might -#require continuouity or smoothness in the function. +# The next two functions/methods calculate expected value of an arbitrary +# function, however for the continuous functions intquad is use, which might +# require continuouity or smoothness in the function. -#TODO: add option for Monte Carlo integration +# TODO: add option for Monte Carlo integration -def expect(self, fn=None, args=(), loc=0, scale=1, lb=None, ub=None, conditional=False): - '''calculate expected value of a function with respect to the distribution + +def expect( + self, fn=None, args=(), loc=0, scale=1, lb=None, ub=None, conditional=False +): + """calculate expected value of a function with respect to the distribution location and scale only tested on a few examples @@ -324,28 +337,34 @@ def expect(self, fn=None, args=(), loc=0, scale=1, lb=None, ub=None, conditional This function has not been checked for it's behavior when the integral is not finite. The integration behavior is inherited from scipy.integrate.quad. - ''' + """ if fn is None: + def fun(x, *args): - return x*self.pdf(x, loc=loc, scale=scale, *args) + return x * self.pdf(x, loc=loc, scale=scale, *args) + else: + def fun(x, *args): - return fn(x)*self.pdf(x, loc=loc, scale=scale, *args) + return fn(x) * self.pdf(x, loc=loc, scale=scale, *args) + if lb is None: - lb = loc + self.a * scale #(self.a - loc)/(1.0*scale) + lb = loc + self.a * scale # (self.a - loc)/(1.0*scale) if ub is None: - ub = loc + self.b * scale #(self.b - loc)/(1.0*scale) + ub = loc + self.b * scale # (self.b - loc)/(1.0*scale) if conditional: - invfac = (self.sf(lb, loc=loc, scale=scale, *args) - - self.sf(ub, loc=loc, scale=scale, *args)) + invfac = self.sf(lb, loc=loc, scale=scale, *args) - self.sf( + ub, loc=loc, scale=scale, *args + ) else: invfac = 1.0 - return integrate.quad(fun, lb, ub, - args=args)[0]/invfac + return integrate.quad(fun, lb, ub, args=args)[0] / invfac -def expect_v2(self, fn=None, args=(), loc=0, scale=1, lb=None, ub=None, conditional=False): - '''calculate expected value of a function with respect to the distribution +def expect_v2( + self, fn=None, args=(), loc=0, scale=1, lb=None, ub=None, conditional=False +): + """calculate expected value of a function with respect to the distribution location and scale only tested on a few examples @@ -385,50 +404,57 @@ def expect_v2(self, fn=None, args=(), loc=0, scale=1, lb=None, ub=None, conditio for example if the distribution is very concentrated and the default limits are too large. - ''' - #changes: 20100809 - #correction and refactoring how loc and scale are handled - #uses now _pdf - #needs more testing for distribution with bound support, e.g. genpareto + """ + # changes: 20100809 + # correction and refactoring how loc and scale are handled + # uses now _pdf + # needs more testing for distribution with bound support, e.g. genpareto if fn is None: + def fun(x, *args): - return (loc + x*scale)*self._pdf(x, *args) + return (loc + x * scale) * self._pdf(x, *args) + else: + def fun(x, *args): - return fn(loc + x*scale)*self._pdf(x, *args) + return fn(loc + x * scale) * self._pdf(x, *args) + if lb is None: - #lb = self.a + # lb = self.a try: - lb = self.ppf(1e-9, *args) #1e-14 quad fails for pareto + lb = self.ppf(1e-9, *args) # 1e-14 quad fails for pareto except ValueError: lb = self.a else: - lb = max(self.a, (lb - loc)/(1.0*scale)) #transform to standardized + lb = max( + self.a, (lb - loc) / (1.0 * scale) + ) # transform to standardized if ub is None: - #ub = self.b + # ub = self.b try: - ub = self.ppf(1-1e-9, *args) + ub = self.ppf(1 - 1e-9, *args) except ValueError: ub = self.b else: - ub = min(self.b, (ub - loc)/(1.0*scale)) + ub = min(self.b, (ub - loc) / (1.0 * scale)) if conditional: - invfac = self._sf(lb,*args) - self._sf(ub,*args) + invfac = self._sf(lb, *args) - self._sf(ub, *args) else: invfac = 1.0 - return integrate.quad(fun, lb, ub, - args=args, limit=500)[0]/invfac + return integrate.quad(fun, lb, ub, args=args, limit=500)[0] / invfac + ### for discrete distributions -#TODO: check that for a distribution with finite support the calculations are +# TODO: check that for a distribution with finite support the calculations are # done with one array summation (np.dot) -#based on _drv2_moment(self, n, *args), but streamlined -def expect_discrete(self, fn=None, args=(), loc=0, lb=None, ub=None, - conditional=False): - '''calculate expected value of a function with respect to the distribution +# based on _drv2_moment(self, n, *args), but streamlined +def expect_discrete( + self, fn=None, args=(), loc=0, lb=None, ub=None, conditional=False +): + """calculate expected value of a function with respect to the distribution for discrete distribution Parameters @@ -468,39 +494,43 @@ def expect_discrete(self, fn=None, args=(), loc=0, lb=None, ub=None, are evaluated) - ''' + """ - #moment_tol = 1e-12 # increase compared to self.moment_tol, + # moment_tol = 1e-12 # increase compared to self.moment_tol, # too slow for only small gain in precision for zipf - #avoid endless loop with unbound integral, eg. var of zipf(2) + # avoid endless loop with unbound integral, eg. var of zipf(2) maxcount = 1000 - suppnmin = 100 #minimum number of points to evaluate (+ and -) + suppnmin = 100 # minimum number of points to evaluate (+ and -) if fn is None: + def fun(x): - #loc and args from outer scope - return (x+loc)*self._pmf(x, *args) + # loc and args from outer scope + return (x + loc) * self._pmf(x, *args) + else: + def fun(x): - #loc and args from outer scope - return fn(x+loc)*self._pmf(x, *args) + # loc and args from outer scope + return fn(x + loc) * self._pmf(x, *args) + # used pmf because _pmf does not check support in randint # and there might be problems(?) with correct self.a, self.b at this stage # maybe not anymore, seems to work now with _pmf - self._argcheck(*args) # (re)generate scalar self.a and self.b + self._argcheck(*args) # (re)generate scalar self.a and self.b if lb is None: - lb = (self.a) + lb = self.a else: lb = lb - loc if ub is None: - ub = (self.b) + ub = self.b else: ub = ub - loc if conditional: - invfac = self.sf(lb,*args) - self.sf(ub+1,*args) + invfac = self.sf(lb, *args) - self.sf(ub + 1, *args) else: invfac = 1.0 @@ -508,14 +538,14 @@ def fun(x): low, upp = self._ppf(0.001, *args), self._ppf(0.999, *args) low = max(min(-suppnmin, low), lb) upp = min(max(suppnmin, upp), ub) - supp = np.arange(low, upp+1, self.inc) #check limits - #print('low, upp', low, upp + supp = np.arange(low, upp + 1, self.inc) # check limits + # print('low, upp', low, upp tot = np.sum(fun(supp)) diff = 1e100 pos = upp + self.inc count = 0 - #handle cases with infinite support + # handle cases with infinite support while (pos <= ub) and (diff > self.moment_tol) and count <= maxcount: diff = fun(pos) @@ -523,7 +553,7 @@ def fun(x): pos += self.inc count += 1 - if self.a < 0: #handle case when self.a = -inf + if self.a < 0: # handle case when self.a = -inf diff = 1e100 pos = low - self.inc while (pos >= lb) and (diff > self.moment_tol) and count <= maxcount: @@ -533,21 +563,24 @@ def fun(x): count += 1 if count > maxcount: # replace with proper warning - print('sum did not converge') - return tot/invfac + print("sum did not converge") + return tot / invfac + stats.distributions.rv_continuous.fit_fr = fit_fr stats.distributions.rv_continuous.nnlf_fr = nnlf_fr stats.distributions.rv_continuous.expect = expect stats.distributions.rv_discrete.expect = expect_discrete -stats.distributions.beta_gen._fitstart = _fitstart_beta #not tried out yet -stats.distributions.poisson_gen._fitstart = _fitstart_poisson #not tried out yet +stats.distributions.beta_gen._fitstart = _fitstart_beta # not tried out yet +stats.distributions.poisson_gen._fitstart = ( + _fitstart_poisson # not tried out yet +) ########## end patching scipy def distfitbootstrap(sample, distr, nrepl=100): - '''run bootstrap for estimation of distribution parameters + """run bootstrap for estimation of distribution parameters hard coded: only one shape parameter is allowed and estimated, loc=0 and scale=1 are fixed in the estimation @@ -565,7 +598,7 @@ def distfitbootstrap(sample, distr, nrepl=100): res : array (nrepl,) parameter estimates for all bootstrap replications - ''' + """ nobs = len(sample) res = np.zeros(nrepl) for ii in range(nrepl): @@ -574,8 +607,9 @@ def distfitbootstrap(sample, distr, nrepl=100): res[ii] = distr.fit_fr(x, frozen=[np.nan, 0.0, 1.0]) return res + def distfitmc(sample, distr, nrepl=100, distkwds={}): - '''run Monte Carlo for estimation of distribution parameters + """run Monte Carlo for estimation of distribution parameters hard coded: only one shape parameter is allowed and estimated, loc=0 and scale=1 are fixed in the estimation @@ -593,8 +627,8 @@ def distfitmc(sample, distr, nrepl=100, distkwds={}): res : array (nrepl,) parameter estimates for all Monte Carlo replications - ''' - arg = distkwds.pop('arg') + """ + arg = distkwds.pop("arg") nobs = len(sample) res = np.zeros(nrepl) for ii in range(nrepl): @@ -603,8 +637,8 @@ def distfitmc(sample, distr, nrepl=100, distkwds={}): return res -def printresults(sample, arg, bres, kind='bootstrap'): - '''calculate and print(Bootstrap or Monte Carlo result +def printresults(sample, arg, bres, kind="bootstrap"): + """calculate and print(Bootstrap or Monte Carlo result Parameters ---------- @@ -633,95 +667,102 @@ def printresults(sample, arg, bres, kind='bootstrap'): todo: return results and string instead of printing - ''' - print('true parameter value') + """ + print("true parameter value") print(arg) - print('MLE estimate of parameters using sample (nobs=%d)'% (nobs)) + print("MLE estimate of parameters using sample (nobs=%d)" % (nobs)) argest = distr.fit_fr(sample, frozen=[np.nan, 0.0, 1.0]) print(argest) - if kind == 'bootstrap': - #bootstrap compares to estimate from sample + if kind == "bootstrap": + # bootstrap compares to estimate from sample argorig = arg arg = argest - print('%s distribution of parameter estimate (nrepl=%d)'% (kind, nrepl)) - print('mean = %f, bias=%f' % (bres.mean(0), bres.mean(0)-arg)) - print('median', np.median(bres, axis=0)) - print('var and std', bres.var(0), np.sqrt(bres.var(0))) - bmse = ((bres - arg)**2).mean(0) - print('mse, rmse', bmse, np.sqrt(bmse)) + print("%s distribution of parameter estimate (nrepl=%d)" % (kind, nrepl)) + print("mean = %f, bias=%f" % (bres.mean(0), bres.mean(0) - arg)) + print("median", np.median(bres, axis=0)) + print("var and std", bres.var(0), np.sqrt(bres.var(0))) + bmse = ((bres - arg) ** 2).mean(0) + print("mse, rmse", bmse, np.sqrt(bmse)) bressorted = np.sort(bres) - print('%s confidence interval (90%% coverage)' % kind) - print(bressorted[np.floor(nrepl*0.05)], bressorted[np.floor(nrepl*0.95)]) - print('%s confidence interval (90%% coverage) normal approximation' % kind) - print(stats.norm.ppf(0.05, loc=bres.mean(), scale=bres.std()),) + print("%s confidence interval (90%% coverage)" % kind) + print( + bressorted[np.floor(nrepl * 0.05)], bressorted[np.floor(nrepl * 0.95)] + ) + print("%s confidence interval (90%% coverage) normal approximation" % kind) + print( + stats.norm.ppf(0.05, loc=bres.mean(), scale=bres.std()), + ) print(stats.norm.isf(0.05, loc=bres.mean(), scale=bres.std())) - print('Kolmogorov-Smirnov test for normality of %s distribution' % kind) - print(' - estimated parameters, p-values not really correct') - print(stats.kstest(bres, 'norm', (bres.mean(), bres.std()))) + print("Kolmogorov-Smirnov test for normality of %s distribution" % kind) + print(" - estimated parameters, p-values not really correct") + print(stats.kstest(bres, "norm", (bres.mean(), bres.std()))) -if __name__ == '__main__': +if __name__ == "__main__": - examplecases = ['largenumber', 'bootstrap', 'montecarlo'][:] + examplecases = ["largenumber", "bootstrap", "montecarlo"][:] - if 'largenumber' in examplecases: + if "largenumber" in examplecases: - print('\nDistribution: vonmises') + print("\nDistribution: vonmises") - for nobs in [200]:#[20000, 1000, 100]: + for nobs in [200]: # [20000, 1000, 100]: x = stats.vonmises.rvs(1.23, loc=0, scale=1, size=nobs) - print('\nnobs:', nobs) - print('true parameter') - print('1.23, loc=0, scale=1') - print('unconstrained') + print("\nnobs:", nobs) + print("true parameter") + print("1.23, loc=0, scale=1") + print("unconstrained") print(stats.vonmises.fit(x)) print(stats.vonmises.fit_fr(x, frozen=[np.nan, np.nan, np.nan])) - print('with fixed loc and scale') + print("with fixed loc and scale") print(stats.vonmises.fit_fr(x, frozen=[np.nan, 0.0, 1.0])) - print('\nDistribution: gamma') + print("\nDistribution: gamma") distr = stats.gamma - arg, loc, scale = 2.5, 0., 20. + arg, loc, scale = 2.5, 0.0, 20.0 - for nobs in [200]:#[20000, 1000, 100]: + for nobs in [200]: # [20000, 1000, 100]: x = distr.rvs(arg, loc=loc, scale=scale, size=nobs) - print('\nnobs:', nobs) - print('true parameter') - print('%f, loc=%f, scale=%f' % (arg, loc, scale)) - print('unconstrained') + print("\nnobs:", nobs) + print("true parameter") + print("%f, loc=%f, scale=%f" % (arg, loc, scale)) + print("unconstrained") print(distr.fit(x)) print(distr.fit_fr(x, frozen=[np.nan, np.nan, np.nan])) - print('with fixed loc and scale') + print("with fixed loc and scale") print(distr.fit_fr(x, frozen=[np.nan, 0.0, 1.0])) - print('with fixed loc') + print("with fixed loc") print(distr.fit_fr(x, frozen=[np.nan, 0.0, np.nan])) + ex = ["gamma", "vonmises"][0] - ex = ['gamma', 'vonmises'][0] - - if ex == 'gamma': + if ex == "gamma": distr = stats.gamma - arg, loc, scale = 2.5, 0., 1 - elif ex == 'vonmises': + arg, loc, scale = 2.5, 0.0, 1 + elif ex == "vonmises": distr = stats.vonmises - arg, loc, scale = 1.5, 0., 1 + arg, loc, scale = 1.5, 0.0, 1 else: - raise ValueError('wrong example') + raise ValueError("wrong example") nobs = 100 nrepl = 1000 sample = distr.rvs(arg, loc=loc, scale=scale, size=nobs) - print('\nDistribution:', distr) - if 'bootstrap' in examplecases: - print('\nBootstrap') - bres = distfitbootstrap(sample, distr, nrepl=nrepl ) + print("\nDistribution:", distr) + if "bootstrap" in examplecases: + print("\nBootstrap") + bres = distfitbootstrap(sample, distr, nrepl=nrepl) printresults(sample, arg, bres) - if 'montecarlo' in examplecases: - print('\nMonteCarlo') - mcres = distfitmc(sample, distr, nrepl=nrepl, - distkwds=dict(arg=arg, loc=loc, scale=scale)) - printresults(sample, arg, mcres, kind='montecarlo') + if "montecarlo" in examplecases: + print("\nMonteCarlo") + mcres = distfitmc( + sample, + distr, + nrepl=nrepl, + distkwds=dict(arg=arg, loc=loc, scale=scale), + ) + printresults(sample, arg, mcres, kind="montecarlo") diff --git a/statsmodels/sandbox/distributions/tests/_est_fit.py b/statsmodels/sandbox/distributions/tests/_est_fit.py index 0a5facd0229..f9ce6cdad90 100644 --- a/statsmodels/sandbox/distributions/tests/_est_fit.py +++ b/statsmodels/sandbox/distributions/tests/_est_fit.py @@ -8,19 +8,19 @@ import numpy as np - from scipy import stats from .distparams import distcont # this is not a proper statistical test for convergence, but only # verifies that the estimate and true values do not differ by too much -n_repl1 = 1000 # sample size for first run -n_repl2 = 5000 # sample size for second run, if first run fails -thresh_percent = 0.25 # percent of true parameters for fail cut-off +n_repl1 = 1000 # sample size for first run +n_repl2 = 5000 # sample size for second run, if first run fails +thresh_percent = 0.25 # percent of true parameters for fail cut-off thresh_min = 0.75 # minimum difference estimate - true to fail test -#distcont = [['genextreme', (3.3184017469423535,)]] +# distcont = [['genextreme', (3.3184017469423535,)]] + def _est_cont_fit(): # this tests the closeness of the estimated parameters to the true @@ -28,41 +28,54 @@ def _est_cont_fit(): # Note: is slow, some distributions do not converge with sample size <= 10000 for distname, arg in distcont: - yield check_cont_fit, distname,arg + yield check_cont_fit, distname, arg -def check_cont_fit(distname,arg): +def check_cont_fit(distname, arg): distfn = getattr(stats, distname) - rvs = distfn.rvs(size=n_repl1,*arg) - est = distfn.fit(rvs) #,*arg) # start with default values - - truearg = np.hstack([arg,[0.0,1.0]]) - diff = est-truearg - - txt = '' - diffthreshold = np.max(np.vstack([truearg*thresh_percent, - np.ones(distfn.numargs+2)*thresh_min]),0) + rvs = distfn.rvs(size=n_repl1, *arg) + est = distfn.fit(rvs) # ,*arg) # start with default values + + truearg = np.hstack([arg, [0.0, 1.0]]) + diff = est - truearg + + txt = "" + diffthreshold = np.max( + np.vstack( + [ + truearg * thresh_percent, + np.ones(distfn.numargs + 2) * thresh_min, + ] + ), + 0, + ) # threshold for location - diffthreshold[-2] = np.max([np.abs(rvs.mean())*thresh_percent,thresh_min]) + diffthreshold[-2] = np.max( + [np.abs(rvs.mean()) * thresh_percent, thresh_min] + ) if np.any(np.isnan(est)): - raise AssertionError('nan returned in fit') + raise AssertionError("nan returned in fit") else: if np.any((np.abs(diff) - diffthreshold) > 0.0): -## txt = 'WARNING - diff too large with small sample' -## print 'parameter diff =', diff - diffthreshold, txt - rvs = np.concatenate([rvs,distfn.rvs(size=n_repl2-n_repl1,*arg)]) - est = distfn.fit(rvs) #,*arg) - truearg = np.hstack([arg,[0.0,1.0]]) - diff = est-truearg + ## txt = 'WARNING - diff too large with small sample' + ## print 'parameter diff =', diff - diffthreshold, txt + rvs = np.concatenate( + [rvs, distfn.rvs(size=n_repl2 - n_repl1, *arg)] + ) + est = distfn.fit(rvs) # ,*arg) + truearg = np.hstack([arg, [0.0, 1.0]]) + diff = est - truearg if np.any((np.abs(diff) - diffthreshold) > 0.0): - txt = 'parameter: %s\n' % str(truearg) - txt += 'estimated: %s\n' % str(est) - txt += 'diff : %s\n' % str(diff) - raise AssertionError('fit not very good in %s\n' % distfn.name + txt) - + txt = "parameter: %s\n" % str(truearg) + txt += "estimated: %s\n" % str(est) + txt += "diff : %s\n" % str(diff) + raise AssertionError( + "fit not very good in %s\n" % distfn.name + txt + ) if __name__ == "__main__": import pytest - pytest.main([__file__, '-vvs', '-x', '--pdb']) + + pytest.main([__file__, "-vvs", "-x", "--pdb"]) diff --git a/statsmodels/sandbox/distributions/tests/check_moments.py b/statsmodels/sandbox/distributions/tests/check_moments.py index 7014e4b323e..d91d7dc676f 100644 --- a/statsmodels/sandbox/distributions/tests/check_moments.py +++ b/statsmodels/sandbox/distributions/tests/check_moments.py @@ -1,54 +1,60 @@ -'''script to test expect and moments in distributions.stats method +"""script to test expect and moments in distributions.stats method not written as a test, prints results, renamed to prevent test runner from running it -''' +""" import numpy as np from scipy import stats -#from statsmodels.stats.moment_helpers import mnc2mvsk + +# from statsmodels.stats.moment_helpers import mnc2mvsk from statsmodels.sandbox.distributions.sppatch import expect_v2 from .distparams import distcont +specialcases = { + "ncf": {"ub": 1000} # diverges if it's too large, checked for mean +} -specialcases = {'ncf':{'ub':1000} # diverges if it's too large, checked for mean - } - -#next functions are copies from sm.stats.moment_helpers +# next functions are copies from sm.stats.moment_helpers def mc2mvsk(args): - '''convert central moments to mean, variance, skew, kurtosis - ''' + """convert central moments to mean, variance, skew, kurtosis""" mc, mc2, mc3, mc4 = args skew = np.divide(mc3, mc2**1.5) kurt = np.divide(mc4, mc2**2.0) - 3.0 return (mc, mc2, skew, kurt) + def mnc2mvsk(args): - '''convert central moments to mean, variance, skew, kurtosis - ''' - #convert four non-central moments to central moments + """convert central moments to mean, variance, skew, kurtosis""" + # convert four non-central moments to central moments mnc, mnc2, mnc3, mnc4 = args mc = mnc - mc2 = mnc2 - mnc*mnc - mc3 = mnc3 - (3*mc*mc2+mc**3) # 3rd central moment - mc4 = mnc4 - (4*mc*mc3+6*mc*mc*mc2+mc**4) + mc2 = mnc2 - mnc * mnc + mc3 = mnc3 - (3 * mc * mc2 + mc**3) # 3rd central moment + mc4 = mnc4 - (4 * mc * mc3 + 6 * mc * mc * mc2 + mc**4) return mc2mvsk((mc, mc2, mc3, mc4)) + def mom_nc0(x): - return 1. + return 1.0 + def mom_nc1(x): return x + def mom_nc2(x): - return x*x + return x * x + def mom_nc3(x): - return x*x*x + return x * x * x + def mom_nc4(x): - return np.power(x,4) + return np.power(x, 4) + res = {} distex = [] @@ -56,28 +62,29 @@ def mom_nc4(x): distok = [] distnonfinite = [] + def check_cont_basic(): - #results saved in module global variable + # results saved in module global variable for distname, distargs in distcont[:]: - #if distname not in distex_0: continue + # if distname not in distex_0: continue distfn = getattr(stats, distname) -## np.random.seed(765456) -## sn = 1000 -## rvs = distfn.rvs(size=sn,*arg) -## sm = rvs.mean() -## sv = rvs.var() -## skurt = stats.kurtosis(rvs) -## sskew = stats.skew(rvs) - m,v,s,k = distfn.stats(*distargs, **dict(moments='mvsk')) - st = np.array([m,v,s,k]) + ## np.random.seed(765456) + ## sn = 1000 + ## rvs = distfn.rvs(size=sn,*arg) + ## sm = rvs.mean() + ## sv = rvs.var() + ## skurt = stats.kurtosis(rvs) + ## sskew = stats.skew(rvs) + m, v, s, k = distfn.stats(*distargs, **dict(moments="mvsk")) + st = np.array([m, v, s, k]) mask = np.isfinite(st) if mask.sum() < 4: distnonfinite.append(distname) print(distname) - #print 'stats ', m,v,s,k + # print 'stats ', m,v,s,k expect = distfn.expect - expect = lambda *args, **kwds : expect_v2(distfn, *args, **kwds) + expect = lambda *args, **kwds: expect_v2(distfn, *args, **kwds) special_kwds = specialcases.get(distname, {}) mnc0 = expect(mom_nc0, args=distargs, **special_kwds) @@ -87,72 +94,90 @@ def check_cont_basic(): mnc4 = expect(mom_nc4, args=distargs, **special_kwds) mnc1_lc = expect(args=distargs, loc=1, scale=2, **special_kwds) - #print mnc1, mnc2, mnc3, mnc4 + # print mnc1, mnc2, mnc3, mnc4 try: me, ve, se, ke = mnc2mvsk((mnc1, mnc2, mnc3, mnc4)) except: - print('exception', mnc1, mnc2, mnc3, mnc4, st) - me, ve, se, ke = [np.nan]*4 + print("exception", mnc1, mnc2, mnc3, mnc4, st) + me, ve, se, ke = [np.nan] * 4 if mask.size > 0: distex.append(distname) - #print 'expect', me, ve, se, ke, - #print mnc1, mnc2, mnc3, mnc4 + # print 'expect', me, ve, se, ke, + # print mnc1, mnc2, mnc3, mnc4 em = np.array([me, ve, se, ke]) diff = st[mask] - em[mask] - print(diff, mnc1_lc - (1 + 2*mnc1)) - if np.size(diff)>0 and np.max(np.abs(diff)) > 1e-3: + print(diff, mnc1_lc - (1 + 2 * mnc1)) + if np.size(diff) > 0 and np.max(np.abs(diff)) > 1e-3: distlow.append(distname) else: distok.append(distname) res[distname] = [mnc0, st, em, diff, mnc1_lc] + def nct_kurt_bug(): - '''test for incorrect kurtosis of nct + """test for incorrect kurtosis of nct D. Hogben, R. S. Pinkham, M. B. Wilk: The Moments of the Non-Central t-DistributionAuthor(s): Biometrika, Vol. 48, No. 3/4 (Dec., 1961), pp. 465-468 - ''' + """ from numpy.testing import assert_almost_equal + mvsk_10_1 = (1.08372, 1.325546, 0.39993, 1.2499424941142943) - assert_almost_equal(stats.nct.stats(10, 1, moments='mvsk'), mvsk_10_1, decimal=6) - c1=np.array([1.08372]) - c2=np.array([.0755460, 1.25000]) - c3 = np.array([.0297802, .580566]) + assert_almost_equal( + stats.nct.stats(10, 1, moments="mvsk"), mvsk_10_1, decimal=6 + ) + c1 = np.array([1.08372]) + c2 = np.array([0.0755460, 1.25000]) + c3 = np.array([0.0297802, 0.580566]) c4 = np.array([0.0425458, 1.17491, 6.25]) - #calculation for df=10, for arbitrary nc + # calculation for df=10, for arbitrary nc nc = 1 mc1 = c1.item() - mc2 = (c2*nc**np.array([2,0])).sum() - mc3 = (c3*nc**np.array([3,1])).sum() - mc4 = c4=np.array([0.0425458, 1.17491, 6.25]) - mvsk_nc = mc2mvsk((mc1,mc2,mc3,mc4)) + mc2 = (c2 * nc ** np.array([2, 0])).sum() + mc3 = (c3 * nc ** np.array([3, 1])).sum() + mc4 = c4 = np.array([0.0425458, 1.17491, 6.25]) + mvsk_nc = mc2mvsk((mc1, mc2, mc3, mc4)) + -if __name__ == '__main__': +if __name__ == "__main__": check_cont_basic() - #print [(k, v[0]) for k,v in res.items() if np.abs(v[0]-1)>1e-3] - #print [(k, v[2][0], 1+2*v[2][0]) for k,v in res.items() if np.abs(v[-1]-(1+2*v[2][0]))>1e-3] - mean_ = [(k, v[1][0], v[2][0]) for k,v in res.items() - if np.abs(v[1][0] - v[2][0])>1e-6 and np.isfinite(v[1][0])] - var_ = [(k, v[1][1], v[2][1]) for k,v in res.items() - if np.abs(v[1][1] - v[2][1])>1e-2 and np.isfinite(v[1][1])] - skew = [(k, v[1][2], v[2][2]) for k,v in res.items() - if np.abs(v[1][2] - v[2][2])>1e-2 and np.isfinite(v[1][1])] - kurt = [(k, v[1][3], v[2][3]) for k,v in res.items() - if np.abs(v[1][3] - v[2][3])>1e-2 and np.isfinite(v[1][1])] + # print [(k, v[0]) for k,v in res.items() if np.abs(v[0]-1)>1e-3] + # print [(k, v[2][0], 1+2*v[2][0]) for k,v in res.items() if np.abs(v[-1]-(1+2*v[2][0]))>1e-3] + mean_ = [ + (k, v[1][0], v[2][0]) + for k, v in res.items() + if np.abs(v[1][0] - v[2][0]) > 1e-6 and np.isfinite(v[1][0]) + ] + var_ = [ + (k, v[1][1], v[2][1]) + for k, v in res.items() + if np.abs(v[1][1] - v[2][1]) > 1e-2 and np.isfinite(v[1][1]) + ] + skew = [ + (k, v[1][2], v[2][2]) + for k, v in res.items() + if np.abs(v[1][2] - v[2][2]) > 1e-2 and np.isfinite(v[1][1]) + ] + kurt = [ + (k, v[1][3], v[2][3]) + for k, v in res.items() + if np.abs(v[1][3] - v[2][3]) > 1e-2 and np.isfinite(v[1][1]) + ] from statsmodels.iolib import SimpleTable + if len(mean_) > 0: - print('\nMean difference at least 1e-6') - print(SimpleTable(mean_, headers=['distname', 'diststats', 'expect'])) - print('\nVariance difference at least 1e-2') - print(SimpleTable(var_, headers=['distname', 'diststats', 'expect'])) - print('\nSkew difference at least 1e-2') - print(SimpleTable(skew, headers=['distname', 'diststats', 'expect'])) - print('\nKurtosis difference at least 1e-2') - print(SimpleTable(kurt, headers=['distname', 'diststats', 'expect'])) + print("\nMean difference at least 1e-6") + print(SimpleTable(mean_, headers=["distname", "diststats", "expect"])) + print("\nVariance difference at least 1e-2") + print(SimpleTable(var_, headers=["distname", "diststats", "expect"])) + print("\nSkew difference at least 1e-2") + print(SimpleTable(skew, headers=["distname", "diststats", "expect"])) + print("\nKurtosis difference at least 1e-2") + print(SimpleTable(kurt, headers=["distname", "diststats", "expect"])) diff --git a/statsmodels/sandbox/distributions/tests/distparams.py b/statsmodels/sandbox/distributions/tests/distparams.py index 19b11ae1f3b..d1d799ae2d5 100644 --- a/statsmodels/sandbox/distributions/tests/distparams.py +++ b/statsmodels/sandbox/distributions/tests/distparams.py @@ -1,121 +1,153 @@ - - distcont = [ - ['alpha', (3.5704770516650459,)], - ['anglit', ()], - ['arcsine', ()], - ['beta', (2.3098496451481823, 0.62687954300963677)], - ['betaprime', (5, 6)], # avoid unbound error in entropy with (100, 86)], - ['bradford', (0.29891359763170633,)], - ['burr', (10.5, 4.3)], #incorrect mean and var for(0.94839838075366045, 4.3820284068855795)], - ['cauchy', ()], - ['chi', (78,)], - ['chi2', (55,)], - ['cosine', ()], - ['dgamma', (1.1023326088288166,)], - ['dweibull', (2.0685080649914673,)], - ['erlang', (20,)], #correction numargs = 1 - ['expon', ()], - ['exponpow', (2.697119160358469,)], - ['exponweib', (2.8923945291034436, 1.9505288745913174)], - ['f', (29, 18)], - #['fatiguelife', (29,)], #correction numargs = 1, variance very large - ['fatiguelife', (2,)], - ['fisk', (3.0857548622253179,)], - ['foldcauchy', (4.7164673455831894,)], - ['foldnorm', (1.9521253373555869,)], - ['frechet_l', (3.6279911255583239,)], - ['frechet_r', (1.8928171603534227,)], - ['gamma', (1.9932305483800778,)], - ['gausshyper', (13.763771604130699, 3.1189636648681431, - 2.5145980350183019, 5.1811649903971615)], #veryslow - ['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)], - ['genextreme', (-0.1,)], # sample mean test fails for (3.3184017469423535,)], - ['gengamma', (4.4162385429431925, 3.1193091679242761)], - ['genhalflogistic', (0.77274727809929322,)], - ['genlogistic', (0.41192440799679475,)], - ['genpareto', (0.1,)], # use case with finite moments - ['gilbrat', ()], - ['gompertz', (0.94743713075105251,)], - ['gumbel_l', ()], - ['gumbel_r', ()], - ['halfcauchy', ()], - ['halflogistic', ()], - ['halfnorm', ()], - ['hypsecant', ()], - #['invgamma', (2.0668996136993067,)], #convergence problem with expect - #['invgamma', (3.0,)], - ['invgamma', (5.0,)], #kurtosis requires alpha > 4 - ['invnorm', (0.14546264555347513,)], - ['invweibull', (10.58,)], # sample mean test fails at(0.58847112119264788,)] - ['johnsonsb', (4.3172675099141058, 3.1837781130785063)], - ['johnsonsu', (2.554395574161155, 2.2482281679651965)], - ['ksone', (1000,)], #replace 22 by 100 to avoid failing range, ticket 956 - ['kstwobign', ()], - ['laplace', ()], - ['levy', ()], - ['levy_l', ()], -# ['levy_stable', (0.35667405469844993, -# -0.67450531578494011)], #NotImplementedError + ["alpha", (3.5704770516650459,)], + ["anglit", ()], + ["arcsine", ()], + ["beta", (2.3098496451481823, 0.62687954300963677)], + ["betaprime", (5, 6)], # avoid unbound error in entropy with (100, 86)], + ["bradford", (0.29891359763170633,)], + [ + "burr", + (10.5, 4.3), + ], # incorrect mean and var for(0.94839838075366045, 4.3820284068855795)], + ["cauchy", ()], + ["chi", (78,)], + ["chi2", (55,)], + ["cosine", ()], + ["dgamma", (1.1023326088288166,)], + ["dweibull", (2.0685080649914673,)], + ["erlang", (20,)], # correction numargs = 1 + ["expon", ()], + ["exponpow", (2.697119160358469,)], + ["exponweib", (2.8923945291034436, 1.9505288745913174)], + ["f", (29, 18)], + # ['fatiguelife', (29,)], #correction numargs = 1, variance very large + ["fatiguelife", (2,)], + ["fisk", (3.0857548622253179,)], + ["foldcauchy", (4.7164673455831894,)], + ["foldnorm", (1.9521253373555869,)], + ["frechet_l", (3.6279911255583239,)], + ["frechet_r", (1.8928171603534227,)], + ["gamma", (1.9932305483800778,)], + [ + "gausshyper", + ( + 13.763771604130699, + 3.1189636648681431, + 2.5145980350183019, + 5.1811649903971615, + ), + ], # veryslow + ["genexpon", (9.1325976465418908, 16.231956600590632, 3.2819552690843983)], + [ + "genextreme", + (-0.1,), + ], # sample mean test fails for (3.3184017469423535,)], + ["gengamma", (4.4162385429431925, 3.1193091679242761)], + ["genhalflogistic", (0.77274727809929322,)], + ["genlogistic", (0.41192440799679475,)], + ["genpareto", (0.1,)], # use case with finite moments + ["gilbrat", ()], + ["gompertz", (0.94743713075105251,)], + ["gumbel_l", ()], + ["gumbel_r", ()], + ["halfcauchy", ()], + ["halflogistic", ()], + ["halfnorm", ()], + ["hypsecant", ()], + # ['invgamma', (2.0668996136993067,)], #convergence problem with expect + # ['invgamma', (3.0,)], + ["invgamma", (5.0,)], # kurtosis requires alpha > 4 + ["invnorm", (0.14546264555347513,)], + [ + "invweibull", + (10.58,), + ], # sample mean test fails at(0.58847112119264788,)] + ["johnsonsb", (4.3172675099141058, 3.1837781130785063)], + ["johnsonsu", (2.554395574161155, 2.2482281679651965)], + ["ksone", (1000,)], # replace 22 by 100 to avoid failing range, ticket 956 + ["kstwobign", ()], + ["laplace", ()], + ["levy", ()], + ["levy_l", ()], + # ['levy_stable', (0.35667405469844993, + # -0.67450531578494011)], #NotImplementedError # rvs not tested - ['loggamma', (0.41411931826052117,)], - ['logistic', ()], - ['loglaplace', (3.2505926592051435,)], - ['lognorm', (0.95368226960575331,)], - ['lomax', (1.8771398388773268,)], #this has infinite variance - ['lomax', (10,)], #first 4 moments are finite - ['maxwell', ()], - ['mielke', (10.4, 3.6)], # sample mean test fails for (4.6420495492121487, 0.59707419545516938)], - # mielke: good results if 2nd parameter >2, weird mean or var below - ['nakagami', (4.9673794866666237,)], - ['ncf', (27, 27, 0.41578441799226107)], - ['nct', (14, 0.24045031331198066)], - ['ncx2', (21, 1.0560465975116415)], - ['norm', ()], - ['pareto', (2.621716532144454,)], - ['powerlaw', (1.6591133289905851,)], - ['powerlognorm', (2.1413923530064087, 0.44639540782048337)], - ['powernorm', (4.4453652254590779,)], - ['rayleigh', ()], - ['rdist', (0.9,)], # feels also slow -# ['rdist', (3.8266985793976525,)], #veryslow, especially rvs - #['rdist', (541.0,)], # from ticket #758 #veryslow - ['recipinvgauss', (0.63004267809369119,)], - ['reciprocal', (0.0062309367010521255, 1.0062309367010522)], - ['rice', (0.7749725210111873,)], - ['semicircular', ()], - ['t', (2.7433514990818093,)], - ['triang', (0.15785029824528218,)], - ['truncexpon', (4.6907725456810478,)], - ['truncnorm', (-1.0978730080013919, 2.7306754109031979)], - ['tukeylambda', (3.1321477856738267,)], - ['uniform', ()], - ['vonmises', (3.9939042581071398,)], - ['wald', ()], - ['weibull_max', (2.8687961709100187,)], - ['weibull_min', (1.7866166930421596,)], - ['wrapcauchy', (0.031071279018614728,)]] + ["loggamma", (0.41411931826052117,)], + ["logistic", ()], + ["loglaplace", (3.2505926592051435,)], + ["lognorm", (0.95368226960575331,)], + ["lomax", (1.8771398388773268,)], # this has infinite variance + ["lomax", (10,)], # first 4 moments are finite + ["maxwell", ()], + [ + "mielke", + (10.4, 3.6), + ], # sample mean test fails for (4.6420495492121487, 0.59707419545516938)], + # mielke: good results if 2nd parameter >2, weird mean or var below + ["nakagami", (4.9673794866666237,)], + ["ncf", (27, 27, 0.41578441799226107)], + ["nct", (14, 0.24045031331198066)], + ["ncx2", (21, 1.0560465975116415)], + ["norm", ()], + ["pareto", (2.621716532144454,)], + ["powerlaw", (1.6591133289905851,)], + ["powerlognorm", (2.1413923530064087, 0.44639540782048337)], + ["powernorm", (4.4453652254590779,)], + ["rayleigh", ()], + ["rdist", (0.9,)], # feels also slow + # ['rdist', (3.8266985793976525,)], #veryslow, especially rvs + # ['rdist', (541.0,)], # from ticket #758 #veryslow + ["recipinvgauss", (0.63004267809369119,)], + ["reciprocal", (0.0062309367010521255, 1.0062309367010522)], + ["rice", (0.7749725210111873,)], + ["semicircular", ()], + ["t", (2.7433514990818093,)], + ["triang", (0.15785029824528218,)], + ["truncexpon", (4.6907725456810478,)], + ["truncnorm", (-1.0978730080013919, 2.7306754109031979)], + ["tukeylambda", (3.1321477856738267,)], + ["uniform", ()], + ["vonmises", (3.9939042581071398,)], + ["wald", ()], + ["weibull_max", (2.8687961709100187,)], + ["weibull_min", (1.7866166930421596,)], + ["wrapcauchy", (0.031071279018614728,)], +] distdiscrete = [ - ['bernoulli',(0.3,)], - ['binom', (5, 0.4)], - ['boltzmann',(1.4, 19)], - ['dlaplace', (0.8,)], #0.5 - ['geom', (0.5,)], - ['hypergeom',(30, 12, 6)], - ['hypergeom',(21,3,12)], #numpy.random (3,18,12) numpy ticket:921 - ['hypergeom',(21,18,11)], #numpy.random (18,3,11) numpy ticket:921 - ['logser', (0.6,)], # reenabled, numpy ticket:921 - ['nbinom', (5, 0.5)], - ['nbinom', (0.4, 0.4)], #from tickets: 583 - ['planck', (0.51,)], #4.1 - ['poisson', (0.6,)], - ['randint', (7, 31)], - ['skellam', (15, 8)], - ['zipf', (4,)] ] # arg=4 is ok, - # Zipf broken for arg = 2, e.g. weird .stats - # looking closer, mean, var should be inf for arg=2 + ["bernoulli", (0.3,)], + ["binom", (5, 0.4)], + ["boltzmann", (1.4, 19)], + ["dlaplace", (0.8,)], # 0.5 + ["geom", (0.5,)], + ["hypergeom", (30, 12, 6)], + ["hypergeom", (21, 3, 12)], # numpy.random (3,18,12) numpy ticket:921 + ["hypergeom", (21, 18, 11)], # numpy.random (18,3,11) numpy ticket:921 + ["logser", (0.6,)], # reenabled, numpy ticket:921 + ["nbinom", (5, 0.5)], + ["nbinom", (0.4, 0.4)], # from tickets: 583 + ["planck", (0.51,)], # 4.1 + ["poisson", (0.6,)], + ["randint", (7, 31)], + ["skellam", (15, 8)], + ["zipf", (4,)], +] # arg=4 is ok, +# Zipf broken for arg = 2, e.g. weird .stats +# looking closer, mean, var should be inf for arg=2 -distslow = ['rdist', 'gausshyper', 'recipinvgauss', 'ksone', 'genexpon', - 'vonmises', 'rice', 'mielke', 'semicircular', 'cosine', 'invweibull', - 'powerlognorm', 'johnsonsu', 'kstwobign'] +distslow = [ + "rdist", + "gausshyper", + "recipinvgauss", + "ksone", + "genexpon", + "vonmises", + "rice", + "mielke", + "semicircular", + "cosine", + "invweibull", + "powerlognorm", + "johnsonsu", + "kstwobign", +] diff --git a/statsmodels/sandbox/distributions/tests/test_extras.py b/statsmodels/sandbox/distributions/tests/test_extras.py index 0f5cd625f2d..a4d9eff12d5 100644 --- a/statsmodels/sandbox/distributions/tests/test_extras.py +++ b/statsmodels/sandbox/distributions/tests/test_extras.py @@ -8,111 +8,168 @@ import numpy as np from numpy.testing import assert_, assert_almost_equal -from statsmodels.sandbox.distributions.extras import (skewnorm, - skewnorm2, ACSkewT_gen) +from statsmodels.sandbox.distributions.extras import ( + ACSkewT_gen, + skewnorm, + skewnorm2, +) def test_skewnorm(): - #library("sn") - #dsn(c(-2,-1,0,1,2), shape=10) - #psn(c(-2,-1,0,1,2), shape=10) - #noquote(sprintf("%.15e,", snp)) - pdf_r = np.array([2.973416551551523e-90, 3.687562713971017e-24, - 3.989422804014327e-01, 4.839414490382867e-01, - 1.079819330263761e-01]) - pdf_sn = skewnorm.pdf([-2,-1,0,1,2], 10) - - #res = (snp-snp_r)/snp - assert_(np.allclose(pdf_sn, pdf_r,rtol=1e-13, atol=0)) - - pdf_sn2 = skewnorm2.pdf([-2,-1,0,1,2], 10) + # library("sn") + # dsn(c(-2,-1,0,1,2), shape=10) + # psn(c(-2,-1,0,1,2), shape=10) + # noquote(sprintf("%.15e,", snp)) + pdf_r = np.array( + [ + 2.973416551551523e-90, + 3.687562713971017e-24, + 3.989422804014327e-01, + 4.839414490382867e-01, + 1.079819330263761e-01, + ] + ) + pdf_sn = skewnorm.pdf([-2, -1, 0, 1, 2], 10) + + # res = (snp-snp_r)/snp + assert_(np.allclose(pdf_sn, pdf_r, rtol=1e-13, atol=0)) + + pdf_sn2 = skewnorm2.pdf([-2, -1, 0, 1, 2], 10) assert_(np.allclose(pdf_sn2, pdf_r, rtol=1e-13, atol=0)) - - cdf_r = np.array([0.000000000000000e+00, 0.000000000000000e+00, - 3.172551743055357e-02, 6.826894921370859e-01, - 9.544997361036416e-01]) - cdf_sn = skewnorm.cdf([-2,-1,0,1,2], 10) + cdf_r = np.array( + [ + 0.000000000000000e00, + 0.000000000000000e00, + 3.172551743055357e-02, + 6.826894921370859e-01, + 9.544997361036416e-01, + ] + ) + cdf_sn = skewnorm.cdf([-2, -1, 0, 1, 2], 10) maxabs = np.max(np.abs(cdf_sn - cdf_r)) - maxrel = np.max(np.abs(cdf_sn - cdf_r)/(cdf_r+1e-50)) - msg = "maxabs=%15.13g, maxrel=%15.13g\n%r\n%r" % (maxabs, maxrel, cdf_sn, - cdf_r) - #assert_(np.allclose(cdf_sn, cdf_r, rtol=1e-13, atol=1e-25), msg=msg) + maxrel = np.max(np.abs(cdf_sn - cdf_r) / (cdf_r + 1e-50)) + msg = "maxabs=%15.13g, maxrel=%15.13g\n%r\n%r" % ( + maxabs, + maxrel, + cdf_sn, + cdf_r, + ) + # assert_(np.allclose(cdf_sn, cdf_r, rtol=1e-13, atol=1e-25), msg=msg) assert_almost_equal(cdf_sn, cdf_r, decimal=10) - cdf_sn2 = skewnorm2.cdf([-2,-1,0,1,2], 10) + cdf_sn2 = skewnorm2.cdf([-2, -1, 0, 1, 2], 10) maxabs = np.max(np.abs(cdf_sn2 - cdf_r)) - maxrel = np.max(np.abs(cdf_sn2 - cdf_r)/(cdf_r+1e-50)) + maxrel = np.max(np.abs(cdf_sn2 - cdf_r) / (cdf_r + 1e-50)) msg = "maxabs=%15.13g, maxrel=%15.13g" % (maxabs, maxrel) - #assert_(np.allclose(cdf_sn2, cdf_r, rtol=1e-13, atol=1e-25), msg=msg) + # assert_(np.allclose(cdf_sn2, cdf_r, rtol=1e-13, atol=1e-25), msg=msg) assert_almost_equal(cdf_sn2, cdf_r, decimal=10, err_msg=msg) def test_skewt(): skewt = ACSkewT_gen() x = [-2, -1, -0.5, 0, 1, 2] - #noquote(sprintf("%.15e,", dst(c(-2,-1, -0.5,0,1,2), shape=10))) - #default in R:sn is df=inf - pdf_r = np.array([2.973416551551523e-90, 3.687562713971017e-24, - 2.018401586422970e-07, 3.989422804014327e-01, - 4.839414490382867e-01, 1.079819330263761e-01]) + # noquote(sprintf("%.15e,", dst(c(-2,-1, -0.5,0,1,2), shape=10))) + # default in R:sn is df=inf + pdf_r = np.array( + [ + 2.973416551551523e-90, + 3.687562713971017e-24, + 2.018401586422970e-07, + 3.989422804014327e-01, + 4.839414490382867e-01, + 1.079819330263761e-01, + ] + ) pdf_st = skewt.pdf(x, 1000000, 10) pass np.allclose(pdf_st, pdf_r, rtol=0, atol=1e-6) np.allclose(pdf_st, pdf_r, rtol=1e-1, atol=0) - - #noquote(sprintf("%.15e,", pst(c(-2,-1, -0.5,0,1,2), shape=10))) - cdf_r = np.array([0.000000000000000e+00, 0.000000000000000e+00, - 3.729478836866917e-09, 3.172551743055357e-02, - 6.826894921370859e-01, 9.544997361036416e-01]) + # noquote(sprintf("%.15e,", pst(c(-2,-1, -0.5,0,1,2), shape=10))) + cdf_r = np.array( + [ + 0.000000000000000e00, + 0.000000000000000e00, + 3.729478836866917e-09, + 3.172551743055357e-02, + 6.826894921370859e-01, + 9.544997361036416e-01, + ] + ) cdf_st = skewt.cdf(x, 1000000, 10) np.allclose(cdf_st, cdf_r, rtol=0, atol=1e-6) np.allclose(cdf_st, cdf_r, rtol=1e-1, atol=0) - #assert_(np.allclose(cdf_st, cdf_r, rtol=1e-13, atol=1e-15)) - - - #noquote(sprintf("%.15e,", dst(c(-2,-1, -0.5,0,1,2), shape=10, df=5))) - pdf_r = np.array([2.185448836190663e-07, 1.272381597868587e-05, - 5.746937644959992e-04, 3.796066898224945e-01, - 4.393468708859825e-01, 1.301804021075493e-01]) - pdf_st = skewt.pdf(x, 5, 10) #args = (df, alpha) + # assert_(np.allclose(cdf_st, cdf_r, rtol=1e-13, atol=1e-15)) + + # noquote(sprintf("%.15e,", dst(c(-2,-1, -0.5,0,1,2), shape=10, df=5))) + pdf_r = np.array( + [ + 2.185448836190663e-07, + 1.272381597868587e-05, + 5.746937644959992e-04, + 3.796066898224945e-01, + 4.393468708859825e-01, + 1.301804021075493e-01, + ] + ) + pdf_st = skewt.pdf(x, 5, 10) # args = (df, alpha) assert_(np.allclose(pdf_st, pdf_r, rtol=1e-13, atol=1e-25)) - #noquote(sprintf("%.15e,", pst(c(-2,-1, -0.5,0,1,2), shape=10, df=5))) - cdf_r = np.array([8.822783669199699e-08, 2.638467463775795e-06, - 6.573106017198583e-05, 3.172551743055352e-02, - 6.367851708183412e-01, 8.980606093979784e-01]) - cdf_st = skewt.cdf(x, 5, 10) #args = (df, alpha) + # noquote(sprintf("%.15e,", pst(c(-2,-1, -0.5,0,1,2), shape=10, df=5))) + cdf_r = np.array( + [ + 8.822783669199699e-08, + 2.638467463775795e-06, + 6.573106017198583e-05, + 3.172551743055352e-02, + 6.367851708183412e-01, + 8.980606093979784e-01, + ] + ) + cdf_st = skewt.cdf(x, 5, 10) # args = (df, alpha) assert_(np.allclose(cdf_st, cdf_r, rtol=1e-10, atol=0)) - - #noquote(sprintf("%.15e,", dst(c(-2,-1, -0.5,0,1,2), shape=10, df=1))) - pdf_r = np.array([3.941955996757291e-04, 1.568067236862745e-03, - 6.136996029432048e-03, 3.183098861837907e-01, - 3.167418189469279e-01, 1.269297588738406e-01]) - pdf_st = skewt.pdf(x, 1, 10) #args = (df, alpha) = (1, 10)) + # noquote(sprintf("%.15e,", dst(c(-2,-1, -0.5,0,1,2), shape=10, df=1))) + pdf_r = np.array( + [ + 3.941955996757291e-04, + 1.568067236862745e-03, + 6.136996029432048e-03, + 3.183098861837907e-01, + 3.167418189469279e-01, + 1.269297588738406e-01, + ] + ) + pdf_st = skewt.pdf(x, 1, 10) # args = (df, alpha) = (1, 10)) assert_(np.allclose(pdf_st, pdf_r, rtol=1e-13, atol=1e-25)) - #noquote(sprintf("%.15e,", pst(c(-2,-1, -0.5,0,1,2), shape=10, df=1))) - cdf_r = np.array([7.893671370544414e-04, 1.575817262600422e-03, - 3.128720749105560e-03, 3.172551743055351e-02, - 5.015758172626005e-01, 7.056221318361879e-01]) - cdf_st = skewt.cdf(x, 1, 10) #args = (df, alpha) = (1, 10) + # noquote(sprintf("%.15e,", pst(c(-2,-1, -0.5,0,1,2), shape=10, df=1))) + cdf_r = np.array( + [ + 7.893671370544414e-04, + 1.575817262600422e-03, + 3.128720749105560e-03, + 3.172551743055351e-02, + 5.015758172626005e-01, + 7.056221318361879e-01, + ] + ) + cdf_st = skewt.cdf(x, 1, 10) # args = (df, alpha) = (1, 10) assert_(np.allclose(cdf_st, cdf_r, rtol=1e-13, atol=1e-25)) - -if __name__ == '__main__': +if __name__ == "__main__": import pytest - pytest.main([__file__, '-vvs', '-x', '--pdb']) - print('Done') + + pytest.main([__file__, "-vvs", "-x", "--pdb"]) + print("Done") -''' +""" >>> skewt.pdf([-2,-1,0,1,2], 10000000, 10) array([ 2.98557345e-90, 3.68850289e-24, 3.98942271e-01, 4.83941426e-01, 1.07981952e-01]) >>> skewt.pdf([-2,-1,0,1,2], np.inf, 10) array([ nan, nan, nan, nan, nan]) -''' +""" diff --git a/statsmodels/sandbox/distributions/tests/test_gof_new.py b/statsmodels/sandbox/distributions/tests/test_gof_new.py index 8fc3b2bf809..4156c2e284e 100644 --- a/statsmodels/sandbox/distributions/tests/test_gof_new.py +++ b/statsmodels/sandbox/distributions/tests/test_gof_new.py @@ -1,7 +1,7 @@ import numpy as np from numpy.testing import assert_array_almost_equal -from statsmodels.sandbox.distributions.gof_new import bootstrap, NewNorm +from statsmodels.sandbox.distributions.gof_new import NewNorm, bootstrap def test_loop_vectorized_batch_equivalence(): @@ -9,18 +9,35 @@ def test_loop_vectorized_batch_equivalence(): nobs = 200 np.random.seed(8765679) - resu1 = bootstrap(NewNorm(), args=(0, 1), nobs=nobs, nrep=100, - value=0.576/(1 + 4./nobs - 25./nobs**2)) + resu1 = bootstrap( + NewNorm(), + args=(0, 1), + nobs=nobs, + nrep=100, + value=0.576 / (1 + 4.0 / nobs - 25.0 / nobs**2), + ) np.random.seed(8765679) - tmp = [bootstrap(NewNorm(), args=(0, 1), nobs=nobs, nrep=1) - for _ in range(100)] - resu2 = (np.array(tmp) > 0.576/(1 + 4./nobs - 25./nobs**2)).mean() + tmp = [ + bootstrap(NewNorm(), args=(0, 1), nobs=nobs, nrep=1) + for _ in range(100) + ] + resu2 = ( + np.array(tmp) > 0.576 / (1 + 4.0 / nobs - 25.0 / nobs**2) + ).mean() np.random.seed(8765679) - tmp = [bootstrap(NewNorm(), args=(0, 1), nobs=nobs, nrep=1, - value=0.576/(1 + 4./nobs - 25./nobs**2), - batch_size=10) for _ in range(10)] + tmp = [ + bootstrap( + NewNorm(), + args=(0, 1), + nobs=nobs, + nrep=1, + value=0.576 / (1 + 4.0 / nobs - 25.0 / nobs**2), + batch_size=10, + ) + for _ in range(10) + ] resu3 = np.array(tmp).mean() assert_array_almost_equal(resu1, resu2, 15) diff --git a/statsmodels/sandbox/distributions/tests/test_multivariate.py b/statsmodels/sandbox/distributions/tests/test_multivariate.py index 7cc6d4a2c6c..2a859b28607 100644 --- a/statsmodels/sandbox/distributions/tests/test_multivariate.py +++ b/statsmodels/sandbox/distributions/tests/test_multivariate.py @@ -4,58 +4,65 @@ @author: Josef Perktold """ import numpy as np -from numpy.testing import assert_almost_equal, assert_allclose +from numpy.testing import assert_allclose, assert_almost_equal from statsmodels.sandbox.distributions.multivariate import ( - mvstdtprob, mvstdnormcdf) + mvstdnormcdf, + mvstdtprob, +) from statsmodels.sandbox.distributions.mv_normal import MVT, MVNormal class Test_MVN_MVT_prob(object): - #test for block integratal, cdf, of multivariate t and normal - #comparison results from R + # test for block integratal, cdf, of multivariate t and normal + # comparison results from R @classmethod def setup_class(cls): - cls.corr_equal = np.asarray([[1.0, 0.5, 0.5],[0.5,1,0.5],[0.5,0.5,1]]) + cls.corr_equal = np.asarray( + [[1.0, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]] + ) cls.a = -1 * np.ones(3) cls.b = 3 * np.ones(3) cls.df = 4 corr2 = cls.corr_equal.copy() - corr2[2,1] = -0.5 + corr2[2, 1] = -0.5 cls.corr2 = corr2 def test_mvn_mvt_1(self): a, b = self.a, self.b df = self.df corr_equal = self.corr_equal - #result from R, mvtnorm with option - #algorithm = GenzBretz(maxpts = 100000, abseps = 0.000001, releps = 0) + # result from R, mvtnorm with option + # algorithm = GenzBretz(maxpts = 100000, abseps = 0.000001, releps = 0) # or higher - probmvt_R = 0.60414 #report, ed error approx. 7.5e-06 - probmvn_R = 0.673970 #reported error approx. 6.4e-07 + probmvt_R = 0.60414 # report, ed error approx. 7.5e-06 + probmvn_R = 0.673970 # reported error approx. 6.4e-07 assert_almost_equal(probmvt_R, mvstdtprob(a, b, corr_equal, df), 4) - assert_almost_equal(probmvn_R, - mvstdnormcdf(a, b, corr_equal, abseps=1e-5), 4) + assert_almost_equal( + probmvn_R, mvstdnormcdf(a, b, corr_equal, abseps=1e-5), 4 + ) mvn_high = mvstdnormcdf(a, b, corr_equal, abseps=1e-8, maxpts=10000000) assert_almost_equal(probmvn_R, mvn_high, 5) - #this still barely fails sometimes at 6 why?? error is -7.2627419411830374e-007 - #>>> 0.67396999999999996 - 0.67397072627419408 - #-7.2627419411830374e-007 - #>>> assert_almost_equal(0.67396999999999996, 0.67397072627419408, 6) - #Fail + # this still barely fails sometimes at 6 why?? error is -7.2627419411830374e-007 + # >>> 0.67396999999999996 - 0.67397072627419408 + # -7.2627419411830374e-007 + # >>> assert_almost_equal(0.67396999999999996, 0.67397072627419408, 6) + # Fail def test_mvn_mvt_2(self): a, b = self.a, self.b df = self.df corr2 = self.corr2 - probmvn_R = 0.6472497 #reported error approx. 7.7e-08 - probmvt_R = 0.5881863 #highest reported error up to approx. 1.99e-06 + probmvn_R = 0.6472497 # reported error approx. 7.7e-08 + probmvt_R = 0.5881863 # highest reported error up to approx. 1.99e-06 assert_almost_equal(probmvt_R, mvstdtprob(a, b, corr2, df), 4) - assert_almost_equal(probmvn_R, mvstdnormcdf(a, b, corr2, abseps=1e-5), 4) + assert_almost_equal( + probmvn_R, mvstdnormcdf(a, b, corr2, abseps=1e-5), 4 + ) def test_mvn_mvt_3(self): a, b = self.a, self.b @@ -68,7 +75,7 @@ def test_mvn_mvt_3(self): probmvn_R = 0.9961141 # using higher precision in R, error approx. 1.6e-07 probmvt_R = 0.9522146 - quadkwds = {'epsabs': 1e-08} + quadkwds = {"epsabs": 1e-08} probmvt = mvstdtprob(a2, b, corr2, df, quadkwds=quadkwds) assert_allclose(probmvt_R, probmvt, atol=5e-4) probmvn = mvstdnormcdf(a2, b, corr2, maxpts=100000, abseps=1e-5) @@ -79,47 +86,56 @@ def test_mvn_mvt_4(self): df = self.df corr2 = self.corr2 - #from 0 to inf - #print '0 inf' + # from 0 to inf + # print '0 inf' a2 = a.copy() a2[:] = -np.inf - probmvn_R = 0.1666667 #error approx. 6.1e-08 - probmvt_R = 0.1666667 #error approx. 8.2e-08 - assert_almost_equal(probmvt_R, mvstdtprob(np.zeros(3), -a2, corr2, df), 4) - assert_almost_equal(probmvn_R, - mvstdnormcdf(np.zeros(3), -a2, corr2, - maxpts=100000, abseps=1e-5), 4) + probmvn_R = 0.1666667 # error approx. 6.1e-08 + probmvt_R = 0.1666667 # error approx. 8.2e-08 + assert_almost_equal( + probmvt_R, mvstdtprob(np.zeros(3), -a2, corr2, df), 4 + ) + assert_almost_equal( + probmvn_R, + mvstdnormcdf(np.zeros(3), -a2, corr2, maxpts=100000, abseps=1e-5), + 4, + ) def test_mvn_mvt_5(self): a, bl = self.a, self.b df = self.df corr2 = self.corr2 - #unequal integration bounds - #print "ue" + # unequal integration bounds + # print "ue" a3 = np.array([0.5, -0.5, 0.5]) - probmvn_R = 0.06910487 #using higher precision in R, error approx. 3.5e-08 - probmvt_R = 0.05797867 #using higher precision in R, error approx. 5.8e-08 - assert_almost_equal(mvstdtprob(a3, a3+1, corr2, df), probmvt_R, 4) - assert_almost_equal(probmvn_R, mvstdnormcdf(a3, a3+1, corr2, - maxpts=100000, abseps=1e-5), 4) + probmvn_R = ( + 0.06910487 # using higher precision in R, error approx. 3.5e-08 + ) + probmvt_R = ( + 0.05797867 # using higher precision in R, error approx. 5.8e-08 + ) + assert_almost_equal(mvstdtprob(a3, a3 + 1, corr2, df), probmvt_R, 4) + assert_almost_equal( + probmvn_R, + mvstdnormcdf(a3, a3 + 1, corr2, maxpts=100000, abseps=1e-5), + 4, + ) class TestMVDistributions(object): - #this is not well organized + # this is not well organized @classmethod def setup_class(cls): covx = np.array([[1.0, 0.5], [0.5, 1.0]]) - mu3 = [-1, 0., 2.] - cov3 = np.array([[ 1. , 0.5 , 0.75], - [ 0.5 , 1.5 , 0.6 ], - [ 0.75, 0.6 , 2. ]]) + mu3 = [-1, 0.0, 2.0] + cov3 = np.array([[1.0, 0.5, 0.75], [0.5, 1.5, 0.6], [0.75, 0.6, 2.0]]) cls.mu3 = mu3 cls.cov3 = cov3 mvn3 = MVNormal(mu3, cov3) - mvn3c = MVNormal(np.array([0,0,0]), cov3) + mvn3c = MVNormal(np.array([0, 0, 0]), cov3) cls.mvn3 = mvn3 cls.mvn3c = mvn3c @@ -127,20 +143,18 @@ def test_mvn_pdf(self): cov3 = self.cov3 mvn3 = self.mvn3 - r_val = [ - -7.667977543898155, -6.917977543898155, -5.167977543898155 - ] + r_val = [-7.667977543898155, -6.917977543898155, -5.167977543898155] assert_allclose(mvn3.logpdf(cov3), r_val, rtol=1e-13) r_val = [ - 0.000467562492721686, 0.000989829804859273, 0.005696077243833402 + 0.000467562492721686, + 0.000989829804859273, + 0.005696077243833402, ] assert_allclose(mvn3.pdf(cov3), r_val, rtol=1e-13) mvn3b = MVNormal(np.array([0, 0, 0]), cov3) - r_val = [ - 0.02914269740502042, 0.02269635555984291, 0.01767593948287269 - ] + r_val = [0.02914269740502042, 0.02269635555984291, 0.01767593948287269] assert_allclose(mvn3b.pdf(cov3), r_val, rtol=1e-13) def test_mvt_pdf(self, reset_randomstate): @@ -148,33 +162,41 @@ def test_mvt_pdf(self, reset_randomstate): mu3 = self.mu3 mvt = MVT((0, 0), 1, 5) - assert_almost_equal(mvt.logpdf(np.array([0., 0.])), -1.837877066409345, - decimal=15) - assert_almost_equal(mvt.pdf(np.array([0., 0.])), 0.1591549430918953, - decimal=15) + assert_almost_equal( + mvt.logpdf(np.array([0.0, 0.0])), -1.837877066409345, decimal=15 + ) + assert_almost_equal( + mvt.pdf(np.array([0.0, 0.0])), 0.1591549430918953, decimal=15 + ) - mvt.logpdf(np.array([1., 1.])) - (-3.01552989458359) + mvt.logpdf(np.array([1.0, 1.0])) - (-3.01552989458359) mvt1 = MVT((0, 0), 1, 1) - mvt1.logpdf(np.array([1., 1.])) - (-3.48579549941151) # decimal=16 + mvt1.logpdf(np.array([1.0, 1.0])) - (-3.48579549941151) # decimal=16 rvs = mvt.rvs(100000) assert_almost_equal(np.cov(rvs, rowvar=False), mvt.cov, decimal=1) mvt31 = MVT(mu3, cov3, 1) - assert_almost_equal(mvt31.pdf(cov3), - [0.0007276818698165781, 0.0009980625182293658, - 0.0027661422056214652], - decimal=17) + assert_almost_equal( + mvt31.pdf(cov3), + [ + 0.0007276818698165781, + 0.0009980625182293658, + 0.0027661422056214652, + ], + decimal=17, + ) mvt = MVT(mu3, cov3, 3) - assert_almost_equal(mvt.pdf(cov3), - [0.000863777424247410, 0.001277510788307594, - 0.004156314279452241], - decimal=17) + assert_almost_equal( + mvt.pdf(cov3), + [0.000863777424247410, 0.001277510788307594, 0.004156314279452241], + decimal=17, + ) -if __name__ == '__main__': +if __name__ == "__main__": import pytest - pytest.main([__file__, '-vvs', '-x', '--pdb']) + pytest.main([__file__, "-vvs", "-x", "--pdb"]) diff --git a/statsmodels/sandbox/distributions/tests/test_norm_expan.py b/statsmodels/sandbox/distributions/tests/test_norm_expan.py index ad4e7010c3c..5f24804e52c 100644 --- a/statsmodels/sandbox/distributions/tests/test_norm_expan.py +++ b/statsmodels/sandbox/distributions/tests/test_norm_expan.py @@ -8,17 +8,15 @@ Author: Josef Perktold """ -import pytest import numpy as np -from scipy import stats - from numpy.testing import assert_allclose, assert_array_less +import pytest +from scipy import stats from statsmodels.sandbox.distributions.extras import NormExpan_gen class CheckDistribution(object): - @pytest.mark.smoke def test_dist1(self): self.dist1.rvs(size=10) @@ -35,27 +33,27 @@ def test_cdf_ppf_roundtrip(self): class CheckExpandNorm(CheckDistribution): - def test_pdf(self): - scale = getattr(self, 'scale', 1) + scale = getattr(self, "scale", 1) x = np.linspace(-4, 4, 11) * scale pdf2 = self.dist2.pdf(x) pdf1 = self.dist1.pdf(x) - atol_pdf = getattr(self, 'atol_pdf', 0) - assert_allclose(((pdf2 - pdf1)**2).mean(), 0, rtol=1e-6, atol=atol_pdf) + atol_pdf = getattr(self, "atol_pdf", 0) + assert_allclose( + ((pdf2 - pdf1) ** 2).mean(), 0, rtol=1e-6, atol=atol_pdf + ) assert_allclose(pdf2, pdf1, rtol=1e-6, atol=atol_pdf) def test_mvsk(self): - #compare defining mvsk with numerical integration, generic stats + # compare defining mvsk with numerical integration, generic stats mvsk2 = self.dist2.mvsk - mvsk1 = self.dist2.stats(moments='mvsk') + mvsk1 = self.dist2.stats(moments="mvsk") assert_allclose(mvsk2, mvsk1, rtol=1e-6, atol=1e-13) # check mvsk that was used to generate distribution assert_allclose(self.dist2.mvsk, self.mvsk, rtol=1e-12) - class TestExpandNormMom(CheckExpandNorm): # compare with normal, skew=0, excess_kurtosis=0 @@ -63,8 +61,8 @@ class TestExpandNormMom(CheckExpandNorm): def setup_class(kls): kls.scale = 2 kls.dist1 = stats.norm(1, 2) - kls.mvsk = [1., 2**2, 0, 0] - kls.dist2 = NormExpan_gen(kls.mvsk, mode='mvsk') + kls.mvsk = [1.0, 2**2, 0, 0] + kls.dist2 = NormExpan_gen(kls.mvsk, mode="mvsk") class TestExpandNormSample(object): @@ -76,10 +74,10 @@ def setup_class(kls): kls.dist1 = dist1 = stats.norm(1, 2) np.random.seed(5999) kls.rvs = dist1.rvs(size=200) - #rvs = np.concatenate([rvs, -rvs]) + # rvs = np.concatenate([rvs, -rvs]) # fix mean and std of sample - #rvs = (rvs - rvs.mean())/rvs.std(ddof=1) * np.sqrt(2) + 1 - kls.dist2 = NormExpan_gen(kls.rvs, mode='sample') + # rvs = (rvs - rvs.mean())/rvs.std(ddof=1) * np.sqrt(2) + 1 + kls.dist2 = NormExpan_gen(kls.rvs, mode="sample") kls.scale = 2 kls.atol_pdf = 1e-3 diff --git a/statsmodels/sandbox/distributions/tests/test_transf.py b/statsmodels/sandbox/distributions/tests/test_transf.py index 60348f3bd98..7b8bdaf29b9 100644 --- a/statsmodels/sandbox/distributions/tests/test_transf.py +++ b/statsmodels/sandbox/distributions/tests/test_transf.py @@ -15,165 +15,207 @@ the best which can be obtained. array(2981.0032380193438) """ -import warnings # for silencing, see above... +import warnings # for silencing, see above... + import numpy as np from numpy.testing import assert_almost_equal -from scipy import stats, special -from statsmodels.sandbox.distributions.extras import ( - squarenormalg, absnormalg, negsquarenormalg, squaretg) +from scipy import special, stats +from statsmodels.sandbox.distributions.extras import ( + absnormalg, + negsquarenormalg, + squarenormalg, + squaretg, +) # some patches to scipy.stats.distributions so tests work and pass # this should be necessary only for older scipy -#patch frozen distributions with a name +# patch frozen distributions with a name stats.distributions.rv_frozen.name = property(lambda self: self.dist.name) -#patch f distribution, correct skew and maybe kurtosis +# patch f distribution, correct skew and maybe kurtosis def f_stats(self, dfn, dfd): arr, where, inf, sqrt, nan = np.array, np.where, np.inf, np.sqrt, np.nan - v2 = arr(dfd*1.0) - v1 = arr(dfn*1.0) + v2 = arr(dfd * 1.0) + v1 = arr(dfn * 1.0) mu = where(v2 > 2, v2 / arr(v2 - 2), inf) - mu2 = 2*v2*v2*(v2+v1-2)/(v1*(v2-2)**2 * (v2-4)) + mu2 = 2 * v2 * v2 * (v2 + v1 - 2) / (v1 * (v2 - 2) ** 2 * (v2 - 4)) mu2 = where(v2 > 4, mu2, inf) - #g1 = 2*(v2+2*v1-2)/(v2-6)*sqrt((2*v2-4)/(v1*(v2+v1-2))) - g1 = 2*(v2+2*v1-2.)/(v2-6.)*np.sqrt(2*(v2-4.)/(v1*(v2+v1-2.))) + # g1 = 2*(v2+2*v1-2)/(v2-6)*sqrt((2*v2-4)/(v1*(v2+v1-2))) + g1 = ( + 2 + * (v2 + 2 * v1 - 2.0) + / (v2 - 6.0) + * np.sqrt(2 * (v2 - 4.0) / (v1 * (v2 + v1 - 2.0))) + ) g1 = where(v2 > 6, g1, nan) - #g2 = 3/(2*v2-16)*(8+g1*g1*(v2-6)) - g2 = 3/(2.*v2-16)*(8+g1*g1*(v2-6.)) + # g2 = 3/(2*v2-16)*(8+g1*g1*(v2-6)) + g2 = 3 / (2.0 * v2 - 16) * (8 + g1 * g1 * (v2 - 6.0)) g2 = where(v2 > 8, g2, nan) return mu, mu2, g1, g2 -#stats.distributions.f_gen._stats = f_stats + +# stats.distributions.f_gen._stats = f_stats stats.f.__class__._stats = f_stats -#correct kurtosis by subtracting 3 (Fisher) -#after this it matches halfnorm for arg close to zero +# correct kurtosis by subtracting 3 (Fisher) +# after this it matches halfnorm for arg close to zero def foldnorm_stats(self, c): arr, where, inf, sqrt, nan = np.array, np.where, np.inf, np.sqrt, np.nan exp = np.exp pi = np.pi - fac = special.erf(c/sqrt(2)) - mu = sqrt(2.0/pi)*exp(-0.5*c*c)+c*fac - mu2 = c*c + 1 - mu*mu - c2 = c*c - g1 = sqrt(2/pi)*exp(-1.5*c2)*(4-pi*exp(c2)*(2*c2+1.0)) - g1 += 2*c*fac*(6*exp(-c2) + 3*sqrt(2*pi)*c*exp(-c2/2.0)*fac + \ - pi*c*(fac*fac-1)) - g1 /= pi*mu2**1.5 - - g2 = c2*c2+6*c2+3+6*(c2+1)*mu*mu - 3*mu**4 - g2 -= 4*exp(-c2/2.0)*mu*(sqrt(2.0/pi)*(c2+2)+c*(c2+3)*exp(c2/2.0)*fac) + fac = special.erf(c / sqrt(2)) + mu = sqrt(2.0 / pi) * exp(-0.5 * c * c) + c * fac + mu2 = c * c + 1 - mu * mu + c2 = c * c + g1 = sqrt(2 / pi) * exp(-1.5 * c2) * (4 - pi * exp(c2) * (2 * c2 + 1.0)) + g1 += ( + 2 + * c + * fac + * ( + 6 * exp(-c2) + + 3 * sqrt(2 * pi) * c * exp(-c2 / 2.0) * fac + + pi * c * (fac * fac - 1) + ) + ) + g1 /= pi * mu2**1.5 + + g2 = c2 * c2 + 6 * c2 + 3 + 6 * (c2 + 1) * mu * mu - 3 * mu**4 + g2 -= ( + 4 + * exp(-c2 / 2.0) + * mu + * (sqrt(2.0 / pi) * (c2 + 2) + c * (c2 + 3) * exp(c2 / 2.0) * fac) + ) g2 /= mu2**2.0 - g2 -= 3. + g2 -= 3.0 return mu, mu2, g1, g2 -#stats.distributions.foldnorm_gen._stats = foldnorm_stats + +# stats.distributions.foldnorm_gen._stats = foldnorm_stats stats.foldnorm.__class__._stats = foldnorm_stats -#----------------------------- +# ----------------------------- DECIMAL = 5 -class Test_Transf2(object): +class Test_Transf2(object): @classmethod def setup_class(cls): cls.dist_equivalents = [ - #transf, stats.lognorm(1)) - #The below fails on the SPARC box with scipy 10.1 - #(lognormalg, stats.lognorm(1)), - #transf2 + # transf, stats.lognorm(1)) + # The below fails on the SPARC box with scipy 10.1 + # (lognormalg, stats.lognorm(1)), + # transf2 (squarenormalg, stats.chi2(1)), (absnormalg, stats.halfnorm), - (absnormalg, stats.foldnorm(1e-5)), #try frozen - #(negsquarenormalg, 1-stats.chi2), # will not work as distribution - (squaretg(10), stats.f(1, 10)) - ] #try both frozen + (absnormalg, stats.foldnorm(1e-5)), # try frozen + # (negsquarenormalg, 1-stats.chi2), # will not work as distribution + (squaretg(10), stats.f(1, 10)), + ] # try both frozen - l,s = 0.0, 1.0 - cls.ppfq = [0.1,0.5,0.9] - cls.xx = [0.95,1.0,1.1] - cls.nxx = [-0.95,-1.0,-1.1] + l, s = 0.0, 1.0 + cls.ppfq = [0.1, 0.5, 0.9] + cls.xx = [0.95, 1.0, 1.1] + cls.nxx = [-0.95, -1.0, -1.1] def test_equivalent(self): xx, ppfq = self.xx, self.ppfq - for d1,d2 in self.dist_equivalents: -## print d1.name - assert_almost_equal(d1.cdf(xx), d2.cdf(xx), err_msg='cdf'+d1.name) - assert_almost_equal(d1.pdf(xx), d2.pdf(xx), - err_msg='pdf '+d1.name+d2.name) - assert_almost_equal(d1.sf(xx), d2.sf(xx), - err_msg='sf '+d1.name+d2.name) - assert_almost_equal(d1.ppf(ppfq), d2.ppf(ppfq), - err_msg='ppq '+d1.name+d2.name) - assert_almost_equal(d1.isf(ppfq), d2.isf(ppfq), - err_msg='isf '+d1.name+d2.name) + for d1, d2 in self.dist_equivalents: + ## print d1.name + assert_almost_equal( + d1.cdf(xx), d2.cdf(xx), err_msg="cdf" + d1.name + ) + assert_almost_equal( + d1.pdf(xx), d2.pdf(xx), err_msg="pdf " + d1.name + d2.name + ) + assert_almost_equal( + d1.sf(xx), d2.sf(xx), err_msg="sf " + d1.name + d2.name + ) + assert_almost_equal( + d1.ppf(ppfq), d2.ppf(ppfq), err_msg="ppq " + d1.name + d2.name + ) + assert_almost_equal( + d1.isf(ppfq), d2.isf(ppfq), err_msg="isf " + d1.name + d2.name + ) self.d1 = d1 self.d2 = d2 -## print d1, d2 -## print d1.moment(3) -## print d2.moment(3) - #work around bug#1293 - if hasattr(d2, 'dist'): + ## print d1, d2 + ## print d1.moment(3) + ## print d2.moment(3) + # work around bug#1293 + if hasattr(d2, "dist"): d2mom = d2.dist.moment(3, *d2.args) else: d2mom = d2.moment(3) - assert_almost_equal(d1.moment(3), d2mom, - DECIMAL, - err_msg='moment '+d1.name+d2.name) + assert_almost_equal( + d1.moment(3), + d2mom, + DECIMAL, + err_msg="moment " + d1.name + d2.name, + ) # silence warnings in scipy, works for versions # after print changed to warning in scipy orig_filter = warnings.filters[:] - warnings.simplefilter('ignore') + warnings.simplefilter("ignore") try: - s1 = d1.stats(moments='mvsk') - s2 = d2.stats(moments='mvsk') + s1 = d1.stats(moments="mvsk") + s2 = d2.stats(moments="mvsk") finally: warnings.filters = orig_filter - #stats(moments='k') prints warning for lognormalg - assert_almost_equal(s1[:2], s2[:2], - err_msg='stats '+d1.name+d2.name) - assert_almost_equal(s1[2:], s2[2:], - decimal=2, #lognorm for kurtosis - err_msg='stats '+d1.name+d2.name) - - + # stats(moments='k') prints warning for lognormalg + assert_almost_equal( + s1[:2], s2[:2], err_msg="stats " + d1.name + d2.name + ) + assert_almost_equal( + s1[2:], + s2[2:], + decimal=2, # lognorm for kurtosis + err_msg="stats " + d1.name + d2.name, + ) def test_equivalent_negsq(self): - #special case negsquarenormalg - #negsquarenormalg.cdf(x) == stats.chi2(1).cdf(-x), for x<=0 + # special case negsquarenormalg + # negsquarenormalg.cdf(x) == stats.chi2(1).cdf(-x), for x<=0 xx, nxx, ppfq = self.xx, self.nxx, self.ppfq - d1,d2 = (negsquarenormalg, stats.chi2(1)) - #print d1.name - assert_almost_equal(d1.cdf(nxx), 1-d2.cdf(xx), err_msg='cdf'+d1.name) + d1, d2 = (negsquarenormalg, stats.chi2(1)) + # print d1.name + assert_almost_equal( + d1.cdf(nxx), 1 - d2.cdf(xx), err_msg="cdf" + d1.name + ) assert_almost_equal(d1.pdf(nxx), d2.pdf(xx)) - assert_almost_equal(d1.sf(nxx), 1-d2.sf(xx)) + assert_almost_equal(d1.sf(nxx), 1 - d2.sf(xx)) assert_almost_equal(d1.ppf(ppfq), -d2.ppf(ppfq)[::-1]) assert_almost_equal(d1.isf(ppfq), -d2.isf(ppfq)[::-1]) assert_almost_equal(d1.moment(3), -d2.moment(3)) - ch2oddneg = [v*(-1)**(i+1) for i,v in - enumerate(d2.stats(moments='mvsk'))] - assert_almost_equal(d1.stats(moments='mvsk'), ch2oddneg, - err_msg='stats '+d1.name+d2.name) + ch2oddneg = [ + v * (-1) ** (i + 1) for i, v in enumerate(d2.stats(moments="mvsk")) + ] + assert_almost_equal( + d1.stats(moments="mvsk"), + ch2oddneg, + err_msg="stats " + d1.name + d2.name, + ) -if __name__ == '__main__': +if __name__ == "__main__": tt = Test_Transf2() tt.test_equivalent() tt.test_equivalent_negsq() debug = 0 if debug: - print(negsquarenormalg.ppf([0.1,0.5,0.9])) - print(stats.chi2.ppf([0.1,0.5,0.9],1)) + print(negsquarenormalg.ppf([0.1, 0.5, 0.9])) + print(stats.chi2.ppf([0.1, 0.5, 0.9], 1)) print(negsquarenormalg.a) print(negsquarenormalg.b) - print(absnormalg.stats( moments='mvsk')) - print(stats.foldnorm(1e-10).stats( moments='mvsk')) - print(stats.halfnorm.stats( moments='mvsk')) + print(absnormalg.stats(moments="mvsk")) + print(stats.foldnorm(1e-10).stats(moments="mvsk")) + print(stats.halfnorm.stats(moments="mvsk")) diff --git a/statsmodels/sandbox/distributions/transform_functions.py b/statsmodels/sandbox/distributions/transform_functions.py index f8bad5e83f6..4b9dab86ce0 100644 --- a/statsmodels/sandbox/distributions/transform_functions.py +++ b/statsmodels/sandbox/distributions/transform_functions.py @@ -11,24 +11,22 @@ class TransformFunction(object): - def __call__(self, x): self.func(x) - ## Hump and U-shaped functions class SquareFunc(TransformFunction): - '''class to hold quadratic function with inverse function and derivative + """class to hold quadratic function with inverse function and derivative using instance methods instead of class methods, if we want extension to parametrized function - ''' + """ def func(self, x): - return np.power(x, 2.) + return np.power(x, 2.0) def inverseplus(self, x): return np.sqrt(x) @@ -37,20 +35,17 @@ def inverseminus(self, x): return 0.0 - np.sqrt(x) def derivplus(self, x): - return 0.5/np.sqrt(x) + return 0.5 / np.sqrt(x) def derivminus(self, x): - return 0.0 - 0.5/np.sqrt(x) - - + return 0.0 - 0.5 / np.sqrt(x) class NegSquareFunc(TransformFunction): - '''negative quadratic function + """negative quadratic function""" - ''' def func(self, x): - return -np.power(x,2) + return -np.power(x, 2) def inverseplus(self, x): return np.sqrt(-x) @@ -59,15 +54,14 @@ def inverseminus(self, x): return 0.0 - np.sqrt(-x) def derivplus(self, x): - return 0.0 - 0.5/np.sqrt(-x) + return 0.0 - 0.5 / np.sqrt(-x) def derivminus(self, x): - return 0.5/np.sqrt(-x) + return 0.5 / np.sqrt(-x) class AbsFunc(TransformFunction): - '''class for absolute value transformation - ''' + """class for absolute value transformation""" def func(self, x): return np.abs(x) @@ -90,7 +84,6 @@ def derivminus(self, x): class LogFunc(TransformFunction): - def func(self, x): return np.log(x) @@ -98,11 +91,10 @@ def inverse(self, y): return np.exp(y) def deriv(self, x): - return 1./x - -class ExpFunc(TransformFunction): + return 1.0 / x +class ExpFunc(TransformFunction): def func(self, x): return np.exp(x) @@ -114,22 +106,20 @@ def deriv(self, x): class BoxCoxNonzeroFunc(TransformFunction): - def __init__(self, lamda): self.lamda = lamda def func(self, x): - return (np.power(x, self.lamda) - 1)/self.lamda + return (np.power(x, self.lamda) - 1) / self.lamda def inverse(self, y): - return (self.lamda * y + 1)/self.lamda + return (self.lamda * y + 1) / self.lamda def deriv(self, x): return np.power(x, self.lamda - 1) class AffineFunc(TransformFunction): - def __init__(self, constant, slope): self.constant = constant self.slope = slope @@ -145,7 +135,6 @@ def deriv(self, x): class ChainFunc(TransformFunction): - def __init__(self, finn, fout): self.finn = finn self.fout = fout @@ -161,28 +150,28 @@ def deriv(self, x): return self.fout.deriv(z) * self.finn.deriv(x) -#def inverse(x): +# def inverse(x): # return np.divide(1.0,x) # -#mux, stdx = 0.05, 0.1 -#mux, stdx = 9.0, 1.0 -#def inversew(x): +# mux, stdx = 0.05, 0.1 +# mux, stdx = 9.0, 1.0 +# def inversew(x): # return 1.0/(1+mux+x*stdx) -#def inversew_inv(x): +# def inversew_inv(x): # return (1.0/x - 1.0 - mux)/stdx #.np.divide(1.0,x)-10 # -#def identit(x): +# def identit(x): # return x -if __name__ == '__main__': +if __name__ == "__main__": absf = AbsFunc() absf.func(5) == 5 absf.func(-5) == 5 absf.inverseplus(5) == 5 absf.inverseminus(5) == -5 - chainf = ChainFunc(AffineFunc(1,2), BoxCoxNonzeroFunc(2)) - print(chainf.func(3.)) - chainf2 = ChainFunc(BoxCoxNonzeroFunc(2), AffineFunc(1,2)) - print(chainf.func(3.)) + chainf = ChainFunc(AffineFunc(1, 2), BoxCoxNonzeroFunc(2)) + print(chainf.func(3.0)) + chainf2 = ChainFunc(BoxCoxNonzeroFunc(2), AffineFunc(1, 2)) + print(chainf.func(3.0)) diff --git a/statsmodels/sandbox/distributions/transformed.py b/statsmodels/sandbox/distributions/transformed.py index 6a5ae2f16be..e192f3855a2 100644 --- a/statsmodels/sandbox/distributions/transformed.py +++ b/statsmodels/sandbox/distributions/transformed.py @@ -1,8 +1,6 @@ - - ## copied from nonlinear_transform_gen.py -''' A class for the distribution of a non-linear monotonic transformation of a continuous random variable +""" A class for the distribution of a non-linear monotonic transformation of a continuous random variable simplest usage: example: create log-gamma distribution, i.e. y = log(x), @@ -37,160 +35,197 @@ Author: josef-pktd License: BSD -''' +""" +import numpy as np from scipy import stats from scipy.stats import distributions -import numpy as np def get_u_argskwargs(**kwargs): - #Todo: What's this? wrong spacing, used in Transf_gen TransfTwo_gen - u_kwargs = dict((k.replace('u_','',1),v) for k,v in kwargs.items() - if k.startswith('u_')) - u_args = u_kwargs.pop('u_args',None) + # Todo: What's this? wrong spacing, used in Transf_gen TransfTwo_gen + u_kwargs = dict( + (k.replace("u_", "", 1), v) + for k, v in kwargs.items() + if k.startswith("u_") + ) + u_args = u_kwargs.pop("u_args", None) return u_args, u_kwargs + class Transf_gen(distributions.rv_continuous): - '''a class for non-linear monotonic transformation of a continuous random variable + """a class for non-linear monotonic transformation of a continuous random variable""" - ''' def __init__(self, kls, func, funcinv, *args, **kwargs): - #print(args - #print(kwargs + # print(args + # print(kwargs self.func = func self.funcinv = funcinv - #explicit for self.__dict__.update(kwargs) - #need to set numargs because inspection does not work - self.numargs = kwargs.pop('numargs', 0) - #print(self.numargs - name = kwargs.pop('name','transfdist') - longname = kwargs.pop('longname','Non-linear transformed distribution') - extradoc = kwargs.pop('extradoc',None) - a = kwargs.pop('a', -np.inf) - b = kwargs.pop('b', np.inf) - self.decr = kwargs.pop('decr', False) - #defines whether it is a decreasing (True) - # or increasing (False) monotonic transformation - + # explicit for self.__dict__.update(kwargs) + # need to set numargs because inspection does not work + self.numargs = kwargs.pop("numargs", 0) + # print(self.numargs + name = kwargs.pop("name", "transfdist") + longname = kwargs.pop( + "longname", "Non-linear transformed distribution" + ) + # extradoc = kwargs.pop('extradoc',None) + a = kwargs.pop("a", -np.inf) + b = kwargs.pop("b", np.inf) + self.decr = kwargs.pop("decr", False) + # defines whether it is a decreasing (True) + # or increasing (False) monotonic transformation self.u_args, self.u_kwargs = get_u_argskwargs(**kwargs) - self.kls = kls #(self.u_args, self.u_kwargs) - # possible to freeze the underlying distribution - - super(Transf_gen,self).__init__(a=a, b=b, name = name, - shapes=kls.shapes, - longname = longname, - extradoc = extradoc) - - def _cdf(self,x,*args, **kwargs): - #print(args + self.kls = kls # (self.u_args, self.u_kwargs) + # possible to freeze the underlying distribution + + super(Transf_gen, self).__init__( + a=a, + b=b, + name=name, + shapes=kls.shapes, + longname=longname, + # extradoc = extradoc + ) + + def _cdf(self, x, *args, **kwargs): + # print(args if not self.decr: - return self.kls._cdf(self.funcinv(x),*args, **kwargs) - #note scipy _cdf only take *args not *kwargs + return self.kls._cdf(self.funcinv(x), *args, **kwargs) + # note scipy _cdf only take *args not *kwargs else: - return 1.0 - self.kls._cdf(self.funcinv(x),*args, **kwargs) + return 1.0 - self.kls._cdf(self.funcinv(x), *args, **kwargs) + def _ppf(self, q, *args, **kwargs): if not self.decr: - return self.func(self.kls._ppf(q,*args, **kwargs)) + return self.func(self.kls._ppf(q, *args, **kwargs)) else: - return self.func(self.kls._ppf(1-q,*args, **kwargs)) + return self.func(self.kls._ppf(1 - q, *args, **kwargs)) def inverse(x): - return np.divide(1.0,x) + return np.divide(1.0, x) + mux, stdx = 0.05, 0.1 mux, stdx = 9.0, 1.0 + + def inversew(x): - return 1.0/(1+mux+x*stdx) + return 1.0 / (1 + mux + x * stdx) + + def inversew_inv(x): - return (1.0/x - 1.0 - mux)/stdx #.np.divide(1.0,x)-10 + return (1.0 / x - 1.0 - mux) / stdx # .np.divide(1.0,x)-10 + def identit(x): return x -invdnormalg = Transf_gen(stats.norm, inversew, inversew_inv, decr=True, #a=-np.inf, - numargs = 0, name = 'discf', longname = 'normal-based discount factor', - extradoc = '\ndistribution of discount factor y=1/(1+x)) with x N(0.05,0.1**2)') -lognormalg = Transf_gen(stats.norm, np.exp, np.log, - numargs = 2, a=0, name = 'lnnorm', - longname = 'Exp transformed normal', - extradoc = '\ndistribution of y = exp(x), with x standard normal' - 'precision for moment andstats is not very high, 2-3 decimals') +invdnormalg = Transf_gen( + stats.norm, + inversew, + inversew_inv, + decr=True, # a=-np.inf, + numargs=0, + name="discf", + longname="normal-based discount factor", + # extradoc = '\ndistribution of discount factor y=1/(1+x)) with x N(0.05,0.1**2)' +) + +lognormalg = Transf_gen( + stats.norm, + np.exp, + np.log, + numargs=2, + a=0, + name="lnnorm", + longname="Exp transformed normal", + # extradoc = '\ndistribution of y = exp(x), with x standard normal' + # 'precision for moment andstats is not very high, 2-3 decimals' +) loggammaexpg = Transf_gen(stats.gamma, np.log, np.exp, numargs=1) ## copied form nonlinear_transform_short.py -'''univariate distribution of a non-linear monotonic transformation of a +"""univariate distribution of a non-linear monotonic transformation of a random variable -''' +""" + class ExpTransf_gen(distributions.rv_continuous): - '''Distribution based on log/exp transformation + """Distribution based on log/exp transformation the constructor can be called with a distribution class and generates the distribution of the transformed random variable - ''' + """ + def __init__(self, kls, *args, **kwargs): - #print(args - #print(kwargs - #explicit for self.__dict__.update(kwargs) - if 'numargs' in kwargs: - self.numargs = kwargs['numargs'] + # print(args + # print(kwargs + # explicit for self.__dict__.update(kwargs) + if "numargs" in kwargs: + self.numargs = kwargs["numargs"] else: self.numargs = 1 - if 'name' in kwargs: - name = kwargs['name'] + if "name" in kwargs: + name = kwargs["name"] else: - name = 'Log transformed distribution' - if 'a' in kwargs: - a = kwargs['a'] + name = "Log transformed distribution" + if "a" in kwargs: + a = kwargs["a"] else: a = 0 - super(ExpTransf_gen,self).__init__(a=a, name=name) + super(ExpTransf_gen, self).__init__(a=a, name=name) self.kls = kls - def _cdf(self,x,*args): - #print(args - return self.kls._cdf(np.log(x),*args) + + def _cdf(self, x, *args): + # print(args + return self.kls._cdf(np.log(x), *args) + def _ppf(self, q, *args): - return np.exp(self.kls._ppf(q,*args)) + return np.exp(self.kls._ppf(q, *args)) + class LogTransf_gen(distributions.rv_continuous): - '''Distribution based on log/exp transformation + """Distribution based on log/exp transformation the constructor can be called with a distribution class and generates the distribution of the transformed random variable - ''' + """ + def __init__(self, kls, *args, **kwargs): - #explicit for self.__dict__.update(kwargs) - if 'numargs' in kwargs: - self.numargs = kwargs['numargs'] + # explicit for self.__dict__.update(kwargs) + if "numargs" in kwargs: + self.numargs = kwargs["numargs"] else: self.numargs = 1 - if 'name' in kwargs: - name = kwargs['name'] + if "name" in kwargs: + name = kwargs["name"] else: - name = 'Log transformed distribution' - if 'a' in kwargs: - a = kwargs['a'] + name = "Log transformed distribution" + if "a" in kwargs: + a = kwargs["a"] else: a = 0 - super(LogTransf_gen,self).__init__(a=a, name = name) + super(LogTransf_gen, self).__init__(a=a, name=name) self.kls = kls - def _cdf(self,x, *args): - #print(args - return self.kls._cdf(np.exp(x),*args) + def _cdf(self, x, *args): + # print(args + return self.kls._cdf(np.exp(x), *args) + def _ppf(self, q, *args): - return np.log(self.kls._ppf(q,*args)) + return np.log(self.kls._ppf(q, *args)) + def examples_transf(): ##lognormal = ExpTransf(a=0.0, xa=-10.0, name = 'Log transformed normal') @@ -200,10 +235,12 @@ def examples_transf(): ##print(stats.lognorm.stats(1) ##print(lognormal.rvs(size=10) - print('Results for lognormal') - lognormalg = ExpTransf_gen(stats.norm, a=0, name = 'Log transformed normal general') + print("Results for lognormal") + lognormalg = ExpTransf_gen( + stats.norm, a=0, name="Log transformed normal general" + ) print(lognormalg.cdf(1)) - print(stats.lognorm.cdf(1,1)) + print(stats.lognorm.cdf(1, 1)) print(lognormalg.stats()) print(stats.lognorm.stats(1)) print(lognormalg.rvs(size=5)) @@ -213,36 +250,33 @@ def examples_transf(): ##print(loggammag._cdf(1,10) ##print(stats.loggamma.cdf(1,10) - print('Results for expgamma') + print("Results for expgamma") loggammaexpg = LogTransf_gen(stats.gamma) - print(loggammaexpg._cdf(1,10)) - print(stats.loggamma.cdf(1,10)) - print(loggammaexpg._cdf(2,15)) - print(stats.loggamma.cdf(2,15)) - + print(loggammaexpg._cdf(1, 10)) + print(stats.loggamma.cdf(1, 10)) + print(loggammaexpg._cdf(2, 15)) + print(stats.loggamma.cdf(2, 15)) # this requires change in scipy.stats.distribution - #print(loggammaexpg.cdf(1,10) + # print(loggammaexpg.cdf(1,10) - print('Results for loglaplace') + print("Results for loglaplace") loglaplaceg = LogTransf_gen(stats.laplace) - print(loglaplaceg._cdf(2,10)) - print(stats.loglaplace.cdf(2,10)) + print(loglaplaceg._cdf(2, 10)) + print(stats.loglaplace.cdf(2, 10)) loglaplaceexpg = ExpTransf_gen(stats.laplace) - print(loglaplaceexpg._cdf(2,10)) - - + print(loglaplaceexpg._cdf(2, 10)) ## copied from transformtwo.py -''' +""" Created on Apr 28, 2009 @author: Josef Perktold -''' +""" -''' A class for the distribution of a non-linear u-shaped or hump shaped transformation of a +""" A class for the distribution of a non-linear u-shaped or hump shaped transformation of a continuous random variable This is a companion to the distributions of non-linear monotonic transformation to the case @@ -270,11 +304,11 @@ def examples_transf(): * add _rvs as method, will be faster in many cases -''' +""" class TransfTwo_gen(distributions.rv_continuous): - '''Distribution based on a non-monotonic (u- or hump-shaped transformation) + """Distribution based on a non-monotonic (u- or hump-shaped transformation) the constructor can be called with a distribution class, and functions that define the non-linear transformation. @@ -287,79 +321,103 @@ class TransfTwo_gen(distributions.rv_continuous): This can be used to generate distribution instances similar to the distributions in scipy.stats. - ''' - #a class for non-linear non-monotonic transformation of a continuous random variable - def __init__(self, kls, func, funcinvplus, funcinvminus, derivplus, - derivminus, *args, **kwargs): - #print(args - #print(kwargs + """ + + # a class for non-linear non-monotonic transformation of a continuous random variable + def __init__( + self, + kls, + func, + funcinvplus, + funcinvminus, + derivplus, + derivminus, + *args, + **kwargs + ): + # print(args + # print(kwargs self.func = func self.funcinvplus = funcinvplus self.funcinvminus = funcinvminus self.derivplus = derivplus self.derivminus = derivminus - #explicit for self.__dict__.update(kwargs) - #need to set numargs because inspection does not work - self.numargs = kwargs.pop('numargs', 0) - #print(self.numargs - name = kwargs.pop('name','transfdist') - longname = kwargs.pop('longname','Non-linear transformed distribution') - extradoc = kwargs.pop('extradoc',None) - a = kwargs.pop('a', -np.inf) # attached to self in super - b = kwargs.pop('b', np.inf) # self.a, self.b would be overwritten - self.shape = kwargs.pop('shape', False) - #defines whether it is a `u` shaped or `hump' shaped - # transformation - + # explicit for self.__dict__.update(kwargs) + # need to set numargs because inspection does not work + self.numargs = kwargs.pop("numargs", 0) + # print(self.numargs + name = kwargs.pop("name", "transfdist") + longname = kwargs.pop( + "longname", "Non-linear transformed distribution" + ) + # extradoc = kwargs.pop('extradoc',None) + a = kwargs.pop("a", -np.inf) # attached to self in super + b = kwargs.pop("b", np.inf) # self.a, self.b would be overwritten + self.shape = kwargs.pop("shape", False) + # defines whether it is a `u` shaped or `hump' shaped + # transformation self.u_args, self.u_kwargs = get_u_argskwargs(**kwargs) - self.kls = kls #(self.u_args, self.u_kwargs) - # possible to freeze the underlying distribution - - super(TransfTwo_gen,self).__init__(a=a, b=b, - name = name, - shapes=kls.shapes, - longname = longname, - extradoc = extradoc) + self.kls = kls # (self.u_args, self.u_kwargs) + # possible to freeze the underlying distribution + + super(TransfTwo_gen, self).__init__( + a=a, + b=b, + name=name, + shapes=kls.shapes, + longname=longname, + # extradoc = extradoc + ) def _rvs(self, *args): - self.kls._size = self._size #size attached to self, not function argument + self.kls._size = ( + self._size + ) # size attached to self, not function argument return self.func(self.kls._rvs(*args)) - def _pdf(self,x,*args, **kwargs): - #print(args - if self.shape == 'u': + def _pdf(self, x, *args, **kwargs): + # print(args + if self.shape == "u": signpdf = 1 - elif self.shape == 'hump': + elif self.shape == "hump": signpdf = -1 else: - raise ValueError('shape can only be `u` or `hump`') - - return signpdf * (self.derivplus(x)*self.kls._pdf(self.funcinvplus(x),*args, **kwargs) - - self.derivminus(x)*self.kls._pdf(self.funcinvminus(x),*args, **kwargs)) - #note scipy _cdf only take *args not *kwargs - - def _cdf(self,x,*args, **kwargs): - #print(args - if self.shape == 'u': - return self.kls._cdf(self.funcinvplus(x),*args, **kwargs) - \ - self.kls._cdf(self.funcinvminus(x),*args, **kwargs) - #note scipy _cdf only take *args not *kwargs + raise ValueError("shape can only be `u` or `hump`") + + return signpdf * ( + self.derivplus(x) + * self.kls._pdf(self.funcinvplus(x), *args, **kwargs) + - self.derivminus(x) + * self.kls._pdf(self.funcinvminus(x), *args, **kwargs) + ) + # note scipy _cdf only take *args not *kwargs + + def _cdf(self, x, *args, **kwargs): + # print(args + if self.shape == "u": + return self.kls._cdf( + self.funcinvplus(x), *args, **kwargs + ) - self.kls._cdf(self.funcinvminus(x), *args, **kwargs) + # note scipy _cdf only take *args not *kwargs else: - return 1.0 - self._sf(x,*args, **kwargs) - - def _sf(self,x,*args, **kwargs): - #print(args - if self.shape == 'hump': - return self.kls._cdf(self.funcinvplus(x),*args, **kwargs) - \ - self.kls._cdf(self.funcinvminus(x),*args, **kwargs) - #note scipy _cdf only take *args not *kwargs + return 1.0 - self._sf(x, *args, **kwargs) + + def _sf(self, x, *args, **kwargs): + # print(args + if self.shape == "hump": + return self.kls._cdf( + self.funcinvplus(x), *args, **kwargs + ) - self.kls._cdf(self.funcinvminus(x), *args, **kwargs) + # note scipy _cdf only take *args not *kwargs else: return 1.0 - self._cdf(x, *args, **kwargs) - def _munp(self, n,*args, **kwargs): - return self._mom0_sc(n,*args) + def _munp(self, n, *args, **kwargs): + return self._mom0_sc(n, *args) + + # ppf might not be possible in general case? # should be possible in symmetric case # def _ppf(self, q, *args, **kwargs): @@ -368,14 +426,16 @@ def _munp(self, n,*args, **kwargs): # elif self.shape == 'hump': # return self.func(self.kls._ppf(1-q,*args, **kwargs)) -#TODO: rename these functions to have unique names +# TODO: rename these functions to have unique names + class SquareFunc(object): - '''class to hold quadratic function with inverse function and derivative + """class to hold quadratic function with inverse function and derivative using instance methods instead of class methods, if we want extension to parametrized function - ''' + """ + def inverseplus(self, x): return np.sqrt(x) @@ -383,71 +443,124 @@ def inverseminus(self, x): return 0.0 - np.sqrt(x) def derivplus(self, x): - return 0.5/np.sqrt(x) + return 0.5 / np.sqrt(x) def derivminus(self, x): - return 0.0 - 0.5/np.sqrt(x) + return 0.0 - 0.5 / np.sqrt(x) def squarefunc(self, x): - return np.power(x,2) + return np.power(x, 2) + sqfunc = SquareFunc() -squarenormalg = TransfTwo_gen(stats.norm, sqfunc.squarefunc, sqfunc.inverseplus, - sqfunc.inverseminus, sqfunc.derivplus, sqfunc.derivminus, - shape='u', a=0.0, b=np.inf, - numargs = 0, name = 'squarenorm', longname = 'squared normal distribution', - extradoc = '\ndistribution of the square of a normal random variable' +\ - ' y=x**2 with x N(0.0,1)') - #u_loc=l, u_scale=s) -squaretg = TransfTwo_gen(stats.t, sqfunc.squarefunc, sqfunc.inverseplus, - sqfunc.inverseminus, sqfunc.derivplus, sqfunc.derivminus, - shape='u', a=0.0, b=np.inf, - numargs = 1, name = 'squarenorm', longname = 'squared t distribution', - extradoc = '\ndistribution of the square of a t random variable' +\ - ' y=x**2 with x t(dof,0.0,1)') +squarenormalg = TransfTwo_gen( + stats.norm, + sqfunc.squarefunc, + sqfunc.inverseplus, + sqfunc.inverseminus, + sqfunc.derivplus, + sqfunc.derivminus, + shape="u", + a=0.0, + b=np.inf, + numargs=0, + name="squarenorm", + longname="squared normal distribution", + # extradoc = '\ndistribution of the square of a normal random variable' +\ + # ' y=x**2 with x N(0.0,1)' +) +# u_loc=l, u_scale=s) +squaretg = TransfTwo_gen( + stats.t, + sqfunc.squarefunc, + sqfunc.inverseplus, + sqfunc.inverseminus, + sqfunc.derivplus, + sqfunc.derivminus, + shape="u", + a=0.0, + b=np.inf, + numargs=1, + name="squarenorm", + longname="squared t distribution", + # extradoc = '\ndistribution of the square of a t random variable' +\ + # ' y=x**2 with x t(dof,0.0,1)' +) + def inverseplus(x): return np.sqrt(-x) + def inverseminus(x): return 0.0 - np.sqrt(-x) + def derivplus(x): - return 0.0 - 0.5/np.sqrt(-x) + return 0.0 - 0.5 / np.sqrt(-x) + def derivminus(x): - return 0.5/np.sqrt(-x) + return 0.5 / np.sqrt(-x) -def negsquarefunc(x): - return -np.power(x,2) +def negsquarefunc(x): + return -np.power(x, 2) + + +negsquarenormalg = TransfTwo_gen( + stats.norm, + negsquarefunc, + inverseplus, + inverseminus, + derivplus, + derivminus, + shape="hump", + a=-np.inf, + b=0.0, + numargs=0, + name="negsquarenorm", + longname="negative squared normal distribution", + # extradoc = '\ndistribution of the negative square of a normal random variable' +\ + # ' y=-x**2 with x N(0.0,1)' +) +# u_loc=l, u_scale=s) -negsquarenormalg = TransfTwo_gen(stats.norm, negsquarefunc, inverseplus, inverseminus, - derivplus, derivminus, shape='hump', a=-np.inf, b=0.0, - numargs = 0, name = 'negsquarenorm', longname = 'negative squared normal distribution', - extradoc = '\ndistribution of the negative square of a normal random variable' +\ - ' y=-x**2 with x N(0.0,1)') - #u_loc=l, u_scale=s) def inverseplus(x): return x + def inverseminus(x): return 0.0 - x + def derivplus(x): return 1.0 + def derivminus(x): return 0.0 - 1.0 + def absfunc(x): return np.abs(x) -absnormalg = TransfTwo_gen(stats.norm, np.abs, inverseplus, inverseminus, - derivplus, derivminus, shape='u', a=0.0, b=np.inf, - numargs = 0, name = 'absnorm', longname = 'absolute of normal distribution', - extradoc = '\ndistribution of the absolute value of a normal random variable' +\ - ' y=abs(x) with x N(0,1)') +absnormalg = TransfTwo_gen( + stats.norm, + np.abs, + inverseplus, + inverseminus, + derivplus, + derivminus, + shape="u", + a=0.0, + b=np.inf, + numargs=0, + name="absnorm", + longname="absolute of normal distribution", + # extradoc = '\ndistribution of the absolute value of a normal random variable' +\ + # ' y=abs(x) with x N(0,1)' +) diff --git a/statsmodels/sandbox/distributions/try_max.py b/statsmodels/sandbox/distributions/try_max.py index 5457325ff7a..c287927adc4 100644 --- a/statsmodels/sandbox/distributions/try_max.py +++ b/statsmodels/sandbox/distributions/try_max.py @@ -1,40 +1,51 @@ -''' +""" adjusted from Denis on pystatsmodels mailing list there might still be problems with loc and scale, -''' +""" from scipy import stats + __date__ = "2010-12-29 dec" + class MaxDist(stats.rv_continuous): - """ max of n of scipy.stats normal expon ... - Example: - maxnormal10 = RVmax( scipy.stats.norm, 10 ) - sample = maxnormal10( size=1000 ) - sample.cdf = cdf ^ n, ppf ^ (1/n) + """max of n of scipy.stats normal expon ... + Example: + maxnormal10 = RVmax( scipy.stats.norm, 10 ) + sample = maxnormal10( size=1000 ) + sample.cdf = cdf ^ n, ppf ^ (1/n) """ - def __init__( self, dist, n ): + + def __init__(self, dist, n): self.dist = dist self.n = n - extradoc = 'maximumdistribution is the distribution of the '\ - + 'maximum of n i.i.d. random variable' - super(MaxDist, self).__init__(name='maxdist', a=dist.a, b=dist.b, - longname = 'A maximumdistribution', extradoc = extradoc) + # extradoc = 'maximumdistribution is the distribution of the '\ + # + 'maximum of n i.i.d. random variable' + super(MaxDist, self).__init__( + name="maxdist", + a=dist.a, + b=dist.b, + longname="A maximumdistribution", # extradoc = extradoc + ) def _pdf(self, x, *args, **kw): - return self.n * self.dist.pdf(x, *args, **kw) \ - * self.dist.cdf(x, *args, **kw )**(self.n-1) + return ( + self.n + * self.dist.pdf(x, *args, **kw) + * self.dist.cdf(x, *args, **kw) ** (self.n - 1) + ) def _cdf(self, x, *args, **kw): - return self.dist.cdf(x, *args, **kw)**self.n + return self.dist.cdf(x, *args, **kw) ** self.n def _ppf(self, q, *args, **kw): # y = F(x) ^ n <=> x = F-1( y ^ 1/n) - return self.dist.ppf(q**(1./self.n), *args, **kw) + return self.dist.ppf(q ** (1.0 / self.n), *args, **kw) + ## def rvs( self, *args, **kw ): ## size = kw.pop( "size", 1 ) @@ -45,9 +56,9 @@ def _ppf(self, q, *args, **kw): maxdistr = MaxDist(stats.norm, 10) print(maxdistr.rvs(size=10)) -print(maxdistr.stats(moments = 'mvsk')) +print(maxdistr.stats(moments="mvsk")) -''' +""" >>> print maxdistr.stats(moments = 'mvsk') (array(1.5387527308351818), array(0.34434382328492852), array(0.40990510188513779), array(0.33139861783918922)) >>> rvs = np.random.randn(1000,10) @@ -70,4 +81,4 @@ def _ppf(self, q, *args, **kw): 0.99999999999999956 -''' +""" diff --git a/statsmodels/sandbox/distributions/try_pot.py b/statsmodels/sandbox/distributions/try_pot.py index 6a088423b26..dd879f11b9e 100644 --- a/statsmodels/sandbox/distributions/try_pot.py +++ b/statsmodels/sandbox/distributions/try_pot.py @@ -8,7 +8,7 @@ def mean_residual_life(x, frac=None, alpha=0.05): - '''empirical mean residual life or expected shortfall + """empirical mean residual life or expected shortfall Parameters ---------- @@ -24,7 +24,7 @@ def mean_residual_life(x, frac=None, alpha=0.05): last observations std is zero vectorize loop using cumsum frac does not work yet - ''' + """ axis = 0 # searchsorted is 1d only x = np.asarray(x) @@ -35,16 +35,18 @@ def mean_residual_life(x, frac=None, alpha=0.05): else: xthreshold = xsorted[np.floor(nobs * frac).astype(int)] # use searchsorted instead of simple index in case of ties - xlargerindex = np.searchsorted(xsorted, xthreshold, side='right') + xlargerindex = np.searchsorted(xsorted, xthreshold, side="right") # TODO:replace loop with cumsum ? result = [] - for i in range(len(xthreshold)-1): + for i in range(len(xthreshold) - 1): k_ind = xlargerindex[i] rmean = x[k_ind:].mean() # this does not work for last observations, nans rstd = x[k_ind:].std() - rmstd = rstd/np.sqrt(nobs-k_ind) # std error of mean, check formula + rmstd = rstd / np.sqrt( + nobs - k_ind + ) # std error of mean, check formula result.append((k_ind, xthreshold[i], rmean, rmstd)) res = np.array(result) diff --git a/statsmodels/sandbox/examples/bayesprior.py b/statsmodels/sandbox/examples/bayesprior.py index c0e48959d58..a192006f2ae 100644 --- a/statsmodels/sandbox/examples/bayesprior.py +++ b/statsmodels/sandbox/examples/bayesprior.py @@ -5,40 +5,47 @@ try: import pymc + pymc_installed = 1 except: print("pymc not imported") pymc_installed = 0 -import numpy as np from matplotlib import pyplot as plt -from scipy import stats, integrate +import numpy as np +from numpy import exp, log +from scipy import integrate, stats +from scipy.special import gammainc, gammaincinv, gammaln from scipy.stats import rv_continuous -from scipy.special import gammaln, gammaincinv, gammainc -from numpy import log,exp -#np.random.seed(12345) +# np.random.seed(12345) + class igamma_gen(rv_continuous): def _pdf(self, x, a, b): - return exp(self._logpdf(x,a,b)) + return exp(self._logpdf(x, a, b)) + def _logpdf(self, x, a, b): - return a*log(b) - gammaln(a) -(a+1)*log(x) - b/x + return a * log(b) - gammaln(a) - (a + 1) * log(x) - b / x + def _cdf(self, x, a, b): - return 1.0-gammainc(a,b/x) # why is this different than the wiki? + return 1.0 - gammainc(a, b / x) # why is this different than the wiki? + def _ppf(self, q, a, b): - return b/gammaincinv(a,1-q) -#NOTE: should be correct, work through invgamma example and 2 param inv gamma -#CDF + return b / gammaincinv(a, 1 - q) + + # NOTE: should be correct, work through invgamma example and 2 param inv gamma + # CDF def _munp(self, n, a, b): - args = (a,b) + args = (a, b) super(igamma_gen, self)._munp(self, n, *args) -#TODO: is this robust for differential entropy in this case? closed form or -#shortcuts in special? + + # TODO: is this robust for differential entropy in this case? closed form or + # shortcuts in special? def _entropy(self, *args): def integ(x): val = self._pdf(x, *args) - return val*log(val) + return val * log(val) entr = -integrate.quad(integ, self.a, self.b)[0] if not np.isnan(entr): @@ -46,31 +53,36 @@ def integ(x): else: raise ValueError("Problem with integration. Returned nan.") -igamma = igamma_gen(a=0.0, name='invgamma', longname="An inverted gamma", - shapes = 'a,b', extradoc=""" -Inverted gamma distribution +igamma = igamma_gen( + a=0.0, + name="invgamma", + longname="An inverted gamma", + shapes="a,b", # extradoc=""" + # + # Inverted gamma distribution + # + # invgamma.pdf(x,a,b) = b**a*x**(-a-1)/gamma(a) * exp(-b/x) + # for x > 0, a > 0, b>0. + # """ +) -invgamma.pdf(x,a,b) = b**a*x**(-a-1)/gamma(a) * exp(-b/x) -for x > 0, a > 0, b>0. -""") - -#NOTE: the above is unnecessary. B takes the same role as the scale parameter +# NOTE: the above is unnecessary. B takes the same role as the scale parameter # in inverted gamma -palpha = np.random.gamma(400.,.005, size=10000) -print("First moment: %s\nSecond moment: %s" % (palpha.mean(),palpha.std())) +palpha = np.random.gamma(400.0, 0.005, size=10000) +print("First moment: %s\nSecond moment: %s" % (palpha.mean(), palpha.std())) palpha = palpha[0] -prho = np.random.beta(49.5,49.5, size=1e5) +prho = np.random.beta(49.5, 49.5, size=1e5) print("Beta Distribution") -print("First moment: %s\nSecond moment: %s" % (prho.mean(),prho.std())) +print("First moment: %s\nSecond moment: %s" % (prho.mean(), prho.std())) prho = prho[0] -psigma = igamma.rvs(1.,4.**2/2, size=1e5) +psigma = igamma.rvs(1.0, 4.0**2 / 2, size=1e5) print("Inverse Gamma Distribution") -print("First moment: %s\nSecond moment: %s" % (psigma.mean(),psigma.std())) +print("First moment: %s\nSecond moment: %s" % (psigma.mean(), psigma.std())) # First do the univariate case # y_t = theta_t + epsilon_t @@ -90,38 +102,38 @@ def integ(x): draws = 400 # prior beliefs, from JME paper -mu_, lambda_ = 1.,2. +mu_, lambda_ = 1.0, 2.0 # Model 1 -y1y2 = np.zeros((draws,2)) +y1y2 = np.zeros((draws, 2)) for draw in range(draws): - theta = np.random.normal(mu_,lambda_**2) + theta = np.random.normal(mu_, lambda_**2) y1 = theta + np.random.normal() y2 = theta + np.random.normal() - y1y2[draw] = y1,y2 + y1y2[draw] = y1, y2 # log marginal distribution -lnp1p2_mod1 = stats.norm.pdf(y1,loc=mu_, scale=lambda_**2+1)*\ - stats.norm.pdf(y2,mu_,scale=lambda_**2+1) +lnp1p2_mod1 = stats.norm.pdf( + y1, loc=mu_, scale=lambda_**2 + 1 +) * stats.norm.pdf(y2, mu_, scale=lambda_**2 + 1) # Model 2 -pmu_pairsp1 = np.zeros((draws,2)) -y1y2pairsp1 = np.zeros((draws,2)) +pmu_pairsp1 = np.zeros((draws, 2)) +y1y2pairsp1 = np.zeros((draws, 2)) # prior 1 for draw in range(draws): - theta1 = np.random.uniform(0,1) + theta1 = np.random.uniform(0, 1) theta2 = np.random.normal(mu_, lambda_**2) -# mu = theta2/(1-theta1) -#do not do this to maintain independence theta2 is the _location_ -# y1 = np.random.normal(mu_, lambda_**2) + # mu = theta2/(1-theta1) + # do not do this to maintain independence theta2 is the _location_ + # y1 = np.random.normal(mu_, lambda_**2) y1 = theta2 -# pmu_pairsp1[draw] = mu, theta1 - pmu_pairsp1[draw] = theta2, theta1 # mean, autocorr + # pmu_pairsp1[draw] = mu, theta1 + pmu_pairsp1[draw] = theta2, theta1 # mean, autocorr y2 = theta2 + theta1 * y1 + np.random.normal() - y1y2pairsp1[draw] = y1,y2 - + y1y2pairsp1[draw] = y1, y2 # for a = 0, b = 1 - epsilon = .99999 @@ -129,65 +141,70 @@ def integ(x): # variance is 1./12 * .99999**2 # Model 2 -pmu_pairsp2 = np.zeros((draws,2)) -y1y2pairsp2 = np.zeros((draws,2)) +pmu_pairsp2 = np.zeros((draws, 2)) +y1y2pairsp2 = np.zeros((draws, 2)) # prior 2 theta12_2 = [] for draw in range(draws): -# y1 = np.random.uniform(-4,6) - theta1 = np.random.uniform(0,1) - theta2 = np.random.normal(mu_*(1-theta1), lambda_**2*(1-theta1)**2) - theta12_2.append([theta1,theta2]) - - mu = theta2/(1-theta1) - y1 = np.random.normal(mu_,lambda_**2) + # y1 = np.random.uniform(-4,6) + theta1 = np.random.uniform(0, 1) + theta2 = np.random.normal( + mu_ * (1 - theta1), lambda_**2 * (1 - theta1) ** 2 + ) + theta12_2.append([theta1, theta2]) + + mu = theta2 / (1 - theta1) + y1 = np.random.normal(mu_, lambda_**2) y2 = theta2 + theta1 * y1 + np.random.normal() pmu_pairsp2[draw] = mu, theta1 - y1y2pairsp2[draw] = y1,y2 + y1y2pairsp2[draw] = y1, y2 fig = plt.figure() fsp = fig.add_subplot(221) -fsp.scatter(pmu_pairsp1[:,0], pmu_pairsp1[:,1], color='b', facecolor='none') -fsp.set_ylabel('Autocorrelation (Y)') -fsp.set_xlabel('Mean (Y)') -fsp.set_title('Model 2 (P1)') -fsp.axis([-20,20,0,1]) +fsp.scatter(pmu_pairsp1[:, 0], pmu_pairsp1[:, 1], color="b", facecolor="none") +fsp.set_ylabel("Autocorrelation (Y)") +fsp.set_xlabel("Mean (Y)") +fsp.set_title("Model 2 (P1)") +fsp.axis([-20, 20, 0, 1]) fsp = fig.add_subplot(222) -fsp.scatter(pmu_pairsp2[:,0],pmu_pairsp2[:,1], color='b', facecolor='none') -fsp.set_title('Model 2 (P2)') -fsp.set_ylabel('Autocorrelation (Y)') -fsp.set_xlabel('Mean (Y)') -fsp.set_title('Model 2 (P2)') -fsp.axis([-20,20,0,1]) +fsp.scatter(pmu_pairsp2[:, 0], pmu_pairsp2[:, 1], color="b", facecolor="none") +fsp.set_title("Model 2 (P2)") +fsp.set_ylabel("Autocorrelation (Y)") +fsp.set_xlabel("Mean (Y)") +fsp.set_title("Model 2 (P2)") +fsp.axis([-20, 20, 0, 1]) fsp = fig.add_subplot(223) -fsp.scatter(y1y2pairsp1[:,0], y1y2pairsp1[:,1], color='b', marker='o', - facecolor='none') -fsp.scatter(y1y2[:,0], y1y2[:,1], color ='g', marker='+') -fsp.set_title('Model 1 vs. Model 2 (P1)') -fsp.set_ylabel('Y(2)') -fsp.set_xlabel('Y(1)') -fsp.axis([-20,20,-20,20]) +fsp.scatter( + y1y2pairsp1[:, 0], + y1y2pairsp1[:, 1], + color="b", + marker="o", + facecolor="none", +) +fsp.scatter(y1y2[:, 0], y1y2[:, 1], color="g", marker="+") +fsp.set_title("Model 1 vs. Model 2 (P1)") +fsp.set_ylabel("Y(2)") +fsp.set_xlabel("Y(1)") +fsp.axis([-20, 20, -20, 20]) fsp = fig.add_subplot(224) -fsp.scatter(y1y2pairsp2[:,0], y1y2pairsp2[:,1], color='b', marker='o') -fsp.scatter(y1y2[:,0], y1y2[:,1], color='g', marker='+') -fsp.set_title('Model 1 vs. Model 2 (P2)') -fsp.set_ylabel('Y(2)') -fsp.set_xlabel('Y(1)') -fsp.axis([-20,20,-20,20]) +fsp.scatter(y1y2pairsp2[:, 0], y1y2pairsp2[:, 1], color="b", marker="o") +fsp.scatter(y1y2[:, 0], y1y2[:, 1], color="g", marker="+") +fsp.set_title("Model 1 vs. Model 2 (P2)") +fsp.set_ylabel("Y(2)") +fsp.set_xlabel("Y(1)") +fsp.axis([-20, 20, -20, 20]) -#plt.show() +# plt.show() -#TODO: this does not look the same as the working paper? -#NOTE: but it matches the language? I think mine is right! +# TODO: this does not look the same as the working paper? +# NOTE: but it matches the language? I think mine is right! # Contour plots. # on the basis of observed data. ie., the mgrid -#np.mgrid[6:-4:10j,-4:6:10j] - - +# np.mgrid[6:-4:10j,-4:6:10j] # Example 2: @@ -211,30 +228,32 @@ def integ(x): # palpha ~ Gamma(2.00,.10) # mean = 2.00 # std = .1 which implies k = 400, theta = .005 -palpha = np.random.gamma(400,.005) +palpha = np.random.gamma(400, 0.005) # pi ~ Beta(.5,.05) pi = np.random.beta(49.5, 49.5) # psigma ~ InvGamma(1.00,4.00) -#def invgamma(a,b): +# def invgamma(a,b): # return np.sqrt(b*a**2/np.sum(np.random.random(b,1)**2, axis=1)) -#NOTE: Use inverse gamma distribution igamma -psigma = igamma.rvs(1.,4.0, size=1e6) #TODO: parameterization is not correct vs. +# NOTE: Use inverse gamma distribution igamma +psigma = igamma.rvs( + 1.0, 4.0, size=1e6 +) # TODO: parameterization is not correct vs. # Del Negro and Schorfheide if pymc_installed: - psigma2 = pymc.rinverse_gamma(1.,4.0, size=1e6) + psigma2 = pymc.rinverse_gamma(1.0, 4.0, size=1e6) else: - psigma2 = stats.invgamma.rvs(1., scale=4.0, size=1e6) + psigma2 = stats.invgamma.rvs(1.0, scale=4.0, size=1e6) nsims = 500 y = np.zeros((nsims)) -#for i in range(1,nsims): +# for i in range(1,nsims): # y[i] = .9*y[i-1] + 1/(1-p1/alpha) + np.random.normal() -#Are these supposed to be sampled jointly? +# Are these supposed to be sampled jointly? # InvGamma(sigma|v,s) propto sigma**(-v-1)*e**(-vs**2/2*sigma**2) -#igamma = +# igamma = # M2: y_t = 1/alpha * E_t[y_t+1] + p2*y_t-1 + mu_t # mu_t ~ epsilon_t diff --git a/statsmodels/stats/proportion.py b/statsmodels/stats/proportion.py index f8df2900f84..ee068b74bac 100644 --- a/statsmodels/stats/proportion.py +++ b/statsmodels/stats/proportion.py @@ -622,7 +622,11 @@ def binom_test(count, nobs, prop=0.5, alternative='two-sided'): if np.any(prop > 1.0) or np.any(prop < 0.0): raise ValueError("p must be in range [0,1]") if alternative in ['2s', 'two-sided']: - pval = stats.binom_test(count, n=nobs, p=prop) + try: + pval = stats.binomtest(int(count), n=int(nobs), p=prop).pvalue + except AttributeError: + # Remove after min SciPy >= 1.7 + pval = stats.binom_test(count, n=nobs, p=prop) elif alternative in ['l', 'larger']: pval = stats.binom.sf(count-1, nobs, prop) elif alternative in ['s', 'smaller']: