From d1bef78911e5a325393fed66adb1beb5d1cdb85b Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Mon, 24 Aug 2015 11:23:58 -0400 Subject: [PATCH 01/24] ENH: OrderedModel initial version --- statsmodels/miscmodels/ordinal_model.py | 174 ++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 statsmodels/miscmodels/ordinal_model.py diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py new file mode 100644 index 00000000000..934ea212f94 --- /dev/null +++ b/statsmodels/miscmodels/ordinal_model.py @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- +""" +Created on Sat Aug 22 20:24:42 2015 + +Author: Josef Perktold +License: BSD-3 +""" + +import numpy as np +from scipy import stats +from statsmodels.base.model import GenericLikelihoodModel, GenericLikelihoodModelResults + + +class OrderedModel(GenericLikelihoodModel): + """Ordinal Model based on logistic or normal distribution + + The parameterization corresponds to the proportional odds model. + + The mode assumes that the endogenous variable is ordered but that the + labels have no numeric interpretation besides the ordering. + + The model is based on a latent linear variable, where we observe only + + y_latent = X beta + u + + The observed variable is defined by the interval + + y = {0 if y_latent <= cut_0 + 1 of cut_0 < y_latent <= cut_1 + ... + K if cut_K < y_latent + + The probability of observing y=k conditional on the explanatory variables + X is given by + + prob(y = k | x) = Prob(cut_k < y_latent <= cut_k+1) + = Prob(cut_k - x beta < u <= cut_k+1 - x beta + = F(cut_k+1 - x beta) - F(cut_k - x beta) + + Where F is the cumulative distribution of u which is either the normal + or the logistic distribution, but can be set to any other continuous + distribution. We use standardized distributions to avoid identifiability + problems. + + + Parameters + ---------- + endog : array_like + endogenous or dependent ordered categorical variable with k levels. + Labels or values of endog will internally transformed to consecutive + integers, 0, 1, 2, ... + exog : array_like + exogenous explanatory variables. This should not include an intercept. + (TODO: verify) + distr : string 'probit' or 'logit', or a distribution instance + The default is currently 'probit' which uses the normal distribution + and corresponds to an ordered Probit model. The distribution is + assumed to have the main methods of scipy.stats distributions, mainly + cdf, pdf and ppf. The inverse cdf, ppf, is only use to calculate + starting values. + + Status: initial version, subclasses `GenericLikelihoodModel` + + """ + + def __init__(self, endog, exog, distr='probit', **kwds): + super(OrderedModel, self).__init__(endog, exog, **kwds) + unique, index = np.unique(self.endog, return_inverse=True) + self.k_levels = len(unique) + self.endog = index + self.labels = unique + if distr == 'probit': + self.distr = stats.norm + elif distr == 'logit': + self.distr = stats.logistic + else: + self.distr = distr + + self.k_vars = self.exog.shape[1] + self.results_class = OrderedResults #TODO: doesn't work + + + def cdf(self, x): + """cdf evaluated at x + """ + return self.distr.cdf(x) + + def prob(self, low, upp): + """interval probability + """ + return np.maximum(self.cdf(upp) - self.cdf(low), 0) + + def transform_threshold_params(self, params): + """transformation of the parameters in the optimization + + Parameters + ---------- + params : nd_array + contains (exog_coef, transformed_thresholds) where exog_coef are + the coefficient for the explanatory variables in the linear term, + transformed threshold or cutoff points. The first, lowest threshold + is unchanged, all other thresholds are in terms of exponentiated + increments + + Returns + ------- + thresh : nd_array + thresh are the thresholds or cutoff constants for the intervals. + + """ + th_params = params[-(self.k_levels - 1):] + thresh = np.concatenate((th_params[:1], np.exp(th_params[1:]))).cumsum() + thresh = np.concatenate(([-np.inf], thresh, [np.inf])) + return thresh + + def transform_reverse_threshold_params(self, params): + """obtain transformed thresholds from original thresholds, cutoff + constants. + + """ + start_ppf = params + thresh_params = np.concatenate((start_ppf[:1], np.log(np.diff(start_ppf[:-1])))) + return thresh_params + + + def predict(self, params, exog=None): + """predicted probabilities for each level of the ordinal endog. + + + """ + #structure of params = [beta, constants_or_thresholds] + + # explicit in several steps to avoid bugs + th_params = params[-(self.k_levels - 1):] + thresh = np.concatenate((th_params[:1], np.exp(th_params[1:]))).cumsum() + thresh = np.concatenate(([-np.inf], thresh, [np.inf])) + xb = self.exog.dot(params[:-(self.k_levels - 1)])[:,None] + low = thresh[:-1] - xb + upp = thresh[1:] - xb + prob = self.prob(low, upp) + return prob + + + def loglike(self, params): + + #structure of params = [beta, constants_or_thresholds] + + thresh = np.concatenate(([-np.inf], params[-(self.k_levels - 1):], [np.inf])) + + # explicit in several steps to avoid bugs + th_params = params[-(self.k_levels - 1):] + thresh = np.concatenate((th_params[:1], np.exp(th_params[1:]))).cumsum() + thresh = np.concatenate(([-np.inf], thresh, [np.inf])) + thresh_i_low = thresh[self.endog] + thresh_i_upp = thresh[self.endog + 1] + xb = self.exog.dot(params[:-(self.k_levels - 1)]) + low = thresh_i_low - xb + upp = thresh_i_upp - xb + prob = self.prob(low, upp) + return np.log(prob + 1e-20).sum() + + @property + def start_params(self): + # start params based on model without exog + freq = np.bincount(self.endog) / len(self.endog) + start_ppf = self.distr.ppf(np.clip(freq.cumsum(), 0, 1)) + start_threshold = self.transform_reverse_threshold_params(start_ppf) + start_params = np.concatenate((np.zeros(self.k_vars), start_threshold)) + return start_params + + +class OrderedResults(GenericLikelihoodModelResults): + + pass From ac0abebbbe48c7ecc4d03968ef2bc80dcd9b0f04 Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Mon, 24 Aug 2015 11:24:38 -0400 Subject: [PATCH 02/24] Example script for OrderedModel --- statsmodels/examples/ex_ordered_model.py | 72 ++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 statsmodels/examples/ex_ordered_model.py diff --git a/statsmodels/examples/ex_ordered_model.py b/statsmodels/examples/ex_ordered_model.py new file mode 100644 index 00000000000..b6afcec34d9 --- /dev/null +++ b/statsmodels/examples/ex_ordered_model.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Aug 24 11:17:06 2015 + +Author: Josef Perktold +License: BSD-3 +""" + +import numpy as np +from scipy import stats + +from statsmodels.miscmodels.ordinal_model import OrderedModel + +nobs, k_vars = 1000, 3 +x = np.random.randn(nobs, k_vars) +#x = np.column_stack((np.ones(nobs), x)) #constant will be in integration limits +xb = x.dot(np.ones(k_vars)) +y_latent = xb + np.random.randn(nobs) +y = np.round(np.clip(y_latent, -2.4, 2.4)).astype(int) + 2 + +print(np.unique(y)) +print(np.bincount(y)) + +mod = OrderedModel(y, x) +start_params = np.ones(k_vars + 4) +start_params = np.concatenate((np.ones(k_vars), np.arange(4))) +start_ppf = stats.norm.ppf((np.bincount(y) / len(y)).cumsum()) +start_threshold = np.concatenate((start_ppf[:1], np.log(np.diff(start_ppf[:-1])))) +start_params = np.concatenate((np.zeros(k_vars), start_threshold)) +res = mod.fit(start_params=start_params, maxiter=5000, maxfun=5000) +print(res.params) +#res = mod.fit(start_params=res.params, method='bfgs') +res = mod.fit(start_params=start_params, method='bfgs') + +print(res.params) +print(np.exp(res.params[-(mod.k_levels - 1):]).cumsum()) +#print(res.summary()) + +predicted = res.model.predict(res.params) +pred_choice = predicted.argmax(1) +print('Fraction of correct choice predictions') +print((y == pred_choice).mean()) + +print('\ncomparing bincount') +print(np.bincount(res.model.predict(res.params).argmax(1))) +print(np.bincount(res.model.endog)) + +res_log = OrderedModel(y, x, distr='logit').fit(method='bfgs') +pred_choice_log = res_log.predict().argmax(1) +print((y == pred_choice_log).mean()) + + +# example form UCLA Stats pages http://www.ats.ucla.edu/stat/stata/dae/ologit.htm +# requires downloaded dataset ologit.dta + +import pandas +dataf = pandas.read_stata(r"M:\josef_new\scripts\ologit_ucla.dta") + +# this works but sorts category levels alphabetically +res_log2 = OrderedModel(np.asarray(dataf['apply']), + np.asarray(dataf[['pared', 'public', 'gpa']], float), + distr='logit').fit(method='bfgs') + +# this replicates the UCLA example except for different parameterization of par2 +res_log3 = OrderedModel(dataf['apply'].values.codes, + np.asarray(dataf[['pared', 'public', 'gpa']], float), + distr='logit').fit(method='bfgs') + +print(res_log3.summary()) + +# with ordered probit - not on UCLA page +print(OrderedModel(dataf['apply'].values.codes, np.asarray(dataf[['pared', 'public', 'gpa']], float), distr='probit').fit(method='bfgs').summary()) From b9459a0580ffe6b3d746ef9824faf45e35e0cf58 Mon Sep 17 00:00:00 2001 From: Bolzano-Weierstrass Date: Sat, 22 Aug 2020 13:42:29 +0200 Subject: [PATCH 03/24] ENH: added Pandas support for exog & endog --- statsmodels/miscmodels/ordinal_model.py | 60 ++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index 934ea212f94..c48c030e191 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -7,6 +7,7 @@ """ import numpy as np +import pandas as pd from scipy import stats from statsmodels.base.model import GenericLikelihoodModel, GenericLikelihoodModelResults @@ -49,9 +50,12 @@ class OrderedModel(GenericLikelihoodModel): endogenous or dependent ordered categorical variable with k levels. Labels or values of endog will internally transformed to consecutive integers, 0, 1, 2, ... + pd.Series with Categorical as dtype should be preferred as it gives + the order relation between the levels. exog : array_like exogenous explanatory variables. This should not include an intercept. (TODO: verify) + pd.DataFrame are also accepted. distr : string 'probit' or 'logit', or a distribution instance The default is currently 'probit' which uses the normal distribution and corresponds to an ordered Probit model. The distribution is @@ -64,11 +68,7 @@ class OrderedModel(GenericLikelihoodModel): """ def __init__(self, endog, exog, distr='probit', **kwds): - super(OrderedModel, self).__init__(endog, exog, **kwds) - unique, index = np.unique(self.endog, return_inverse=True) - self.k_levels = len(unique) - self.endog = index - self.labels = unique + if distr == 'probit': self.distr = stats.norm elif distr == 'logit': @@ -76,8 +76,56 @@ def __init__(self, endog, exog, distr='probit', **kwds): else: self.distr = distr + self.names, endog, exog = self._check_inputs(endog, exog) + + super(OrderedModel, self).__init__(endog, exog, **kwds) + + unique, index = np.unique(self.endog, return_inverse=True) + self.k_levels = len(unique) + self.endog = index + self.labels = unique + self.k_vars = self.exog.shape[1] - self.results_class = OrderedResults #TODO: doesn't work + self.results_class = OrderedResults + + def _check_inputs(self, endog, exog): + """ + checks if self.distrib is legal and does the Pandas support for endog and exog. + Also retrieves columns & categories names for .summary() of the results class. + """ + names = {} + if not isinstance(self.distr, stats.rv_continuous): + msg = ( + f"{self.distr.name} must be a scipy.stats distribution." + ) + raise ValueError(msg) + + # Pandas' support + if (isinstance(exog, pd.DataFrame)) or (isinstance(exog, pd.Series)): + exog_name = (exog.name if isinstance(exog, pd.Series) else exog.columns.tolist()) + names['xname'] = exog_name + exog = np.asarray(exog) + + if isinstance(endog, pd.Series): + if isinstance(endog.dtypes, pd.CategoricalDtype): + if not endog.dtype.ordered: + import warnings + warnings.warn("the endog has ordered == False, risk of capturing a wrong order" + " for the categories. ordered == True preferred.", Warning) + endog_name = endog.name + thresholds_name = [str(x) + '/' + str(y) + for x, y in zip(endog.values.categories[:-1], endog.values.categories[1:])] + names['yname'] = endog_name + names['xname'] = names['xname'] + thresholds_name + endog = np.asarray(endog.values.codes) + else: + msg = ( + f"If the endog is a pandas.Serie it must be of categoricalDtype." + ) + raise ValueError(msg) + + return names, endog, exog + def cdf(self, x): From 774a12df4830288eadd9662092db0279b04d0cf3 Mon Sep 17 00:00:00 2001 From: Bolzano-Weierstrass Date: Sat, 22 Aug 2020 15:30:03 +0200 Subject: [PATCH 04/24] FIX: pd.CategoricalDtype changed for pandas < 0.24 --- statsmodels/miscmodels/ordinal_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index c48c030e191..cb4d3525ceb 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd +from pandas.api.types import CategoricalDtype from scipy import stats from statsmodels.base.model import GenericLikelihoodModel, GenericLikelihoodModelResults @@ -107,7 +108,7 @@ def _check_inputs(self, endog, exog): exog = np.asarray(exog) if isinstance(endog, pd.Series): - if isinstance(endog.dtypes, pd.CategoricalDtype): + if isinstance(endog.dtypes, CategoricalDtype): if not endog.dtype.ordered: import warnings warnings.warn("the endog has ordered == False, risk of capturing a wrong order" From a02b691765e41ab8a6687d7dd61e2ac8809c3e45 Mon Sep 17 00:00:00 2001 From: Bolzano-Weierstrass Date: Sat, 22 Aug 2020 15:32:16 +0200 Subject: [PATCH 05/24] ENH: added OrderedResults --- statsmodels/miscmodels/ordinal_model.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index cb4d3525ceb..39aa240e1fb 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -11,7 +11,7 @@ from pandas.api.types import CategoricalDtype from scipy import stats from statsmodels.base.model import GenericLikelihoodModel, GenericLikelihoodModelResults - +from statsmodels.compat.pandas import Appender class OrderedModel(GenericLikelihoodModel): """Ordinal Model based on logistic or normal distribution @@ -217,7 +217,23 @@ def start_params(self): start_params = np.concatenate((np.zeros(self.k_vars), start_threshold)) return start_params + @Appender(GenericLikelihoodModel.fit.__doc__) + def fit(self, start_params=None, method='nm', maxiter=500, full_output=1, + disp=1, callback=None, retall=0, **kwargs): -class OrderedResults(GenericLikelihoodModelResults): + fit_method = super(OrderedModel, self).fit + mlefit = fit_method(start_params=start_params, + method=method, maxiter=maxiter, + full_output=full_output, + disp=disp, callback=callback, **kwargs) + # use the proper result class + ordmlefit = OrderedResults(self, mlefit) - pass + return ordmlefit + +class OrderedResults(GenericLikelihoodModelResults): + @Appender(GenericLikelihoodModelResults.summary.__doc__) + def summary(self, yname=None, xname=None, title=None, alpha=.05, **kwargs): + names = self.model.names + print(names) + return super(OrderedResults, self).summary(**names, **kwargs) From cf43203e61e46f839ce658e7010ebd5e8952b167 Mon Sep 17 00:00:00 2001 From: Bolzano-Weierstrass Date: Sat, 22 Aug 2020 15:41:10 +0200 Subject: [PATCH 06/24] TST: added raw data & R results Holders --- .../miscmodels/tests/results/ologit_ucla.csv | 401 ++++++++++++++++++ .../tests/results/results_ordinal_method.py | 101 +++++ 2 files changed, 502 insertions(+) create mode 100644 statsmodels/miscmodels/tests/results/ologit_ucla.csv create mode 100644 statsmodels/miscmodels/tests/results/results_ordinal_method.py diff --git a/statsmodels/miscmodels/tests/results/ologit_ucla.csv b/statsmodels/miscmodels/tests/results/ologit_ucla.csv new file mode 100644 index 00000000000..0c77e0ea8b5 --- /dev/null +++ b/statsmodels/miscmodels/tests/results/ologit_ucla.csv @@ -0,0 +1,401 @@ +apply,pared,public,gpa +very likely,0,0,3.26 +somewhat likely,1,0,3.21 +unlikely,1,1,3.94 +somewhat likely,0,0,2.81 +somewhat likely,0,0,2.53 +unlikely,0,1,2.59 +somewhat likely,0,0,2.56 +somewhat likely,0,0,2.73 +unlikely,0,0,3.0 +somewhat likely,1,0,3.5 +unlikely,1,1,3.65 +somewhat likely,0,0,2.84 +very likely,0,1,3.9 +somewhat likely,0,0,2.68 +unlikely,1,0,3.57 +unlikely,0,0,3.09 +unlikely,0,1,3.5 +unlikely,0,0,2.17 +very likely,0,1,3.36 +somewhat likely,0,0,3.4 +very likely,0,0,2.75 +somewhat likely,1,0,3.2 +unlikely,0,0,2.44 +unlikely,0,0,2.83 +unlikely,0,1,3.0 +somewhat likely,0,1,3.27 +somewhat likely,1,0,3.14 +somewhat likely,0,0,3.37 +very likely,0,1,2.79 +unlikely,0,0,2.9 +somewhat likely,0,0,3.38 +unlikely,0,1,2.95 +unlikely,0,0,2.98 +unlikely,1,1,3.81 +unlikely,0,0,2.74 +unlikely,0,0,2.62 +unlikely,0,0,2.85 +somewhat likely,0,0,2.5 +somewhat likely,0,0,2.75 +unlikely,0,0,2.26 +unlikely,0,0,2.03 +somewhat likely,1,0,2.85 +unlikely,0,0,2.72 +somewhat likely,0,0,2.89 +somewhat likely,1,0,2.47 +unlikely,0,0,3.04 +unlikely,0,0,3.1 +unlikely,0,0,2.57 +unlikely,0,0,2.09 +somewhat likely,0,0,2.94 +unlikely,0,0,3.45 +unlikely,0,0,2.76 +very likely,0,1,2.96 +unlikely,0,0,2.89 +somewhat likely,0,0,2.97 +somewhat likely,0,1,3.91 +somewhat likely,0,0,2.77 +unlikely,0,0,2.51 +unlikely,0,0,3.24 +unlikely,0,0,2.44 +somewhat likely,0,0,2.78 +unlikely,0,0,2.94 +unlikely,1,0,3.22 +unlikely,0,0,3.5 +very likely,1,0,3.57 +somewhat likely,0,0,3.17 +unlikely,0,1,3.23 +unlikely,0,0,2.91 +unlikely,0,0,3.28 +unlikely,0,1,3.32 +unlikely,0,0,3.62 +unlikely,0,0,2.55 +unlikely,0,0,2.97 +very likely,1,0,3.63 +unlikely,0,1,3.02 +unlikely,0,1,3.6 +unlikely,0,0,2.95 +very likely,1,1,3.81 +somewhat likely,1,0,2.68 +unlikely,1,0,3.72 +unlikely,0,0,2.49 +unlikely,0,0,2.72 +unlikely,1,0,2.25 +unlikely,0,0,2.53 +unlikely,0,0,3.26 +very likely,0,0,2.57 +unlikely,0,0,2.9 +unlikely,0,0,3.05 +somewhat likely,0,0,2.85 +somewhat likely,0,0,4.0 +very likely,1,0,3.4 +somewhat likely,0,1,3.24 +unlikely,0,1,3.51 +very likely,0,0,2.55 +unlikely,0,0,3.06 +very likely,0,0,3.0 +somewhat likely,0,0,3.13 +unlikely,0,0,3.16 +somewhat likely,0,0,2.47 +somewhat likely,0,0,2.89 +unlikely,0,0,2.74 +unlikely,0,0,3.16 +unlikely,0,0,2.92 +somewhat likely,0,0,3.57 +somewhat likely,1,0,3.49 +somewhat likely,0,1,2.56 +somewhat likely,1,0,3.12 +somewhat likely,1,0,2.65 +somewhat likely,0,0,2.51 +somewhat likely,0,0,3.08 +somewhat likely,0,0,2.65 +unlikely,0,0,2.99 +unlikely,0,0,2.63 +somewhat likely,0,0,2.8 +somewhat likely,0,1,3.46 +unlikely,0,0,3.61 +unlikely,0,0,3.08 +unlikely,1,0,3.28 +unlikely,0,1,3.31 +unlikely,0,0,2.57 +somewhat likely,0,0,3.39 +unlikely,0,0,2.26 +unlikely,0,0,2.42 +unlikely,0,0,2.6 +somewhat likely,1,1,3.44 +somewhat likely,0,0,2.98 +somewhat likely,1,0,3.09 +unlikely,0,1,3.34 +unlikely,0,0,2.84 +unlikely,0,0,2.81 +unlikely,0,0,2.79 +unlikely,0,0,2.4 +unlikely,0,0,3.02 +somewhat likely,0,0,2.65 +somewhat likely,0,0,2.55 +somewhat likely,1,0,3.12 +unlikely,0,0,2.98 +very likely,1,0,3.61 +unlikely,0,0,2.98 +unlikely,0,0,3.19 +unlikely,0,0,3.51 +unlikely,0,0,3.1 +unlikely,0,0,3.55 +unlikely,0,0,2.98 +very likely,0,0,2.99 +unlikely,0,0,3.05 +unlikely,0,0,2.99 +unlikely,0,0,2.32 +unlikely,0,0,2.5 +unlikely,0,1,2.9 +very likely,1,1,3.28 +somewhat likely,0,0,2.95 +somewhat likely,0,0,3.54 +somewhat likely,0,0,3.11 +unlikely,1,0,3.25 +unlikely,0,0,2.44 +unlikely,0,0,2.13 +somewhat likely,0,0,3.22 +somewhat likely,0,0,3.16 +unlikely,0,0,3.39 +somewhat likely,0,0,2.7 +somewhat likely,0,0,3.09 +somewhat likely,0,0,3.16 +unlikely,0,0,2.28 +unlikely,0,0,2.91 +unlikely,0,0,3.65 +unlikely,0,0,2.86 +unlikely,0,1,3.39 +unlikely,0,0,3.71 +unlikely,0,0,3.25 +unlikely,0,0,3.14 +unlikely,0,0,2.41 +somewhat likely,0,0,3.08 +unlikely,0,1,3.02 +somewhat likely,0,0,3.15 +very likely,0,0,2.95 +unlikely,0,0,2.22 +somewhat likely,0,0,2.86 +somewhat likely,1,0,2.88 +very likely,0,0,2.62 +unlikely,0,0,3.37 +somewhat likely,0,0,3.51 +somewhat likely,0,0,3.65 +very likely,1,0,3.42 +very likely,0,0,2.41 +very likely,0,1,3.21 +unlikely,0,0,3.22 +unlikely,0,0,2.53 +somewhat likely,0,0,2.64 +unlikely,0,0,2.94 +somewhat likely,0,0,2.56 +unlikely,0,1,3.12 +somewhat likely,0,0,3.34 +unlikely,0,0,3.22 +somewhat likely,0,0,3.05 +somewhat likely,0,0,3.29 +somewhat likely,0,0,2.71 +unlikely,0,0,2.87 +unlikely,0,0,3.29 +unlikely,0,0,3.36 +unlikely,1,1,2.85 +unlikely,0,0,2.79 +somewhat likely,0,0,3.69 +very likely,0,0,3.56 +unlikely,0,0,3.52 +unlikely,1,1,3.38 +very likely,0,1,3.11 +somewhat likely,1,0,3.2 +unlikely,0,0,2.83 +unlikely,0,0,3.08 +very likely,0,1,2.97 +unlikely,0,0,2.64 +unlikely,0,0,2.47 +very likely,0,0,3.07 +somewhat likely,0,0,3.2 +unlikely,0,0,2.55 +unlikely,0,0,2.48 +unlikely,0,0,3.29 +somewhat likely,0,0,2.64 +somewhat likely,0,0,3.22 +somewhat likely,1,0,2.8 +somewhat likely,0,0,3.62 +unlikely,0,0,2.62 +somewhat likely,0,1,3.04 +unlikely,0,1,2.49 +unlikely,0,1,3.1 +very likely,1,0,3.15 +somewhat likely,0,0,2.65 +unlikely,0,0,3.04 +unlikely,0,1,3.05 +unlikely,0,0,2.88 +somewhat likely,0,0,2.86 +somewhat likely,0,0,3.0 +unlikely,0,0,2.23 +unlikely,0,1,3.14 +unlikely,0,0,2.67 +unlikely,0,0,3.11 +somewhat likely,0,0,3.73 +somewhat likely,0,0,2.45 +unlikely,0,0,3.04 +unlikely,0,0,2.4 +somewhat likely,0,0,3.43 +somewhat likely,0,0,2.57 +unlikely,0,0,2.84 +unlikely,0,0,2.67 +very likely,1,1,3.45 +somewhat likely,1,0,2.88 +somewhat likely,0,0,2.78 +unlikely,0,0,3.22 +unlikely,1,0,3.3 +somewhat likely,1,0,2.86 +somewhat likely,0,0,2.83 +somewhat likely,0,0,3.7 +somewhat likely,0,0,3.15 +somewhat likely,1,1,3.08 +unlikely,0,0,2.92 +unlikely,0,0,2.89 +very likely,1,0,3.53 +unlikely,0,0,3.1 +somewhat likely,1,0,3.36 +somewhat likely,0,0,2.74 +somewhat likely,0,0,2.73 +unlikely,1,0,2.72 +unlikely,0,0,3.53 +unlikely,1,0,3.45 +unlikely,0,0,2.98 +somewhat likely,0,0,2.88 +very likely,1,0,3.21 +unlikely,0,0,3.06 +unlikely,0,0,2.4 +somewhat likely,0,0,3.57 +unlikely,1,0,2.88 +unlikely,0,0,3.27 +unlikely,0,0,2.92 +very likely,0,0,2.7 +somewhat likely,1,0,2.53 +somewhat likely,0,0,3.38 +unlikely,0,0,3.16 +somewhat likely,0,0,2.6 +unlikely,0,0,2.76 +unlikely,0,0,3.56 +unlikely,0,0,2.87 +somewhat likely,1,0,2.99 +unlikely,0,0,2.68 +unlikely,0,0,2.99 +somewhat likely,0,0,2.96 +unlikely,1,1,2.77 +somewhat likely,0,0,3.28 +unlikely,0,1,2.74 +unlikely,0,0,1.9 +unlikely,0,0,3.05 +somewhat likely,0,1,2.44 +somewhat likely,0,0,3.33 +somewhat likely,0,1,3.56 +unlikely,0,0,2.15 +unlikely,0,0,2.83 +somewhat likely,0,0,2.79 +somewhat likely,0,0,2.72 +very likely,1,0,3.04 +unlikely,0,1,3.03 +unlikely,0,0,2.88 +somewhat likely,0,0,3.21 +unlikely,0,0,2.74 +unlikely,0,0,3.57 +somewhat likely,0,0,2.43 +unlikely,0,0,3.11 +unlikely,0,0,2.76 +somewhat likely,0,0,3.13 +unlikely,0,0,2.75 +unlikely,0,0,2.6 +unlikely,0,0,3.67 +unlikely,0,0,3.26 +unlikely,0,0,3.41 +very likely,0,0,3.24 +unlikely,0,0,3.05 +somewhat likely,1,0,2.38 +somewhat likely,0,0,2.61 +somewhat likely,1,1,3.18 +unlikely,0,0,3.34 +somewhat likely,0,0,2.84 +unlikely,0,0,2.8 +unlikely,0,0,3.21 +unlikely,0,0,2.63 +unlikely,0,0,2.31 +unlikely,0,0,2.29 +unlikely,0,0,3.51 +somewhat likely,0,0,2.87 +unlikely,0,0,3.09 +somewhat likely,1,0,2.88 +unlikely,1,0,2.47 +somewhat likely,0,1,3.72 +unlikely,0,0,3.2 +somewhat likely,0,0,2.57 +unlikely,0,0,3.05 +unlikely,0,0,3.27 +unlikely,0,0,3.3 +unlikely,0,0,2.73 +unlikely,1,0,2.63 +unlikely,0,0,2.77 +unlikely,0,0,3.15 +unlikely,0,0,2.85 +unlikely,0,0,2.25 +somewhat likely,0,0,3.56 +unlikely,0,0,2.7 +very likely,0,0,2.94 +somewhat likely,1,0,2.58 +somewhat likely,0,0,2.97 +very likely,0,0,3.38 +very likely,0,0,2.96 +unlikely,0,0,2.44 +somewhat likely,0,1,3.75 +somewhat likely,0,0,3.07 +somewhat likely,0,0,2.72 +unlikely,0,0,3.13 +unlikely,0,0,3.39 +unlikely,0,1,2.38 +somewhat likely,0,0,2.87 +unlikely,0,1,2.91 +very likely,1,0,3.25 +somewhat likely,1,1,3.51 +unlikely,1,0,3.65 +somewhat likely,0,0,3.11 +very likely,0,0,2.71 +unlikely,0,0,3.05 +unlikely,0,0,2.93 +somewhat likely,0,0,2.35 +somewhat likely,0,0,2.49 +unlikely,0,0,3.26 +unlikely,0,0,3.77 +unlikely,0,1,3.49 +unlikely,0,0,3.39 +unlikely,0,0,3.37 +unlikely,0,0,1.98 +unlikely,1,0,2.93 +unlikely,0,0,3.36 +unlikely,0,0,3.12 +somewhat likely,0,0,3.23 +somewhat likely,0,1,3.9 +unlikely,0,0,2.7 +unlikely,0,0,2.34 +unlikely,0,0,3.3 +somewhat likely,0,0,3.11 +unlikely,0,0,3.14 +somewhat likely,1,0,3.68 +somewhat likely,0,0,2.22 +unlikely,0,0,2.61 +somewhat likely,0,1,3.48 +very likely,0,0,3.08 +unlikely,0,0,2.79 +unlikely,0,0,3.12 +unlikely,0,1,2.67 +somewhat likely,0,0,3.54 +very likely,0,0,2.98 +somewhat likely,1,0,3.32 +somewhat likely,1,0,3.54 +unlikely,0,0,3.7 +unlikely,0,0,2.63 +somewhat likely,0,0,2.25 +somewhat likely,0,0,3.26 +very likely,0,0,3.52 diff --git a/statsmodels/miscmodels/tests/results/results_ordinal_method.py b/statsmodels/miscmodels/tests/results/results_ordinal_method.py new file mode 100644 index 00000000000..25497b0c2d8 --- /dev/null +++ b/statsmodels/miscmodels/tests/results/results_ordinal_method.py @@ -0,0 +1,101 @@ +""" +Test Results for ordinal models from R MASS lib +""" + +import numpy as np +import os +import pandas as pd +from statsmodels.tools.testing import Holder + +# R (v3.4.4) code inspired from https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/ +# library(readr) # to open the file +# library(MASS) # to perform ordinal regression +# +# ## load the data, 400 rows with 3 exogs(2 binaries, 1 float) and target 3-ordinal variable +# ologit_ucla <- read_csv("ologit_ucla.csv") +# ologit_ucla$apply <- as.factor(ologit_ucla$apply) +# ologit_ucla$apply <- factor(ologit_ucla$apply, levels=c("unlikely", "somewhat likely", "very likely")) +# +# ## fit ordered logit model +# r_logit <- polr(apply ~ pared + public + gpa, +# data = ologit_ucla, +# method = 'logit', # or 'probit' +# Hess=TRUE) +# +# ## fit ordered probit model +# r_probit <- polr(apply ~ pared + public + gpa, +# data = ologit_ucla, +# method = 'probit', +# Hess=TRUE) +# +# ## fit ordered cloglog model +# r_cloglog <- polr(apply ~ pared + public + gpa, +# data = ologit_ucla, +# method = 'cloglog', +# Hess=TRUE) +# +# ## with r = r_logit or r_probit or r_cloglog +# ## we add p-values +# (ctable <- coef(summary(r))) +# p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2 +# (ctable <- cbind(ctable, "p value" = p)) +# ## show 7 first predictions +# head(predict(r, subset(ologit_ucla, select=c("pared", "public","gpa")), type='prob'),7) + +data_store = Holder() +cur_dir = os.path.dirname(os.path.abspath(__file__)) +df = pd.read_csv(os.path.join(cur_dir, "ologit_ucla.csv")) +# df_unordered['apply'] is pd.Categorical with ordered = False +df_unordered = df.copy() +df_unordered['apply'] = pd.Categorical(df['apply'], ordered=False) +# but categories are set in order +df_unordered['apply'].cat.set_categories(['unlikely', 'somewhat likely', 'very likely'], inplace=True) +# df['apply'] is pd.Categorical with ordered = True +df['apply'] = pd.Categorical(df['apply'], ordered=True) +df['apply'].cat.set_categories(['unlikely', 'somewhat likely', 'very likely'], inplace=True) +data_store.df_unordered = df_unordered +data_store.df = df +data_store.nobs = 400 +data_store.n_ordinal_cat = 3 + +res_ord_logit = Holder() +res_ord_logit.coefficients_val = np.array([1.04769011, -0.05878572, 0.61594057]) +res_ord_logit.coefficients_stdE = np.array([0.2658, 0.2979, 0.2606]) +res_ord_logit.coefficients_tval = np.array([3.9418, -0.1974, 2.3632]) +res_ord_logit.coefficients_pval = np.array([8.087070e-05, 8.435464e-01, 1.811594e-02]) +res_ord_logit.thresholds = np.array([2.203915, 4.299363]) +res_ord_logit.prob_pred = np.array([[0.5488310, 0.3593310, 0.09183798], + [0.3055632, 0.4759496, 0.21848725], + [0.2293835, 0.4781951, 0.29242138], + [0.6161224, 0.3126888, 0.07118879], + [0.6560149, 0.2833901, 0.06059505], + [0.6609240, 0.2797117, 0.05936431], + [0.6518332, 0.2865114, 0.06165547]]) + +res_ord_probit = Holder() +res_ord_probit.coefficients_val = np.array([0.59811, 0.01016, 0.35815]) +res_ord_probit.coefficients_stdE = np.array([0.1579, 0.1728, 0.1568]) +res_ord_probit.coefficients_tval = np.array([3.78881, 0.05878, 2.28479]) +res_ord_probit.coefficients_pval = np.array([1.513681e-04, 9.531256e-01, 2.232519e-02]) +res_ord_probit.thresholds = np.array([1.2968, 2.5028]) +res_ord_probit.prob_pred = np.array([[0.5514181, 0.3576848, 0.09089707], + [0.3260107, 0.4488799, 0.22510933], + [0.2349733, 0.4506351, 0.31439162], + [0.6142501, 0.3184778, 0.06727214], + [0.6519891, 0.2928449, 0.05516602], + [0.6402204, 0.3009945, 0.05878509], + [0.6480094, 0.2956162, 0.05637442]]) + +res_ord_cloglog = Holder() +res_ord_cloglog.coefficients_val = np.array([0.5166455, 0.1081131, 0.3343895]) +res_ord_cloglog.coefficients_stdE = np.array([0.1613525, 0.1680675, 0.1542065]) +res_ord_cloglog.coefficients_tval = np.array([3.2019668, 0.6432721, 2.1684534]) +res_ord_cloglog.coefficients_pval = np.array([1.364927e-03, 5.200475e-01, 3.012421e-02]) +res_ord_cloglog.thresholds = np.array([0.8705304, 1.9744660]) +res_ord_cloglog.prob_pred = np.array([[0.5519526, 0.3592524, 0.08879500], + [0.3855287, 0.3842645, 0.23020682], + [0.2899487, 0.3540202, 0.35603111], + [0.6067184, 0.3333548, 0.05992678], + [0.6411418, 0.3133969, 0.04546127], + [0.5940557, 0.3400072, 0.06593710], + [0.6374521, 0.3156622, 0.04688570]]) From 35aa720cadd95a9d803652771bf6b1899fed10c3 Mon Sep 17 00:00:00 2001 From: Bolzano-Weierstrass Date: Sat, 22 Aug 2020 15:45:36 +0200 Subject: [PATCH 07/24] TST: added logit & probit tests --- .../miscmodels/tests/test_ordinal_model.py | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 statsmodels/miscmodels/tests/test_ordinal_model.py diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py new file mode 100644 index 00000000000..b4c85fdbb00 --- /dev/null +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -0,0 +1,138 @@ +""" +Test for ordinal models +""" + +import numpy as np + +from numpy.testing import assert_allclose +from .results.results_ordinal_model import data_store as ds +from statsmodels.miscmodels.ordinal_model import OrderedModel + + +class CheckOrdinalModelMixin(object): + + def test_basic(self): + n_cat = ds.n_ordinal_cat + res1 = self.res1 + res2 = self.res2 + # coefficients values, standard errors, t & p values + assert_allclose(res1.params[:-n_cat + 1], res2.coefficients_val, atol=2e-4) + assert_allclose(res1.bse[:-n_cat + 1], res2.coefficients_stdE, rtol=0.003, atol=1e-5) + assert_allclose(res1.tvalues[:-n_cat + 1], res2.coefficients_tval, rtol=0.003, atol=7e-4) + assert_allclose(res1.pvalues[:-n_cat + 1], res2.coefficients_pval, rtol=0.009, atol=1e-5) + # thresholds are given with exponentiated increments from the first threshold + assert_allclose(res1.model.transform_threshold_params(res1.params)[1:-1], res2.thresholds, atol=4e-4) + + # probabilities + assert_allclose(res1.predict()[:7, :], + res2.prob_pred, atol=5e-5) + + def test_Pandas(self): + # makes sure that the Pandas ecosystem is supported + res1 = self.res1 + resp = self.resp + # converges slightly differently why? + assert_allclose(res1.params, resp.params, atol=1e-10) + assert_allclose(res1.bse, resp.bse, atol=1e-10) + + assert_allclose(res1.model.endog, resp.model.endog, rtol=1e-10) + assert_allclose(res1.model.exog, resp.model.exog, rtol=1e-10) + + def test_formula(self): + res1 = self.res1 + resf = self.resf + # converges slightly differently why? yet e-5 is ok + assert_allclose(res1.params, resf.params, atol=5e-5) + assert_allclose(res1.bse, resf.bse, atol=5e-5) + + assert_allclose(res1.model.endog, resf.model.endog, rtol=1e-10) + assert_allclose(res1.model.exog, resf.model.exog, rtol=1e-10) + + def test_unordered(self): + # makes sure that ordered = True is optional for the endog Serie + # et categories have to be set in the right order + res1 = self.res1 + resf = self.resu + # converges slightly differently why? + assert_allclose(res1.params, resf.params, atol=1e-10) + assert_allclose(res1.bse, resf.bse, atol=1e-10) + + assert_allclose(res1.model.endog, resf.model.endog, rtol=1e-10) + assert_allclose(res1.model.exog, resf.model.exog, rtol=1e-10) + + +class TestLogitModel(CheckOrdinalModelMixin): + + @classmethod + def setup_class(cls): + data = ds.df + data_unordered = ds.df_unordered + + # standard fit + mod = OrderedModel(data['apply'].values.codes, + np.asarray(data[['pared', 'public', 'gpa']], float), + distr='logit') + res = mod.fit(method='bfgs', disp=False) + # standard fit with pandas input + modp = OrderedModel(data['apply'], + data[['pared', 'public', 'gpa']], + distr='logit') + resp = modp.fit(method='bfgs', disp=False) + # fit with formula + modf = OrderedModel.from_formula("apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='logit') + resf = modf.fit(method='bfgs', disp=False) + # fit on data with ordered=False + modu = OrderedModel(data_unordered['apply'].values.codes, + np.asarray(data_unordered[['pared', 'public', 'gpa']], float), + distr='logit') + resu = modu.fit(method='bfgs', disp=False) + + from .results.results_ordinal_model import res_ord_logit as res2 + cls.res2 = res2 + cls.res1 = res + cls.resp = resp + cls.resf = resf + cls.resu = resu + + +class TestProbitModel(CheckOrdinalModelMixin): + + @classmethod + def setup_class(cls): + data = ds.df + data_unordered = ds.df_unordered + + mod = OrderedModel(data['apply'].values.codes, + np.asarray(data[['pared', 'public', 'gpa']], float), + distr='probit') + res = mod.fit(method='bfgs', disp=False) + + modp = OrderedModel(data['apply'], + data[['pared', 'public', 'gpa']], + distr='probit') + resp = modp.fit(method='bfgs', disp=False) + + modf = OrderedModel.from_formula("apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='probit') + resf = modf.fit(method='bfgs', disp=False) + + modu = OrderedModel(data_unordered['apply'].values.codes, + np.asarray(data_unordered[['pared', 'public', 'gpa']], float), + distr='probit') + resu = modu.fit(method='bfgs', disp=False) + + from .results.results_ordinal_model import res_ord_probit as res2 + cls.res2 = res2 + cls.res1 = res + cls.resp = resp + cls.resf = resf + cls.resu = resu From 4b2034177e18254383a3b080731863f68374d44b Mon Sep 17 00:00:00 2001 From: Bolzano-Weierstrass Date: Sat, 22 Aug 2020 15:49:43 +0200 Subject: [PATCH 08/24] TST: added custom distribution tests (with cLogLog) --- ...nal_method.py => results_ordinal_model.py} | 3 ++ .../miscmodels/tests/test_ordinal_model.py | 48 +++++++++++++++++++ 2 files changed, 51 insertions(+) rename statsmodels/miscmodels/tests/results/{results_ordinal_method.py => results_ordinal_model.py} (99%) diff --git a/statsmodels/miscmodels/tests/results/results_ordinal_method.py b/statsmodels/miscmodels/tests/results/results_ordinal_model.py similarity index 99% rename from statsmodels/miscmodels/tests/results/results_ordinal_method.py rename to statsmodels/miscmodels/tests/results/results_ordinal_model.py index 25497b0c2d8..ab6bb277800 100644 --- a/statsmodels/miscmodels/tests/results/results_ordinal_method.py +++ b/statsmodels/miscmodels/tests/results/results_ordinal_model.py @@ -45,14 +45,17 @@ data_store = Holder() cur_dir = os.path.dirname(os.path.abspath(__file__)) df = pd.read_csv(os.path.join(cur_dir, "ologit_ucla.csv")) + # df_unordered['apply'] is pd.Categorical with ordered = False df_unordered = df.copy() df_unordered['apply'] = pd.Categorical(df['apply'], ordered=False) # but categories are set in order df_unordered['apply'].cat.set_categories(['unlikely', 'somewhat likely', 'very likely'], inplace=True) + # df['apply'] is pd.Categorical with ordered = True df['apply'] = pd.Categorical(df['apply'], ordered=True) df['apply'].cat.set_categories(['unlikely', 'somewhat likely', 'very likely'], inplace=True) + data_store.df_unordered = df_unordered data_store.df = df data_store.nobs = 400 diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py index b4c85fdbb00..e7e7efd3967 100644 --- a/statsmodels/miscmodels/tests/test_ordinal_model.py +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -3,6 +3,7 @@ """ import numpy as np +import scipy.stats as stats from numpy.testing import assert_allclose from .results.results_ordinal_model import data_store as ds @@ -136,3 +137,50 @@ def setup_class(cls): cls.resp = resp cls.resf = resf cls.resu = resu + +class TestCLogLogModel(CheckOrdinalModelMixin): + + @classmethod + def setup_class(cls): + data = ds.df + data_unordered = ds.df_unordered + + # a Scipy distribution defined minimally + class CLogLog(stats.rv_continuous): + def _ppf(self, q): + return np.log(-np.log(1 - q)) + + def _cdf(self, x): + return 1 - np.exp(-np.exp(x)) + + cloglog = CLogLog() + + mod = OrderedModel(data['apply'].values.codes, + np.asarray(data[['pared', 'public', 'gpa']], float), + distr=cloglog) + res = mod.fit(method='bfgs', disp=False) + + modp = OrderedModel(data['apply'], + data[['pared', 'public', 'gpa']], + distr=cloglog) + resp = modp.fit(method='bfgs', disp=False) + + modf = OrderedModel.from_formula("apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr=cloglog) + resf = modf.fit(method='bfgs', disp=False) + + modu = OrderedModel(data_unordered['apply'].values.codes, + np.asarray(data_unordered[['pared', 'public', 'gpa']], float), + distr=cloglog) + resu = modu.fit(method='bfgs', disp=False) + + from .results.results_ordinal_model import res_ord_cloglog as res2 + cls.res2 = res2 + cls.res1 = res + cls.resp = resp + cls.resf = resf + cls.resu = resu From 23edfd8cf813898205c8bd62e705f9a62b00b71e Mon Sep 17 00:00:00 2001 From: Bolzano-Weierstrass Date: Sat, 22 Aug 2020 16:03:11 +0200 Subject: [PATCH 09/24] ENH: add custom distribution ex with Pandas inputs --- statsmodels/examples/ex_ordered_model.py | 26 +++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/statsmodels/examples/ex_ordered_model.py b/statsmodels/examples/ex_ordered_model.py index b6afcec34d9..3edd270eabe 100644 --- a/statsmodels/examples/ex_ordered_model.py +++ b/statsmodels/examples/ex_ordered_model.py @@ -8,6 +8,7 @@ import numpy as np from scipy import stats +import pandas from statsmodels.miscmodels.ordinal_model import OrderedModel @@ -48,12 +49,11 @@ res_log = OrderedModel(y, x, distr='logit').fit(method='bfgs') pred_choice_log = res_log.predict().argmax(1) print((y == pred_choice_log).mean()) - +print(res_log.summary()) # example form UCLA Stats pages http://www.ats.ucla.edu/stat/stata/dae/ologit.htm # requires downloaded dataset ologit.dta -import pandas dataf = pandas.read_stata(r"M:\josef_new\scripts\ologit_ucla.dta") # this works but sorts category levels alphabetically @@ -69,4 +69,24 @@ print(res_log3.summary()) # with ordered probit - not on UCLA page -print(OrderedModel(dataf['apply'].values.codes, np.asarray(dataf[['pared', 'public', 'gpa']], float), distr='probit').fit(method='bfgs').summary()) +print( + OrderedModel(dataf['apply'].values.codes, np.asarray(dataf[['pared', 'public', 'gpa']], float), distr='probit').fit( + method='bfgs').summary()) + + +# example with a custom distribution - not on UCLA page +# definition of the SciPy dist +class CLogLog(stats.rv_continuous): + def _ppf(self, q): + return np.log(-np.log(1 - q)) + + def _cdf(self, x): + return 1 - np.exp(-np.exp(x)) + + +cloglog = CLogLog() + +res_cloglog = OrderedModel(dataf['apply'], + dataf[['pared', 'public', 'gpa']], + distr=cloglog).fit(method='bfgs', disp=False) +print(res_cloglog.summary()) From b88f2be5f513c3d31f65e3a3139b85fa3bf2704a Mon Sep 17 00:00:00 2001 From: Bolzano-Weierstrass Date: Sat, 22 Aug 2020 16:44:36 +0200 Subject: [PATCH 10/24] ENH: small typos --- statsmodels/miscmodels/tests/test_ordinal_model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py index e7e7efd3967..8fbba09973f 100644 --- a/statsmodels/miscmodels/tests/test_ordinal_model.py +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -13,6 +13,7 @@ class CheckOrdinalModelMixin(object): def test_basic(self): + # checks basic results againt R MASS package n_cat = ds.n_ordinal_cat res1 = self.res1 res2 = self.res2 @@ -28,7 +29,7 @@ def test_basic(self): assert_allclose(res1.predict()[:7, :], res2.prob_pred, atol=5e-5) - def test_Pandas(self): + def test_pandas(self): # makes sure that the Pandas ecosystem is supported res1 = self.res1 resp = self.resp @@ -40,6 +41,7 @@ def test_Pandas(self): assert_allclose(res1.model.exog, resp.model.exog, rtol=1e-10) def test_formula(self): + # makes sure the "R-way" of writing models is supported res1 = self.res1 resf = self.resf # converges slightly differently why? yet e-5 is ok @@ -138,6 +140,7 @@ def setup_class(cls): cls.resf = resf cls.resu = resu + class TestCLogLogModel(CheckOrdinalModelMixin): @classmethod From cb25187b840a5c51fedb1327467b4595a6212b96 Mon Sep 17 00:00:00 2001 From: Bolzano-Weierstrass Date: Sun, 23 Aug 2020 14:25:58 +0200 Subject: [PATCH 11/24] STY: flake8 friendly --- statsmodels/examples/ex_ordered_model.py | 21 +++-- statsmodels/miscmodels/ordinal_model.py | 54 +++++++------ .../tests/results/results_ordinal_model.py | 30 ++++--- .../miscmodels/tests/test_ordinal_model.py | 79 +++++++++++-------- 4 files changed, 111 insertions(+), 73 deletions(-) diff --git a/statsmodels/examples/ex_ordered_model.py b/statsmodels/examples/ex_ordered_model.py index 3edd270eabe..b06c3c569ae 100644 --- a/statsmodels/examples/ex_ordered_model.py +++ b/statsmodels/examples/ex_ordered_model.py @@ -14,7 +14,8 @@ nobs, k_vars = 1000, 3 x = np.random.randn(nobs, k_vars) -#x = np.column_stack((np.ones(nobs), x)) #constant will be in integration limits +# x = np.column_stack((np.ones(nobs), x)) +# #constant will be in integration limits xb = x.dot(np.ones(k_vars)) y_latent = xb + np.random.randn(nobs) y = np.round(np.clip(y_latent, -2.4, 2.4)).astype(int) + 2 @@ -26,16 +27,17 @@ start_params = np.ones(k_vars + 4) start_params = np.concatenate((np.ones(k_vars), np.arange(4))) start_ppf = stats.norm.ppf((np.bincount(y) / len(y)).cumsum()) -start_threshold = np.concatenate((start_ppf[:1], np.log(np.diff(start_ppf[:-1])))) +start_threshold = np.concatenate((start_ppf[:1], + np.log(np.diff(start_ppf[:-1])))) start_params = np.concatenate((np.zeros(k_vars), start_threshold)) res = mod.fit(start_params=start_params, maxiter=5000, maxfun=5000) print(res.params) -#res = mod.fit(start_params=res.params, method='bfgs') +# res = mod.fit(start_params=res.params, method='bfgs') res = mod.fit(start_params=start_params, method='bfgs') print(res.params) print(np.exp(res.params[-(mod.k_levels - 1):]).cumsum()) -#print(res.summary()) +# print(res.summary()) predicted = res.model.predict(res.params) pred_choice = predicted.argmax(1) @@ -51,7 +53,8 @@ print((y == pred_choice_log).mean()) print(res_log.summary()) -# example form UCLA Stats pages http://www.ats.ucla.edu/stat/stata/dae/ologit.htm +# example form UCLA Stats pages +# http://www.ats.ucla.edu/stat/stata/dae/ologit.htm # requires downloaded dataset ologit.dta dataf = pandas.read_stata(r"M:\josef_new\scripts\ologit_ucla.dta") @@ -61,7 +64,8 @@ np.asarray(dataf[['pared', 'public', 'gpa']], float), distr='logit').fit(method='bfgs') -# this replicates the UCLA example except for different parameterization of par2 +# this replicates the UCLA example except +# for different parameterization of par2 res_log3 = OrderedModel(dataf['apply'].values.codes, np.asarray(dataf[['pared', 'public', 'gpa']], float), distr='logit').fit(method='bfgs') @@ -70,8 +74,9 @@ # with ordered probit - not on UCLA page print( - OrderedModel(dataf['apply'].values.codes, np.asarray(dataf[['pared', 'public', 'gpa']], float), distr='probit').fit( - method='bfgs').summary()) + OrderedModel(dataf['apply'].values.codes, + np.asarray(dataf[['pared', 'public', 'gpa']], float), + distr='probit').fit(method='bfgs').summary()) # example with a custom distribution - not on UCLA page diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index 39aa240e1fb..679a60e53a1 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -10,9 +10,11 @@ import pandas as pd from pandas.api.types import CategoricalDtype from scipy import stats -from statsmodels.base.model import GenericLikelihoodModel, GenericLikelihoodModelResults +from statsmodels.base.model import GenericLikelihoodModel, \ + GenericLikelihoodModelResults from statsmodels.compat.pandas import Appender + class OrderedModel(GenericLikelihoodModel): """Ordinal Model based on logistic or normal distribution @@ -91,8 +93,9 @@ def __init__(self, endog, exog, distr='probit', **kwds): def _check_inputs(self, endog, exog): """ - checks if self.distrib is legal and does the Pandas support for endog and exog. - Also retrieves columns & categories names for .summary() of the results class. + checks if self.distrib is legal and does the Pandas + support for endog and exog. Also retrieves columns & categories + names for .summary() of the results class. """ names = {} if not isinstance(self.distr, stats.rv_continuous): @@ -103,7 +106,8 @@ def _check_inputs(self, endog, exog): # Pandas' support if (isinstance(exog, pd.DataFrame)) or (isinstance(exog, pd.Series)): - exog_name = (exog.name if isinstance(exog, pd.Series) else exog.columns.tolist()) + exog_name = (exog.name if isinstance(exog, pd.Series) + else exog.columns.tolist()) names['xname'] = exog_name exog = np.asarray(exog) @@ -111,24 +115,26 @@ def _check_inputs(self, endog, exog): if isinstance(endog.dtypes, CategoricalDtype): if not endog.dtype.ordered: import warnings - warnings.warn("the endog has ordered == False, risk of capturing a wrong order" - " for the categories. ordered == True preferred.", Warning) + warnings.warn("the endog has ordered == False, " + "risk of capturing a wrong order for the " + "categories. ordered == True preferred.", + Warning) endog_name = endog.name - thresholds_name = [str(x) + '/' + str(y) - for x, y in zip(endog.values.categories[:-1], endog.values.categories[1:])] + threshold_name = [str(x) + '/' + str(y) + for x, y in zip(endog.values.categories[:-1], + endog.values.categories[1:])] names['yname'] = endog_name - names['xname'] = names['xname'] + thresholds_name + names['xname'] = names['xname'] + threshold_name endog = np.asarray(endog.values.codes) else: msg = ( - f"If the endog is a pandas.Serie it must be of categoricalDtype." + "If the endog is a pandas.Serie " + "it must be of categoricalDtype." ) raise ValueError(msg) return names, endog, exog - - def cdf(self, x): """cdf evaluated at x """ @@ -158,7 +164,8 @@ def transform_threshold_params(self, params): """ th_params = params[-(self.k_levels - 1):] - thresh = np.concatenate((th_params[:1], np.exp(th_params[1:]))).cumsum() + thresh = np.concatenate((th_params[:1], + np.exp(th_params[1:]))).cumsum() thresh = np.concatenate(([-np.inf], thresh, [np.inf])) return thresh @@ -168,37 +175,39 @@ def transform_reverse_threshold_params(self, params): """ start_ppf = params - thresh_params = np.concatenate((start_ppf[:1], np.log(np.diff(start_ppf[:-1])))) + thresh_params = np.concatenate((start_ppf[:1], + np.log(np.diff(start_ppf[:-1])))) return thresh_params - def predict(self, params, exog=None): """predicted probabilities for each level of the ordinal endog. """ - #structure of params = [beta, constants_or_thresholds] + # structure of params = [beta, constants_or_thresholds] # explicit in several steps to avoid bugs th_params = params[-(self.k_levels - 1):] - thresh = np.concatenate((th_params[:1], np.exp(th_params[1:]))).cumsum() + thresh = np.concatenate((th_params[:1], + np.exp(th_params[1:]))).cumsum() thresh = np.concatenate(([-np.inf], thresh, [np.inf])) - xb = self.exog.dot(params[:-(self.k_levels - 1)])[:,None] + xb = self.exog.dot(params[:-(self.k_levels - 1)])[:, None] low = thresh[:-1] - xb upp = thresh[1:] - xb prob = self.prob(low, upp) return prob - def loglike(self, params): - #structure of params = [beta, constants_or_thresholds] + # structure of params = [beta, constants_or_thresholds] - thresh = np.concatenate(([-np.inf], params[-(self.k_levels - 1):], [np.inf])) + thresh = np.concatenate(([-np.inf], + params[-(self.k_levels - 1):], [np.inf])) # explicit in several steps to avoid bugs th_params = params[-(self.k_levels - 1):] - thresh = np.concatenate((th_params[:1], np.exp(th_params[1:]))).cumsum() + thresh = np.concatenate((th_params[:1], + np.exp(th_params[1:]))).cumsum() thresh = np.concatenate(([-np.inf], thresh, [np.inf])) thresh_i_low = thresh[self.endog] thresh_i_upp = thresh[self.endog + 1] @@ -231,6 +240,7 @@ def fit(self, start_params=None, method='nm', maxiter=500, full_output=1, return ordmlefit + class OrderedResults(GenericLikelihoodModelResults): @Appender(GenericLikelihoodModelResults.summary.__doc__) def summary(self, yname=None, xname=None, title=None, alpha=.05, **kwargs): diff --git a/statsmodels/miscmodels/tests/results/results_ordinal_model.py b/statsmodels/miscmodels/tests/results/results_ordinal_model.py index ab6bb277800..d4fb122796c 100644 --- a/statsmodels/miscmodels/tests/results/results_ordinal_model.py +++ b/statsmodels/miscmodels/tests/results/results_ordinal_model.py @@ -7,14 +7,17 @@ import pandas as pd from statsmodels.tools.testing import Holder -# R (v3.4.4) code inspired from https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/ +# R (v3.4.4) code inspired from +# https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/ # library(readr) # to open the file # library(MASS) # to perform ordinal regression # -# ## load the data, 400 rows with 3 exogs(2 binaries, 1 float) and target 3-ordinal variable +# ## load the data, 400 rows with 3 exogs(2 binaries, 1 float) +# ##and target 3-ordinal variable # ologit_ucla <- read_csv("ologit_ucla.csv") # ologit_ucla$apply <- as.factor(ologit_ucla$apply) -# ologit_ucla$apply <- factor(ologit_ucla$apply, levels=c("unlikely", "somewhat likely", "very likely")) +# ologit_ucla$apply <- factor(ologit_ucla$apply, +# levels=c("unlikely", "somewhat likely", "very likely")) # # ## fit ordered logit model # r_logit <- polr(apply ~ pared + public + gpa, @@ -40,7 +43,8 @@ # p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2 # (ctable <- cbind(ctable, "p value" = p)) # ## show 7 first predictions -# head(predict(r, subset(ologit_ucla, select=c("pared", "public","gpa")), type='prob'),7) +# head(predict(r, subset(ologit_ucla, +# select=c("pared", "public","gpa")), type='prob'),7) data_store = Holder() cur_dir = os.path.dirname(os.path.abspath(__file__)) @@ -50,11 +54,13 @@ df_unordered = df.copy() df_unordered['apply'] = pd.Categorical(df['apply'], ordered=False) # but categories are set in order -df_unordered['apply'].cat.set_categories(['unlikely', 'somewhat likely', 'very likely'], inplace=True) +df_unordered['apply'].cat.set_categories( + ['unlikely', 'somewhat likely', 'very likely'], inplace=True) # df['apply'] is pd.Categorical with ordered = True df['apply'] = pd.Categorical(df['apply'], ordered=True) -df['apply'].cat.set_categories(['unlikely', 'somewhat likely', 'very likely'], inplace=True) +df['apply'].cat.set_categories( + ['unlikely', 'somewhat likely', 'very likely'], inplace=True) data_store.df_unordered = df_unordered data_store.df = df @@ -62,10 +68,12 @@ data_store.n_ordinal_cat = 3 res_ord_logit = Holder() -res_ord_logit.coefficients_val = np.array([1.04769011, -0.05878572, 0.61594057]) +res_ord_logit.coefficients_val = \ + np.array([1.04769011, -0.05878572, 0.61594057]) res_ord_logit.coefficients_stdE = np.array([0.2658, 0.2979, 0.2606]) res_ord_logit.coefficients_tval = np.array([3.9418, -0.1974, 2.3632]) -res_ord_logit.coefficients_pval = np.array([8.087070e-05, 8.435464e-01, 1.811594e-02]) +res_ord_logit.coefficients_pval = \ + np.array([8.087070e-05, 8.435464e-01, 1.811594e-02]) res_ord_logit.thresholds = np.array([2.203915, 4.299363]) res_ord_logit.prob_pred = np.array([[0.5488310, 0.3593310, 0.09183798], [0.3055632, 0.4759496, 0.21848725], @@ -79,7 +87,8 @@ res_ord_probit.coefficients_val = np.array([0.59811, 0.01016, 0.35815]) res_ord_probit.coefficients_stdE = np.array([0.1579, 0.1728, 0.1568]) res_ord_probit.coefficients_tval = np.array([3.78881, 0.05878, 2.28479]) -res_ord_probit.coefficients_pval = np.array([1.513681e-04, 9.531256e-01, 2.232519e-02]) +res_ord_probit.coefficients_pval = \ + np.array([1.513681e-04, 9.531256e-01, 2.232519e-02]) res_ord_probit.thresholds = np.array([1.2968, 2.5028]) res_ord_probit.prob_pred = np.array([[0.5514181, 0.3576848, 0.09089707], [0.3260107, 0.4488799, 0.22510933], @@ -93,7 +102,8 @@ res_ord_cloglog.coefficients_val = np.array([0.5166455, 0.1081131, 0.3343895]) res_ord_cloglog.coefficients_stdE = np.array([0.1613525, 0.1680675, 0.1542065]) res_ord_cloglog.coefficients_tval = np.array([3.2019668, 0.6432721, 2.1684534]) -res_ord_cloglog.coefficients_pval = np.array([1.364927e-03, 5.200475e-01, 3.012421e-02]) +res_ord_cloglog.coefficients_pval = \ + np.array([1.364927e-03, 5.200475e-01, 3.012421e-02]) res_ord_cloglog.thresholds = np.array([0.8705304, 1.9744660]) res_ord_cloglog.prob_pred = np.array([[0.5519526, 0.3592524, 0.08879500], [0.3855287, 0.3842645, 0.23020682], diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py index 8fbba09973f..4bb86392afa 100644 --- a/statsmodels/miscmodels/tests/test_ordinal_model.py +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -18,12 +18,19 @@ def test_basic(self): res1 = self.res1 res2 = self.res2 # coefficients values, standard errors, t & p values - assert_allclose(res1.params[:-n_cat + 1], res2.coefficients_val, atol=2e-4) - assert_allclose(res1.bse[:-n_cat + 1], res2.coefficients_stdE, rtol=0.003, atol=1e-5) - assert_allclose(res1.tvalues[:-n_cat + 1], res2.coefficients_tval, rtol=0.003, atol=7e-4) - assert_allclose(res1.pvalues[:-n_cat + 1], res2.coefficients_pval, rtol=0.009, atol=1e-5) - # thresholds are given with exponentiated increments from the first threshold - assert_allclose(res1.model.transform_threshold_params(res1.params)[1:-1], res2.thresholds, atol=4e-4) + assert_allclose(res1.params[:-n_cat + 1], + res2.coefficients_val, atol=2e-4) + assert_allclose(res1.bse[:-n_cat + 1], + res2.coefficients_stdE, rtol=0.003, atol=1e-5) + assert_allclose(res1.tvalues[:-n_cat + 1], + res2.coefficients_tval, rtol=0.003, atol=7e-4) + assert_allclose(res1.pvalues[:-n_cat + 1], + res2.coefficients_pval, rtol=0.009, atol=1e-5) + # thresholds are given with exponentiated increments + # from the first threshold + assert_allclose( + res1.model.transform_threshold_params(res1.params)[1:-1], + res2.thresholds, atol=4e-4) # probabilities assert_allclose(res1.predict()[:7, :], @@ -82,17 +89,19 @@ def setup_class(cls): distr='logit') resp = modp.fit(method='bfgs', disp=False) # fit with formula - modf = OrderedModel.from_formula("apply ~ pared + public + gpa - 1", - data={"apply": data['apply'].values.codes, - "pared": data['pared'], - "public": data['public'], - "gpa": data['gpa']}, - distr='logit') + modf = OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='logit') resf = modf.fit(method='bfgs', disp=False) # fit on data with ordered=False - modu = OrderedModel(data_unordered['apply'].values.codes, - np.asarray(data_unordered[['pared', 'public', 'gpa']], float), - distr='logit') + modu = OrderedModel( + data_unordered['apply'].values.codes, + np.asarray(data_unordered[['pared', 'public', 'gpa']], float), + distr='logit') resu = modu.fit(method='bfgs', disp=False) from .results.results_ordinal_model import res_ord_logit as res2 @@ -120,17 +129,19 @@ def setup_class(cls): distr='probit') resp = modp.fit(method='bfgs', disp=False) - modf = OrderedModel.from_formula("apply ~ pared + public + gpa - 1", - data={"apply": data['apply'].values.codes, - "pared": data['pared'], - "public": data['public'], - "gpa": data['gpa']}, - distr='probit') + modf = OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='probit') resf = modf.fit(method='bfgs', disp=False) - modu = OrderedModel(data_unordered['apply'].values.codes, - np.asarray(data_unordered[['pared', 'public', 'gpa']], float), - distr='probit') + modu = OrderedModel( + data_unordered['apply'].values.codes, + np.asarray(data_unordered[['pared', 'public', 'gpa']], float), + distr='probit') resu = modu.fit(method='bfgs', disp=False) from .results.results_ordinal_model import res_ord_probit as res2 @@ -168,17 +179,19 @@ def _cdf(self, x): distr=cloglog) resp = modp.fit(method='bfgs', disp=False) - modf = OrderedModel.from_formula("apply ~ pared + public + gpa - 1", - data={"apply": data['apply'].values.codes, - "pared": data['pared'], - "public": data['public'], - "gpa": data['gpa']}, - distr=cloglog) + modf = OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr=cloglog) resf = modf.fit(method='bfgs', disp=False) - modu = OrderedModel(data_unordered['apply'].values.codes, - np.asarray(data_unordered[['pared', 'public', 'gpa']], float), - distr=cloglog) + modu = OrderedModel( + data_unordered['apply'].values.codes, + np.asarray(data_unordered[['pared', 'public', 'gpa']], float), + distr=cloglog) resu = modu.fit(method='bfgs', disp=False) from .results.results_ordinal_model import res_ord_cloglog as res2 From e00c2d583d095a917f7f28bfaf04a48cf867fdf1 Mon Sep 17 00:00:00 2001 From: Bolzano-Weierstrass Date: Tue, 25 Aug 2020 12:19:35 +0200 Subject: [PATCH 12/24] ENH: small fixes + example notebook --- examples/notebooks/ordinal_regression.ipynb | 247 ++++++++++++++++++++ statsmodels/miscmodels/ordinal_model.py | 7 +- 2 files changed, 250 insertions(+), 4 deletions(-) create mode 100644 examples/notebooks/ordinal_regression.ipynb diff --git a/examples/notebooks/ordinal_regression.ipynb b/examples/notebooks/ordinal_regression.ipynb new file mode 100644 index 00000000000..818b93080aa --- /dev/null +++ b/examples/notebooks/ordinal_regression.ipynb @@ -0,0 +1,247 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ordinal Regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import scipy.stats as stats\n", + "\n", + "from statsmodels.miscmodels.ordinal_model import OrderedModel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading a stata data file from the UCLA website.This notebook is inspired by https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/ which is a R notebook from UCLA." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://stats.idre.ucla.edu/stat/data/ologit.dta\"\n", + "data_student = pd.read_stata(url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_student.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_student.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_student['apply'].dtype" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This dataset is about the probability for undergraduate students to apply to graduate school given three exogenous variables:\n", + "- their grade point average(`gpa`), a float between 0 and 4.\n", + "- `pared`, a binary that indicates if at least one parent went to graduate school.\n", + "- and `public`, a binary that indicates if the current undergraduate institution of the student is public or private.\n", + "\n", + "`apply`, the target variable is categorical with ordered categories: `unlikely` < `somewhat likely` < `very likely`. It is a `pd.Serie` of categorical type, this is preferred over NumPy arrays." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The model is based on a numerical latent variable $y_{latent}$ that we cannot observe but that we can compute thanks to exogenous variables.\n", + "Moreover we can use this $y_{latent}$ to define $y$ that we can observe.\n", + "\n", + "For more details see the the Documentation of OrderedModel, [the UCLA webpage](https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/) or this [book](https://onlinelibrary.wiley.com/doi/book/10.1002/9780470594001).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Probit ordinal regression:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mod_prob = OrderedModel(data_student['apply'],\n", + " data_student[['pared', 'public', 'gpa']],\n", + " distr='logit')\n", + "\n", + "res_prob = mod_prob.fit(method='bfgs')\n", + "res_prob.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In our model, we have 3 exogenous variables(the $\\beta$s if we keep the documentation's notations) so we have 3 coefficients that need to be estimated.\n", + "\n", + "Those 3 estimations and their standard errors can be retrieved in the summary table.\n", + "\n", + "Since there are 3 categories in the target variable(`unlikely`, `somewhat likely`, `very likely`), we have two thresholds to estimate. \n", + "As explained in the doc of the method `OrderedModel.transform_threshold_params`, the first estimated threshold is the actual value and all the other thresholds are in terms of cumulative exponentiated increments. Actual thresholds values can be computed as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_of_thresholds = 2\n", + "mod_prob.transform_threshold_params(res_prob.params[-num_of_thresholds:])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Logit ordinal regression:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mod_log = OrderedModel(data_student['apply'],\n", + " data_student[['pared', 'public', 'gpa']],\n", + " distr='logit')\n", + "\n", + "res_log = mod_log.fit(method='bfgs', disp=False)\n", + "res_log.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predicted = res_log.model.predict(res_log.params, exog=data_student[['pared', 'public', 'gpa']])\n", + "predicted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pred_choice = predicted.argmax(1)\n", + "print('Fraction of correct choice predictions')\n", + "print((np.asarray(data_student['apply'].values.codes) == pred_choice).mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Ordinal regression with a custom cumulative cLogLog distribution:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In addition to `logit` and `probit` regression, any continuous distribution from `SciPy.stats` package can be used for the `distr` argument. Alternatively, one can define its own distribution simply creating a subclass from `rv_continuous` and implementing a few methods." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# using a SciPy distribution\n", + "res_exp = OrderedModel(data_student['apply'],\n", + " data_student[['pared', 'public', 'gpa']],\n", + " distr=stats.expon).fit(method='bfgs', disp=False)\n", + "res_exp.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# minimal definition of a custom scipy distribution.\n", + "class CLogLog(stats.rv_continuous):\n", + " def _ppf(self, q):\n", + " return np.log(-np.log(1 - q))\n", + "\n", + " def _cdf(self, x):\n", + " return 1 - np.exp(-np.exp(x))\n", + "\n", + "\n", + "cloglog = CLogLog()\n", + "\n", + "# definition of the model and fitting\n", + "res_cloglog = OrderedModel(data_student['apply'],\n", + " data_student[['pared', 'public', 'gpa']],\n", + " distr=cloglog).fit(method='bfgs', disp=False)\n", + "res_cloglog.summary()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index 679a60e53a1..e6a5a1d19fc 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -106,7 +106,7 @@ def _check_inputs(self, endog, exog): # Pandas' support if (isinstance(exog, pd.DataFrame)) or (isinstance(exog, pd.Series)): - exog_name = (exog.name if isinstance(exog, pd.Series) + exog_name = ([exog.name] if isinstance(exog, pd.Series) else exog.columns.tolist()) names['xname'] = exog_name exog = np.asarray(exog) @@ -243,7 +243,6 @@ def fit(self, start_params=None, method='nm', maxiter=500, full_output=1, class OrderedResults(GenericLikelihoodModelResults): @Appender(GenericLikelihoodModelResults.summary.__doc__) - def summary(self, yname=None, xname=None, title=None, alpha=.05, **kwargs): + def summary(self, yname=None, xname=None, title=None, alpha=.05): names = self.model.names - print(names) - return super(OrderedResults, self).summary(**names, **kwargs) + return super(OrderedResults, self).summary(**names) From 7a5f16ada5abe3a18d4b6c087b24ade85bde44cf Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Thu, 3 Sep 2020 19:22:21 -0400 Subject: [PATCH 13/24] ENH/REF: add offset, scoreobs, DRY loglike and related --- statsmodels/miscmodels/ordinal_model.py | 99 ++++++++++++++++--- .../miscmodels/tests/test_ordinal_model.py | 21 ++++ 2 files changed, 107 insertions(+), 13 deletions(-) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index e6a5a1d19fc..94ea42da8f4 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -70,7 +70,7 @@ class OrderedModel(GenericLikelihoodModel): """ - def __init__(self, endog, exog, distr='probit', **kwds): + def __init__(self, endog, exog, offset=None, distr='probit', **kwds): if distr == 'probit': self.distr = stats.norm @@ -79,6 +79,12 @@ def __init__(self, endog, exog, distr='probit', **kwds): else: self.distr = distr + if offset is not None: + offset = np.asarray(offset) + + # TODO: check if super can handle offset + self.offset = offset + self.names, endog, exog = self._check_inputs(endog, exog) super(OrderedModel, self).__init__(endog, exog, **kwds) @@ -88,7 +94,10 @@ def __init__(self, endog, exog, distr='probit', **kwds): self.endog = index self.labels = unique - self.k_vars = self.exog.shape[1] + if self.exog is not None: + self.nobs, self.k_vars = self.exog.shape + else: # no exog in model + self.nobs, self.k_vars = self.endog.shape[0], 0 self.results_class = OrderedResults def _check_inputs(self, endog, exog): @@ -140,6 +149,11 @@ def cdf(self, x): """ return self.distr.cdf(x) + def pdf(self, x): + """pdf evaluated at x + """ + return self.distr.pdf(x) + def prob(self, low, upp): """interval probability """ @@ -184,6 +198,8 @@ def predict(self, params, exog=None): """ + if exog is None: + exog = self.exog # structure of params = [beta, constants_or_thresholds] # explicit in several steps to avoid bugs @@ -191,32 +207,89 @@ def predict(self, params, exog=None): thresh = np.concatenate((th_params[:1], np.exp(th_params[1:]))).cumsum() thresh = np.concatenate(([-np.inf], thresh, [np.inf])) - xb = self.exog.dot(params[:-(self.k_levels - 1)])[:, None] + xb = exog.dot(params[:-(self.k_levels - 1)])[:, None] low = thresh[:-1] - xb upp = thresh[1:] - xb prob = self.prob(low, upp) return prob - def loglike(self, params): + def _linpred(self, params, exog=None, offset=None): + """linear prediction of latent variable `x b` - # structure of params = [beta, constants_or_thresholds] + currently only for exog from estimation sample (in-sample) + """ + if exog is None: + exog = self.exog + if offset is None: + offset = self.offset + if exog is not None: + linpred = self.exog.dot(params[:-(self.k_levels - 1)]) + else: # means self.exog is also None + linpred = np.zeros(self.nobs) + if offset is not None: + linpred += offset + return linpred + + def _bounds(self, params): + thresh = self.transform_threshold_params(params) - thresh = np.concatenate(([-np.inf], - params[-(self.k_levels - 1):], [np.inf])) + thresh_i_low = thresh[self.endog] + thresh_i_upp = thresh[self.endog + 1] + xb = self._linpred(params) + low = thresh_i_low - xb + upp = thresh_i_upp - xb + return low, upp + + def loglike(self, params): + + thresh = self.transform_threshold_params(params) - # explicit in several steps to avoid bugs - th_params = params[-(self.k_levels - 1):] - thresh = np.concatenate((th_params[:1], - np.exp(th_params[1:]))).cumsum() - thresh = np.concatenate(([-np.inf], thresh, [np.inf])) thresh_i_low = thresh[self.endog] thresh_i_upp = thresh[self.endog + 1] - xb = self.exog.dot(params[:-(self.k_levels - 1)]) + xb = self._linpred(params) low = thresh_i_low - xb upp = thresh_i_upp - xb prob = self.prob(low, upp) return np.log(prob + 1e-20).sum() + def loglikeobs(self, params): + + low, upp = self._bounds(params) + prob = self.prob(low, upp) + return np.log(prob + 1e-20) + + def score_obs_(self, params): + """score, first derivative of loglike for each observations + + This currently only implements the derivative with respect to the + exog parameters, but not with respect to threshold-constant parameters. + + """ + low, upp = self._bounds(params) + + prob = self.prob(low, upp) + pdf_upp = self.pdf(upp) + pdf_low = self.pdf(low) + + # TODO the following doesn't work yet because of the incremental exp + # parameterization. The following was written base on Greene for the + # simple non-incremental parameterization. + # k = self.k_levels - 1 + # idx = self.endog + # score_factor = np.zeros((self.nobs, k + 1 + 2)) #+2 avoids idx bounds + # + # rows = np.arange(self.nobs) + # shift = 1 + # score_factor[rows, shift + idx-1] = -pdf_low + # score_factor[rows, shift + idx] = pdf_upp + # score_factor[:, 0] = pdf_upp - pdf_low + score_factor = (pdf_upp - pdf_low)[:, None] + score_factor /= prob[:, None] + + so = np.column_stack((-score_factor[:, :1] * self.exog, + score_factor[:, 1:])) + return so + @property def start_params(self): # start params based on model without exog diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py index 4bb86392afa..7e4ecf4a027 100644 --- a/statsmodels/miscmodels/tests/test_ordinal_model.py +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -151,6 +151,27 @@ def setup_class(cls): cls.resf = resf cls.resu = resu + def test_loglikerelated(self): + + res1 = self.res1 + # res2 = self.res2 + + mod = res1.model + fact = 1.1 # evaluate away from optimum + score1 = mod.score(res1.params * fact) + score_obs_numdiff = mod.score_obs(res1.params * fact) + score_obs_exog = mod.score_obs_(res1.params * fact) + assert_allclose(score_obs_numdiff.sum(0), score1, rtol=1e-8) + assert_allclose(score_obs_exog.sum(0), score1[:mod.k_vars], rtol=1e-7) + + # null model + mod_null = OrderedModel(mod.endog, None, + offset=np.zeros(mod.nobs), + distr='probit') + null_params = mod.start_params + res_null = mod_null.fit(method='bfgs', disp=False) + assert_allclose(res_null.params, null_params[mod.k_vars:], rtol=1e-8) + class TestCLogLogModel(CheckOrdinalModelMixin): From 3e232ba57d194308e17b68b946e3384e115dcffb Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Fri, 4 Sep 2020 22:29:57 -0400 Subject: [PATCH 14/24] ENH/TST: add pred_table, adjust and add unit tests --- statsmodels/examples/ex_ordered_model.py | 4 +-- statsmodels/miscmodels/ordinal_model.py | 18 +++++++++- .../miscmodels/tests/results/__init__.py | 0 .../miscmodels/tests/test_ordinal_model.py | 33 +++++++++++++++++-- 4 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 statsmodels/miscmodels/tests/results/__init__.py diff --git a/statsmodels/examples/ex_ordered_model.py b/statsmodels/examples/ex_ordered_model.py index b06c3c569ae..a03f25a48c8 100644 --- a/statsmodels/examples/ex_ordered_model.py +++ b/statsmodels/examples/ex_ordered_model.py @@ -24,8 +24,8 @@ print(np.bincount(y)) mod = OrderedModel(y, x) -start_params = np.ones(k_vars + 4) -start_params = np.concatenate((np.ones(k_vars), np.arange(4))) +# start_params = np.ones(k_vars + 4) +# start_params = np.concatenate((np.ones(k_vars), np.arange(4))) start_ppf = stats.norm.ppf((np.bincount(y) / len(y)).cumsum()) start_threshold = np.concatenate((start_ppf[:1], np.log(np.diff(start_ppf[:-1])))) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index 94ea42da8f4..c0922dad0bd 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -118,7 +118,7 @@ def _check_inputs(self, endog, exog): exog_name = ([exog.name] if isinstance(exog, pd.Series) else exog.columns.tolist()) names['xname'] = exog_name - exog = np.asarray(exog) + # exog = np.asarray(exog) if isinstance(endog, pd.Series): if isinstance(endog.dtypes, CategoricalDtype): @@ -315,6 +315,22 @@ def fit(self, start_params=None, method='nm', maxiter=500, full_output=1, class OrderedResults(GenericLikelihoodModelResults): + + def pred_table(self): + """prediction table + + returns pandas DataFrame + + """ + # todo: add category labels + categories = np.arange(self.model.k_levels) + observed = pd.Categorical(self.model.endog, + categories=categories, ordered=True) + predicted = pd.Categorical(self.predict().argmax(1), + categories=categories, ordered=True) + table = pd.crosstab(observed, predicted, margins=True, dropna=False) + return table + @Appender(GenericLikelihoodModelResults.summary.__doc__) def summary(self, yname=None, xname=None, title=None, alpha=.05): names = self.model.names diff --git a/statsmodels/miscmodels/tests/results/__init__.py b/statsmodels/miscmodels/tests/results/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py index 7e4ecf4a027..28e14403660 100644 --- a/statsmodels/miscmodels/tests/test_ordinal_model.py +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -5,7 +5,7 @@ import numpy as np import scipy.stats as stats -from numpy.testing import assert_allclose +from numpy.testing import assert_allclose, assert_equal from .results.results_ordinal_model import data_store as ds from statsmodels.miscmodels.ordinal_model import OrderedModel @@ -70,6 +70,27 @@ def test_unordered(self): assert_allclose(res1.model.endog, resf.model.endog, rtol=1e-10) assert_allclose(res1.model.exog, resf.model.exog, rtol=1e-10) + def test_results_other(self): + + res1 = self.res1 + + # results + if hasattr(self, "pred_table"): + table = res1.pred_table() + assert_equal(table.values, self.pred_table) + + # smoke test + res1.summary() + + # inherited + tt = res1.t_test(np.eye(len(res1.params))) + assert_allclose(tt.pvalue, res1.pvalues, rtol=1e-13) + # TODO: test using string definition of constraints + + pred = res1.predict(exog=res1.model.exog[-5:]) + fitted = res1.predict() + assert_allclose(pred, fitted[-5:], rtol=1e-13) + class TestLogitModel(CheckOrdinalModelMixin): @@ -151,6 +172,12 @@ def setup_class(cls): cls.resf = resf cls.resu = resu + # regression numbers + cls.pred_table = np.array([[202, 18, 0, 220], + [112, 28, 0, 140], + [ 27, 13, 0, 40], # noqa + [341, 59, 0, 400]], dtype=np.int64) + def test_loglikerelated(self): res1 = self.res1 @@ -161,8 +188,8 @@ def test_loglikerelated(self): score1 = mod.score(res1.params * fact) score_obs_numdiff = mod.score_obs(res1.params * fact) score_obs_exog = mod.score_obs_(res1.params * fact) - assert_allclose(score_obs_numdiff.sum(0), score1, rtol=1e-8) - assert_allclose(score_obs_exog.sum(0), score1[:mod.k_vars], rtol=1e-7) + assert_allclose(score_obs_numdiff.sum(0), score1, atol=1e-7) + assert_allclose(score_obs_exog.sum(0), score1[:mod.k_vars], atol=1e-7) # null model mod_null = OrderedModel(mod.endog, None, From 7152d11f6242fb0c5863f60620c1adca8e316370 Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Sat, 5 Sep 2020 17:01:48 -0400 Subject: [PATCH 15/24] ENH: properly connect param_names, give pandas codes endog to super --- statsmodels/miscmodels/ordinal_model.py | 59 ++++++++++--------- .../miscmodels/tests/test_ordinal_model.py | 17 +++++- 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index c0922dad0bd..590c1000216 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -85,19 +85,32 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds): # TODO: check if super can handle offset self.offset = offset - self.names, endog, exog = self._check_inputs(endog, exog) + endog, labels, is_pandas = self._check_inputs(endog, exog) super(OrderedModel, self).__init__(endog, exog, **kwds) - unique, index = np.unique(self.endog, return_inverse=True) - self.k_levels = len(unique) - self.endog = index - self.labels = unique + if not is_pandas: + unique, index = np.unique(self.endog, return_inverse=True) + self.endog = index + labels = unique + + self.labels = labels + self.k_levels = len(labels) if self.exog is not None: self.nobs, self.k_vars = self.exog.shape else: # no exog in model self.nobs, self.k_vars = self.endog.shape[0], 0 + + threshold_names = [str(x) + '/' + str(y) + for x, y in zip(labels[:-1], labels[1:])] + + # from GenericLikelihoodModel.fit + if self.exog is not None: + self.exog_names.extend(threshold_names) + else: + self.data.xnames = threshold_names + self.results_class = OrderedResults def _check_inputs(self, endog, exog): @@ -106,20 +119,14 @@ def _check_inputs(self, endog, exog): support for endog and exog. Also retrieves columns & categories names for .summary() of the results class. """ - names = {} if not isinstance(self.distr, stats.rv_continuous): msg = ( f"{self.distr.name} must be a scipy.stats distribution." ) raise ValueError(msg) - # Pandas' support - if (isinstance(exog, pd.DataFrame)) or (isinstance(exog, pd.Series)): - exog_name = ([exog.name] if isinstance(exog, pd.Series) - else exog.columns.tolist()) - names['xname'] = exog_name - # exog = np.asarray(exog) - + labels = None + is_pandas = False if isinstance(endog, pd.Series): if isinstance(endog.dtypes, CategoricalDtype): if not endog.dtype.ordered: @@ -129,20 +136,19 @@ def _check_inputs(self, endog, exog): "categories. ordered == True preferred.", Warning) endog_name = endog.name - threshold_name = [str(x) + '/' + str(y) - for x, y in zip(endog.values.categories[:-1], - endog.values.categories[1:])] - names['yname'] = endog_name - names['xname'] = names['xname'] + threshold_name - endog = np.asarray(endog.values.codes) + labels = endog.values.categories + endog = endog.cat.codes + if endog.min() == -1: # means there is a missing value + raise ValueError("missing values in categorical endog are " + "not supported") + endog.name = endog_name + is_pandas = True else: - msg = ( - "If the endog is a pandas.Serie " - "it must be of categoricalDtype." - ) + msg = ("If endog is a pandas.Series, " + "it must be of CategoricalDtype.") raise ValueError(msg) - return names, endog, exog + return endog, labels, is_pandas def cdf(self, x): """cdf evaluated at x @@ -330,8 +336,3 @@ def pred_table(self): categories=categories, ordered=True) table = pd.crosstab(observed, predicted, margins=True, dropna=False) return table - - @Appender(GenericLikelihoodModelResults.summary.__doc__) - def summary(self, yname=None, xname=None, title=None, alpha=.05): - names = self.model.names - return super(OrderedResults, self).summary(**names) diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py index 28e14403660..9fe9612b86d 100644 --- a/statsmodels/miscmodels/tests/test_ordinal_model.py +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -72,7 +72,16 @@ def test_unordered(self): def test_results_other(self): - res1 = self.res1 + res1 = self.res1 # numpy + resp = self.resp # pandas + + param_names_np = ['x1', 'x2', 'x3', '0/1', '1/2'] + param_names_pd = ['pared', 'public', 'gpa', 'unlikely/somewhat likely', + 'somewhat likely/very likely'] + + assert res1.model.data.param_names == param_names_np + assert self.resp.model.data.param_names == param_names_pd + assert self.resp.model.endog_names == "apply" # results if hasattr(self, "pred_table"): @@ -85,7 +94,9 @@ def test_results_other(self): # inherited tt = res1.t_test(np.eye(len(res1.params))) assert_allclose(tt.pvalue, res1.pvalues, rtol=1e-13) - # TODO: test using string definition of constraints + + tt = resp.t_test(['pared', 'public', 'gpa']) # pandas names + assert_allclose(tt.pvalue, res1.pvalues[:3], rtol=1e-13) pred = res1.predict(exog=res1.model.exog[-5:]) fitted = res1.predict() @@ -194,7 +205,7 @@ def test_loglikerelated(self): # null model mod_null = OrderedModel(mod.endog, None, offset=np.zeros(mod.nobs), - distr='probit') + distr=mod.distr) null_params = mod.start_params res_null = mod_null.fit(method='bfgs', disp=False) assert_allclose(res_null.params, null_params[mod.k_vars:], rtol=1e-8) From dcff9416cad85ea50314c744f7108908a3e90280 Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Tue, 15 Sep 2020 11:14:51 -0400 Subject: [PATCH 16/24] ENH/REF: support pd ordered Categorical in formula --- statsmodels/miscmodels/ordinal_model.py | 87 +++++++++++++++---- .../miscmodels/tests/test_ordinal_model.py | 25 ++++++ 2 files changed, 93 insertions(+), 19 deletions(-) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index 590c1000216..10f61d986f9 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -10,8 +10,8 @@ import pandas as pd from pandas.api.types import CategoricalDtype from scipy import stats -from statsmodels.base.model import GenericLikelihoodModel, \ - GenericLikelihoodModelResults +from statsmodels.base.model import ( + GenericLikelihoodModel, GenericLikelihoodModelResults, LikelihoodModel) from statsmodels.compat.pandas import Appender @@ -23,7 +23,8 @@ class OrderedModel(GenericLikelihoodModel): The mode assumes that the endogenous variable is ordered but that the labels have no numeric interpretation besides the ordering. - The model is based on a latent linear variable, where we observe only + The model is based on a latent linear variable, where we observe only a + discretization. y_latent = X beta + u @@ -46,18 +47,18 @@ class OrderedModel(GenericLikelihoodModel): distribution. We use standardized distributions to avoid identifiability problems. - Parameters ---------- endog : array_like - endogenous or dependent ordered categorical variable with k levels. + Endogenous or dependent ordered categorical variable with k levels. Labels or values of endog will internally transformed to consecutive integers, 0, 1, 2, ... pd.Series with Categorical as dtype should be preferred as it gives the order relation between the levels. + If endog is not a pandas Categorical, then categories are + sorted in lexicographic order (by numpy.unique). exog : array_like - exogenous explanatory variables. This should not include an intercept. - (TODO: verify) + Exogenous, explanatory variables. This should not include an intercept. pd.DataFrame are also accepted. distr : string 'probit' or 'logit', or a distribution instance The default is currently 'probit' which uses the normal distribution @@ -69,6 +70,7 @@ class OrderedModel(GenericLikelihoodModel): Status: initial version, subclasses `GenericLikelihoodModel` """ + _formula_max_endog = np.inf def __init__(self, endog, exog, offset=None, distr='probit', **kwds): @@ -87,12 +89,24 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds): endog, labels, is_pandas = self._check_inputs(endog, exog) + frame = kwds.pop("frame", None) super(OrderedModel, self).__init__(endog, exog, **kwds) + if frame is not None: + self.data.frame = frame + if not is_pandas: - unique, index = np.unique(self.endog, return_inverse=True) - self.endog = index - labels = unique + # TODO: maybe handle 2-dim endog obtained from formula + if self.endog.ndim == 1: + unique, index = np.unique(self.endog, return_inverse=True) + self.endog = index + labels = unique + elif self.endog.ndim == 2: + endog_, labels, ynames = self._handle_formula_categorical() + # replace endog with categorical + self.endog = endog_ + # fix yname + self.data.ynames = ynames self.labels = labels self.k_levels = len(labels) @@ -114,11 +128,13 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds): self.results_class = OrderedResults def _check_inputs(self, endog, exog): + """handle endog that is pandas Categorical + + checks if self.distrib is legal and does the Pandas Categorical + support for endog. """ - checks if self.distrib is legal and does the Pandas - support for endog and exog. Also retrieves columns & categories - names for .summary() of the results class. - """ + + # TOCO: maybe remove this if we want to have duck distributions if not isinstance(self.distr, stats.rv_continuous): msg = ( f"{self.distr.name} must be a scipy.stats distribution." @@ -135,6 +151,7 @@ def _check_inputs(self, endog, exog): "risk of capturing a wrong order for the " "categories. ordered == True preferred.", Warning) + endog_name = endog.name labels = endog.values.categories endog = endog.cat.codes @@ -143,13 +160,45 @@ def _check_inputs(self, endog, exog): "not supported") endog.name = endog_name is_pandas = True - else: - msg = ("If endog is a pandas.Series, " - "it must be of CategoricalDtype.") - raise ValueError(msg) +# else: +# msg = ("If endog is a pandas.Series, " +# "it must be of CategoricalDtype.") +# raise ValueError(msg) return endog, labels, is_pandas + def _handle_formula_categorical(self): + """handle 2dim endog, + + raise if not from formula with pandas ordered Categorical endog + + """ + # get info about formula and original data + if not hasattr(self.data.orig_endog, "design_info"): + msg = "2-dim endog are not supported" + raise ValueError(msg) + + di_endog = self.data.orig_endog.design_info + if len(di_endog.terms) > 1: + raise ValueError("more than one term in endog") + + factor = list(di_endog.factor_infos.values())[0] + labels = factor.categories + name = factor.state["eval_code"] + original_endog = self.data.frame[name] + if not (isinstance(original_endog.dtype, CategoricalDtype) + and original_endog.dtype.ordered): + msg = ("Only ordered pandas Categorical are supported as endog " + "in formulas") + raise ValueError(msg) + + # Now we should only have an ordered pandas Categorical + + endog = self.endog.argmax(1) + # fix yname + ynames = name + return endog, labels, ynames + def cdf(self, x): """cdf evaluated at x """ @@ -268,7 +317,7 @@ def score_obs_(self, params): """score, first derivative of loglike for each observations This currently only implements the derivative with respect to the - exog parameters, but not with respect to threshold-constant parameters. + exog parameters, but not with respect to threshold parameters. """ low, upp = self._bounds(params) diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py index 9fe9612b86d..180f6834058 100644 --- a/statsmodels/miscmodels/tests/test_ordinal_model.py +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -4,6 +4,7 @@ import numpy as np import scipy.stats as stats +import pytest from numpy.testing import assert_allclose, assert_equal from .results.results_ordinal_model import data_store as ds @@ -210,6 +211,30 @@ def test_loglikerelated(self): res_null = mod_null.fit(method='bfgs', disp=False) assert_allclose(res_null.params, null_params[mod.k_vars:], rtol=1e-8) + def test_formula_categorical(self): + + resp = self.resp + data = ds.df + + "apply ~ pared + public + gpa - 1" + modf2 = OrderedModel.from_formula("apply ~ pared + public + gpa - 1", + data, distr='probit') + resf2 = modf2.fit(method='bfgs', disp=False) + assert_allclose(resf2.params, resp.params, atol=1e-8) + assert modf2.exog_names == resp.model.exog_names + assert modf2.data.ynames == resp.model.data.ynames + assert hasattr(modf2.data, "frame") + assert not hasattr(modf2, "frame") + + with pytest.raises(ValueError): + OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": np.asarray(data['apply']), + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='probit') + class TestCLogLogModel(CheckOrdinalModelMixin): From 0241fd52a74d68ef9a818c5ef475f7ae9f49c8fb Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Tue, 15 Sep 2020 12:54:52 -0400 Subject: [PATCH 17/24] fix formula for pandas categorical --- statsmodels/miscmodels/ordinal_model.py | 84 +++++++++++++++++-------- 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index 10f61d986f9..fa72c02665e 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -89,12 +89,8 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds): endog, labels, is_pandas = self._check_inputs(endog, exog) - frame = kwds.pop("frame", None) super(OrderedModel, self).__init__(endog, exog, **kwds) - if frame is not None: - self.data.frame = frame - if not is_pandas: # TODO: maybe handle 2-dim endog obtained from formula if self.endog.ndim == 1: @@ -102,28 +98,17 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds): self.endog = index labels = unique elif self.endog.ndim == 2: - endog_, labels, ynames = self._handle_formula_categorical() - # replace endog with categorical - self.endog = endog_ - # fix yname - self.data.ynames = ynames - - self.labels = labels - self.k_levels = len(labels) - - if self.exog is not None: - self.nobs, self.k_vars = self.exog.shape - else: # no exog in model - self.nobs, self.k_vars = self.endog.shape[0], 0 - - threshold_names = [str(x) + '/' + str(y) - for x, y in zip(labels[:-1], labels[1:])] - - # from GenericLikelihoodModel.fit - if self.exog is not None: - self.exog_names.extend(threshold_names) - else: - self.data.xnames = threshold_names + labels = [str(i) for i in range(self.endog.shape[1])] + labels = [] + #self.endog = self.endog.argmax(1) + # endog_ = self.endog.argmax(1) +# endog_, labels, ynames = self._handle_formula_categorical() +# # replace endog with categorical +# self.endog = endog_ +# # fix yname +# self.data.ynames = ynames + + self._initialize_labels(labels) self.results_class = OrderedResults @@ -167,6 +152,25 @@ def _check_inputs(self, endog, exog): return endog, labels, is_pandas + def _initialize_labels(self, labels): + self.labels = labels + self.k_levels = len(labels) + + if self.exog is not None: + self.nobs, self.k_vars = self.exog.shape + else: # no exog in model + self.nobs, self.k_vars = self.endog.shape[0], 0 + + threshold_names = [str(x) + '/' + str(y) + for x, y in zip(labels[:-1], labels[1:])] + + # from GenericLikelihoodModel.fit + if self.exog is not None: + # avoid extending several times + self.exog_names.extend(threshold_names) + else: + self.data.xnames = threshold_names + def _handle_formula_categorical(self): """handle 2dim endog, @@ -199,6 +203,34 @@ def _handle_formula_categorical(self): ynames = name return endog, labels, ynames + @classmethod + def from_formula(cls, formula, data, subset=None, drop_cols=None, + *args, **kwargs): + + if "0+" not in formula.replace(" ", ""): + import warnings + warnings.warn("Ordinal models should not include an intercept") + + endog_name = formula.split("~")[0].strip() + original_endog = data[endog_name] + + model = super(OrderedModel, cls).from_formula( + formula, data=data, *args, **kwargs) + + if model.endog.ndim == 2: + if not (isinstance(original_endog.dtype, CategoricalDtype) + and original_endog.dtype.ordered): + msg = ("Only ordered pandas Categorical are supported as endog" + " in formulas") + raise ValueError(msg) + + labels = original_endog.values.categories + model._initialize_labels(labels) + model.endog = model.endog.argmax(1) + model.data.ynames = endog_name + + return model + def cdf(self, x): """cdf evaluated at x """ From c1664d989c4eea864180e69b200bef687494a234 Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Wed, 16 Sep 2020 11:06:50 -0400 Subject: [PATCH 18/24] CLN: cleanup, remove now redundant code and comments for formulas --- statsmodels/miscmodels/ordinal_model.py | 66 ++++++------------------- 1 file changed, 14 insertions(+), 52 deletions(-) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index fa72c02665e..188b121a12e 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -11,7 +11,7 @@ from pandas.api.types import CategoricalDtype from scipy import stats from statsmodels.base.model import ( - GenericLikelihoodModel, GenericLikelihoodModelResults, LikelihoodModel) + GenericLikelihoodModel, GenericLikelihoodModelResults) from statsmodels.compat.pandas import Appender @@ -84,7 +84,6 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds): if offset is not None: offset = np.asarray(offset) - # TODO: check if super can handle offset self.offset = offset endog, labels, is_pandas = self._check_inputs(endog, exog) @@ -92,21 +91,19 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds): super(OrderedModel, self).__init__(endog, exog, **kwds) if not is_pandas: - # TODO: maybe handle 2-dim endog obtained from formula if self.endog.ndim == 1: unique, index = np.unique(self.endog, return_inverse=True) self.endog = index labels = unique elif self.endog.ndim == 2: + if not hasattr(self, "design_info"): + raise ValueError("2-dim endog not supported") + # this branch is currently only in support of from_formula + # labels here are only needed to choose k_levels in initialize labels = [str(i) for i in range(self.endog.shape[1])] labels = [] - #self.endog = self.endog.argmax(1) - # endog_ = self.endog.argmax(1) -# endog_, labels, ynames = self._handle_formula_categorical() -# # replace endog with categorical -# self.endog = endog_ -# # fix yname -# self.data.ynames = ynames + # Note: Doing the following here would break from_formula + # self.endog = self.endog.argmax(1) self._initialize_labels(labels) @@ -119,19 +116,18 @@ def _check_inputs(self, endog, exog): support for endog. """ - # TOCO: maybe remove this if we want to have duck distributions if not isinstance(self.distr, stats.rv_continuous): + import warnings msg = ( - f"{self.distr.name} must be a scipy.stats distribution." + f"{self.distr.name} is not a scipy.stats distribution." ) - raise ValueError(msg) + raise warnings.warn(msg) labels = None is_pandas = False if isinstance(endog, pd.Series): if isinstance(endog.dtypes, CategoricalDtype): if not endog.dtype.ordered: - import warnings warnings.warn("the endog has ordered == False, " "risk of capturing a wrong order for the " "categories. ordered == True preferred.", @@ -145,10 +141,6 @@ def _check_inputs(self, endog, exog): "not supported") endog.name = endog_name is_pandas = True -# else: -# msg = ("If endog is a pandas.Series, " -# "it must be of CategoricalDtype.") -# raise ValueError(msg) return endog, labels, is_pandas @@ -167,42 +159,12 @@ def _initialize_labels(self, labels): # from GenericLikelihoodModel.fit if self.exog is not None: # avoid extending several times + if len(self.exog_names) > self.k_vars: + raise RuntimeError("something wrong with exog_names, too long") self.exog_names.extend(threshold_names) else: self.data.xnames = threshold_names - def _handle_formula_categorical(self): - """handle 2dim endog, - - raise if not from formula with pandas ordered Categorical endog - - """ - # get info about formula and original data - if not hasattr(self.data.orig_endog, "design_info"): - msg = "2-dim endog are not supported" - raise ValueError(msg) - - di_endog = self.data.orig_endog.design_info - if len(di_endog.terms) > 1: - raise ValueError("more than one term in endog") - - factor = list(di_endog.factor_infos.values())[0] - labels = factor.categories - name = factor.state["eval_code"] - original_endog = self.data.frame[name] - if not (isinstance(original_endog.dtype, CategoricalDtype) - and original_endog.dtype.ordered): - msg = ("Only ordered pandas Categorical are supported as endog " - "in formulas") - raise ValueError(msg) - - # Now we should only have an ordered pandas Categorical - - endog = self.endog.argmax(1) - # fix yname - ynames = name - return endog, labels, ynames - @classmethod def from_formula(cls, formula, data, subset=None, drop_cols=None, *args, **kwargs): @@ -220,8 +182,8 @@ def from_formula(cls, formula, data, subset=None, drop_cols=None, if model.endog.ndim == 2: if not (isinstance(original_endog.dtype, CategoricalDtype) and original_endog.dtype.ordered): - msg = ("Only ordered pandas Categorical are supported as endog" - " in formulas") + msg = ("Only ordered pandas Categorical are supported as " + "endog in formulas") raise ValueError(msg) labels = original_endog.values.categories From db637720f06b99c3f21721843a942cd90bda3be6 Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Sat, 19 Sep 2020 13:46:19 -0400 Subject: [PATCH 19/24] ENH: drop Intercept in `from_formula`, similar to MixedLM, fix predict design_info --- statsmodels/base/model.py | 4 +- statsmodels/miscmodels/ordinal_model.py | 17 +++- .../miscmodels/tests/test_ordinal_model.py | 82 +++++++++++++++++-- 3 files changed, 90 insertions(+), 13 deletions(-) diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py index 8d0328b471d..9ec656552c9 100644 --- a/statsmodels/base/model.py +++ b/statsmodels/base/model.py @@ -1060,7 +1060,9 @@ def predict(self, exog=None, transform=True, *args, **kwargs): exog_index = exog.index if is_pandas else None if transform and hasattr(self.model, 'formula') and (exog is not None): - design_info = self.model.data.design_info + # allow both location of design_info, see #7043 + design_info = (getattr(self.model, "design_info", None) or + self.model.data.design_info) from patsy import dmatrix if isinstance(exog, pd.Series): # we are guessing whether it should be column or row diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index 188b121a12e..69734d9b494 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -169,15 +169,24 @@ def _initialize_labels(self, labels): def from_formula(cls, formula, data, subset=None, drop_cols=None, *args, **kwargs): - if "0+" not in formula.replace(" ", ""): - import warnings - warnings.warn("Ordinal models should not include an intercept") + # we want an explicit Intercept in the model that we can remove + # Removing constant with "0 +" or "- 1" does not work for categ. exog + # copied from PHReg + import re + terms = re.split(r"[+\-~]", formula) + for term in terms: + term = term.strip() + if term in ("0", "1"): + import warnings + msg = ("OrderedModel formulas should not include any '0' or " + "'1' terms if those create an implicit constant.") + warnings.warn(msg) endog_name = formula.split("~")[0].strip() original_endog = data[endog_name] model = super(OrderedModel, cls).from_formula( - formula, data=data, *args, **kwargs) + formula, data=data, drop_cols=["Intercept"], *args, **kwargs) if model.endog.ndim == 2: if not (isinstance(original_endog.dtype, CategoricalDtype) diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py index 180f6834058..0eabde972b8 100644 --- a/statsmodels/miscmodels/tests/test_ordinal_model.py +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -5,6 +5,7 @@ import numpy as np import scipy.stats as stats import pytest +import pandas as pd from numpy.testing import assert_allclose, assert_equal from .results.results_ordinal_model import data_store as ds @@ -103,6 +104,16 @@ def test_results_other(self): fitted = res1.predict() assert_allclose(pred, fitted[-5:], rtol=1e-13) + pred = resp.predict(exog=resp.model.data.orig_exog.iloc[-5:]) + fitted = resp.predict() + assert_allclose(pred, fitted[-5:], rtol=1e-13) + + dataf = self.resf.model.data.frame # is a dict + dataf_df = pd.DataFrame.from_dict(dataf) + pred = self.resf.predict(exog=dataf_df.iloc[-5:]) + fitted = self.resf.predict() + assert_allclose(pred, fitted[-5:], rtol=1e-13) + class TestLogitModel(CheckOrdinalModelMixin): @@ -216,7 +227,6 @@ def test_formula_categorical(self): resp = self.resp data = ds.df - "apply ~ pared + public + gpa - 1" modf2 = OrderedModel.from_formula("apply ~ pared + public + gpa - 1", data, distr='probit') resf2 = modf2.fit(method='bfgs', disp=False) @@ -236,6 +246,61 @@ def test_formula_categorical(self): distr='probit') +class TestLogitModelFormula(): + + @classmethod + def setup_class(cls): + data = ds.df + nobs = len(data) + data["dummy"] = (np.arange(nobs) < (nobs / 2)).astype(float) + # alias to correspond to patsy name + data["C(dummy)[T.1.0]"] = data["dummy"] + cls.data = data + + columns = ['C(dummy)[T.1.0]', 'pared', 'public', 'gpa'] + # standard fit + mod = OrderedModel(data['apply'].values.codes, + np.asarray(data[columns], float), + distr='logit') + cls.res = mod.fit(method='bfgs', disp=False) + # standard fit with pandas input + modp = OrderedModel(data['apply'], + data[columns], + distr='logit') + cls.resp = modp.fit(method='bfgs', disp=False) + + def test_setup(self): + data = self.data + resp = self.resp + fittedvalues = resp.predict() + + formulas = ["apply ~ 1 + pared + public + gpa + C(dummy)", + "apply ~ pared + public + gpa + C(dummy)"] + for formula in formulas: + modf1 = OrderedModel.from_formula(formula, data, distr='logit') + resf1 = modf1.fit(method='bfgs') + summf1 = resf1.summary() + summf1_str = str(summf1) + assert resf1.model.exog_names == resp.model.exog_names + assert resf1.model.data.param_names == resp.model.exog_names + assert all(name in summf1_str for name in + resp.model.data.param_names) + assert_allclose(resf1.predict(data[:5]), fittedvalues[:5]) + + # test over parameterized model with implicit constant + # warns but doesn't raise + formula = "apply ~ 0 + pared + public + gpa + C(dummy)" + + with pytest.warns(UserWarning): + modf2 = OrderedModel.from_formula(formula, data, distr='logit') + + with pytest.warns(UserWarning): + resf2 = modf2.fit(method='bfgs') + assert np.isnan(resf2.bse).all() + + assert_allclose(resf2.predict(data[:5]), fittedvalues[:5], rtol=1e-4) + + class TestCLogLogModel(CheckOrdinalModelMixin): @classmethod @@ -263,13 +328,14 @@ def _cdf(self, x): distr=cloglog) resp = modp.fit(method='bfgs', disp=False) - modf = OrderedModel.from_formula( - "apply ~ pared + public + gpa - 1", - data={"apply": data['apply'].values.codes, - "pared": data['pared'], - "public": data['public'], - "gpa": data['gpa']}, - distr=cloglog) + with pytest.warns(UserWarning): + modf = OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr=cloglog) resf = modf.fit(method='bfgs', disp=False) modu = OrderedModel( From a1fb8c2d8d5a648bde1574ce3575ee42bf2b2f2c Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Sat, 19 Sep 2020 19:42:02 -0400 Subject: [PATCH 20/24] REF: use SpecificationWarning for formula, --- statsmodels/miscmodels/ordinal_model.py | 4 +- .../miscmodels/tests/test_ordinal_model.py | 68 +++++++++++-------- 2 files changed, 44 insertions(+), 28 deletions(-) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index 69734d9b494..e2779f136ce 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -10,6 +10,8 @@ import pandas as pd from pandas.api.types import CategoricalDtype from scipy import stats +from statsmodels.tools.sm_exceptions import ( + SpecificationWarning) from statsmodels.base.model import ( GenericLikelihoodModel, GenericLikelihoodModelResults) from statsmodels.compat.pandas import Appender @@ -180,7 +182,7 @@ def from_formula(cls, formula, data, subset=None, drop_cols=None, import warnings msg = ("OrderedModel formulas should not include any '0' or " "'1' terms if those create an implicit constant.") - warnings.warn(msg) + warnings.warn(msg, SpecificationWarning) endog_name = formula.split("~")[0].strip() original_endog = data[endog_name] diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py index 0eabde972b8..c4db0e37c7b 100644 --- a/statsmodels/miscmodels/tests/test_ordinal_model.py +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -2,12 +2,15 @@ Test for ordinal models """ +import warnings import numpy as np import scipy.stats as stats -import pytest import pandas as pd +import pytest from numpy.testing import assert_allclose, assert_equal +from statsmodels.tools.sm_exceptions import ( + HessianInversionWarning, SpecificationWarning) from .results.results_ordinal_model import data_store as ds from statsmodels.miscmodels.ordinal_model import OrderedModel @@ -133,13 +136,15 @@ def setup_class(cls): distr='logit') resp = modp.fit(method='bfgs', disp=False) # fit with formula - modf = OrderedModel.from_formula( - "apply ~ pared + public + gpa - 1", - data={"apply": data['apply'].values.codes, - "pared": data['pared'], - "public": data['public'], - "gpa": data['gpa']}, - distr='logit') + with warnings.catch_warnings(): + warnings.simplefilter("ignore", SpecificationWarning) + modf = OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='logit') resf = modf.fit(method='bfgs', disp=False) # fit on data with ordered=False modu = OrderedModel( @@ -173,13 +178,15 @@ def setup_class(cls): distr='probit') resp = modp.fit(method='bfgs', disp=False) - modf = OrderedModel.from_formula( - "apply ~ pared + public + gpa - 1", - data={"apply": data['apply'].values.codes, - "pared": data['pared'], - "public": data['public'], - "gpa": data['gpa']}, - distr='probit') + with warnings.catch_warnings(): + warnings.simplefilter("ignore", SpecificationWarning) + modf = OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='probit') resf = modf.fit(method='bfgs', disp=False) modu = OrderedModel( @@ -227,8 +234,11 @@ def test_formula_categorical(self): resp = self.resp data = ds.df - modf2 = OrderedModel.from_formula("apply ~ pared + public + gpa - 1", - data, distr='probit') + with warnings.catch_warnings(): + warnings.simplefilter("ignore", SpecificationWarning) + formula = "apply ~ pared + public + gpa - 1" + modf2 = OrderedModel.from_formula(formula, + data, distr='probit') resf2 = modf2.fit(method='bfgs', disp=False) assert_allclose(resf2.params, resp.params, atol=1e-8) assert modf2.exog_names == resp.model.exog_names @@ -237,19 +247,22 @@ def test_formula_categorical(self): assert not hasattr(modf2, "frame") with pytest.raises(ValueError): - OrderedModel.from_formula( - "apply ~ pared + public + gpa - 1", - data={"apply": np.asarray(data['apply']), - "pared": data['pared'], - "public": data['public'], - "gpa": data['gpa']}, - distr='probit') + with warnings.catch_warnings(): + warnings.simplefilter("ignore", SpecificationWarning) + OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": np.asarray(data['apply']), + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='probit') class TestLogitModelFormula(): @classmethod def setup_class(cls): + warnings.simplefilter("ignore", SpecificationWarning) data = ds.df nobs = len(data) data["dummy"] = (np.arange(nobs) < (nobs / 2)).astype(float) @@ -291,12 +304,13 @@ def test_setup(self): # warns but doesn't raise formula = "apply ~ 0 + pared + public + gpa + C(dummy)" - with pytest.warns(UserWarning): + with pytest.warns(SpecificationWarning): modf2 = OrderedModel.from_formula(formula, data, distr='logit') - with pytest.warns(UserWarning): + with pytest.warns(HessianInversionWarning): resf2 = modf2.fit(method='bfgs') - assert np.isnan(resf2.bse).all() + assert resf2.converged is False + # assert np.isnan(resf2.bse).all() assert_allclose(resf2.predict(data[:5]), fittedvalues[:5], rtol=1e-4) From eeb1eb20044445fc7596ddbc9f192b34228b33df Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Mon, 21 Sep 2020 11:06:51 -0400 Subject: [PATCH 21/24] BUG: GenericLikelihoodModel propagate `hasconst` see #7049 --- statsmodels/base/model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py index 9ec656552c9..03ae9b63349 100644 --- a/statsmodels/base/model.py +++ b/statsmodels/base/model.py @@ -789,6 +789,7 @@ def __init__(self, endog, exog=None, loglike=None, score=None, if hessian is not None: self.hessian = hessian + hasconst = kwds.pop("hasconst", None) self.__dict__.update(kwds) # TODO: data structures? @@ -797,7 +798,8 @@ def __init__(self, endog, exog=None, loglike=None, score=None, # self.df_model = 9999 # somewhere: CacheWriteWarning: 'df_model' cannot be overwritten super(GenericLikelihoodModel, self).__init__(endog, exog, - missing=missing) + missing=missing, + hasconst=hasconst) # this will not work for ru2nmnl, maybe np.ndim of a dict? if exog is not None: From 7f392b9d017d037cefb02794615c17ee8dea687f Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Mon, 21 Sep 2020 12:52:44 -0400 Subject: [PATCH 22/24] REF: raise instead of warn if k_constant > 0 --- statsmodels/miscmodels/ordinal_model.py | 16 +--- .../miscmodels/tests/test_ordinal_model.py | 90 +++++++++---------- 2 files changed, 45 insertions(+), 61 deletions(-) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index e2779f136ce..a92f75f069c 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -10,8 +10,7 @@ import pandas as pd from pandas.api.types import CategoricalDtype from scipy import stats -from statsmodels.tools.sm_exceptions import ( - SpecificationWarning) + from statsmodels.base.model import ( GenericLikelihoodModel, GenericLikelihoodModelResults) from statsmodels.compat.pandas import Appender @@ -107,6 +106,9 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds): # Note: Doing the following here would break from_formula # self.endog = self.endog.argmax(1) + if self.k_constant > 0: + raise ValueError("there should not be a constant in the model") + self._initialize_labels(labels) self.results_class = OrderedResults @@ -173,16 +175,6 @@ def from_formula(cls, formula, data, subset=None, drop_cols=None, # we want an explicit Intercept in the model that we can remove # Removing constant with "0 +" or "- 1" does not work for categ. exog - # copied from PHReg - import re - terms = re.split(r"[+\-~]", formula) - for term in terms: - term = term.strip() - if term in ("0", "1"): - import warnings - msg = ("OrderedModel formulas should not include any '0' or " - "'1' terms if those create an implicit constant.") - warnings.warn(msg, SpecificationWarning) endog_name = formula.split("~")[0].strip() original_endog = data[endog_name] diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py index c4db0e37c7b..350e3e24219 100644 --- a/statsmodels/miscmodels/tests/test_ordinal_model.py +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -9,8 +9,7 @@ import pytest from numpy.testing import assert_allclose, assert_equal -from statsmodels.tools.sm_exceptions import ( - HessianInversionWarning, SpecificationWarning) +from statsmodels.tools.sm_exceptions import HessianInversionWarning from .results.results_ordinal_model import data_store as ds from statsmodels.miscmodels.ordinal_model import OrderedModel @@ -136,15 +135,13 @@ def setup_class(cls): distr='logit') resp = modp.fit(method='bfgs', disp=False) # fit with formula - with warnings.catch_warnings(): - warnings.simplefilter("ignore", SpecificationWarning) - modf = OrderedModel.from_formula( - "apply ~ pared + public + gpa - 1", - data={"apply": data['apply'].values.codes, - "pared": data['pared'], - "public": data['public'], - "gpa": data['gpa']}, - distr='logit') + modf = OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='logit') resf = modf.fit(method='bfgs', disp=False) # fit on data with ordered=False modu = OrderedModel( @@ -178,15 +175,13 @@ def setup_class(cls): distr='probit') resp = modp.fit(method='bfgs', disp=False) - with warnings.catch_warnings(): - warnings.simplefilter("ignore", SpecificationWarning) - modf = OrderedModel.from_formula( - "apply ~ pared + public + gpa - 1", - data={"apply": data['apply'].values.codes, - "pared": data['pared'], - "public": data['public'], - "gpa": data['gpa']}, - distr='probit') + modf = OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='probit') resf = modf.fit(method='bfgs', disp=False) modu = OrderedModel( @@ -234,11 +229,9 @@ def test_formula_categorical(self): resp = self.resp data = ds.df - with warnings.catch_warnings(): - warnings.simplefilter("ignore", SpecificationWarning) - formula = "apply ~ pared + public + gpa - 1" - modf2 = OrderedModel.from_formula(formula, - data, distr='probit') + formula = "apply ~ pared + public + gpa - 1" + modf2 = OrderedModel.from_formula(formula, + data, distr='probit') resf2 = modf2.fit(method='bfgs', disp=False) assert_allclose(resf2.params, resp.params, atol=1e-8) assert modf2.exog_names == resp.model.exog_names @@ -247,22 +240,19 @@ def test_formula_categorical(self): assert not hasattr(modf2, "frame") with pytest.raises(ValueError): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", SpecificationWarning) - OrderedModel.from_formula( - "apply ~ pared + public + gpa - 1", - data={"apply": np.asarray(data['apply']), - "pared": data['pared'], - "public": data['public'], - "gpa": data['gpa']}, - distr='probit') + OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": np.asarray(data['apply']), + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='probit') class TestLogitModelFormula(): @classmethod def setup_class(cls): - warnings.simplefilter("ignore", SpecificationWarning) data = ds.df nobs = len(data) data["dummy"] = (np.arange(nobs) < (nobs / 2)).astype(float) @@ -301,16 +291,18 @@ def test_setup(self): assert_allclose(resf1.predict(data[:5]), fittedvalues[:5]) # test over parameterized model with implicit constant - # warns but doesn't raise formula = "apply ~ 0 + pared + public + gpa + C(dummy)" - with pytest.warns(SpecificationWarning): - modf2 = OrderedModel.from_formula(formula, data, distr='logit') + with pytest.raises(ValueError): + OrderedModel.from_formula(formula, data, distr='logit') - with pytest.warns(HessianInversionWarning): + # ignore constant, so we get results without exception + modf2 = OrderedModel.from_formula(formula, data, distr='logit', + hasconst=False) + # we get a warning in some environments + with warnings.catch_warnings(): + warnings.simplefilter("ignore", HessianInversionWarning) resf2 = modf2.fit(method='bfgs') - assert resf2.converged is False - # assert np.isnan(resf2.bse).all() assert_allclose(resf2.predict(data[:5]), fittedvalues[:5], rtol=1e-4) @@ -342,14 +334,14 @@ def _cdf(self, x): distr=cloglog) resp = modp.fit(method='bfgs', disp=False) - with pytest.warns(UserWarning): - modf = OrderedModel.from_formula( - "apply ~ pared + public + gpa - 1", - data={"apply": data['apply'].values.codes, - "pared": data['pared'], - "public": data['public'], - "gpa": data['gpa']}, - distr=cloglog) + # with pytest.warns(UserWarning): + modf = OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr=cloglog) resf = modf.fit(method='bfgs', disp=False) modu = OrderedModel( From 97101b395a24ef3febd705c7881a92a101ec64ce Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Mon, 21 Sep 2020 13:42:12 -0400 Subject: [PATCH 23/24] ENH/BUG: correct predict with offset --- statsmodels/miscmodels/ordinal_model.py | 27 +++++++------- .../miscmodels/tests/test_ordinal_model.py | 37 +++++++++++++++++++ 2 files changed, 51 insertions(+), 13 deletions(-) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index a92f75f069c..df3d98b9546 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -245,21 +245,15 @@ def transform_reverse_threshold_params(self, params): np.log(np.diff(start_ppf[:-1])))) return thresh_params - def predict(self, params, exog=None): + def predict(self, params, exog=None, offset=None): """predicted probabilities for each level of the ordinal endog. """ - if exog is None: - exog = self.exog - # structure of params = [beta, constants_or_thresholds] + # note, exog and offset handling is in linpred - # explicit in several steps to avoid bugs - th_params = params[-(self.k_levels - 1):] - thresh = np.concatenate((th_params[:1], - np.exp(th_params[1:]))).cumsum() - thresh = np.concatenate(([-np.inf], thresh, [np.inf])) - xb = exog.dot(params[:-(self.k_levels - 1)])[:, None] + thresh = self.transform_threshold_params(params) + xb = self._linpred(params, exog=exog, offset=offset)[:, None] low = thresh[:-1] - xb upp = thresh[1:] - xb prob = self.prob(low, upp) @@ -272,10 +266,17 @@ def _linpred(self, params, exog=None, offset=None): """ if exog is None: exog = self.exog - if offset is None: - offset = self.offset + if offset is None: + offset = self.offset + else: + if offset is None: + offset = 0 + + if offset is not None: + offset = np.asarray(offset) + if exog is not None: - linpred = self.exog.dot(params[:-(self.k_levels - 1)]) + linpred = exog.dot(params[:-(self.k_levels - 1)]) else: # means self.exog is also None linpred = np.zeros(self.nobs) if offset is not None: diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py index 350e3e24219..22d4e280439 100644 --- a/statsmodels/miscmodels/tests/test_ordinal_model.py +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -240,6 +240,8 @@ def test_formula_categorical(self): assert not hasattr(modf2, "frame") with pytest.raises(ValueError): + # only ordered categorical or numerical endog are allowed + # string endog raises ValueError OrderedModel.from_formula( "apply ~ pared + public + gpa - 1", data={"apply": np.asarray(data['apply']), @@ -248,6 +250,41 @@ def test_formula_categorical(self): "gpa": data['gpa']}, distr='probit') + def test_offset(self): + + resp = self.resp + data = ds.df + offset = np.ones(len(data)) + + formula = "apply ~ pared + public + gpa - 1" + modf2 = OrderedModel.from_formula(formula, data, offset=offset, + distr='probit') + resf2 = modf2.fit(method='bfgs', disp=False) + + assert_allclose(resf2.params[:3], resp.params[:3], atol=2e-4) + assert_allclose(resf2.params[3], resp.params[3] + 1, atol=2e-4) + + fitted = resp.predict() + fitted2 = resf2.predict() + assert_allclose(fitted2, fitted, atol=2e-4) + + pred_ones = resf2.predict(data[:6], offset=np.ones(6)) + assert_allclose(pred_ones, fitted[:6], atol=2e-4) + + # check default is 0. if exog provided + pred_zero1 = resf2.predict(data[:6]) + pred_zero2 = resf2.predict(data[:6], offset=0) + assert_allclose(pred_zero1, pred_zero2, atol=2e-4) + + # compare with equivalent results frp, no-offset model + pred_zero = resp.predict(data.iloc[:6, 1:], offset=-np.ones(6)) + assert_allclose(pred_zero1, pred_zero, atol=2e-4) + + params_adj = resp.params.copy() + params_adj[3] += 1 + fitted_zero = resp.model.predict(params_adj) + assert_allclose(pred_zero1, fitted_zero[:6], atol=2e-4) + class TestLogitModelFormula(): From ce0899b1984b77e56dfd51de2b75cf1c46449477 Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Mon, 21 Sep 2020 16:17:27 -0400 Subject: [PATCH 24/24] BUG: fix df_resid, unit test compared to discrete Logit --- statsmodels/base/model.py | 7 +++ statsmodels/miscmodels/ordinal_model.py | 23 ++++++--- .../miscmodels/tests/test_ordinal_model.py | 49 +++++++++++++++++++ 3 files changed, 73 insertions(+), 6 deletions(-) diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py index 03ae9b63349..c3a3f74b44e 100644 --- a/statsmodels/base/model.py +++ b/statsmodels/base/model.py @@ -2492,6 +2492,13 @@ def __init__(self, model, mlefit): self._cache = {} self.__dict__.update(mlefit.__dict__) + k_params = len(mlefit.params) + # checks mainly for adding new models or subclassing + if self.df_model + self.model.k_constant != k_params: + warnings.warn("df_model + k_constant differs from nparams") + if self.df_resid != self.nobs - k_params: + warnings.warn("df_resid differs from nobs - nparams") + def summary(self, yname=None, xname=None, title=None, alpha=.05): """Summarize the Regression Results diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py index df3d98b9546..a48939c40c7 100644 --- a/statsmodels/miscmodels/ordinal_model.py +++ b/statsmodels/miscmodels/ordinal_model.py @@ -90,7 +90,7 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds): endog, labels, is_pandas = self._check_inputs(endog, exog) super(OrderedModel, self).__init__(endog, exog, **kwds) - + k_levels = None # initialize if not is_pandas: if self.endog.ndim == 1: unique, index = np.unique(self.endog, return_inverse=True) @@ -100,8 +100,8 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds): if not hasattr(self, "design_info"): raise ValueError("2-dim endog not supported") # this branch is currently only in support of from_formula - # labels here are only needed to choose k_levels in initialize - labels = [str(i) for i in range(self.endog.shape[1])] + # we need to initialize k_levels correctly for df_resid + k_levels = self.endog.shape[1] labels = [] # Note: Doing the following here would break from_formula # self.endog = self.endog.argmax(1) @@ -109,7 +109,12 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds): if self.k_constant > 0: raise ValueError("there should not be a constant in the model") - self._initialize_labels(labels) + self._initialize_labels(labels, k_levels=k_levels) + + # adjust df + self.k_extra = self.k_levels - 1 + self.df_model = self.k_vars + self.k_extra + self.df_resid = self.nobs - self.df_model self.results_class = OrderedResults @@ -148,9 +153,12 @@ def _check_inputs(self, endog, exog): return endog, labels, is_pandas - def _initialize_labels(self, labels): + def _initialize_labels(self, labels, k_levels=None): self.labels = labels - self.k_levels = len(labels) + if k_levels is None: + self.k_levels = len(labels) + else: + self.k_levels = k_levels if self.exog is not None: self.nobs, self.k_vars = self.exog.shape @@ -364,6 +372,9 @@ def fit(self, start_params=None, method='nm', maxiter=500, full_output=1, # use the proper result class ordmlefit = OrderedResults(self, mlefit) + # TODO: temporary, needs better fix, modelwc adds 1 by default + ordmlefit.hasconst = 0 + return ordmlefit diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py index 22d4e280439..582ba08d0ca 100644 --- a/statsmodels/miscmodels/tests/test_ordinal_model.py +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -13,6 +13,9 @@ from .results.results_ordinal_model import data_store as ds from statsmodels.miscmodels.ordinal_model import OrderedModel +from statsmodels.discrete.discrete_model import Logit +from statsmodels.tools.tools import add_constant + class CheckOrdinalModelMixin(object): @@ -116,6 +119,9 @@ def test_results_other(self): fitted = self.resf.predict() assert_allclose(pred, fitted[-5:], rtol=1e-13) + n, k = res1.model.exog.shape + assert_equal(self.resf.df_resid, n - (k + 2)) + class TestLogitModel(CheckOrdinalModelMixin): @@ -393,3 +399,46 @@ def _cdf(self, x): cls.resp = resp cls.resf = resf cls.resu = resu + + +class TestLogitBinary(): + # compare OrderedModel with discrete Logit for binary case + def test_attributes(self): + data = ds.df + + mask_drop = data['apply'] == "somewhat likely" + data2 = data.loc[~mask_drop, :] + # we need to remove the category also from the Categorical Index + data2['apply'].cat.remove_categories("somewhat likely", inplace=True) + + # standard fit with pandas input + modp = OrderedModel(data2['apply'], + data2[['pared', 'public', 'gpa']], + distr='logit') + resp = modp.fit(method='bfgs', disp=False) + + exog = add_constant(data2[['pared', 'public', 'gpa']], prepend=False) + mod_logit = Logit(data2['apply'].cat.codes, exog) + res_logit = mod_logit.fit() + + attributes = "bse df_resid llf aic bic".split() + assert_allclose(resp.params[:3], res_logit.params[:3], rtol=1e-5) + assert_allclose(resp.params[3], -res_logit.params[3], rtol=1e-5) + for attr in attributes: + assert_allclose(getattr(resp, attr), getattr(res_logit, attr), + rtol=1e-4) + + resp = modp.fit(method='bfgs', disp=False, + cov_type="hac", cov_kwds={"maxlags": 2}) + res_logit = mod_logit.fit(method='bfgs', disp=False, + cov_type="hac", cov_kwds={"maxlags": 2}) + for attr in attributes: + assert_allclose(getattr(resp, attr), getattr(res_logit, attr), + rtol=1e-4) + + resp = modp.fit(method='bfgs', disp=False, cov_type="hc1") + res_logit = mod_logit.fit(method='bfgs', disp=False, + cov_type="hc1") + for attr in attributes: + assert_allclose(getattr(resp, attr), getattr(res_logit, attr), + rtol=1e-4)