diff --git a/examples/notebooks/ordinal_regression.ipynb b/examples/notebooks/ordinal_regression.ipynb new file mode 100644 index 00000000000..818b93080aa --- /dev/null +++ b/examples/notebooks/ordinal_regression.ipynb @@ -0,0 +1,247 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ordinal Regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import scipy.stats as stats\n", + "\n", + "from statsmodels.miscmodels.ordinal_model import OrderedModel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading a stata data file from the UCLA website.This notebook is inspired by https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/ which is a R notebook from UCLA." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://stats.idre.ucla.edu/stat/data/ologit.dta\"\n", + "data_student = pd.read_stata(url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_student.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_student.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_student['apply'].dtype" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This dataset is about the probability for undergraduate students to apply to graduate school given three exogenous variables:\n", + "- their grade point average(`gpa`), a float between 0 and 4.\n", + "- `pared`, a binary that indicates if at least one parent went to graduate school.\n", + "- and `public`, a binary that indicates if the current undergraduate institution of the student is public or private.\n", + "\n", + "`apply`, the target variable is categorical with ordered categories: `unlikely` < `somewhat likely` < `very likely`. It is a `pd.Serie` of categorical type, this is preferred over NumPy arrays." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The model is based on a numerical latent variable $y_{latent}$ that we cannot observe but that we can compute thanks to exogenous variables.\n", + "Moreover we can use this $y_{latent}$ to define $y$ that we can observe.\n", + "\n", + "For more details see the the Documentation of OrderedModel, [the UCLA webpage](https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/) or this [book](https://onlinelibrary.wiley.com/doi/book/10.1002/9780470594001).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Probit ordinal regression:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mod_prob = OrderedModel(data_student['apply'],\n", + " data_student[['pared', 'public', 'gpa']],\n", + " distr='logit')\n", + "\n", + "res_prob = mod_prob.fit(method='bfgs')\n", + "res_prob.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In our model, we have 3 exogenous variables(the $\\beta$s if we keep the documentation's notations) so we have 3 coefficients that need to be estimated.\n", + "\n", + "Those 3 estimations and their standard errors can be retrieved in the summary table.\n", + "\n", + "Since there are 3 categories in the target variable(`unlikely`, `somewhat likely`, `very likely`), we have two thresholds to estimate. \n", + "As explained in the doc of the method `OrderedModel.transform_threshold_params`, the first estimated threshold is the actual value and all the other thresholds are in terms of cumulative exponentiated increments. Actual thresholds values can be computed as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_of_thresholds = 2\n", + "mod_prob.transform_threshold_params(res_prob.params[-num_of_thresholds:])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Logit ordinal regression:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mod_log = OrderedModel(data_student['apply'],\n", + " data_student[['pared', 'public', 'gpa']],\n", + " distr='logit')\n", + "\n", + "res_log = mod_log.fit(method='bfgs', disp=False)\n", + "res_log.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predicted = res_log.model.predict(res_log.params, exog=data_student[['pared', 'public', 'gpa']])\n", + "predicted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pred_choice = predicted.argmax(1)\n", + "print('Fraction of correct choice predictions')\n", + "print((np.asarray(data_student['apply'].values.codes) == pred_choice).mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Ordinal regression with a custom cumulative cLogLog distribution:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In addition to `logit` and `probit` regression, any continuous distribution from `SciPy.stats` package can be used for the `distr` argument. Alternatively, one can define its own distribution simply creating a subclass from `rv_continuous` and implementing a few methods." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# using a SciPy distribution\n", + "res_exp = OrderedModel(data_student['apply'],\n", + " data_student[['pared', 'public', 'gpa']],\n", + " distr=stats.expon).fit(method='bfgs', disp=False)\n", + "res_exp.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# minimal definition of a custom scipy distribution.\n", + "class CLogLog(stats.rv_continuous):\n", + " def _ppf(self, q):\n", + " return np.log(-np.log(1 - q))\n", + "\n", + " def _cdf(self, x):\n", + " return 1 - np.exp(-np.exp(x))\n", + "\n", + "\n", + "cloglog = CLogLog()\n", + "\n", + "# definition of the model and fitting\n", + "res_cloglog = OrderedModel(data_student['apply'],\n", + " data_student[['pared', 'public', 'gpa']],\n", + " distr=cloglog).fit(method='bfgs', disp=False)\n", + "res_cloglog.summary()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py index 8d0328b471d..c3a3f74b44e 100644 --- a/statsmodels/base/model.py +++ b/statsmodels/base/model.py @@ -789,6 +789,7 @@ def __init__(self, endog, exog=None, loglike=None, score=None, if hessian is not None: self.hessian = hessian + hasconst = kwds.pop("hasconst", None) self.__dict__.update(kwds) # TODO: data structures? @@ -797,7 +798,8 @@ def __init__(self, endog, exog=None, loglike=None, score=None, # self.df_model = 9999 # somewhere: CacheWriteWarning: 'df_model' cannot be overwritten super(GenericLikelihoodModel, self).__init__(endog, exog, - missing=missing) + missing=missing, + hasconst=hasconst) # this will not work for ru2nmnl, maybe np.ndim of a dict? if exog is not None: @@ -1060,7 +1062,9 @@ def predict(self, exog=None, transform=True, *args, **kwargs): exog_index = exog.index if is_pandas else None if transform and hasattr(self.model, 'formula') and (exog is not None): - design_info = self.model.data.design_info + # allow both location of design_info, see #7043 + design_info = (getattr(self.model, "design_info", None) or + self.model.data.design_info) from patsy import dmatrix if isinstance(exog, pd.Series): # we are guessing whether it should be column or row @@ -2488,6 +2492,13 @@ def __init__(self, model, mlefit): self._cache = {} self.__dict__.update(mlefit.__dict__) + k_params = len(mlefit.params) + # checks mainly for adding new models or subclassing + if self.df_model + self.model.k_constant != k_params: + warnings.warn("df_model + k_constant differs from nparams") + if self.df_resid != self.nobs - k_params: + warnings.warn("df_resid differs from nobs - nparams") + def summary(self, yname=None, xname=None, title=None, alpha=.05): """Summarize the Regression Results diff --git a/statsmodels/examples/ex_ordered_model.py b/statsmodels/examples/ex_ordered_model.py new file mode 100644 index 00000000000..a03f25a48c8 --- /dev/null +++ b/statsmodels/examples/ex_ordered_model.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Aug 24 11:17:06 2015 + +Author: Josef Perktold +License: BSD-3 +""" + +import numpy as np +from scipy import stats +import pandas + +from statsmodels.miscmodels.ordinal_model import OrderedModel + +nobs, k_vars = 1000, 3 +x = np.random.randn(nobs, k_vars) +# x = np.column_stack((np.ones(nobs), x)) +# #constant will be in integration limits +xb = x.dot(np.ones(k_vars)) +y_latent = xb + np.random.randn(nobs) +y = np.round(np.clip(y_latent, -2.4, 2.4)).astype(int) + 2 + +print(np.unique(y)) +print(np.bincount(y)) + +mod = OrderedModel(y, x) +# start_params = np.ones(k_vars + 4) +# start_params = np.concatenate((np.ones(k_vars), np.arange(4))) +start_ppf = stats.norm.ppf((np.bincount(y) / len(y)).cumsum()) +start_threshold = np.concatenate((start_ppf[:1], + np.log(np.diff(start_ppf[:-1])))) +start_params = np.concatenate((np.zeros(k_vars), start_threshold)) +res = mod.fit(start_params=start_params, maxiter=5000, maxfun=5000) +print(res.params) +# res = mod.fit(start_params=res.params, method='bfgs') +res = mod.fit(start_params=start_params, method='bfgs') + +print(res.params) +print(np.exp(res.params[-(mod.k_levels - 1):]).cumsum()) +# print(res.summary()) + +predicted = res.model.predict(res.params) +pred_choice = predicted.argmax(1) +print('Fraction of correct choice predictions') +print((y == pred_choice).mean()) + +print('\ncomparing bincount') +print(np.bincount(res.model.predict(res.params).argmax(1))) +print(np.bincount(res.model.endog)) + +res_log = OrderedModel(y, x, distr='logit').fit(method='bfgs') +pred_choice_log = res_log.predict().argmax(1) +print((y == pred_choice_log).mean()) +print(res_log.summary()) + +# example form UCLA Stats pages +# http://www.ats.ucla.edu/stat/stata/dae/ologit.htm +# requires downloaded dataset ologit.dta + +dataf = pandas.read_stata(r"M:\josef_new\scripts\ologit_ucla.dta") + +# this works but sorts category levels alphabetically +res_log2 = OrderedModel(np.asarray(dataf['apply']), + np.asarray(dataf[['pared', 'public', 'gpa']], float), + distr='logit').fit(method='bfgs') + +# this replicates the UCLA example except +# for different parameterization of par2 +res_log3 = OrderedModel(dataf['apply'].values.codes, + np.asarray(dataf[['pared', 'public', 'gpa']], float), + distr='logit').fit(method='bfgs') + +print(res_log3.summary()) + +# with ordered probit - not on UCLA page +print( + OrderedModel(dataf['apply'].values.codes, + np.asarray(dataf[['pared', 'public', 'gpa']], float), + distr='probit').fit(method='bfgs').summary()) + + +# example with a custom distribution - not on UCLA page +# definition of the SciPy dist +class CLogLog(stats.rv_continuous): + def _ppf(self, q): + return np.log(-np.log(1 - q)) + + def _cdf(self, x): + return 1 - np.exp(-np.exp(x)) + + +cloglog = CLogLog() + +res_cloglog = OrderedModel(dataf['apply'], + dataf[['pared', 'public', 'gpa']], + distr=cloglog).fit(method='bfgs', disp=False) +print(res_cloglog.summary()) diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py new file mode 100644 index 00000000000..a48939c40c7 --- /dev/null +++ b/statsmodels/miscmodels/ordinal_model.py @@ -0,0 +1,396 @@ +# -*- coding: utf-8 -*- +""" +Created on Sat Aug 22 20:24:42 2015 + +Author: Josef Perktold +License: BSD-3 +""" + +import numpy as np +import pandas as pd +from pandas.api.types import CategoricalDtype +from scipy import stats + +from statsmodels.base.model import ( + GenericLikelihoodModel, GenericLikelihoodModelResults) +from statsmodels.compat.pandas import Appender + + +class OrderedModel(GenericLikelihoodModel): + """Ordinal Model based on logistic or normal distribution + + The parameterization corresponds to the proportional odds model. + + The mode assumes that the endogenous variable is ordered but that the + labels have no numeric interpretation besides the ordering. + + The model is based on a latent linear variable, where we observe only a + discretization. + + y_latent = X beta + u + + The observed variable is defined by the interval + + y = {0 if y_latent <= cut_0 + 1 of cut_0 < y_latent <= cut_1 + ... + K if cut_K < y_latent + + The probability of observing y=k conditional on the explanatory variables + X is given by + + prob(y = k | x) = Prob(cut_k < y_latent <= cut_k+1) + = Prob(cut_k - x beta < u <= cut_k+1 - x beta + = F(cut_k+1 - x beta) - F(cut_k - x beta) + + Where F is the cumulative distribution of u which is either the normal + or the logistic distribution, but can be set to any other continuous + distribution. We use standardized distributions to avoid identifiability + problems. + + Parameters + ---------- + endog : array_like + Endogenous or dependent ordered categorical variable with k levels. + Labels or values of endog will internally transformed to consecutive + integers, 0, 1, 2, ... + pd.Series with Categorical as dtype should be preferred as it gives + the order relation between the levels. + If endog is not a pandas Categorical, then categories are + sorted in lexicographic order (by numpy.unique). + exog : array_like + Exogenous, explanatory variables. This should not include an intercept. + pd.DataFrame are also accepted. + distr : string 'probit' or 'logit', or a distribution instance + The default is currently 'probit' which uses the normal distribution + and corresponds to an ordered Probit model. The distribution is + assumed to have the main methods of scipy.stats distributions, mainly + cdf, pdf and ppf. The inverse cdf, ppf, is only use to calculate + starting values. + + Status: initial version, subclasses `GenericLikelihoodModel` + + """ + _formula_max_endog = np.inf + + def __init__(self, endog, exog, offset=None, distr='probit', **kwds): + + if distr == 'probit': + self.distr = stats.norm + elif distr == 'logit': + self.distr = stats.logistic + else: + self.distr = distr + + if offset is not None: + offset = np.asarray(offset) + + self.offset = offset + + endog, labels, is_pandas = self._check_inputs(endog, exog) + + super(OrderedModel, self).__init__(endog, exog, **kwds) + k_levels = None # initialize + if not is_pandas: + if self.endog.ndim == 1: + unique, index = np.unique(self.endog, return_inverse=True) + self.endog = index + labels = unique + elif self.endog.ndim == 2: + if not hasattr(self, "design_info"): + raise ValueError("2-dim endog not supported") + # this branch is currently only in support of from_formula + # we need to initialize k_levels correctly for df_resid + k_levels = self.endog.shape[1] + labels = [] + # Note: Doing the following here would break from_formula + # self.endog = self.endog.argmax(1) + + if self.k_constant > 0: + raise ValueError("there should not be a constant in the model") + + self._initialize_labels(labels, k_levels=k_levels) + + # adjust df + self.k_extra = self.k_levels - 1 + self.df_model = self.k_vars + self.k_extra + self.df_resid = self.nobs - self.df_model + + self.results_class = OrderedResults + + def _check_inputs(self, endog, exog): + """handle endog that is pandas Categorical + + checks if self.distrib is legal and does the Pandas Categorical + support for endog. + """ + + if not isinstance(self.distr, stats.rv_continuous): + import warnings + msg = ( + f"{self.distr.name} is not a scipy.stats distribution." + ) + raise warnings.warn(msg) + + labels = None + is_pandas = False + if isinstance(endog, pd.Series): + if isinstance(endog.dtypes, CategoricalDtype): + if not endog.dtype.ordered: + warnings.warn("the endog has ordered == False, " + "risk of capturing a wrong order for the " + "categories. ordered == True preferred.", + Warning) + + endog_name = endog.name + labels = endog.values.categories + endog = endog.cat.codes + if endog.min() == -1: # means there is a missing value + raise ValueError("missing values in categorical endog are " + "not supported") + endog.name = endog_name + is_pandas = True + + return endog, labels, is_pandas + + def _initialize_labels(self, labels, k_levels=None): + self.labels = labels + if k_levels is None: + self.k_levels = len(labels) + else: + self.k_levels = k_levels + + if self.exog is not None: + self.nobs, self.k_vars = self.exog.shape + else: # no exog in model + self.nobs, self.k_vars = self.endog.shape[0], 0 + + threshold_names = [str(x) + '/' + str(y) + for x, y in zip(labels[:-1], labels[1:])] + + # from GenericLikelihoodModel.fit + if self.exog is not None: + # avoid extending several times + if len(self.exog_names) > self.k_vars: + raise RuntimeError("something wrong with exog_names, too long") + self.exog_names.extend(threshold_names) + else: + self.data.xnames = threshold_names + + @classmethod + def from_formula(cls, formula, data, subset=None, drop_cols=None, + *args, **kwargs): + + # we want an explicit Intercept in the model that we can remove + # Removing constant with "0 +" or "- 1" does not work for categ. exog + + endog_name = formula.split("~")[0].strip() + original_endog = data[endog_name] + + model = super(OrderedModel, cls).from_formula( + formula, data=data, drop_cols=["Intercept"], *args, **kwargs) + + if model.endog.ndim == 2: + if not (isinstance(original_endog.dtype, CategoricalDtype) + and original_endog.dtype.ordered): + msg = ("Only ordered pandas Categorical are supported as " + "endog in formulas") + raise ValueError(msg) + + labels = original_endog.values.categories + model._initialize_labels(labels) + model.endog = model.endog.argmax(1) + model.data.ynames = endog_name + + return model + + def cdf(self, x): + """cdf evaluated at x + """ + return self.distr.cdf(x) + + def pdf(self, x): + """pdf evaluated at x + """ + return self.distr.pdf(x) + + def prob(self, low, upp): + """interval probability + """ + return np.maximum(self.cdf(upp) - self.cdf(low), 0) + + def transform_threshold_params(self, params): + """transformation of the parameters in the optimization + + Parameters + ---------- + params : nd_array + contains (exog_coef, transformed_thresholds) where exog_coef are + the coefficient for the explanatory variables in the linear term, + transformed threshold or cutoff points. The first, lowest threshold + is unchanged, all other thresholds are in terms of exponentiated + increments + + Returns + ------- + thresh : nd_array + thresh are the thresholds or cutoff constants for the intervals. + + """ + th_params = params[-(self.k_levels - 1):] + thresh = np.concatenate((th_params[:1], + np.exp(th_params[1:]))).cumsum() + thresh = np.concatenate(([-np.inf], thresh, [np.inf])) + return thresh + + def transform_reverse_threshold_params(self, params): + """obtain transformed thresholds from original thresholds, cutoff + constants. + + """ + start_ppf = params + thresh_params = np.concatenate((start_ppf[:1], + np.log(np.diff(start_ppf[:-1])))) + return thresh_params + + def predict(self, params, exog=None, offset=None): + """predicted probabilities for each level of the ordinal endog. + + + """ + # note, exog and offset handling is in linpred + + thresh = self.transform_threshold_params(params) + xb = self._linpred(params, exog=exog, offset=offset)[:, None] + low = thresh[:-1] - xb + upp = thresh[1:] - xb + prob = self.prob(low, upp) + return prob + + def _linpred(self, params, exog=None, offset=None): + """linear prediction of latent variable `x b` + + currently only for exog from estimation sample (in-sample) + """ + if exog is None: + exog = self.exog + if offset is None: + offset = self.offset + else: + if offset is None: + offset = 0 + + if offset is not None: + offset = np.asarray(offset) + + if exog is not None: + linpred = exog.dot(params[:-(self.k_levels - 1)]) + else: # means self.exog is also None + linpred = np.zeros(self.nobs) + if offset is not None: + linpred += offset + return linpred + + def _bounds(self, params): + thresh = self.transform_threshold_params(params) + + thresh_i_low = thresh[self.endog] + thresh_i_upp = thresh[self.endog + 1] + xb = self._linpred(params) + low = thresh_i_low - xb + upp = thresh_i_upp - xb + return low, upp + + def loglike(self, params): + + thresh = self.transform_threshold_params(params) + + thresh_i_low = thresh[self.endog] + thresh_i_upp = thresh[self.endog + 1] + xb = self._linpred(params) + low = thresh_i_low - xb + upp = thresh_i_upp - xb + prob = self.prob(low, upp) + return np.log(prob + 1e-20).sum() + + def loglikeobs(self, params): + + low, upp = self._bounds(params) + prob = self.prob(low, upp) + return np.log(prob + 1e-20) + + def score_obs_(self, params): + """score, first derivative of loglike for each observations + + This currently only implements the derivative with respect to the + exog parameters, but not with respect to threshold parameters. + + """ + low, upp = self._bounds(params) + + prob = self.prob(low, upp) + pdf_upp = self.pdf(upp) + pdf_low = self.pdf(low) + + # TODO the following doesn't work yet because of the incremental exp + # parameterization. The following was written base on Greene for the + # simple non-incremental parameterization. + # k = self.k_levels - 1 + # idx = self.endog + # score_factor = np.zeros((self.nobs, k + 1 + 2)) #+2 avoids idx bounds + # + # rows = np.arange(self.nobs) + # shift = 1 + # score_factor[rows, shift + idx-1] = -pdf_low + # score_factor[rows, shift + idx] = pdf_upp + # score_factor[:, 0] = pdf_upp - pdf_low + score_factor = (pdf_upp - pdf_low)[:, None] + score_factor /= prob[:, None] + + so = np.column_stack((-score_factor[:, :1] * self.exog, + score_factor[:, 1:])) + return so + + @property + def start_params(self): + # start params based on model without exog + freq = np.bincount(self.endog) / len(self.endog) + start_ppf = self.distr.ppf(np.clip(freq.cumsum(), 0, 1)) + start_threshold = self.transform_reverse_threshold_params(start_ppf) + start_params = np.concatenate((np.zeros(self.k_vars), start_threshold)) + return start_params + + @Appender(GenericLikelihoodModel.fit.__doc__) + def fit(self, start_params=None, method='nm', maxiter=500, full_output=1, + disp=1, callback=None, retall=0, **kwargs): + + fit_method = super(OrderedModel, self).fit + mlefit = fit_method(start_params=start_params, + method=method, maxiter=maxiter, + full_output=full_output, + disp=disp, callback=callback, **kwargs) + # use the proper result class + ordmlefit = OrderedResults(self, mlefit) + + # TODO: temporary, needs better fix, modelwc adds 1 by default + ordmlefit.hasconst = 0 + + return ordmlefit + + +class OrderedResults(GenericLikelihoodModelResults): + + def pred_table(self): + """prediction table + + returns pandas DataFrame + + """ + # todo: add category labels + categories = np.arange(self.model.k_levels) + observed = pd.Categorical(self.model.endog, + categories=categories, ordered=True) + predicted = pd.Categorical(self.predict().argmax(1), + categories=categories, ordered=True) + table = pd.crosstab(observed, predicted, margins=True, dropna=False) + return table diff --git a/statsmodels/miscmodels/tests/results/__init__.py b/statsmodels/miscmodels/tests/results/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/statsmodels/miscmodels/tests/results/ologit_ucla.csv b/statsmodels/miscmodels/tests/results/ologit_ucla.csv new file mode 100644 index 00000000000..0c77e0ea8b5 --- /dev/null +++ b/statsmodels/miscmodels/tests/results/ologit_ucla.csv @@ -0,0 +1,401 @@ +apply,pared,public,gpa +very likely,0,0,3.26 +somewhat likely,1,0,3.21 +unlikely,1,1,3.94 +somewhat likely,0,0,2.81 +somewhat likely,0,0,2.53 +unlikely,0,1,2.59 +somewhat likely,0,0,2.56 +somewhat likely,0,0,2.73 +unlikely,0,0,3.0 +somewhat likely,1,0,3.5 +unlikely,1,1,3.65 +somewhat likely,0,0,2.84 +very likely,0,1,3.9 +somewhat likely,0,0,2.68 +unlikely,1,0,3.57 +unlikely,0,0,3.09 +unlikely,0,1,3.5 +unlikely,0,0,2.17 +very likely,0,1,3.36 +somewhat likely,0,0,3.4 +very likely,0,0,2.75 +somewhat likely,1,0,3.2 +unlikely,0,0,2.44 +unlikely,0,0,2.83 +unlikely,0,1,3.0 +somewhat likely,0,1,3.27 +somewhat likely,1,0,3.14 +somewhat likely,0,0,3.37 +very likely,0,1,2.79 +unlikely,0,0,2.9 +somewhat likely,0,0,3.38 +unlikely,0,1,2.95 +unlikely,0,0,2.98 +unlikely,1,1,3.81 +unlikely,0,0,2.74 +unlikely,0,0,2.62 +unlikely,0,0,2.85 +somewhat likely,0,0,2.5 +somewhat likely,0,0,2.75 +unlikely,0,0,2.26 +unlikely,0,0,2.03 +somewhat likely,1,0,2.85 +unlikely,0,0,2.72 +somewhat likely,0,0,2.89 +somewhat likely,1,0,2.47 +unlikely,0,0,3.04 +unlikely,0,0,3.1 +unlikely,0,0,2.57 +unlikely,0,0,2.09 +somewhat likely,0,0,2.94 +unlikely,0,0,3.45 +unlikely,0,0,2.76 +very likely,0,1,2.96 +unlikely,0,0,2.89 +somewhat likely,0,0,2.97 +somewhat likely,0,1,3.91 +somewhat likely,0,0,2.77 +unlikely,0,0,2.51 +unlikely,0,0,3.24 +unlikely,0,0,2.44 +somewhat likely,0,0,2.78 +unlikely,0,0,2.94 +unlikely,1,0,3.22 +unlikely,0,0,3.5 +very likely,1,0,3.57 +somewhat likely,0,0,3.17 +unlikely,0,1,3.23 +unlikely,0,0,2.91 +unlikely,0,0,3.28 +unlikely,0,1,3.32 +unlikely,0,0,3.62 +unlikely,0,0,2.55 +unlikely,0,0,2.97 +very likely,1,0,3.63 +unlikely,0,1,3.02 +unlikely,0,1,3.6 +unlikely,0,0,2.95 +very likely,1,1,3.81 +somewhat likely,1,0,2.68 +unlikely,1,0,3.72 +unlikely,0,0,2.49 +unlikely,0,0,2.72 +unlikely,1,0,2.25 +unlikely,0,0,2.53 +unlikely,0,0,3.26 +very likely,0,0,2.57 +unlikely,0,0,2.9 +unlikely,0,0,3.05 +somewhat likely,0,0,2.85 +somewhat likely,0,0,4.0 +very likely,1,0,3.4 +somewhat likely,0,1,3.24 +unlikely,0,1,3.51 +very likely,0,0,2.55 +unlikely,0,0,3.06 +very likely,0,0,3.0 +somewhat likely,0,0,3.13 +unlikely,0,0,3.16 +somewhat likely,0,0,2.47 +somewhat likely,0,0,2.89 +unlikely,0,0,2.74 +unlikely,0,0,3.16 +unlikely,0,0,2.92 +somewhat likely,0,0,3.57 +somewhat likely,1,0,3.49 +somewhat likely,0,1,2.56 +somewhat likely,1,0,3.12 +somewhat likely,1,0,2.65 +somewhat likely,0,0,2.51 +somewhat likely,0,0,3.08 +somewhat likely,0,0,2.65 +unlikely,0,0,2.99 +unlikely,0,0,2.63 +somewhat likely,0,0,2.8 +somewhat likely,0,1,3.46 +unlikely,0,0,3.61 +unlikely,0,0,3.08 +unlikely,1,0,3.28 +unlikely,0,1,3.31 +unlikely,0,0,2.57 +somewhat likely,0,0,3.39 +unlikely,0,0,2.26 +unlikely,0,0,2.42 +unlikely,0,0,2.6 +somewhat likely,1,1,3.44 +somewhat likely,0,0,2.98 +somewhat likely,1,0,3.09 +unlikely,0,1,3.34 +unlikely,0,0,2.84 +unlikely,0,0,2.81 +unlikely,0,0,2.79 +unlikely,0,0,2.4 +unlikely,0,0,3.02 +somewhat likely,0,0,2.65 +somewhat likely,0,0,2.55 +somewhat likely,1,0,3.12 +unlikely,0,0,2.98 +very likely,1,0,3.61 +unlikely,0,0,2.98 +unlikely,0,0,3.19 +unlikely,0,0,3.51 +unlikely,0,0,3.1 +unlikely,0,0,3.55 +unlikely,0,0,2.98 +very likely,0,0,2.99 +unlikely,0,0,3.05 +unlikely,0,0,2.99 +unlikely,0,0,2.32 +unlikely,0,0,2.5 +unlikely,0,1,2.9 +very likely,1,1,3.28 +somewhat likely,0,0,2.95 +somewhat likely,0,0,3.54 +somewhat likely,0,0,3.11 +unlikely,1,0,3.25 +unlikely,0,0,2.44 +unlikely,0,0,2.13 +somewhat likely,0,0,3.22 +somewhat likely,0,0,3.16 +unlikely,0,0,3.39 +somewhat likely,0,0,2.7 +somewhat likely,0,0,3.09 +somewhat likely,0,0,3.16 +unlikely,0,0,2.28 +unlikely,0,0,2.91 +unlikely,0,0,3.65 +unlikely,0,0,2.86 +unlikely,0,1,3.39 +unlikely,0,0,3.71 +unlikely,0,0,3.25 +unlikely,0,0,3.14 +unlikely,0,0,2.41 +somewhat likely,0,0,3.08 +unlikely,0,1,3.02 +somewhat likely,0,0,3.15 +very likely,0,0,2.95 +unlikely,0,0,2.22 +somewhat likely,0,0,2.86 +somewhat likely,1,0,2.88 +very likely,0,0,2.62 +unlikely,0,0,3.37 +somewhat likely,0,0,3.51 +somewhat likely,0,0,3.65 +very likely,1,0,3.42 +very likely,0,0,2.41 +very likely,0,1,3.21 +unlikely,0,0,3.22 +unlikely,0,0,2.53 +somewhat likely,0,0,2.64 +unlikely,0,0,2.94 +somewhat likely,0,0,2.56 +unlikely,0,1,3.12 +somewhat likely,0,0,3.34 +unlikely,0,0,3.22 +somewhat likely,0,0,3.05 +somewhat likely,0,0,3.29 +somewhat likely,0,0,2.71 +unlikely,0,0,2.87 +unlikely,0,0,3.29 +unlikely,0,0,3.36 +unlikely,1,1,2.85 +unlikely,0,0,2.79 +somewhat likely,0,0,3.69 +very likely,0,0,3.56 +unlikely,0,0,3.52 +unlikely,1,1,3.38 +very likely,0,1,3.11 +somewhat likely,1,0,3.2 +unlikely,0,0,2.83 +unlikely,0,0,3.08 +very likely,0,1,2.97 +unlikely,0,0,2.64 +unlikely,0,0,2.47 +very likely,0,0,3.07 +somewhat likely,0,0,3.2 +unlikely,0,0,2.55 +unlikely,0,0,2.48 +unlikely,0,0,3.29 +somewhat likely,0,0,2.64 +somewhat likely,0,0,3.22 +somewhat likely,1,0,2.8 +somewhat likely,0,0,3.62 +unlikely,0,0,2.62 +somewhat likely,0,1,3.04 +unlikely,0,1,2.49 +unlikely,0,1,3.1 +very likely,1,0,3.15 +somewhat likely,0,0,2.65 +unlikely,0,0,3.04 +unlikely,0,1,3.05 +unlikely,0,0,2.88 +somewhat likely,0,0,2.86 +somewhat likely,0,0,3.0 +unlikely,0,0,2.23 +unlikely,0,1,3.14 +unlikely,0,0,2.67 +unlikely,0,0,3.11 +somewhat likely,0,0,3.73 +somewhat likely,0,0,2.45 +unlikely,0,0,3.04 +unlikely,0,0,2.4 +somewhat likely,0,0,3.43 +somewhat likely,0,0,2.57 +unlikely,0,0,2.84 +unlikely,0,0,2.67 +very likely,1,1,3.45 +somewhat likely,1,0,2.88 +somewhat likely,0,0,2.78 +unlikely,0,0,3.22 +unlikely,1,0,3.3 +somewhat likely,1,0,2.86 +somewhat likely,0,0,2.83 +somewhat likely,0,0,3.7 +somewhat likely,0,0,3.15 +somewhat likely,1,1,3.08 +unlikely,0,0,2.92 +unlikely,0,0,2.89 +very likely,1,0,3.53 +unlikely,0,0,3.1 +somewhat likely,1,0,3.36 +somewhat likely,0,0,2.74 +somewhat likely,0,0,2.73 +unlikely,1,0,2.72 +unlikely,0,0,3.53 +unlikely,1,0,3.45 +unlikely,0,0,2.98 +somewhat likely,0,0,2.88 +very likely,1,0,3.21 +unlikely,0,0,3.06 +unlikely,0,0,2.4 +somewhat likely,0,0,3.57 +unlikely,1,0,2.88 +unlikely,0,0,3.27 +unlikely,0,0,2.92 +very likely,0,0,2.7 +somewhat likely,1,0,2.53 +somewhat likely,0,0,3.38 +unlikely,0,0,3.16 +somewhat likely,0,0,2.6 +unlikely,0,0,2.76 +unlikely,0,0,3.56 +unlikely,0,0,2.87 +somewhat likely,1,0,2.99 +unlikely,0,0,2.68 +unlikely,0,0,2.99 +somewhat likely,0,0,2.96 +unlikely,1,1,2.77 +somewhat likely,0,0,3.28 +unlikely,0,1,2.74 +unlikely,0,0,1.9 +unlikely,0,0,3.05 +somewhat likely,0,1,2.44 +somewhat likely,0,0,3.33 +somewhat likely,0,1,3.56 +unlikely,0,0,2.15 +unlikely,0,0,2.83 +somewhat likely,0,0,2.79 +somewhat likely,0,0,2.72 +very likely,1,0,3.04 +unlikely,0,1,3.03 +unlikely,0,0,2.88 +somewhat likely,0,0,3.21 +unlikely,0,0,2.74 +unlikely,0,0,3.57 +somewhat likely,0,0,2.43 +unlikely,0,0,3.11 +unlikely,0,0,2.76 +somewhat likely,0,0,3.13 +unlikely,0,0,2.75 +unlikely,0,0,2.6 +unlikely,0,0,3.67 +unlikely,0,0,3.26 +unlikely,0,0,3.41 +very likely,0,0,3.24 +unlikely,0,0,3.05 +somewhat likely,1,0,2.38 +somewhat likely,0,0,2.61 +somewhat likely,1,1,3.18 +unlikely,0,0,3.34 +somewhat likely,0,0,2.84 +unlikely,0,0,2.8 +unlikely,0,0,3.21 +unlikely,0,0,2.63 +unlikely,0,0,2.31 +unlikely,0,0,2.29 +unlikely,0,0,3.51 +somewhat likely,0,0,2.87 +unlikely,0,0,3.09 +somewhat likely,1,0,2.88 +unlikely,1,0,2.47 +somewhat likely,0,1,3.72 +unlikely,0,0,3.2 +somewhat likely,0,0,2.57 +unlikely,0,0,3.05 +unlikely,0,0,3.27 +unlikely,0,0,3.3 +unlikely,0,0,2.73 +unlikely,1,0,2.63 +unlikely,0,0,2.77 +unlikely,0,0,3.15 +unlikely,0,0,2.85 +unlikely,0,0,2.25 +somewhat likely,0,0,3.56 +unlikely,0,0,2.7 +very likely,0,0,2.94 +somewhat likely,1,0,2.58 +somewhat likely,0,0,2.97 +very likely,0,0,3.38 +very likely,0,0,2.96 +unlikely,0,0,2.44 +somewhat likely,0,1,3.75 +somewhat likely,0,0,3.07 +somewhat likely,0,0,2.72 +unlikely,0,0,3.13 +unlikely,0,0,3.39 +unlikely,0,1,2.38 +somewhat likely,0,0,2.87 +unlikely,0,1,2.91 +very likely,1,0,3.25 +somewhat likely,1,1,3.51 +unlikely,1,0,3.65 +somewhat likely,0,0,3.11 +very likely,0,0,2.71 +unlikely,0,0,3.05 +unlikely,0,0,2.93 +somewhat likely,0,0,2.35 +somewhat likely,0,0,2.49 +unlikely,0,0,3.26 +unlikely,0,0,3.77 +unlikely,0,1,3.49 +unlikely,0,0,3.39 +unlikely,0,0,3.37 +unlikely,0,0,1.98 +unlikely,1,0,2.93 +unlikely,0,0,3.36 +unlikely,0,0,3.12 +somewhat likely,0,0,3.23 +somewhat likely,0,1,3.9 +unlikely,0,0,2.7 +unlikely,0,0,2.34 +unlikely,0,0,3.3 +somewhat likely,0,0,3.11 +unlikely,0,0,3.14 +somewhat likely,1,0,3.68 +somewhat likely,0,0,2.22 +unlikely,0,0,2.61 +somewhat likely,0,1,3.48 +very likely,0,0,3.08 +unlikely,0,0,2.79 +unlikely,0,0,3.12 +unlikely,0,1,2.67 +somewhat likely,0,0,3.54 +very likely,0,0,2.98 +somewhat likely,1,0,3.32 +somewhat likely,1,0,3.54 +unlikely,0,0,3.7 +unlikely,0,0,2.63 +somewhat likely,0,0,2.25 +somewhat likely,0,0,3.26 +very likely,0,0,3.52 diff --git a/statsmodels/miscmodels/tests/results/results_ordinal_model.py b/statsmodels/miscmodels/tests/results/results_ordinal_model.py new file mode 100644 index 00000000000..d4fb122796c --- /dev/null +++ b/statsmodels/miscmodels/tests/results/results_ordinal_model.py @@ -0,0 +1,114 @@ +""" +Test Results for ordinal models from R MASS lib +""" + +import numpy as np +import os +import pandas as pd +from statsmodels.tools.testing import Holder + +# R (v3.4.4) code inspired from +# https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/ +# library(readr) # to open the file +# library(MASS) # to perform ordinal regression +# +# ## load the data, 400 rows with 3 exogs(2 binaries, 1 float) +# ##and target 3-ordinal variable +# ologit_ucla <- read_csv("ologit_ucla.csv") +# ologit_ucla$apply <- as.factor(ologit_ucla$apply) +# ologit_ucla$apply <- factor(ologit_ucla$apply, +# levels=c("unlikely", "somewhat likely", "very likely")) +# +# ## fit ordered logit model +# r_logit <- polr(apply ~ pared + public + gpa, +# data = ologit_ucla, +# method = 'logit', # or 'probit' +# Hess=TRUE) +# +# ## fit ordered probit model +# r_probit <- polr(apply ~ pared + public + gpa, +# data = ologit_ucla, +# method = 'probit', +# Hess=TRUE) +# +# ## fit ordered cloglog model +# r_cloglog <- polr(apply ~ pared + public + gpa, +# data = ologit_ucla, +# method = 'cloglog', +# Hess=TRUE) +# +# ## with r = r_logit or r_probit or r_cloglog +# ## we add p-values +# (ctable <- coef(summary(r))) +# p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2 +# (ctable <- cbind(ctable, "p value" = p)) +# ## show 7 first predictions +# head(predict(r, subset(ologit_ucla, +# select=c("pared", "public","gpa")), type='prob'),7) + +data_store = Holder() +cur_dir = os.path.dirname(os.path.abspath(__file__)) +df = pd.read_csv(os.path.join(cur_dir, "ologit_ucla.csv")) + +# df_unordered['apply'] is pd.Categorical with ordered = False +df_unordered = df.copy() +df_unordered['apply'] = pd.Categorical(df['apply'], ordered=False) +# but categories are set in order +df_unordered['apply'].cat.set_categories( + ['unlikely', 'somewhat likely', 'very likely'], inplace=True) + +# df['apply'] is pd.Categorical with ordered = True +df['apply'] = pd.Categorical(df['apply'], ordered=True) +df['apply'].cat.set_categories( + ['unlikely', 'somewhat likely', 'very likely'], inplace=True) + +data_store.df_unordered = df_unordered +data_store.df = df +data_store.nobs = 400 +data_store.n_ordinal_cat = 3 + +res_ord_logit = Holder() +res_ord_logit.coefficients_val = \ + np.array([1.04769011, -0.05878572, 0.61594057]) +res_ord_logit.coefficients_stdE = np.array([0.2658, 0.2979, 0.2606]) +res_ord_logit.coefficients_tval = np.array([3.9418, -0.1974, 2.3632]) +res_ord_logit.coefficients_pval = \ + np.array([8.087070e-05, 8.435464e-01, 1.811594e-02]) +res_ord_logit.thresholds = np.array([2.203915, 4.299363]) +res_ord_logit.prob_pred = np.array([[0.5488310, 0.3593310, 0.09183798], + [0.3055632, 0.4759496, 0.21848725], + [0.2293835, 0.4781951, 0.29242138], + [0.6161224, 0.3126888, 0.07118879], + [0.6560149, 0.2833901, 0.06059505], + [0.6609240, 0.2797117, 0.05936431], + [0.6518332, 0.2865114, 0.06165547]]) + +res_ord_probit = Holder() +res_ord_probit.coefficients_val = np.array([0.59811, 0.01016, 0.35815]) +res_ord_probit.coefficients_stdE = np.array([0.1579, 0.1728, 0.1568]) +res_ord_probit.coefficients_tval = np.array([3.78881, 0.05878, 2.28479]) +res_ord_probit.coefficients_pval = \ + np.array([1.513681e-04, 9.531256e-01, 2.232519e-02]) +res_ord_probit.thresholds = np.array([1.2968, 2.5028]) +res_ord_probit.prob_pred = np.array([[0.5514181, 0.3576848, 0.09089707], + [0.3260107, 0.4488799, 0.22510933], + [0.2349733, 0.4506351, 0.31439162], + [0.6142501, 0.3184778, 0.06727214], + [0.6519891, 0.2928449, 0.05516602], + [0.6402204, 0.3009945, 0.05878509], + [0.6480094, 0.2956162, 0.05637442]]) + +res_ord_cloglog = Holder() +res_ord_cloglog.coefficients_val = np.array([0.5166455, 0.1081131, 0.3343895]) +res_ord_cloglog.coefficients_stdE = np.array([0.1613525, 0.1680675, 0.1542065]) +res_ord_cloglog.coefficients_tval = np.array([3.2019668, 0.6432721, 2.1684534]) +res_ord_cloglog.coefficients_pval = \ + np.array([1.364927e-03, 5.200475e-01, 3.012421e-02]) +res_ord_cloglog.thresholds = np.array([0.8705304, 1.9744660]) +res_ord_cloglog.prob_pred = np.array([[0.5519526, 0.3592524, 0.08879500], + [0.3855287, 0.3842645, 0.23020682], + [0.2899487, 0.3540202, 0.35603111], + [0.6067184, 0.3333548, 0.05992678], + [0.6411418, 0.3133969, 0.04546127], + [0.5940557, 0.3400072, 0.06593710], + [0.6374521, 0.3156622, 0.04688570]]) diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py new file mode 100644 index 00000000000..582ba08d0ca --- /dev/null +++ b/statsmodels/miscmodels/tests/test_ordinal_model.py @@ -0,0 +1,444 @@ +""" +Test for ordinal models +""" + +import warnings +import numpy as np +import scipy.stats as stats +import pandas as pd +import pytest + +from numpy.testing import assert_allclose, assert_equal +from statsmodels.tools.sm_exceptions import HessianInversionWarning +from .results.results_ordinal_model import data_store as ds +from statsmodels.miscmodels.ordinal_model import OrderedModel + +from statsmodels.discrete.discrete_model import Logit +from statsmodels.tools.tools import add_constant + + +class CheckOrdinalModelMixin(object): + + def test_basic(self): + # checks basic results againt R MASS package + n_cat = ds.n_ordinal_cat + res1 = self.res1 + res2 = self.res2 + # coefficients values, standard errors, t & p values + assert_allclose(res1.params[:-n_cat + 1], + res2.coefficients_val, atol=2e-4) + assert_allclose(res1.bse[:-n_cat + 1], + res2.coefficients_stdE, rtol=0.003, atol=1e-5) + assert_allclose(res1.tvalues[:-n_cat + 1], + res2.coefficients_tval, rtol=0.003, atol=7e-4) + assert_allclose(res1.pvalues[:-n_cat + 1], + res2.coefficients_pval, rtol=0.009, atol=1e-5) + # thresholds are given with exponentiated increments + # from the first threshold + assert_allclose( + res1.model.transform_threshold_params(res1.params)[1:-1], + res2.thresholds, atol=4e-4) + + # probabilities + assert_allclose(res1.predict()[:7, :], + res2.prob_pred, atol=5e-5) + + def test_pandas(self): + # makes sure that the Pandas ecosystem is supported + res1 = self.res1 + resp = self.resp + # converges slightly differently why? + assert_allclose(res1.params, resp.params, atol=1e-10) + assert_allclose(res1.bse, resp.bse, atol=1e-10) + + assert_allclose(res1.model.endog, resp.model.endog, rtol=1e-10) + assert_allclose(res1.model.exog, resp.model.exog, rtol=1e-10) + + def test_formula(self): + # makes sure the "R-way" of writing models is supported + res1 = self.res1 + resf = self.resf + # converges slightly differently why? yet e-5 is ok + assert_allclose(res1.params, resf.params, atol=5e-5) + assert_allclose(res1.bse, resf.bse, atol=5e-5) + + assert_allclose(res1.model.endog, resf.model.endog, rtol=1e-10) + assert_allclose(res1.model.exog, resf.model.exog, rtol=1e-10) + + def test_unordered(self): + # makes sure that ordered = True is optional for the endog Serie + # et categories have to be set in the right order + res1 = self.res1 + resf = self.resu + # converges slightly differently why? + assert_allclose(res1.params, resf.params, atol=1e-10) + assert_allclose(res1.bse, resf.bse, atol=1e-10) + + assert_allclose(res1.model.endog, resf.model.endog, rtol=1e-10) + assert_allclose(res1.model.exog, resf.model.exog, rtol=1e-10) + + def test_results_other(self): + + res1 = self.res1 # numpy + resp = self.resp # pandas + + param_names_np = ['x1', 'x2', 'x3', '0/1', '1/2'] + param_names_pd = ['pared', 'public', 'gpa', 'unlikely/somewhat likely', + 'somewhat likely/very likely'] + + assert res1.model.data.param_names == param_names_np + assert self.resp.model.data.param_names == param_names_pd + assert self.resp.model.endog_names == "apply" + + # results + if hasattr(self, "pred_table"): + table = res1.pred_table() + assert_equal(table.values, self.pred_table) + + # smoke test + res1.summary() + + # inherited + tt = res1.t_test(np.eye(len(res1.params))) + assert_allclose(tt.pvalue, res1.pvalues, rtol=1e-13) + + tt = resp.t_test(['pared', 'public', 'gpa']) # pandas names + assert_allclose(tt.pvalue, res1.pvalues[:3], rtol=1e-13) + + pred = res1.predict(exog=res1.model.exog[-5:]) + fitted = res1.predict() + assert_allclose(pred, fitted[-5:], rtol=1e-13) + + pred = resp.predict(exog=resp.model.data.orig_exog.iloc[-5:]) + fitted = resp.predict() + assert_allclose(pred, fitted[-5:], rtol=1e-13) + + dataf = self.resf.model.data.frame # is a dict + dataf_df = pd.DataFrame.from_dict(dataf) + pred = self.resf.predict(exog=dataf_df.iloc[-5:]) + fitted = self.resf.predict() + assert_allclose(pred, fitted[-5:], rtol=1e-13) + + n, k = res1.model.exog.shape + assert_equal(self.resf.df_resid, n - (k + 2)) + + +class TestLogitModel(CheckOrdinalModelMixin): + + @classmethod + def setup_class(cls): + data = ds.df + data_unordered = ds.df_unordered + + # standard fit + mod = OrderedModel(data['apply'].values.codes, + np.asarray(data[['pared', 'public', 'gpa']], float), + distr='logit') + res = mod.fit(method='bfgs', disp=False) + # standard fit with pandas input + modp = OrderedModel(data['apply'], + data[['pared', 'public', 'gpa']], + distr='logit') + resp = modp.fit(method='bfgs', disp=False) + # fit with formula + modf = OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='logit') + resf = modf.fit(method='bfgs', disp=False) + # fit on data with ordered=False + modu = OrderedModel( + data_unordered['apply'].values.codes, + np.asarray(data_unordered[['pared', 'public', 'gpa']], float), + distr='logit') + resu = modu.fit(method='bfgs', disp=False) + + from .results.results_ordinal_model import res_ord_logit as res2 + cls.res2 = res2 + cls.res1 = res + cls.resp = resp + cls.resf = resf + cls.resu = resu + + +class TestProbitModel(CheckOrdinalModelMixin): + + @classmethod + def setup_class(cls): + data = ds.df + data_unordered = ds.df_unordered + + mod = OrderedModel(data['apply'].values.codes, + np.asarray(data[['pared', 'public', 'gpa']], float), + distr='probit') + res = mod.fit(method='bfgs', disp=False) + + modp = OrderedModel(data['apply'], + data[['pared', 'public', 'gpa']], + distr='probit') + resp = modp.fit(method='bfgs', disp=False) + + modf = OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='probit') + resf = modf.fit(method='bfgs', disp=False) + + modu = OrderedModel( + data_unordered['apply'].values.codes, + np.asarray(data_unordered[['pared', 'public', 'gpa']], float), + distr='probit') + resu = modu.fit(method='bfgs', disp=False) + + from .results.results_ordinal_model import res_ord_probit as res2 + cls.res2 = res2 + cls.res1 = res + cls.resp = resp + cls.resf = resf + cls.resu = resu + + # regression numbers + cls.pred_table = np.array([[202, 18, 0, 220], + [112, 28, 0, 140], + [ 27, 13, 0, 40], # noqa + [341, 59, 0, 400]], dtype=np.int64) + + def test_loglikerelated(self): + + res1 = self.res1 + # res2 = self.res2 + + mod = res1.model + fact = 1.1 # evaluate away from optimum + score1 = mod.score(res1.params * fact) + score_obs_numdiff = mod.score_obs(res1.params * fact) + score_obs_exog = mod.score_obs_(res1.params * fact) + assert_allclose(score_obs_numdiff.sum(0), score1, atol=1e-7) + assert_allclose(score_obs_exog.sum(0), score1[:mod.k_vars], atol=1e-7) + + # null model + mod_null = OrderedModel(mod.endog, None, + offset=np.zeros(mod.nobs), + distr=mod.distr) + null_params = mod.start_params + res_null = mod_null.fit(method='bfgs', disp=False) + assert_allclose(res_null.params, null_params[mod.k_vars:], rtol=1e-8) + + def test_formula_categorical(self): + + resp = self.resp + data = ds.df + + formula = "apply ~ pared + public + gpa - 1" + modf2 = OrderedModel.from_formula(formula, + data, distr='probit') + resf2 = modf2.fit(method='bfgs', disp=False) + assert_allclose(resf2.params, resp.params, atol=1e-8) + assert modf2.exog_names == resp.model.exog_names + assert modf2.data.ynames == resp.model.data.ynames + assert hasattr(modf2.data, "frame") + assert not hasattr(modf2, "frame") + + with pytest.raises(ValueError): + # only ordered categorical or numerical endog are allowed + # string endog raises ValueError + OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": np.asarray(data['apply']), + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr='probit') + + def test_offset(self): + + resp = self.resp + data = ds.df + offset = np.ones(len(data)) + + formula = "apply ~ pared + public + gpa - 1" + modf2 = OrderedModel.from_formula(formula, data, offset=offset, + distr='probit') + resf2 = modf2.fit(method='bfgs', disp=False) + + assert_allclose(resf2.params[:3], resp.params[:3], atol=2e-4) + assert_allclose(resf2.params[3], resp.params[3] + 1, atol=2e-4) + + fitted = resp.predict() + fitted2 = resf2.predict() + assert_allclose(fitted2, fitted, atol=2e-4) + + pred_ones = resf2.predict(data[:6], offset=np.ones(6)) + assert_allclose(pred_ones, fitted[:6], atol=2e-4) + + # check default is 0. if exog provided + pred_zero1 = resf2.predict(data[:6]) + pred_zero2 = resf2.predict(data[:6], offset=0) + assert_allclose(pred_zero1, pred_zero2, atol=2e-4) + + # compare with equivalent results frp, no-offset model + pred_zero = resp.predict(data.iloc[:6, 1:], offset=-np.ones(6)) + assert_allclose(pred_zero1, pred_zero, atol=2e-4) + + params_adj = resp.params.copy() + params_adj[3] += 1 + fitted_zero = resp.model.predict(params_adj) + assert_allclose(pred_zero1, fitted_zero[:6], atol=2e-4) + + +class TestLogitModelFormula(): + + @classmethod + def setup_class(cls): + data = ds.df + nobs = len(data) + data["dummy"] = (np.arange(nobs) < (nobs / 2)).astype(float) + # alias to correspond to patsy name + data["C(dummy)[T.1.0]"] = data["dummy"] + cls.data = data + + columns = ['C(dummy)[T.1.0]', 'pared', 'public', 'gpa'] + # standard fit + mod = OrderedModel(data['apply'].values.codes, + np.asarray(data[columns], float), + distr='logit') + cls.res = mod.fit(method='bfgs', disp=False) + # standard fit with pandas input + modp = OrderedModel(data['apply'], + data[columns], + distr='logit') + cls.resp = modp.fit(method='bfgs', disp=False) + + def test_setup(self): + data = self.data + resp = self.resp + fittedvalues = resp.predict() + + formulas = ["apply ~ 1 + pared + public + gpa + C(dummy)", + "apply ~ pared + public + gpa + C(dummy)"] + for formula in formulas: + modf1 = OrderedModel.from_formula(formula, data, distr='logit') + resf1 = modf1.fit(method='bfgs') + summf1 = resf1.summary() + summf1_str = str(summf1) + assert resf1.model.exog_names == resp.model.exog_names + assert resf1.model.data.param_names == resp.model.exog_names + assert all(name in summf1_str for name in + resp.model.data.param_names) + assert_allclose(resf1.predict(data[:5]), fittedvalues[:5]) + + # test over parameterized model with implicit constant + formula = "apply ~ 0 + pared + public + gpa + C(dummy)" + + with pytest.raises(ValueError): + OrderedModel.from_formula(formula, data, distr='logit') + + # ignore constant, so we get results without exception + modf2 = OrderedModel.from_formula(formula, data, distr='logit', + hasconst=False) + # we get a warning in some environments + with warnings.catch_warnings(): + warnings.simplefilter("ignore", HessianInversionWarning) + resf2 = modf2.fit(method='bfgs') + + assert_allclose(resf2.predict(data[:5]), fittedvalues[:5], rtol=1e-4) + + +class TestCLogLogModel(CheckOrdinalModelMixin): + + @classmethod + def setup_class(cls): + data = ds.df + data_unordered = ds.df_unordered + + # a Scipy distribution defined minimally + class CLogLog(stats.rv_continuous): + def _ppf(self, q): + return np.log(-np.log(1 - q)) + + def _cdf(self, x): + return 1 - np.exp(-np.exp(x)) + + cloglog = CLogLog() + + mod = OrderedModel(data['apply'].values.codes, + np.asarray(data[['pared', 'public', 'gpa']], float), + distr=cloglog) + res = mod.fit(method='bfgs', disp=False) + + modp = OrderedModel(data['apply'], + data[['pared', 'public', 'gpa']], + distr=cloglog) + resp = modp.fit(method='bfgs', disp=False) + + # with pytest.warns(UserWarning): + modf = OrderedModel.from_formula( + "apply ~ pared + public + gpa - 1", + data={"apply": data['apply'].values.codes, + "pared": data['pared'], + "public": data['public'], + "gpa": data['gpa']}, + distr=cloglog) + resf = modf.fit(method='bfgs', disp=False) + + modu = OrderedModel( + data_unordered['apply'].values.codes, + np.asarray(data_unordered[['pared', 'public', 'gpa']], float), + distr=cloglog) + resu = modu.fit(method='bfgs', disp=False) + + from .results.results_ordinal_model import res_ord_cloglog as res2 + cls.res2 = res2 + cls.res1 = res + cls.resp = resp + cls.resf = resf + cls.resu = resu + + +class TestLogitBinary(): + # compare OrderedModel with discrete Logit for binary case + def test_attributes(self): + data = ds.df + + mask_drop = data['apply'] == "somewhat likely" + data2 = data.loc[~mask_drop, :] + # we need to remove the category also from the Categorical Index + data2['apply'].cat.remove_categories("somewhat likely", inplace=True) + + # standard fit with pandas input + modp = OrderedModel(data2['apply'], + data2[['pared', 'public', 'gpa']], + distr='logit') + resp = modp.fit(method='bfgs', disp=False) + + exog = add_constant(data2[['pared', 'public', 'gpa']], prepend=False) + mod_logit = Logit(data2['apply'].cat.codes, exog) + res_logit = mod_logit.fit() + + attributes = "bse df_resid llf aic bic".split() + assert_allclose(resp.params[:3], res_logit.params[:3], rtol=1e-5) + assert_allclose(resp.params[3], -res_logit.params[3], rtol=1e-5) + for attr in attributes: + assert_allclose(getattr(resp, attr), getattr(res_logit, attr), + rtol=1e-4) + + resp = modp.fit(method='bfgs', disp=False, + cov_type="hac", cov_kwds={"maxlags": 2}) + res_logit = mod_logit.fit(method='bfgs', disp=False, + cov_type="hac", cov_kwds={"maxlags": 2}) + for attr in attributes: + assert_allclose(getattr(resp, attr), getattr(res_logit, attr), + rtol=1e-4) + + resp = modp.fit(method='bfgs', disp=False, cov_type="hc1") + res_logit = mod_logit.fit(method='bfgs', disp=False, + cov_type="hc1") + for attr in attributes: + assert_allclose(getattr(resp, attr), getattr(res_logit, attr), + rtol=1e-4)