Merge 2ac71f3 into 9ce1605

statsmodels · Sep 6, 2014 · e1dd56b · e1dd56b
2 parents 9ce1605 + 2ac71f3
commit e1dd56b
Show file tree

Hide file tree

Showing 2 changed files with 113 additions and 23 deletions.
diff --git a/statsmodels/duration/hazard_regression.py b/statsmodels/duration/hazard_regression.py
@@ -282,6 +282,68 @@ def __init__(self, endog, exog, status=None, entry=None,
 
         self.ties = ties
 
+    @classmethod
+    def from_formula(cls, formula, data, status=None, entry=None,
+                     strata=None, offset=None, subset=None,
+                     ties='breslow', missing='drop', *args, **kwargs):
+        """
+        Create a proportional hazards regression model from a formula
+        and dataframe.
+
+        Parameters
+        ----------
+        formula : str or generic Formula object
+            The formula specifying the model
+        data : array-like
+            The data for the model. See Notes.
+        status : array-like
+            The censoring status values; status=1 indicates that an
+            event occured (e.g. failure or death), status=0 indicates
+            that the observation was right censored. If None, defaults
+            to status=1 for all cases.
+        entry : array-like
+            The entry times, if left truncation occurs
+        strata : array-like
+            Stratum labels.  If None, all observations are taken to be
+            in a single stratum.
+        offset : array-like
+            Array of offset values
+        subset : array-like
+            An array-like object of booleans, integers, or index
+            values that indicate the subset of df to use in the
+            model. Assumes df is a `pandas.DataFrame`
+        ties : string
+            The method used to handle tied times, must be either 'breslow'
+            or 'efron'.
+        missing : string
+            The method used to handle missing data
+        args : extra arguments
+            These are passed to the model
+        kwargs : extra keyword arguments
+            These are passed to the model.
+
+        Returns
+        -------
+        model : PHReg model instance
+        """
+
+        # Allow array arguments to be passed by column name.
+        if type(status) is str:
+            status = data[status]
+        if type(entry) is str:
+            entry = data[entry]
+        if type(strata) is str:
+            strata = data[strata]
+        if type(offset) is str:
+            offset = data[offset]
+
+        mod = super(PHReg, cls).from_formula(formula, data,
+                    status=status, entry=entry, strata=strata,
+                    offset=offset, subset=subset, ties=ties,
+                    missing=missing, *args, **kwargs)
+
+        return mod
+
     def fit(self, groups=None, **args):
         """
         Fit a proportional hazards regression model.

diff --git a/statsmodels/duration/tests/test_phreg.py b/statsmodels/duration/tests/test_phreg.py
@@ -1,7 +1,9 @@
 import os
 import numpy as np
 from statsmodels.duration.hazard_regression import PHReg
-from numpy.testing import assert_almost_equal
+from numpy.testing import (assert_allclose,
+                           assert_equal)
+import pandas as pd
 
 # TODO: Include some corner cases: data sets with empty strata, strata
 #      with no events, entry times after censoring times, etc.
@@ -76,30 +78,30 @@ def do1(self, fname, ties, entry_f, strata_f):
         mod = PHReg(time, exog, status, ties=ties)
         phrb = mod.fit(**args)
         coef_r, se_r, time_r, hazard_r = get_results(n, p, None, ties1)
-        assert_almost_equal(phrb.params, coef_r, decimal=4)
-        assert_almost_equal(phrb.bse, se_r, decimal=4)
+        assert_allclose(phrb.params, coef_r, rtol=1e-3)
+        assert_allclose(phrb.bse, se_r, rtol=1e-4)
         #time_h, cumhaz, surv = phrb.baseline_hazard[0]
 
         # Entry times but no stratification
         phrb = PHReg(time, exog, status, entry=entry,
                      ties=ties).fit(**args)
         coef, se, time_r, hazard_r = get_results(n, p, "et", ties1)
-        assert_almost_equal(phrb.params, coef, decimal=4)
-        assert_almost_equal(phrb.bse, se, decimal=4)
+        assert_allclose(phrb.params, coef, rtol=1e-3)
+        assert_allclose(phrb.bse, se, rtol=1e-3)
 
         # Stratification but no entry times
         phrb = PHReg(time, exog, status, strata=strata,
                       ties=ties).fit(**args)
         coef, se, time_r, hazard_r = get_results(n, p, "st", ties1)
-        assert_almost_equal(phrb.params, coef, decimal=4)
-        assert_almost_equal(phrb.bse, se, decimal=4)
+        assert_allclose(phrb.params, coef, rtol=1e-4)
+        assert_allclose(phrb.bse, se, rtol=1e-4)
 
         # Stratification and entry times
         phrb = PHReg(time, exog, status, entry=entry,
                      strata=strata, ties=ties).fit(**args)
         coef, se, time_r, hazard_r = get_results(n, p, "et_st", ties1)
-        assert_almost_equal(phrb.params, coef, decimal=4)
-        assert_almost_equal(phrb.bse, se, decimal=4)
+        assert_allclose(phrb.params, coef, rtol=1e-3)
+        assert_allclose(phrb.bse, se, rtol=1e-4)
 
 
     # Run all the tests
@@ -130,14 +132,40 @@ def test_missing(self):
         exog[10:15,:] = np.nan
 
         md = PHReg(time, exog, status, missing='drop')
-        assert(len(md.endog) == 185)
-        assert(len(md.status) == 185)
-        assert(all(md.exog.shape == np.r_[185,4]))
+        assert_allclose(len(md.endog), 185)
+        assert_allclose(len(md.status), 185)
+        assert_allclose(md.exog.shape, np.r_[185,4])
 
     def test_formula(self):
-        #Smoke test for import only
-        from statsmodels.api import PHReg
-        from statsmodels.formula.api import phreg
+
+        np.random.seed(34234)
+        time = 50 * np.random.uniform(size=200)
+        status = np.random.randint(0, 2, 200).astype(np.float64)
+        exog = np.random.normal(size=(200,4))
+        entry = np.zeros_like(time)
+        entry[0:10] = time[0:10] / 2
+
+        df = pd.DataFrame({"time": time, "status": status,
+                           "exog1": exog[:, 0], "exog2": exog[:, 1],
+                           "exog3": exog[:, 2], "exog4": exog[:, 3],
+                           "entry": entry})
+
+        mod1 = PHReg(time, exog, status, entry=entry)
+        rslt1 = mod1.fit()
+
+        fml = "time ~ 0 + exog1 + exog2 + exog3 + exog4"
+        mod2 = PHReg.from_formula(fml, df, status=status,
+                                  entry=entry)
+        rslt2 = mod2.fit()
+
+        mod3 = PHReg.from_formula(fml, df, status="status",
+                                  entry="entry")
+        rslt3 = mod3.fit()
+
+        assert_allclose(rslt1.params, rslt2.params)
+        assert_allclose(rslt1.params, rslt3.params)
+        assert_allclose(rslt1.bse, rslt2.bse)
+        assert_allclose(rslt1.bse, rslt3.bse)
 
     def test_offset(self):
 
@@ -154,7 +182,7 @@ def test_offset(self):
         mod2 = PHReg(time, exog, status, offset=offset)
         rslt2 = mod2.fit()
 
-        assert_almost_equal(rslt2.params, rslt1.params[1:])
+        assert_allclose(rslt2.params, rslt1.params[1:])
 
     def test_post_estimation(self):
         # All regression tests
@@ -166,36 +194,36 @@ def test_post_estimation(self):
         mod = PHReg(time, exog, status)
         rslt = mod.fit()
         mart_resid = rslt.martingale_residuals
-        assert_almost_equal(np.abs(mart_resid).sum(), 120.72475743348433)
+        assert_allclose(np.abs(mart_resid).sum(), 120.72475743348433)
 
         w_avg = rslt.weighted_covariate_averages
-        assert_almost_equal(np.abs(w_avg[0]).sum(0),
+        assert_allclose(np.abs(w_avg[0]).sum(0),
                np.r_[7.31008415, 9.77608674,10.89515885, 13.1106801])
 
         bc_haz = rslt.baseline_cumulative_hazard
         v = [np.mean(np.abs(x)) for x in bc_haz[0]]
         w = np.r_[23.482841556421608, 0.44149255358417017,
                   0.68660114081275281]
-        assert_almost_equal(v, w)
+        assert_allclose(v, w)
 
         score_resid = rslt.score_residuals
         v = np.r_[ 0.50924792, 0.4533952, 0.4876718, 0.5441128]
         w = np.abs(score_resid).mean(0)
-        assert_almost_equal(v, w)
+        assert_allclose(v, w)
 
         groups = np.random.randint(0, 3, 200)
         mod = PHReg(time, exog, status)
         rslt = mod.fit(groups=groups)
         robust_cov = rslt.cov_params()
         v = [0.00513432, 0.01278423, 0.00810427, 0.00293147]
         w = np.abs(robust_cov).mean(0)
-        assert_almost_equal(v, w)
+        assert_allclose(v, w, rtol=1e-6)
 
         s_resid = rslt.schoenfeld_residuals
         ii = np.flatnonzero(np.isfinite(s_resid).all(1))
         s_resid = s_resid[ii, :]
         v = np.r_[0.85154336, 0.72993748, 0.73758071, 0.78599333]
-        assert_almost_equal(np.abs(s_resid).mean(0), v)
+        assert_allclose(np.abs(s_resid).mean(0), v)
 
     def test_summary(self):
         # smoke test
@@ -269,7 +297,7 @@ def test_fit_regularized(self):
                 # The agreement isn't very high, the issue may be on
                 # their side.  They seem to use some approximations
                 # that we are not using.
-                assert_almost_equal(rslt.params, coef, decimal=1)
+                assert_allclose(rslt.params, coef, rtol=0.3)
 
                 # Smoke test for summary
                 smry = rslt.summary()