Skip to content

Commit

Permalink
Merge 2ac71f3 into 9ce1605
Browse files Browse the repository at this point in the history
  • Loading branch information
kshedden committed Sep 6, 2014
2 parents 9ce1605 + 2ac71f3 commit e1dd56b
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 23 deletions.
62 changes: 62 additions & 0 deletions statsmodels/duration/hazard_regression.py
Expand Up @@ -282,6 +282,68 @@ def __init__(self, endog, exog, status=None, entry=None,

self.ties = ties

@classmethod
def from_formula(cls, formula, data, status=None, entry=None,
strata=None, offset=None, subset=None,
ties='breslow', missing='drop', *args, **kwargs):
"""
Create a proportional hazards regression model from a formula
and dataframe.
Parameters
----------
formula : str or generic Formula object
The formula specifying the model
data : array-like
The data for the model. See Notes.
status : array-like
The censoring status values; status=1 indicates that an
event occured (e.g. failure or death), status=0 indicates
that the observation was right censored. If None, defaults
to status=1 for all cases.
entry : array-like
The entry times, if left truncation occurs
strata : array-like
Stratum labels. If None, all observations are taken to be
in a single stratum.
offset : array-like
Array of offset values
subset : array-like
An array-like object of booleans, integers, or index
values that indicate the subset of df to use in the
model. Assumes df is a `pandas.DataFrame`
ties : string
The method used to handle tied times, must be either 'breslow'
or 'efron'.
missing : string
The method used to handle missing data
args : extra arguments
These are passed to the model
kwargs : extra keyword arguments
These are passed to the model.
Returns
-------
model : PHReg model instance
"""

# Allow array arguments to be passed by column name.
if type(status) is str:
status = data[status]
if type(entry) is str:
entry = data[entry]
if type(strata) is str:
strata = data[strata]
if type(offset) is str:
offset = data[offset]

mod = super(PHReg, cls).from_formula(formula, data,
status=status, entry=entry, strata=strata,
offset=offset, subset=subset, ties=ties,
missing=missing, *args, **kwargs)

return mod

def fit(self, groups=None, **args):
"""
Fit a proportional hazards regression model.
Expand Down
74 changes: 51 additions & 23 deletions statsmodels/duration/tests/test_phreg.py
@@ -1,7 +1,9 @@
import os
import numpy as np
from statsmodels.duration.hazard_regression import PHReg
from numpy.testing import assert_almost_equal
from numpy.testing import (assert_allclose,
assert_equal)
import pandas as pd

# TODO: Include some corner cases: data sets with empty strata, strata
# with no events, entry times after censoring times, etc.
Expand Down Expand Up @@ -76,30 +78,30 @@ def do1(self, fname, ties, entry_f, strata_f):
mod = PHReg(time, exog, status, ties=ties)
phrb = mod.fit(**args)
coef_r, se_r, time_r, hazard_r = get_results(n, p, None, ties1)
assert_almost_equal(phrb.params, coef_r, decimal=4)
assert_almost_equal(phrb.bse, se_r, decimal=4)
assert_allclose(phrb.params, coef_r, rtol=1e-3)
assert_allclose(phrb.bse, se_r, rtol=1e-4)
#time_h, cumhaz, surv = phrb.baseline_hazard[0]

# Entry times but no stratification
phrb = PHReg(time, exog, status, entry=entry,
ties=ties).fit(**args)
coef, se, time_r, hazard_r = get_results(n, p, "et", ties1)
assert_almost_equal(phrb.params, coef, decimal=4)
assert_almost_equal(phrb.bse, se, decimal=4)
assert_allclose(phrb.params, coef, rtol=1e-3)
assert_allclose(phrb.bse, se, rtol=1e-3)

# Stratification but no entry times
phrb = PHReg(time, exog, status, strata=strata,
ties=ties).fit(**args)
coef, se, time_r, hazard_r = get_results(n, p, "st", ties1)
assert_almost_equal(phrb.params, coef, decimal=4)
assert_almost_equal(phrb.bse, se, decimal=4)
assert_allclose(phrb.params, coef, rtol=1e-4)
assert_allclose(phrb.bse, se, rtol=1e-4)

# Stratification and entry times
phrb = PHReg(time, exog, status, entry=entry,
strata=strata, ties=ties).fit(**args)
coef, se, time_r, hazard_r = get_results(n, p, "et_st", ties1)
assert_almost_equal(phrb.params, coef, decimal=4)
assert_almost_equal(phrb.bse, se, decimal=4)
assert_allclose(phrb.params, coef, rtol=1e-3)
assert_allclose(phrb.bse, se, rtol=1e-4)


# Run all the tests
Expand Down Expand Up @@ -130,14 +132,40 @@ def test_missing(self):
exog[10:15,:] = np.nan

md = PHReg(time, exog, status, missing='drop')
assert(len(md.endog) == 185)
assert(len(md.status) == 185)
assert(all(md.exog.shape == np.r_[185,4]))
assert_allclose(len(md.endog), 185)
assert_allclose(len(md.status), 185)
assert_allclose(md.exog.shape, np.r_[185,4])

def test_formula(self):
#Smoke test for import only
from statsmodels.api import PHReg
from statsmodels.formula.api import phreg

np.random.seed(34234)
time = 50 * np.random.uniform(size=200)
status = np.random.randint(0, 2, 200).astype(np.float64)
exog = np.random.normal(size=(200,4))
entry = np.zeros_like(time)
entry[0:10] = time[0:10] / 2

df = pd.DataFrame({"time": time, "status": status,
"exog1": exog[:, 0], "exog2": exog[:, 1],
"exog3": exog[:, 2], "exog4": exog[:, 3],
"entry": entry})

mod1 = PHReg(time, exog, status, entry=entry)
rslt1 = mod1.fit()

fml = "time ~ 0 + exog1 + exog2 + exog3 + exog4"
mod2 = PHReg.from_formula(fml, df, status=status,
entry=entry)
rslt2 = mod2.fit()

mod3 = PHReg.from_formula(fml, df, status="status",
entry="entry")
rslt3 = mod3.fit()

assert_allclose(rslt1.params, rslt2.params)
assert_allclose(rslt1.params, rslt3.params)
assert_allclose(rslt1.bse, rslt2.bse)
assert_allclose(rslt1.bse, rslt3.bse)

def test_offset(self):

Expand All @@ -154,7 +182,7 @@ def test_offset(self):
mod2 = PHReg(time, exog, status, offset=offset)
rslt2 = mod2.fit()

assert_almost_equal(rslt2.params, rslt1.params[1:])
assert_allclose(rslt2.params, rslt1.params[1:])

def test_post_estimation(self):
# All regression tests
Expand All @@ -166,36 +194,36 @@ def test_post_estimation(self):
mod = PHReg(time, exog, status)
rslt = mod.fit()
mart_resid = rslt.martingale_residuals
assert_almost_equal(np.abs(mart_resid).sum(), 120.72475743348433)
assert_allclose(np.abs(mart_resid).sum(), 120.72475743348433)

w_avg = rslt.weighted_covariate_averages
assert_almost_equal(np.abs(w_avg[0]).sum(0),
assert_allclose(np.abs(w_avg[0]).sum(0),
np.r_[7.31008415, 9.77608674,10.89515885, 13.1106801])

bc_haz = rslt.baseline_cumulative_hazard
v = [np.mean(np.abs(x)) for x in bc_haz[0]]
w = np.r_[23.482841556421608, 0.44149255358417017,
0.68660114081275281]
assert_almost_equal(v, w)
assert_allclose(v, w)

score_resid = rslt.score_residuals
v = np.r_[ 0.50924792, 0.4533952, 0.4876718, 0.5441128]
w = np.abs(score_resid).mean(0)
assert_almost_equal(v, w)
assert_allclose(v, w)

groups = np.random.randint(0, 3, 200)
mod = PHReg(time, exog, status)
rslt = mod.fit(groups=groups)
robust_cov = rslt.cov_params()
v = [0.00513432, 0.01278423, 0.00810427, 0.00293147]
w = np.abs(robust_cov).mean(0)
assert_almost_equal(v, w)
assert_allclose(v, w, rtol=1e-6)

s_resid = rslt.schoenfeld_residuals
ii = np.flatnonzero(np.isfinite(s_resid).all(1))
s_resid = s_resid[ii, :]
v = np.r_[0.85154336, 0.72993748, 0.73758071, 0.78599333]
assert_almost_equal(np.abs(s_resid).mean(0), v)
assert_allclose(np.abs(s_resid).mean(0), v)

def test_summary(self):
# smoke test
Expand Down Expand Up @@ -269,7 +297,7 @@ def test_fit_regularized(self):
# The agreement isn't very high, the issue may be on
# their side. They seem to use some approximations
# that we are not using.
assert_almost_equal(rslt.params, coef, decimal=1)
assert_allclose(rslt.params, coef, rtol=0.3)

# Smoke test for summary
smry = rslt.summary()
Expand Down

0 comments on commit e1dd56b

Please sign in to comment.