From 027ffd71f30ed5b81dc5447737d798de3000f2ba Mon Sep 17 00:00:00 2001 From: CamDavidsonPilon Date: Mon, 4 May 2020 08:41:29 -0400 Subject: [PATCH] v0.24.6 --- CHANGELOG.md | 9 ++++++ lifelines/fitters/__init__.py | 43 +++++++++++++++++------------ lifelines/fitters/weibull_fitter.py | 2 +- lifelines/plotting.py | 16 +++++++---- lifelines/tests/test_estimation.py | 12 ++++++++ lifelines/tests/test_plotting.py | 32 +++++++++++---------- 6 files changed, 74 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5373d23ce..444906b23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ ## Changelog +#### 0.24.6 - unreleased + +##### New features + - At the cost of some performance, convergence is improved in many models. + - New `lifelines.plotting.plot_interval_censored_lifetimes` for plotting interval censored data + +##### Bug fixes + - fixed bug where `cdf_plot` and `qq_plot` were not factoring in the weights correctly. + #### 0.24.5 - 2020-05-01 ##### New features diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py index 902c4211c..045c6be44 100644 --- a/lifelines/fitters/__init__.py +++ b/lifelines/fitters/__init__.py @@ -487,26 +487,31 @@ def _fit_model(self, Ts, E, entry, weights, show_progress=True): with warnings.catch_warnings(): warnings.simplefilter("ignore") - results = minimize( - value_and_grad(negative_log_likelihood), # pylint: disable=no-value-for-parameter - self._initial_values, - jac=True, - method=self._scipy_fit_method, - args=(Ts, E, entry, weights), - bounds=self._bounds, - options={**{"disp": show_progress}, **self._scipy_fit_options}, - ) + minimizing_results, minimizing_ll = None, np.inf + for method in [self._scipy_fit_method, "Powell"]: + results = minimize( + value_and_grad(negative_log_likelihood), # pylint: disable=no-value-for-parameter + self._initial_values, + jac=True, + method=method, + args=(Ts, E, entry, weights), + bounds=self._bounds, + options={**{"disp": show_progress}, **self._scipy_fit_options}, + ) + if results.fun < minimizing_ll: + minimizing_ll = results.fun + minimizing_results = results # convergence successful. - if results.success: + if minimizing_results and minimizing_results.success: # pylint: disable=no-value-for-parameter - hessian_ = hessian(negative_log_likelihood)(results.x, Ts, E, entry, weights) + hessian_ = hessian(negative_log_likelihood)(minimizing_results.x, Ts, E, entry, weights) # see issue https://github.com/CamDavidsonPilon/lifelines/issues/801 hessian_ = (hessian_ + hessian_.T) / 2 - return results.x, -results.fun * weights.sum(), hessian_ * weights.sum() + return minimizing_results.x, -minimizing_results.fun * weights.sum(), hessian_ * weights.sum() # convergence failed. - print(results) + print(minimizing_results) if self._KNOWN_MODEL: raise utils.ConvergenceError( dedent( @@ -1795,7 +1800,7 @@ def _fit_model(self, likelihood, Ts, Xs, E, weights, entries, show_progress=Fals if show_progress: print(minimum_results) - if minimum_results.success: + if minimum_results is not None and minimum_results.success: sum_weights = weights.sum() hessian_ = hessian(self._neg_likelihood_with_penalty_function)(minimum_results.x, Ts, E, weights, entries, Xs) # See issue https://github.com/CamDavidsonPilon/lifelines/issues/801 @@ -1807,16 +1812,18 @@ def _fit_model(self, likelihood, Ts, Xs, E, weights, entries, show_progress=Fals raise utils.ConvergenceError( dedent( """\ - Fitting did not converge. Try checking the following: + Fitting did not converge. Try the following: 0. Are there any lifelines warnings outputted during the `fit`? 1. Inspect your DataFrame: does everything look as expected? + 2. Try scaling your duration vector down, i.e. `df["{duration_col}"] = df["{duration_col}"]/100` 2. Is there high-collinearity in the dataset? Try using the variance inflation factor (VIF) to find redundant variables. 3. Try using an alternate minimizer: ``fitter._scipy_fit_method = "SLSQP"``. - 4. Trying adding a small penalizer (or changing it, if already present). Example: `%s(penalizer=0.01).fit(...)`. + 4. Trying adding a small penalizer (or changing it, if already present). Example: `{fitter_name}(penalizer=0.01).fit(...)`. 5. Are there any extreme outliers? Try modeling them or dropping them to see if it helps convergence. - """ - % self._class_name + """.format( + duration_col=self.duration_col, fitter_name=self._class_name + ) ) ) diff --git a/lifelines/fitters/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py index 38dc0ce63..28060127a 100644 --- a/lifelines/fitters/weibull_fitter.py +++ b/lifelines/fitters/weibull_fitter.py @@ -96,7 +96,7 @@ class WeibullFitter(KnownModelParametricUnivariateFitter): _scipy_fit_options = {"ftol": 1e-14} def _create_initial_point(self, Ts, E, entry, weights): - return np.array([utils.coalesce(*Ts).std(), 1.0]) + return np.array([utils.coalesce(*Ts).mean(), 1.5]) def _cumulative_hazard(self, params, times): lambda_, rho_ = params diff --git a/lifelines/plotting.py b/lifelines/plotting.py index 8c55dfb31..5a459c1ce 100644 --- a/lifelines/plotting.py +++ b/lifelines/plotting.py @@ -88,11 +88,11 @@ def cdf_plot(model, timeline=None, ax=None, **plot_kwargs): if CensoringType.is_left_censoring(model): empirical_kmf = KaplanMeierFitter().fit_left_censoring( - model.durations, model.event_observed, label=COL_EMP, timeline=timeline + model.durations, model.event_observed, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry ) elif CensoringType.is_right_censoring(model): empirical_kmf = KaplanMeierFitter().fit_right_censoring( - model.durations, model.event_observed, label=COL_EMP, timeline=timeline + model.durations, model.event_observed, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry ) elif CensoringType.is_interval_censoring(model): raise NotImplementedError("lifelines does not have a non-parametric interval model yet.") @@ -198,7 +198,7 @@ def rmst_plot(model, model2=None, t=np.inf, ax=None, text_position=None, **plot_ ) ax.text( - text_position[0], text_position[1], "RMST(%s) -\n RMST(%s)=%.3f" % (model._label, model2._label, rmst - rmst2), + text_position[0], text_position[1], "RMST(%s) -\n RMST(%s)=%.3f" % (model._label, model2._label, rmst - rmst2) ) # dynamically pick this. else: rmst = restricted_mean_survival_time(model, t=t) @@ -255,14 +255,18 @@ def qq_plot(model, ax=None, **plot_kwargs): COL_THEO = "fitted %s quantiles" % dist if CensoringType.is_left_censoring(model): - kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP) + kmf = KaplanMeierFitter().fit_left_censoring( + model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry + ) elif CensoringType.is_right_censoring(model): - kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP) + kmf = KaplanMeierFitter().fit_right_censoring( + model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry + ) elif CensoringType.is_interval_censoring(model): raise NotImplementedError("lifelines does not have a non-parametric interval model yet.") q = np.unique(kmf.cumulative_density_.values[:, 0]) - # this is equivalent to the old code `qth_survival_times(q, kmf.cumulative_density, cdf=True)` + quantiles = qth_survival_times(1 - q, kmf.survival_function_) quantiles[COL_THEO] = dist_object.ppf(q) quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna() diff --git a/lifelines/tests/test_estimation.py b/lifelines/tests/test_estimation.py index 534412a34..189f0c20e 100644 --- a/lifelines/tests/test_estimation.py +++ b/lifelines/tests/test_estimation.py @@ -984,6 +984,18 @@ def test_interval_censoring_with_excepted_results(self): npt.assert_allclose(wf.rho_, 1.272946, rtol=1e-5) npt.assert_allclose(wf.lambda_, 7646.68135, rtol=1e-5) + def test_interval_censoring_against_reliasoft(self): + x_left = [0.1, 30, 60, 90, 120, 150, 180, 210, 240] + x_right = [30, 60, 90, 120, 150, 180, 210, 240, np.inf] + number = [2, 5, 1, 9, 6, 6, 5, 2, 2645] + censor = [0, 0, 0, 0, 0, 0, 0, 0, 0] + + wf = WeibullFitter() + wf.fit_interval_censoring(x_left, x_right, event_observed=censor, weights=number) + + npt.assert_allclose(wf.rho_, 1.215, rtol=1e-2) + npt.assert_allclose(wf.lambda_, 8270.88, rtol=1e-2) + class TestGeneralizedGammaFitter: def test_exponential_data_inference(self): diff --git a/lifelines/tests/test_plotting.py b/lifelines/tests/test_plotting.py index 0c3980f93..d3870a66a 100644 --- a/lifelines/tests/test_plotting.py +++ b/lifelines/tests/test_plotting.py @@ -53,9 +53,7 @@ def setup_method(self, method): self.plt = plt - def test_parametric_univariate_fitters_has_hazard_plotting_methods( - self, block, known_parametric_univariate_fitters - ): + def test_parametric_univariate_fitters_has_hazard_plotting_methods(self, block, known_parametric_univariate_fitters): positive_sample_lifetimes = np.arange(1, 100) for fitter in known_parametric_univariate_fitters: f = fitter().fit(positive_sample_lifetimes) @@ -63,9 +61,7 @@ def test_parametric_univariate_fitters_has_hazard_plotting_methods( self.plt.title("test_parametric_univariate_fitters_has_hazard_plotting_methods") self.plt.show(block=block) - def test_parametric_univaraite_fitters_has_cumhazard_plotting_methods( - self, block, known_parametric_univariate_fitters - ): + def test_parametric_univaraite_fitters_has_cumhazard_plotting_methods(self, block, known_parametric_univariate_fitters): positive_sample_lifetimes = np.arange(1, 100) for fitter in known_parametric_univariate_fitters: f = fitter().fit(positive_sample_lifetimes) @@ -74,9 +70,7 @@ def test_parametric_univaraite_fitters_has_cumhazard_plotting_methods( self.plt.title("test_parametric_univaraite_fitters_has_cumhazard_plotting_methods") self.plt.show(block=block) - def test_parametric_univariate_fitters_has_survival_plotting_methods( - self, block, known_parametric_univariate_fitters - ): + def test_parametric_univariate_fitters_has_survival_plotting_methods(self, block, known_parametric_univariate_fitters): positive_sample_lifetimes = np.arange(1, 100) for fitter in known_parametric_univariate_fitters: f = fitter().fit(positive_sample_lifetimes) @@ -538,9 +532,7 @@ def test_aalen_additive_fit_no_censor(self, block): timeline = np.linspace(0, 70, 10000) hz, coef, X = generate_hazard_rates(n, d, timeline) X.columns = coef.columns - cumulative_hazards = pd.DataFrame( - cumulative_integral(coef.values, timeline), index=timeline, columns=coef.columns - ) + cumulative_hazards = pd.DataFrame(cumulative_integral(coef.values, timeline), index=timeline, columns=coef.columns) T = generate_random_lifetimes(hz, timeline) X["T"] = T X["E"] = np.random.binomial(1, 1, n) @@ -563,9 +555,7 @@ def test_aalen_additive_fit_with_censor(self, block): timeline = np.linspace(0, 70, 10000) hz, coef, X = generate_hazard_rates(n, d, timeline) X.columns = coef.columns - cumulative_hazards = pd.DataFrame( - cumulative_integral(coef.values, timeline), index=timeline, columns=coef.columns - ) + cumulative_hazards = pd.DataFrame(cumulative_integral(coef.values, timeline), index=timeline, columns=coef.columns) T = generate_random_lifetimes(hz, timeline) T[np.isinf(T)] = 10 X["T"] = T @@ -686,6 +676,18 @@ def test_qq_plot_left_censoring_with_known_distribution(self, block): self.plt.suptitle("test_qq_plot_left_censoring_with_known_distribution") self.plt.show(block=block) + def test_qq_plot_with_weights_and_entry(self, block): + from lifelines.utils import survival_events_from_table + + df = pd.DataFrame(index=[60, 171, 263, 427, 505, 639]) + df["death"] = [1, 1, 1, 0, 1, 0] + df["censored"] = [0, 0, 0, 3, 0, 330] + T, E, W = survival_events_from_table(df, observed_deaths_col="death", censored_col="censored") + wf = WeibullFitter().fit(T, E, weights=W, entry=0.0001 * np.ones_like(T)) + ax = qq_plot(wf) + self.plt.suptitle("test_qq_plot_with_weights_and_entry") + self.plt.show(block=block) + def test_qq_plot_right_censoring_with_known_distribution(self, block): N = 3000 T_actual = scipy.stats.fisk(8, 0, 1).rvs(N)