v0.24.6

sean-reed · May 4, 2020 · 027ffd7 · 027ffd7
1 parent 290d583
commit 027ffd7
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 40 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,14 @@
 ## Changelog
 
+#### 0.24.6 - unreleased
+
+##### New features
+ - At the cost of some performance, convergence is improved in many models.
+ - New `lifelines.plotting.plot_interval_censored_lifetimes` for plotting interval censored data
+
+##### Bug fixes
+ - fixed bug where `cdf_plot` and `qq_plot` were not factoring in the weights correctly.
+
 #### 0.24.5 - 2020-05-01
 
 ##### New features

diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py
@@ -487,26 +487,31 @@ def _fit_model(self, Ts, E, entry, weights, show_progress=True):
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
 
-            results = minimize(
-                value_and_grad(negative_log_likelihood),  # pylint: disable=no-value-for-parameter
-                self._initial_values,
-                jac=True,
-                method=self._scipy_fit_method,
-                args=(Ts, E, entry, weights),
-                bounds=self._bounds,
-                options={**{"disp": show_progress}, **self._scipy_fit_options},
-            )
+            minimizing_results, minimizing_ll = None, np.inf
+            for method in [self._scipy_fit_method, "Powell"]:
+                results = minimize(
+                    value_and_grad(negative_log_likelihood),  # pylint: disable=no-value-for-parameter
+                    self._initial_values,
+                    jac=True,
+                    method=method,
+                    args=(Ts, E, entry, weights),
+                    bounds=self._bounds,
+                    options={**{"disp": show_progress}, **self._scipy_fit_options},
+                )
+                if results.fun < minimizing_ll:
+                    minimizing_ll = results.fun
+                    minimizing_results = results
 
             # convergence successful.
-            if results.success:
+            if minimizing_results and minimizing_results.success:
                 # pylint: disable=no-value-for-parameter
-                hessian_ = hessian(negative_log_likelihood)(results.x, Ts, E, entry, weights)
+                hessian_ = hessian(negative_log_likelihood)(minimizing_results.x, Ts, E, entry, weights)
                 # see issue https://github.com/CamDavidsonPilon/lifelines/issues/801
                 hessian_ = (hessian_ + hessian_.T) / 2
-                return results.x, -results.fun * weights.sum(), hessian_ * weights.sum()
+                return minimizing_results.x, -minimizing_results.fun * weights.sum(), hessian_ * weights.sum()
 
             # convergence failed.
-            print(results)
+            print(minimizing_results)
             if self._KNOWN_MODEL:
                 raise utils.ConvergenceError(
                     dedent(
@@ -1795,7 +1800,7 @@ def _fit_model(self, likelihood, Ts, Xs, E, weights, entries, show_progress=Fals
         if show_progress:
             print(minimum_results)
 
-        if minimum_results.success:
+        if minimum_results is not None and minimum_results.success:
             sum_weights = weights.sum()
             hessian_ = hessian(self._neg_likelihood_with_penalty_function)(minimum_results.x, Ts, E, weights, entries, Xs)
             # See issue https://github.com/CamDavidsonPilon/lifelines/issues/801
@@ -1807,16 +1812,18 @@ def _fit_model(self, likelihood, Ts, Xs, E, weights, entries, show_progress=Fals
             raise utils.ConvergenceError(
                 dedent(
                     """\
-                Fitting did not converge. Try checking the following:
+                Fitting did not converge. Try the following:
 
                 0. Are there any lifelines warnings outputted during the `fit`?
                 1. Inspect your DataFrame: does everything look as expected?
+                2. Try scaling your duration vector down, i.e. `df["{duration_col}"] = df["{duration_col}"]/100`
                 2. Is there high-collinearity in the dataset? Try using the variance inflation factor (VIF) to find redundant variables.
                 3. Try using an alternate minimizer: ``fitter._scipy_fit_method = "SLSQP"``.
-                4. Trying adding a small penalizer (or changing it, if already present). Example: `%s(penalizer=0.01).fit(...)`.
+                4. Trying adding a small penalizer (or changing it, if already present). Example: `{fitter_name}(penalizer=0.01).fit(...)`.
                 5. Are there any extreme outliers? Try modeling them or dropping them to see if it helps convergence.
-            """
-                    % self._class_name
+            """.format(
+                        duration_col=self.duration_col, fitter_name=self._class_name
+                    )
                 )
             )
 

diff --git a/lifelines/fitters/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py
@@ -96,7 +96,7 @@ class WeibullFitter(KnownModelParametricUnivariateFitter):
     _scipy_fit_options = {"ftol": 1e-14}
 
     def _create_initial_point(self, Ts, E, entry, weights):
-        return np.array([utils.coalesce(*Ts).std(), 1.0])
+        return np.array([utils.coalesce(*Ts).mean(), 1.5])
 
     def _cumulative_hazard(self, params, times):
         lambda_, rho_ = params

diff --git a/lifelines/plotting.py b/lifelines/plotting.py
@@ -88,11 +88,11 @@ def cdf_plot(model, timeline=None, ax=None, **plot_kwargs):
 
     if CensoringType.is_left_censoring(model):
         empirical_kmf = KaplanMeierFitter().fit_left_censoring(
-            model.durations, model.event_observed, label=COL_EMP, timeline=timeline
+            model.durations, model.event_observed, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry
         )
     elif CensoringType.is_right_censoring(model):
         empirical_kmf = KaplanMeierFitter().fit_right_censoring(
-            model.durations, model.event_observed, label=COL_EMP, timeline=timeline
+            model.durations, model.event_observed, label=COL_EMP, timeline=timeline, weights=model.weights, entry=model.entry
         )
     elif CensoringType.is_interval_censoring(model):
         raise NotImplementedError("lifelines does not have a non-parametric interval model yet.")
@@ -198,7 +198,7 @@ def rmst_plot(model, model2=None, t=np.inf, ax=None, text_position=None, **plot_
         )
 
         ax.text(
-            text_position[0], text_position[1], "RMST(%s) -\n   RMST(%s)=%.3f" % (model._label, model2._label, rmst - rmst2),
+            text_position[0], text_position[1], "RMST(%s) -\n   RMST(%s)=%.3f" % (model._label, model2._label, rmst - rmst2)
         )  # dynamically pick this.
     else:
         rmst = restricted_mean_survival_time(model, t=t)
@@ -255,14 +255,18 @@ def qq_plot(model, ax=None, **plot_kwargs):
     COL_THEO = "fitted %s quantiles" % dist
 
     if CensoringType.is_left_censoring(model):
-        kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP)
+        kmf = KaplanMeierFitter().fit_left_censoring(
+            model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry
+        )
     elif CensoringType.is_right_censoring(model):
-        kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP)
+        kmf = KaplanMeierFitter().fit_right_censoring(
+            model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry
+        )
     elif CensoringType.is_interval_censoring(model):
         raise NotImplementedError("lifelines does not have a non-parametric interval model yet.")
 
     q = np.unique(kmf.cumulative_density_.values[:, 0])
-    # this is equivalent to the old code `qth_survival_times(q, kmf.cumulative_density, cdf=True)`
+
     quantiles = qth_survival_times(1 - q, kmf.survival_function_)
     quantiles[COL_THEO] = dist_object.ppf(q)
     quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna()

diff --git a/lifelines/tests/test_estimation.py b/lifelines/tests/test_estimation.py
@@ -984,6 +984,18 @@ def test_interval_censoring_with_excepted_results(self):
         npt.assert_allclose(wf.rho_, 1.272946, rtol=1e-5)
         npt.assert_allclose(wf.lambda_, 7646.68135, rtol=1e-5)
 
+    def test_interval_censoring_against_reliasoft(self):
+        x_left = [0.1, 30, 60, 90, 120, 150, 180, 210, 240]
+        x_right = [30, 60, 90, 120, 150, 180, 210, 240, np.inf]
+        number = [2, 5, 1, 9, 6, 6, 5, 2, 2645]
+        censor = [0, 0, 0, 0, 0, 0, 0, 0, 0]
+
+        wf = WeibullFitter()
+        wf.fit_interval_censoring(x_left, x_right, event_observed=censor, weights=number)
+
+        npt.assert_allclose(wf.rho_, 1.215, rtol=1e-2)
+        npt.assert_allclose(wf.lambda_, 8270.88, rtol=1e-2)
+
 
 class TestGeneralizedGammaFitter:
     def test_exponential_data_inference(self):

diff --git a/lifelines/tests/test_plotting.py b/lifelines/tests/test_plotting.py
@@ -53,19 +53,15 @@ def setup_method(self, method):
 
         self.plt = plt
 
-    def test_parametric_univariate_fitters_has_hazard_plotting_methods(
-        self, block, known_parametric_univariate_fitters
-    ):
+    def test_parametric_univariate_fitters_has_hazard_plotting_methods(self, block, known_parametric_univariate_fitters):
         positive_sample_lifetimes = np.arange(1, 100)
         for fitter in known_parametric_univariate_fitters:
             f = fitter().fit(positive_sample_lifetimes)
             assert f.plot_hazard() is not None
         self.plt.title("test_parametric_univariate_fitters_has_hazard_plotting_methods")
         self.plt.show(block=block)
 
-    def test_parametric_univaraite_fitters_has_cumhazard_plotting_methods(
-        self, block, known_parametric_univariate_fitters
-    ):
+    def test_parametric_univaraite_fitters_has_cumhazard_plotting_methods(self, block, known_parametric_univariate_fitters):
         positive_sample_lifetimes = np.arange(1, 100)
         for fitter in known_parametric_univariate_fitters:
             f = fitter().fit(positive_sample_lifetimes)
@@ -74,9 +70,7 @@ def test_parametric_univaraite_fitters_has_cumhazard_plotting_methods(
         self.plt.title("test_parametric_univaraite_fitters_has_cumhazard_plotting_methods")
         self.plt.show(block=block)
 
-    def test_parametric_univariate_fitters_has_survival_plotting_methods(
-        self, block, known_parametric_univariate_fitters
-    ):
+    def test_parametric_univariate_fitters_has_survival_plotting_methods(self, block, known_parametric_univariate_fitters):
         positive_sample_lifetimes = np.arange(1, 100)
         for fitter in known_parametric_univariate_fitters:
             f = fitter().fit(positive_sample_lifetimes)
@@ -538,9 +532,7 @@ def test_aalen_additive_fit_no_censor(self, block):
         timeline = np.linspace(0, 70, 10000)
         hz, coef, X = generate_hazard_rates(n, d, timeline)
         X.columns = coef.columns
-        cumulative_hazards = pd.DataFrame(
-            cumulative_integral(coef.values, timeline), index=timeline, columns=coef.columns
-        )
+        cumulative_hazards = pd.DataFrame(cumulative_integral(coef.values, timeline), index=timeline, columns=coef.columns)
         T = generate_random_lifetimes(hz, timeline)
         X["T"] = T
         X["E"] = np.random.binomial(1, 1, n)
@@ -563,9 +555,7 @@ def test_aalen_additive_fit_with_censor(self, block):
         timeline = np.linspace(0, 70, 10000)
         hz, coef, X = generate_hazard_rates(n, d, timeline)
         X.columns = coef.columns
-        cumulative_hazards = pd.DataFrame(
-            cumulative_integral(coef.values, timeline), index=timeline, columns=coef.columns
-        )
+        cumulative_hazards = pd.DataFrame(cumulative_integral(coef.values, timeline), index=timeline, columns=coef.columns)
         T = generate_random_lifetimes(hz, timeline)
         T[np.isinf(T)] = 10
         X["T"] = T
@@ -686,6 +676,18 @@ def test_qq_plot_left_censoring_with_known_distribution(self, block):
         self.plt.suptitle("test_qq_plot_left_censoring_with_known_distribution")
         self.plt.show(block=block)
 
+    def test_qq_plot_with_weights_and_entry(self, block):
+        from lifelines.utils import survival_events_from_table
+
+        df = pd.DataFrame(index=[60, 171, 263, 427, 505, 639])
+        df["death"] = [1, 1, 1, 0, 1, 0]
+        df["censored"] = [0, 0, 0, 3, 0, 330]
+        T, E, W = survival_events_from_table(df, observed_deaths_col="death", censored_col="censored")
+        wf = WeibullFitter().fit(T, E, weights=W, entry=0.0001 * np.ones_like(T))
+        ax = qq_plot(wf)
+        self.plt.suptitle("test_qq_plot_with_weights_and_entry")
+        self.plt.show(block=block)
+
     def test_qq_plot_right_censoring_with_known_distribution(self, block):
         N = 3000
         T_actual = scipy.stats.fisk(8, 0, 1).rvs(N)