In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import pymc3 as pm
import seaborn as sns

from matplotlib import pyplot as plt
from pymc3.distributions.timeseries import GaussianRandomWalk
from theano import tensor as T

In [None]:
df = pd.read_csv(pm.get_data("mastectomy.csv"))
df.event = df.event.astype(np.int64)
df = df.rename(columns={"metastasized":"metastized"})
df.metastized = (df.metastized == "yes").astype(np.int64)
n_patients = df.shape[0]
patients = np.arange(n_patients)

In [None]:
n_patients

In [None]:
df.event.mean()


In [None]:
df.head()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

blue, _, red = sns.color_palette()[:3]

ax.hlines(
    patients[df.event.values == 0], 0, df[df.event.values == 0].time, color=blue, label="Censored"
)

ax.hlines(
    patients[df.event.values == 1], 0, df[df.event.values == 1].time, color=red, label="Uncensored"
)

ax.scatter(
    df[df.metastized.values == 1].time,
    patients[df.metastized.values == 1],
    color="k",
    zorder=10,
    label="Metastized",
)

ax.set_xlim(left=0)
ax.set_xlabel("Months since mastectomy")
ax.set_yticks([])
ax.set_ylabel("Subject")

ax.set_ylim(-0.25, n_patients + 0.25)

ax.legend(loc="center right");

In [None]:
interval_length = 3
interval_bounds = np.arange(0, df.time.max() + interval_length + 1, interval_length)
n_intervals = interval_bounds.size - 1
intervals = np.arange(n_intervals)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

ax.hist(
    df[df.event == 1].time.values,
    bins=interval_bounds,
    color=red,
    alpha=0.5,
    lw=0,
    label="Uncensored",
)
ax.hist(
    df[df.event == 0].time.values,
    bins=interval_bounds,
    color=blue,
    alpha=0.5,
    lw=0,
    label="Censored",
)

ax.set_xlim(0, interval_bounds[-1])
ax.set_xlabel("Months since mastectomy")

ax.set_yticks([0, 1, 2, 3])
ax.set_ylabel("Number of observations")

ax.legend();

In [None]:
last_period = np.floor((df.time - 0.01) / interval_length).astype(int)

death = np.zeros((n_patients, n_intervals))
death[patients, last_period] = df.event

In [None]:
exposure = np.greater_equal.outer(df.time.values, interval_bounds[:-1]) * interval_length
exposure[patients, last_period] = df.time - interval_bounds[last_period]

In [None]:
SEED = 644567  # from random.org


In [None]:
with pm.Model() as model:

    lambda0 = pm.Gamma("lambda0", 0.01, 0.01, shape=n_intervals)

    beta = pm.Normal("beta", 0, sigma=1000)

    lambda_ = pm.Deterministic("lambda_", T.outer(T.exp(beta * df.metastized), lambda0))
    mu = pm.Deterministic("mu", exposure * lambda_)

    obs = pm.Poisson("obs", mu, observed=death, )

In [None]:
n_samples = 1000
n_tune = 1000

In [None]:
with model:
    trace = pm.sample(n_samples, tune=n_tune, random_seed=SEED, return_inferencedata=True)

In [None]:
trace