In [405]:
from lifelines.fitters import ParametericUnivariateFitter
from datetime import timedelta
import pandas as pd
import numpy
from autograd import numpy as np
from autograd import grad
from scipy.stats import expon, bernoulli
import datetime
from dateutil.relativedelta import relativedelta
import itertools
import sys
from autograd import numpy as anp
from autograd import value_and_grad, grad
from autograd import elementwise_grad as egrad

# Load the previously saved data

In [131]:
df = pd.read_csv('./seasonal_1000a.csv', parse_dates=['date'])
df.head(2)

Unnamed: 0,date,duration,event_occured
0,2019-04-18,0,1
1,2021-06-22,99,1


In [248]:
def get_life_intervals(duration, date):
    """
    This takes durations and dates.  It returns pandas intervals
    representing the interval over which each record lives
    """
    df = pd.DataFrame({
        'duration': list(duration),
        'end_date': list(date)
    })
    df['start_date'] = df.end_date - df.duration * pd.Timedelta(days=1)
    df['live_interval'] = [
        pd.Interval(start, end, closed='both') 
        for (start, end) in zip(df.start_date, df.end_date)
    ]
    return sorted(df.live_interval)

life_intervals = get_life_intervals(df.duration, df.date)
life_intervals[:3]

[Interval('2019-01-03', '2019-01-07', closed='both'),
 Interval('2019-01-03', '2019-04-07', closed='both'),
 Interval('2019-01-04', '2019-01-10', closed='both')]

In [249]:
def get_bin_intervals(duration, date, offset):
    """
    Takes durations dates, and a pandas time offset.
    Returns intervals of offset covering the whole dataset
    """
    df = pd.DataFrame({
        'duration': list(duration),
        'end_date': list(date)
    })
    df['start_date'] = df.end_date - df.duration * pd.Timedelta(days=1)
    
    min_date = df.start_date.min()
    max_date = df.end_date.max()
    
    min_bin_time = offset.rollback(min_date)
    max_bin_time = offset.rollforward(max_date)
    bin_dates = list(pd.date_range(min_bin_time, max_bin_time))
    ind = pd.interval_range(start=min_bin_time, end=max_bin_time, freq=offset, closed='left')
    return sorted(ind)

bin_intervals = get_bin_intervals(df.duration, df.date, pd.offsets.QuarterBegin(startingMonth=1))
bin_intervals[: 3]

[Interval('2019-01-01', '2019-04-01', closed='left'),
 Interval('2019-04-01', '2019-07-01', closed='left'),
 Interval('2019-07-01', '2019-10-01', closed='left')]

In [316]:
def compute_record(life_interval, bin_interval, row_index, col_index, time_step):
    """
    Computes the values needed for feeding into cumulative hazard functions
    """
    life_start, life_end = life_interval.left, life_interval.right
    bin_start, bin_end = bin_interval.left, bin_interval.right
    
    tau_bin_start = (bin_start - life_start) / time_step
    tau_bin_end = (bin_end - life_start) / time_step
    duration = life_interval.length / time_step
    
    
    lower_lim = np.maximum(0, tau_bin_start)
    upper_lim = np.minimum(duration, tau_bin_end)
    
    rec =  {
        'row_index': row_index,
        'col_index': col_index,
        'duration': duration,
        'tau_bin_start': tau_bin_start,
        'tau_bin_end': tau_bin_end,
        'lower_lim': lower_lim,
        'upper_lim': upper_lim
    }

    if duration == 399:
        print(rec)
        
    return rec
    
    


rec_list = []

for row_index, life_interval in enumerate(life_intervals):
    col_index = 0
    has_overlapped = False
    for bin_interval in bin_intervals:
        if life_interval.overlaps(bin_interval):
            rec = compute_record(
                life_interval, bin_interval, row_index, col_index, time_step=pd.Timedelta(days=1))
            rec_list.append(rec)
            col_index += 1
            has_overlapped = True
        else:
            if has_overlapped:
                break
            
            
dfx = pd.DataFrame(rec_list)
dfx.head()
    

{'row_index': 986, 'col_index': 0, 'duration': 399.0, 'tau_bin_start': -60.0, 'tau_bin_end': 32.0, 'lower_lim': 0.0, 'upper_lim': 32.0}
{'row_index': 986, 'col_index': 1, 'duration': 399.0, 'tau_bin_start': 32.0, 'tau_bin_end': 122.0, 'lower_lim': 32.0, 'upper_lim': 122.0}
{'row_index': 986, 'col_index': 2, 'duration': 399.0, 'tau_bin_start': 122.0, 'tau_bin_end': 213.0, 'lower_lim': 122.0, 'upper_lim': 213.0}
{'row_index': 986, 'col_index': 3, 'duration': 399.0, 'tau_bin_start': 213.0, 'tau_bin_end': 305.0, 'lower_lim': 213.0, 'upper_lim': 305.0}
{'row_index': 986, 'col_index': 4, 'duration': 399.0, 'tau_bin_start': 305.0, 'tau_bin_end': 397.0, 'lower_lim': 305.0, 'upper_lim': 397.0}
{'row_index': 986, 'col_index': 5, 'duration': 399.0, 'tau_bin_start': 397.0, 'tau_bin_end': 487.0, 'lower_lim': 397.0, 'upper_lim': 399.0}


Unnamed: 0,col_index,duration,lower_lim,row_index,tau_bin_end,tau_bin_start,upper_lim
0,0,4.0,0.0,0,88.0,-2.0,4.0
1,0,94.0,0.0,1,88.0,-2.0,88.0
2,1,94.0,88.0,1,179.0,88.0,94.0
3,0,6.0,0.0,2,87.0,-3.0,6.0
4,0,71.0,0.0,3,83.0,-7.0,71.0


In [447]:
"""
This populates upper/lower time limits needed for integrating each cumulative contribution
"""
num_rows = len(df)
num_cols = dfx.col_index.max() + 1
lower_lims = anp.zeros(shape=(num_rows, num_cols))
upper_lims = anp.zeros(shape=(num_rows, num_cols))

for rec in dfx.itertuples():
    lower_lims[rec.row_index, rec.col_index] = rec.lower_lim
    upper_lims[rec.row_index, rec.col_index] = rec.upper_lim


In [448]:
upper_lims

array([[ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [88., 94.,  0., ...,  0.,  0.,  0.],
       [ 6.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [16., 17.,  0., ...,  0.,  0.,  0.],
       [15., 30.,  0., ...,  0.,  0.,  0.],
       [ 4., 58.,  0., ...,  0.,  0.,  0.]])

In [483]:
def H(t2, t1, w):
    """
    This is the cumulative hazard function (exponential for now)
    """
    h_base = t2 - t1
    w = anp.array(w).reshape(len(w), 1)
    return h_base @ w


w = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=np.float64)
# H(upper_lims, lower_lims, w)
h = egrad(H, argnum=0)
h(upper_lims, lower_lims, w)


array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [423]:
def H(t1, t2, w):
    M = anp.matrix(t2 - t1)
    w = anp.array(w)
    w = anp.matrix(w.reshape(len(w), 1))
    v = M * w
    v = np.array(v).reshape((v.shape[0] * v.shape[1],))
    print(v.shape)
    print('max', v.max())
#     return(v)
    return np.sum(v)
    



In [465]:
M = anp.array([[1, 2], [3, 4]], dtype=np.float64)
w = anp.array([[1], [2]], dtype=np.float64)
np.array(np.matrix(M) * np.matrix(w)).flatten()

array([ 5., 11.])

In [476]:
def f(M, w):
    return M @ w
    return np.array(np.matrix(M) * np.matrix(w)).flatten()

g = egrad(f, argnum=1)
g(M, w)

array([[4.],
       [6.]])

array([[1.],
       [2.]])

In [472]:
M

array([[1., 2.],
       [3., 4.]])

In [473]:
w

array([[1.],
       [2.]])

In [474]:
M*w

array([[1., 2.],
       [6., 8.]])

In [475]:
M@w

array([[ 5.],
       [11.]])