In [1]:
# from lifelines.fitters import ParametericUnivariateFitter
from datetime import timedelta
import pandas as pd
import numpy as np
# from autograd import numpy as np
# from autograd import grad, value_and_grad
from scipy.stats import bernoulli
# import fleming
import datetime
from scipy.optimize import fmin


In [8]:
class cached_property(object):
    """
    Provides a nice way to cache properties on a class.
    
    This is a direct copy-paste of Django's cached property from
    https://github.com/django/django/blob/2456ffa42c33d63b54579eae0f5b9cf2a8cd3714/django/utils/functional.py#L38-50
    """
    def __init__(self, func):
        self.func = func

    def __get__(self, instance, type=None):
        if instance is None:
            return self
        res = instance.__dict__[self.func.__name__] = self.func(instance)
        return res


In [9]:
class Data:
    """
    This class provides a (really inneficient) way of generating seasonal
    survival data
    """
    def __init__(self, q1_tau=10, q2_tau=20, q3_tau=30, q4_tau=40):
        """
        Instantiate with the exponential time constants for each quarter's
        hazard rate
        """
        # Save the exponential time constants for each quarter
        self.tau_for_quarter = {
            1: q1_tau,
            2: q2_tau,
            3: q3_tau,
            4: q4_tau,
        }
    
    @cached_property
    def _df_hazard(self):
        """
        This is just a lookup table that registers the exponential time constant
        for the hazard rate of each day.
        """
        # Create a dataframe with daily rows spanning 10 years
        df = pd.DataFrame({'dates': pd.date_range('1/1/2019', '12/31/2029')})
        
        # Populate the time constants for each day from the lookup dict
        df['tau'] = [self.tau_for_quarter[d.quarter] for d in df.dates]
        
        # Index on date so that accessing this dataframe as a lookup table is fast
        df.index = pd.Index(df.dates)
        df.index.name = None
        
        # Return the frame
        return df
    
    def initialize(self):
        """
        This initializes to empty data
        """
        self.records = []
    
    def _generate_events(self, num_samples):
        """
        Generate seasonal survival rates
        """
        # Define Min/Max dates that will fall well within the time bounds of _df_hazard
        min_date = pd.Timestamp('1/1/2019')
        max_date = pd.Timestamp('1/1/2026')
        
        # Will use this for generating random dates
        max_days = (max_date - min_date).days
        
        # This is the latest date of _df_hazard which sets loop limits below
        latest_date = self._df_hazard.dates.iloc[-1]
        
        # I will model censoring as a exponential process as well with this time constant
        censor_tau = 60
        censor_prob = 1 / censor_tau
        
        # Make sure the data list is initialized to empty
        self.initialize()
        
        # Generate num_samples records.
        for nn in range(num_samples):
            # Randomly pull a number of days from the valid range
            days = np.random.randint(0, max_days)
            
            # Start observing this sample at the randomly generated start date
            start_date = pd.Timestamp(min_date + datetime.timedelta(days=days))
            date = start_date
            
            # Now just step forward in time for every valid day in the simulation
            while date < latest_date:
                # Lookup the exponential rate from the hazard table
                tau = self._df_hazard.loc[date, 'tau']
                
                # Use the rate to compute an event probability for this day
                event_prob = 1. / tau
                
                # Randomly draw to see if this event should be censored today
                if bernoulli(censor_prob).rvs():
                    # If the event is censored, handle that and move on to the next record
                    self.handle_event(start_date, date, 0)
                    break
                    
                # Randomly draw to see if the event occured today
                if bernoulli(event_prob).rvs():
                    # If the event occured, handle that and move on to the next record
                    self.handle_event(start_date, date, 1)
                    break
                
                # If no censoring and not event, try again tomorrow.
                date += datetime.timedelta(days=1)
                    
    def handle_event(self, start_date, date, event_occured):
        """
        Handle an event
        """
        # Compute the survival time for this record
        duration = (date - start_date).days
        
        # Create a record
        rec = {
            'date': date,
            'duration': duration,
            'event_occured': event_occured
        }
        
        # Add the record to the record list
        self.records.append(rec)
        
    def get_frame(self, num_samples=10):
        """"""
        self._generate_events(num_samples)
        return pd.DataFrame(self.records)
                    

In [10]:
class Seasonal:
    _fitted_parameter_names = ['q1', 'q2', 'q3', 'q4']
    
    def __init__(self, durations, dates, observed):
        self.durations = durations
        self.dates = dates
        self.observed = observed
        self._days_in_quarter = None
    
    def _get_days_in_quarter(self, durations, dates):
        dates = pd.Series(dates)
        days_in_quarter = []
        for duration, end_date in zip(durations, dates):
            start_date = end_date - datetime.timedelta(days=duration)
            df = pd.DataFrame({'date': pd.date_range(start_date, end_date)})
            df['quarter'] = df.date.dt.quarter
            days_in_quarter.append(df.quarter.value_counts().to_dict())
        return days_in_quarter

    def _hazard(self, params, day):
        day = pd.Timestamp(day)
        return 1. / params[day.quarter - 1]

    def _cumulative_hazard(self, params):
        cum_hazards = []
        if self._days_in_quarter is None:
            self._days_in_quarter = self._get_days_in_quarter(self.durations, self.dates)
        
        for diq in self._days_in_quarter:
            cum_haz = 0.
            for quarter, days_in_quarter in diq.items():
                cum_haz += days_in_quarter / params[quarter - 1]
            cum_hazards.append(cum_haz)
                
        return np.array(cum_hazards, dtype=np.float64)
    
    def _log_likelihood(self, params):
        cum_hazards = self._cumulative_hazard(params)
        hazards = np.array([self._hazard(params, day) for day in self.dates])
        
        d_log_likelihood = self.observed * np.log(hazards) - cum_hazards
        return - np.sum(d_log_likelihood)
    
    def fit(self):
        params = np.array([1., 1., 1., 1.])
        res = fmin(func=seasonal._log_likelihood, x0=params, maxiter=1000)
        return res

        
    

# params = np.array([1., 1., 1., 1.])


data = Data()
df = data.get_frame(800)

seasonal = Seasonal(df.duration, df.date, df.event_occured)
res = seasonal.fit()
# seasonal._log_likelihood(params)
print(res)

# res = fmin(func=seasonal._log_likelihood, x0=params, maxiter=1000)
# res
    


Optimization terminated successfully.
         Current function value: 2321.023225
         Iterations: 272
         Function evaluations: 462
[ 9.85853048 23.7409976  31.61905272 40.03701424]
