# EpiRust Survival Analysis Demo

This notebook demonstrates EpiRust's survival analysis capabilities, including:

1. Kaplan-Meier estimation
2. Cox proportional hazards modeling
3. Time-dependent covariates
4. Competing risks analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from epirust.survival import KaplanMeier, CoxPH

# Set random seed and plotting style
np.random.seed(42)
plt.style.use('seaborn')
sns.set_palette("husl")

## Generate Survival Data

Let's create a dataset simulating a clinical trial with time-to-event data:

In [None]:
def generate_survival_data(n_patients=500):
    # Baseline characteristics
    age = np.random.normal(60, 10, n_patients)
    sex = np.random.binomial(1, 0.5, n_patients)
    treatment = np.random.binomial(1, 0.5, n_patients)
    
    # Generate survival times
    lambda_0 = 0.1  # baseline hazard
    beta = {
        'treatment': -0.5,
        'age': 0.02,
        'sex': 0.3
    }
    
    # Calculate individual hazards
    log_hazard = np.log(lambda_0) + \
                 beta['treatment'] * treatment + \
                 beta['age'] * (age - 60) / 10 + \
                 beta['sex'] * sex
    
    # Generate survival times from exponential distribution
    survival_times = np.random.exponential(1 / np.exp(log_hazard))
    
    # Generate censoring
    c_times = np.random.exponential(10, n_patients)
    observed_times = np.minimum(survival_times, c_times)
    events = (survival_times <= c_times).astype(int)
    
    return pd.DataFrame({
        'time': observed_times,
        'event': events,
        'treatment': treatment,
        'age': age,
        'sex': sex
    })

# Generate data
df = generate_survival_data()
print("Data summary:")
print(df.describe())

## Kaplan-Meier Analysis

Let's estimate and compare survival curves between treatment groups:

In [None]:
# Create KM estimator
km = KaplanMeier()

# Fit survival curves for each treatment group
km_treat = km.fit(df[df['treatment'] == 1]['time'],
                  df[df['treatment'] == 1]['event'])
km_control = km.fit(df[df['treatment'] == 0]['time'],
                    df[df['treatment'] == 0]['event'])

# Plot survival curves
plt.figure(figsize=(10, 6))
km.plot_survival_curves(
    [km_treat, km_control],
    labels=['Treatment', 'Control'],
    show_ci=True
)
plt.xlabel('Time')
plt.ylabel('Survival Probability')
plt.title('Kaplan-Meier Survival Curves by Treatment Group')
plt.grid(True, alpha=0.3)
plt.show()

# Log-rank test
logrank_p = km.logrank_test(km_treat, km_control)
print(f"Log-rank test p-value: {logrank_p:.4f}")

## Cox Proportional Hazards Model

Now let's fit a Cox model adjusting for covariates:

In [None]:
# Fit Cox model
cox = CoxPH()
covariates = ['treatment', 'age', 'sex']
model = cox.fit(df['time'], df['event'], df[covariates])

# Print model results
print("Cox Model Results:")
print(model.summary())

# Test proportional hazards assumption
ph_test = cox.test_proportional_hazards()
print("\nProportional Hazards Test:")
print(ph_test)

## Time-Dependent Covariates

Let's analyze a scenario with time-varying effects:

In [None]:
def generate_time_dependent_data(n_patients=500):
    # Base data
    base_df = generate_survival_data(n_patients)
    
    # Generate time-dependent biomarker measurements
    measurement_times = np.arange(0, 10, 0.5)
    biomarker_data = []
    
    for _, row in base_df.iterrows():
        times = measurement_times[measurement_times < row['time']]
        if len(times) == 0:
            continue
            
        # Generate biomarker values with time trend
        baseline = np.random.normal(100, 20)
        trend = 0.1 if row['treatment'] == 1 else 0.2
        noise = np.random.normal(0, 5, len(times))
        values = baseline + trend * times + noise
        
        for t, v in zip(times, values):
            biomarker_data.append({
                'id': row.name,
                'time': t,
                'biomarker': v
            })
    
    return base_df, pd.DataFrame(biomarker_data)

# Generate time-dependent data
base_df, biomarker_df = generate_time_dependent_data()

# Fit time-dependent Cox model
td_cox = cox.fit_time_dependent(
    base_df,
    biomarker_df,
    id_col='id',
    time_col='time',
    event_col='event',
    time_varying_covariates=['biomarker']
)

print("Time-Dependent Cox Model Results:")
print(td_cox.summary())

## Competing Risks Analysis

Let's analyze data with competing events:

In [None]:
def generate_competing_risks_data(n_patients=500):
    df = generate_survival_data(n_patients)
    
    # Generate competing event times
    competing_times = np.random.exponential(15, n_patients)
    
    # Determine final event type and time
    df['competing_time'] = competing_times
    df['event_type'] = np.where(
        df['time'] <= competing_times,
        1,  # Primary event
        2   # Competing event
    )
    df['time'] = np.minimum(df['time'], competing_times)
    
    return df

# Generate competing risks data
cr_df = generate_competing_risks_data()

# Calculate cumulative incidence functions
cif = cox.competing_risks_cif(cr_df, 'time', 'event_type', [1, 2])

# Plot cumulative incidence
plt.figure(figsize=(10, 6))
for event_type, (times, inc) in cif.items():
    plt.step(times, inc, label=f'Event {event_type}')

plt.xlabel('Time')
plt.ylabel('Cumulative Incidence')
plt.title('Cumulative Incidence Functions')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Advanced Survival Metrics

Let's calculate some advanced survival metrics:

In [None]:
# Calculate restricted mean survival time
rmst = km.restricted_mean_survival_time(tau=5.0)
print(f"Restricted Mean Survival Time (τ=5): {rmst:.2f}")

# Calculate conditional survival probabilities
cond_surv = km.conditional_survival(2.0, 5.0)
print(f"Conditional Survival P(T>5|T>2): {cond_surv:.2f}")

# Calculate time-dependent AUC
td_auc = cox.time_dependent_auc(times=[1, 2, 3, 4, 5])
print("\nTime-dependent AUC:")
for t, auc in td_auc.items():
    print(f"t={t}: {auc:.3f}")

## Conclusion

This notebook demonstrated EpiRust's comprehensive survival analysis capabilities:

1. Kaplan-Meier estimation with confidence intervals
2. Cox proportional hazards modeling
3. Time-dependent covariate analysis
4. Competing risks analysis
5. Advanced survival metrics

These tools provide a robust framework for analyzing time-to-event data in epidemiological studies.