In [None]:
# Install necessary packages
!pip install lifelines pandas matplotlib seaborn

# Step 1: Import Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import logrank_test

# Step 2: Generate Synthetic Cancer Data
np.random.seed(42)  # Ensure reproducibility

# Simulate survival times (in months) using exponential distribution
n = 200  # Number of patients
survival_times = np.random.exponential(scale=12, size=n)  # average survival: 12 months
event_observed = np.random.binomial(1, 0.7, size=n)  # 70% events observed (death)

# Simulate additional covariates (age, treatment type, and cancer stage)
age = np.random.randint(30, 80, size=n)
treatment = np.random.choice(['Chemotherapy', 'Radiation', 'Surgery'], size=n)
cancer_stage = np.random.choice(['Stage I', 'Stage II', 'Stage III', 'Stage IV'], size=n)

# Create a DataFrame to store the data
data = pd.DataFrame({
    'survival_time': survival_times,
    'event_observed': event_observed,
    'age': age,
    'treatment': treatment,
    'cancer_stage': cancer_stage
})

# Display the first few rows of the dataset
print("Sample Cancer Data:")
print(data.head())

# Step 3: Perform Kaplan-Meier Survival Analysis
kmf = KaplanMeierFitter()

# Fit and plot the overall survival curve
plt.figure(figsize=(10, 6))
kmf.fit(data['survival_time'], event_observed=data['event_observed'], label='Overall Survival')
kmf.plot_survival_function()
plt.title('Kaplan-Meier Survival Curve (Overall)')
plt.xlabel('Time (Months)')
plt.ylabel('Survival Probability')
plt.show()

# Step 4: Kaplan-Meier by Cancer Stage
plt.figure(figsize=(10, 6))
for stage in data['cancer_stage'].unique():
    kmf.fit(data[data['cancer_stage'] == stage]['survival_time'], 
            event_observed=data[data['cancer_stage'] == stage]['event_observed'], 
            label=f'Stage {stage}')
    kmf.plot_survival_function()
plt.title('Kaplan-Meier Survival Curves by Cancer Stage')
plt.xlabel('Time (Months)')
plt.ylabel('Survival Probability')
plt.legend()
plt.show()

# Step 5: Statistical Test - Log-rank Test between Stages
result = logrank_test(
    data[data['cancer_stage'] == 'Stage I']['survival_time'], 
    data[data['cancer_stage'] == 'Stage IV']['survival_time'], 
    event_observed_A=data[data['cancer_stage'] == 'Stage I']['event_observed'], 
    event_observed_B=data[data['cancer_stage'] == 'Stage IV']['event_observed']
)
print(f"Log-rank Test p-value between Stage I and Stage IV: {result.p_value:.4f}")

# Step 6: Cox Proportional Hazards Model
# Encode categorical variables (treatment and cancer stage)
data_encoded = pd.get_dummies(data, columns=['treatment', 'cancer_stage'], drop_first=True)

# Fit the Cox model
cph = CoxPHFitter()
cph.fit(data_encoded, duration_col='survival_time', event_col='event_observed')

# Display the summary of the Cox model
print("\nCox Proportional Hazards Model Summary:")
cph.print_summary()

# Step 7: Visualize the Hazard Ratios
plt.figure(figsize=(8, 6))
cph.plot(hazard_ratios=True)
plt.title('Hazard Ratios from Cox Proportional Hazards Model')
plt.show()
