### We need to install the lifelines library 

In [None]:
# Install the library
!pip install lifelines


## Q1: Assume that the dataset contains information on survival times and a binary variable indicating whether an event of interest occurred (1) or not (0).

   Patient      Survival Time (months)      Event (1=Yes, 0=No)
  
        1                      12                     1
        2                       9                     1
        3                      15                     0
        4                       6                     1
        5                      18                     0
        6                       8                     1
        7                      10                     1
        8                      14                     0
        9                       7                     1
       10                      11                     1

### (i) Calculate the Kaplan-Meier estimate of the survival function:

In [19]:
from lifelines import KaplanMeierFitter
import pandas as pd

# Create a DataFrame with the provided data
data = pd.DataFrame({
    'Survival Time (months)': [12, 9, 15, 6, 18, 8, 10, 14, 7, 11],
    'Event': [1, 1, 0, 1, 0, 1, 1, 0, 1, 1]
})

# Fit Kaplan-Meier estimator
kmf = KaplanMeierFitter()
kmf.fit(data['Survival Time (months)'], event_observed=data['Event'])

# Print Kaplan-Meier estimate
print(kmf.survival_function_)


          KM_estimate
timeline             
0.0               1.0
6.0               0.9
7.0               0.8
8.0               0.7
9.0               0.6
10.0              0.5
11.0              0.4
12.0              0.3
14.0              0.3
15.0              0.3
18.0              0.3


### (ii) Determine the median survival time based on the Kaplan-Meier estimate:

In [20]:
# Calculate median survival time
median_survival_time = kmf.median_survival_time_
print("Median survival time:", median_survival_time)


Median survival time: 10.0


### (iii) Perform the log-rank test to compare survival curves for patients with and without events:

In [21]:
from lifelines.statistics import logrank_test

# Separate data based on Event
event_1 = data[data['Event'] == 1]['Survival Time (months)']
event_0 = data[data['Event'] == 0]['Survival Time (months)']

# Perform log-rank test
results = logrank_test(event_1, event_0)
print(results)


<lifelines.StatisticalResult: logrank_test>
               t_0 = -1
 null_distribution = chi squared
degrees_of_freedom = 1
         test_name = logrank_test

---
 test_statistic    p  -log2(p)
           6.80 0.01      6.78


### (iv) Fit a Cox Proportional-Hazards model with the covariate 'Event' as the only predictor:

In [22]:
from lifelines import CoxPHFitter

# Fit Cox Proportional-Hazards model
cph = CoxPHFitter()
cph.fit(data, duration_col='Survival Time (months)', event_col='Event')

# Print hazard ratios
print(cph.summary)


Empty DataFrame
Columns: [coef, exp(coef), se(coef), coef lower 95%, coef upper 95%, exp(coef) lower 95%, exp(coef) upper 95%, cmp to, z, p, -log2(p)]
Index: []


### (v) Calculate the 95% confidence interval for the hazard ratio:

In [23]:
# Get confidence intervals for hazard ratios
conf_intervals = cph.confidence_intervals_
print(conf_intervals)


Empty DataFrame
Columns: [95% lower-bound, 95% upper-bound]
Index: []


## Q2: Assume that the dataset contains information on survival times, a binary variable indicating whether an event of interest occurred (1) or not (0), and a categorical variable representing different treatment groups.

  

### (i). Identify the tied observations in the dataset. Explain how tied observations impact the calculation of the Kaplan-Meier estimate for the survival function.

In [24]:
import pandas as pd

# Create the dataset
data = {
    'Patient': list(range(1, 13)),
    'Survival Time': [12, 9, 15, 6, 18, 8, 10, 14, 7, 11, 10, 14],
    'Event': [1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0],
    'Treatment Group': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B']
}

df = pd.DataFrame(data)

# Identify tied observations
tied_obs = df[df.duplicated(subset=['Survival Time', 'Event'], keep=False)]
print("Tied Observations:")
print(tied_obs)


Tied Observations:
    Patient  Survival Time  Event Treatment Group
6         7             10      1               A
7         8             14      0               B
10       11             10      1               A
11       12             14      0               B


### (ii). Calculate the Kaplan-Meier estimate of the survival function, considering the tied observations. Show the step-by-step calculations and the product-limit formula for each time point.

In [25]:
from lifelines import KaplanMeierFitter

# Fit Kaplan-Meier estimator
kmf = KaplanMeierFitter()
kmf.fit(durations=df['Survival Time'], event_observed=df['Event'])

# Print Kaplan-Meier estimate
print("Kaplan-Meier Estimate:")
print(kmf.survival_function_)


Kaplan-Meier Estimate:
          KM_estimate
timeline             
0.0          1.000000
6.0          0.916667
7.0          0.833333
8.0          0.750000
9.0          0.666667
10.0         0.500000
11.0         0.416667
12.0         0.333333
14.0         0.333333
15.0         0.333333
18.0         0.333333


### (iii). Determine the median survival time based on the Kaplan-Meier estimate, considering the tied observations. Discuss any adjustments made for tied data.

In [26]:
# Calculate median survival time
median_survival_time = kmf.median_survival_time_
print("Median Survival Time:", median_survival_time)


Median Survival Time: 10.0


### (iv). Perform the log-rank test to compare the survival curves. Take into account the tied observations, and show the relevant calculations. Interpret the results of the test.

In [35]:
from lifelines.statistics import logrank_test

# Splitting the dataset by treatment group
treatment_A = (df['Treatment Group'] == 'A')
treatment_B = (df['Treatment Group'] == 'B')

# Perform log-rank test
results = logrank_test(df[treatment_A]['Survival Time'], df[treatment_B]['Survival Time'], 
                       df[treatment_A]['Event'], df[treatment_B]['Event'])

# Print summary
print("Log-Rank Test Summary:")
print(results)


Log-Rank Test Summary:
<lifelines.StatisticalResult: logrank_test>
               t_0 = -1
 null_distribution = chi squared
degrees_of_freedom = 1
         test_name = logrank_test

---
 test_statistic    p  -log2(p)
           0.11 0.74      0.43


### (v) Fitting a Cox Proportional-Hazards model:

In [37]:
from lifelines import CoxPHFitter

# Perform one-hot encoding for 'Treatment Group'
df_encoded = pd.get_dummies(df, columns=['Treatment Group'], drop_first=True)

# Fit Cox Proportional-Hazards model
cph = CoxPHFitter()
cph.fit(df_encoded, duration_col='Survival Time', event_col='Event')

# Print model summary
print("Cox Proportional-Hazards Model Summary:")
print(cph.summary)


Cox Proportional-Hazards Model Summary:
                       coef  exp(coef)  se(coef)  coef lower 95%  \
covariate                                                          
Patient           -0.052603   0.948757  0.113171       -0.274413   
Treatment Group_B  0.409812   1.506535  0.844276       -1.244939   

                   coef upper 95%  exp(coef) lower 95%  exp(coef) upper 95%  \
covariate                                                                     
Patient                  0.169208             0.760018             1.184367   
Treatment Group_B        2.064563             0.287959             7.881851   

                   cmp to         z         p  -log2(p)  
covariate                                                
Patient               0.0 -0.464806  0.642070  0.639197  
Treatment Group_B     0.0  0.485401  0.627392  0.672561  


### (vi) Applying the Efron approximation:

In [38]:
# Fit Kaplan-Meier estimator using Efron approximation
kmf_efron = KaplanMeierFitter()
kmf_efron.fit(durations=df['Survival Time'], event_observed=df['Event'], 
              weights=df['Treatment Group'].map({'A': 1, 'B': 2}))

# Print Kaplan-Meier estimate with Efron approximation
print("Kaplan-Meier Estimate (Efron Approximation):")
print(kmf_efron.survival_function_)


Kaplan-Meier Estimate (Efron Approximation):
          KM_estimate
timeline             
0.0          1.000000
6.0          0.888889
7.0          0.833333
8.0          0.722222
9.0          0.611111
10.0         0.500000
11.0         0.388889
12.0         0.333333
14.0         0.333333
15.0         0.333333
18.0         0.333333


### (vii) Comparing results from Breslow method and Efron approximation:

In [39]:
# Print Kaplan-Meier estimate with Breslow method for comparison
print("Kaplan-Meier Estimate (Breslow Method):")
print(kmf.survival_function_)


Kaplan-Meier Estimate (Breslow Method):
          KM_estimate
timeline             
0.0          1.000000
6.0          0.916667
7.0          0.833333
8.0          0.750000
9.0          0.666667
10.0         0.500000
11.0         0.416667
12.0         0.333333
14.0         0.333333
15.0         0.333333
18.0         0.333333
