# 11.8 Lab: Survival Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
from lifelines import CoxPHFitter

%matplotlib inline

In [None]:
""" 
I am also new to this topic, so let us learn those concept together. Feedbacks are welcome.
Survival analysis is a statistical method used to estimate the survival function of a population.
These arise in the analysis of a unique kind of outcome variable: the analysis time until an event occurs.
"""

## 11.8.1 Brain Cancer Data

In [None]:
BrainCancer = pd.read_csv('data/BrainCancer.csv', header=0)
# use some options in .describe() to get a quick overview of the data
BrainCancer.describe(include = 'object')

In [None]:
BrainCancer.head()

In [None]:
# I did a bit of google search and found the package lifelines.
# % pip install lifelines

In [None]:
# create a kmf object
kmf = KaplanMeierFitter() 

In [None]:
# fit the data into the model
kmf.fit(BrainCancer.time, BrainCancer.status,label='Kaplan Meier Estimate')


In [None]:
# create an estimate
kmf.plot(ci_show=True) ## ci_show is meant for Confidence interval, which is the shaded area in the plot.

In [None]:
kmf1 = KaplanMeierFitter() ## instantiate the class to create an object

## two Cohorts are compared. Cohort 1. Female; Cohort 2. Male 
groups = BrainCancer['sex'] 
T = BrainCancer.time
E = BrainCancer.status  
i1 = (groups == 'Female')     
i2 = (groups == 'Male')     


## fit the model for 1st cohort
kmf1.fit(T[i1], E[i1], label='Female')
a1 = kmf1.plot()

## fit the model for 2nd cohort
kmf1.fit(T[i2], E[i2], label='Male')
kmf1.plot(ax=a1)

In [None]:
# we can perform a log-rank test to compare the survival of males to females,
results=logrank_test(T[i1],T[i2],event_observed_A=E[i1], event_observed_B=E[i2])
results.print_summary()
""" 
The resulting p-value is 0.23, indicating no evidence of a difference in sur- vival between the two sexes.
This also can be seen from the overlapping the confidence intervals. 
"""

In [None]:
df_dummy = pd.get_dummies(BrainCancer, drop_first=True)
df_dummy.head()


In [None]:
# use Cox Proportional Hazards model
cph1 = CoxPHFitter()   
cph1.fit(df_dummy[['status', 'sex_Male', 'time']], 'time', event_col='status')   
cph1.print_summary()
""" 
I was not able to use the optional 'formula' in the fit() function due to the compatibility with new version, instead I sliced the dataframe to only contains 3 cols ('status', 'sex_Male', 'time')
Here the p value is at 0.233. Regardless of which test we use, we see that there is no clear evidence for a difference in survival between males and females.
"""

In [None]:
# use Cox Proportional Hazards model and use more features 
cph2 = CoxPHFitter()   
cph2.fit(df_dummy, 'time', event_col='status')   
cph2.print_summary()
""" 
after adjusting for the other predictors, larger values of the Karnofsky index, ki, are associated with lower risk, 
i.e. longer survival.
"""

## 11.8.2 Publication Data

In [None]:
Publication = pd.read_csv('data/Publication.csv', header=0)
Publication.head()

In [None]:
kmf1 = KaplanMeierFitter() ## instantiate the class to create an object

## two Cohorts are compared. Cohort 1. posres==0 ; Cohort 2. posres==1
groups = Publication['posres'] 
T = Publication.time
E = Publication.status  
i1 = (groups == 0)     
i2 = (groups == 1)     

## fit the model for 1st cohort
kmf1.fit(T[i1], E[i1], label='Positive Results')
a1 = kmf1.plot()

## fit the model for 2nd cohort
kmf1.fit(T[i2], E[i2], label='Negative Results')
kmf1.plot(ax=a1)

In [None]:
# we can perform a log-rank test to compare the survival of males to females,
results=logrank_test(T[i1],T[i2],event_observed_A=E[i1], event_observed_B=E[i2])
results.print_summary()

In [None]:
df_dummy = pd.get_dummies(Publication, drop_first=True)
df_dummy.head()

In [None]:
# use Cox Proportional Hazards model
cph3 = CoxPHFitter()   
cph3.fit(df_dummy[['status', 'posres', 'time']], 'time', event_col='status')   
cph3.print_summary()

In [None]:
cph4 = CoxPHFitter()   
cph4.fit(df_dummy[['status', 'posres', 'time', 'multi', 'clinend','sampsize', 'budget', 'impact']], 'time', event_col='status')   
cph4.print_summary()
""" 
After we control for other features, posres becomes an important factor (well, at least significant).
We see that there are a number of statistically significant variables, 
including whether the trial focused on a clinical endpoint (clinend), the impact of the study(impact),
and whether the study had positive or negative results (posres).
"""

## 11.8.3 Call Center Data

In [None]:
np.random.seed(1)
N = 2000
Operators = np.random.choice(range(5, 16), N)
Center = np.random.choice(["A", "B", "C"], N)
Time = np.random.choice(["Morn.", "After.", "Even."], N)

In [None]:
# we generate a similar random data set
X_pre = pd.DataFrame({"Operators": Operators, "Center": Center, "Time": Time})
X = pd.get_dummies(X_pre, drop_first=True)
X.head()

In [None]:
true_coeff = np.array([0.04, -0.3, 0, 0.2, -0.2])
# well, I was not able to fully following the simulation in the book. 
# I think the highlevel idea is to use those coefficients to generate a dataset and show the model fit could 
# sucessfully recover the coefficients.

In [None]:
# this simulation code is wrong. And I will come back and fix it. 
X['y'] = 350*np.exp(np.sum(-X*true_coeff,axis=1).tolist() + np.random.normal(0, 0.005, N))
X['answered'] = np.where( X['y'] < 300 , 1, 0)
X.head()

In [None]:
# use Cox Proportional Hazards model
cph5 = CoxPHFitter()   
cph5.fit(X, 'y', event_col='answered')   
cph5.print_summary()
""" 
Since the simulation is wrong, so the summary is not correct. 
But at least we can see the p-values for Operatator, Center = B, Time = Even. and Time = Morn are very small, 
and they are directly related to the ground truth coefficients.
"""

In [None]:
# End of Chapter 11