# Assignment 4 Survival Analysis
Applications for Kaplan-Meier survivall curves, Cox Proportional Hazards regression and Randon Survival Forests(RSF)

Using clinical dataset from The Cancer Imaging Archive, which contains survival time, event indicators and covariates (age, treatment type and tumor stage).

In [None]:
# Load libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

# Preprocessing
from datatime import datetime
from sksurv.util import Surv

# Kaplan-Meier 
from lifelines import KaplanMeierFitter

# Cox
from lifelines import CoxPHFitter
from sksurv.linear_model import CoxNetSurvivalAnalysis
from sklearn.model_selection import train_test_split
from sksurv.metrics import concodance_index_censored

# Random Survival Forest
from sksurv.ensemble import RandomSurvivalForest
from sksurv.preprocessing import OneHotEncoder
from sklearn.inspection import permutation_importance


In [None]:
# Load data
cancersurviv = pd.read_excel('RADCURE_Clinical_v04_20241219.xlsx', sheet_name=None)
# if it doesn't work add engine = 'openpyxl'

cancersurviv.head()
cancersurviv.info()

# Preprocessing
# Need to make time column potentially?

# Convert date columns to datetime format
clinical_data["Date of Last Known Alive"] = pd.to_datetime(clinical_data["Date of Last Known Alive"], format="%m/%d/%Y")
clinical_data["CT Date"] = pd.to_datetime(clinical_data["CT Date"], format="%m/%d/%Y")

# Calculate time in days
clinical_data["time"] = (clinical_data["Date of Last Known Alive"] - clinical_data["CT Date"]).dt.days - clinical_data['Days between CT and surgery']

# Need to turn the survival thing into a boolean event indicator 
# (where the endpoint) corresponds to True (meaning the even has been obvserved)
# Convert event column to boolean (Death = True, Alive = False)
clinical_data["event"] = clinical_data["Survival Status"].apply(lambda x: True if x == "Death" else False)

# Convert to sksurv structured array format
survival_data = Surv.from_dataframe("event", "time", clinical_data)


### Kaplan-Meier Analysis 
- Generate Kaplan-Meier survival curves for at least two distinct groups (ex treatment type, age group or tumor stage) ensuring each group has its own plot.
- For each plot, conduct a log-rank test to compare survival differences between the groups

In [None]:
# Generate survival curves

# Fit the Kaplan-Meier estimator
kmf = KaplanMeierFitter()
kmf.fit(data['week'], event_observed=data['arrest'])

# Plot the Kaplan-Meier curve
kmf.plot_survival_function()
plt.title('Kaplan-Meier Curve')
plt.xlabel('Time (weeks)')
plt.ylabel('Survival Probability')
plt.show()

In [None]:
# Log-rank test

### Cox Proportional Hazards Regression
- Perform a Cox regression analysis, including at least three covariates
- Validate the proportional hazards assumption

In [None]:
# Cox regression analysis

# Prepare the data for sksurv
# X = data.drop(columns=['week', 'arrest']).astype(float)
# y = np.array([(bool(event), time) for event, time in zip(data['arrest'], data['week'])], dtype=[('event', bool), ('time', float)])

# Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the Cox proportional hazards model with L1 regularization
# cph_lasso = CoxnetSurvivalAnalysis(l1_ratio=1.0)
# cph_lasso.fit(X_train, y_train)

# Predict the risk scores on the test set
# risk_scores = cph_lasso.predict(X_test)




# Validate proportional hazards assumption

### Random Survival Forests (RSF)
- Build a Random Survival Forest model to predict survival.
- Perform variable importance analysis to identfy the most predictive factors. 
- Compare the model's concordance index (C-index) with that of Cox regression.

In [None]:
# Random Survival Forest

# Load the WHAS500 dataset
data_x, data_y = load_whas500()

# Encode categorical variables
encoder = OneHotEncoder()
data_x = encoder.fit_transform(data_x)

# Train a Random Survival Forest model
rsf = RandomSurvivalForest(n_estimators=100, random_state=42)
rsf.fit(data_x, data_y)

# please refer to https://scikit-survival.readthedocs.io/en/stable/user_guide/random-survival-forest.html
result = permutation_importance(rsf, data_x, data_y, n_repeats=15, random_state=42)
feature_importance = pd.DataFrame(
         {
        k: result[k]
        for k in (
            "importances_mean",
            "importances_std",
        )
    },
    index=data_x.columns,
).sort_values(by="importances_mean", ascending=False)

# Sort by importances_mean and plot
feature_importance = feature_importance.sort_values(by="importances_mean", ascending=False)

plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.barh(feature_importance.index, feature_importance['importances_mean'], xerr=feature_importance['importances_std'], align='center')
plt.xlabel('Mean Importance')
plt.ylabel('Features')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


# Variable importance analysis

In [None]:
# Concordance index with Cox regression
# Calculate the concordance index
# c_index = concordance_index_censored(y_test['event'], y_test['time'], risk_scores)[0]
# print(f'Concordance Index: {c_index:.4f}')