In [3]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from sklearn.ensemble import GradientBoostingRegressor
from sksurv.metrics import concordance_index_censored
from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
from sksurv.util import Surv

# Example dataset
data = pd.read_csv('/home/joshua-siraj/Documents/CDI/AutoML/data/RADCURE_challenge_clinical.csv')
data.dropna(inplace=True)
data.drop(columns=["Study ID", "split"], inplace=True)
data = pd.get_dummies(data, columns=["Sex", "T Stage", "N Stage", "Stage", "Disease Site"], drop_first=True)

# Prepare data
data["event"] = data["death"] == 1
data = data.apply(pd.to_numeric, errors='coerce')
X = data.drop(columns=["survival_time", "death", "event"])
y = Surv.from_dataframe("event", "survival_time", data)

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
models = {
    "CoxPH": CoxPHFitter(),
    "GradientBoosting": GradientBoostingSurvivalAnalysis(),
    "RandomForest": RandomSurvivalForest(n_estimators=100, random_state=42),
}

# Fit CoxPH separately (it's incompatible with sksurv format directly)
cox_data = data.copy()
cox_data["survival_time"] += 1e-6  # Avoid zero survival times for Cox
coxph = CoxPHFitter(penalizer=0.0001)
coxph.fit(cox_data, duration_col="survival_time", event_col="death")


# Fit other models
fitted_models = {}
for name, model in models.items():
    if name == "CoxPH":
        fitted_models[name] = coxph
    else:
        model.fit(X_train, y_train)
        fitted_models[name] = model

# Evaluation
def evaluate_model(model, X_test, y_test, model_name):
    if model_name == "CoxPH":
        return coxph.concordance_index_
    else:
        predictions = model.predict(X_test)
        return concordance_index_censored(y_test["event"], y_test["survival_time"], predictions)[0]

cindex_scores = {}
for name, model in fitted_models.items():
    cindex_scores[name] = evaluate_model(model, X_test, y_test, name)

# Output C-index scores
for model_name, cindex in cindex_scores.items():
    print(f"C-index ({model_name}): {cindex:.4f}")


C-index (CoxPH): 0.9348
C-index (GradientBoosting): 0.7510
C-index (RandomForest): 0.7272
