In [None]:
# Standard library imports
import os
import random
import warnings

# Third-party library imports for data manipulation
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import ptitprince as pt
import umap

# Machine Learning and statistical testing libraries
from scipy.stats import mannwhitneyu, f_oneway
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import FitFailedWarning
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler

# Survival analysis libraries
import sksurv
from sksurv.compare import compare_survival
from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.metrics import concordance_index_censored, concordance_index_ipcw, integrated_brier_score
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.util import Surv

# Feature selection libraries
from boruta import BorutaPy

# Lifelines for additional survival analysis
from lifelines.statistics import logrank_test, multivariate_logrank_test


In [None]:
split_percentage = 0.5
scaler_type = "minmax"
feature_Sel_type = "ElasticNet"
cohort = "RTOG_0521_OS_349_feats_final"
l1_ratio = 0.7
max_days = 5
risk_treshold_method = "median"
num_features = 16

In [None]:
rtog_features_df = pd.read_csv(f'/path/to/features_with_event_time.csv', index_col=0)
rtog_features_df = rtog_features_df.rename(columns={"survival": "event", "survival_years": "time", "cn_deidentified": "patient_id"}) #rename columns to match script
rtog_features_df = rtog_features_df.drop(columns=["disease_free_survival", "disease_free_survival_years", "biochemical_failure", "biochemical_failure_years", "any_distant_mets", "any_distant_mets_years", "local_failure", "local_failure_years"])

In [None]:
imputer = SimpleImputer(strategy='median')
rtog_features = rtog_features_df.drop(['patient_id', 'RX', 'time', 'event'], axis=1)
rtog_features.replace([np.inf, -np.inf], np.nan, inplace=True)
num_df = rtog_features.values
names = rtog_features.columns.values

rtog_features = pd.DataFrame(imputer.fit_transform(num_df), columns=names)

rtog_features['patient_id'] = rtog_features_df['patient_id']
rtog_features['RX'] = rtog_features_df['RX']
rtog_features['time'] = rtog_features_df['time']
rtog_features['event'] = rtog_features_df['event']
rtog_features

In [None]:
rtog_features['event'].value_counts()

In [None]:
rtog_features['event'].value_counts()

### Do analysis on leg of RT + ADT only

In [None]:
rtog_leg_1 = rtog_features.loc[rtog_features['RX'] == 1]

In [None]:
rtog_leg_1['event'].value_counts()

In [None]:
rtog_leg_1

In [None]:
def remove_correlated_features(df, threshold=0.95):
    # Create correlation matrix
    corr_matrix = pd.DataFrame(np.corrcoef(df.values, rowvar=False), columns=df.columns).abs() 

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    # len(to_drop)

    # Drop features 
    corr_removed_df = df.drop(to_drop, axis=1)
    
    return corr_removed_df

def boruta_selected_features(feature_df, y):
    # define Boruta feature selection method
    # ipdb.set_trace()
    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

    feat_selector = BorutaPy(rf, n_estimators='auto', perc=95, alpha=0.05, two_step=False,verbose=0, random_state=42)

    feat_selector.fit(feature_df.values, y)

    # check selected features
    boruta_selected_features = feature_df.columns[feat_selector.support_  | feat_selector.support_weak_].to_list()
    boruta_selected_features_df = feature_df[boruta_selected_features]
    
    return boruta_selected_features_df

def get_discriminative_features(view_features):

    view_features_trim1 = view_features.drop(['patient_id', 'event', 'time'], axis=1) #'view', , 'img_id'
    y = view_features['event']
    if scaler_type == "minmax":
        scaler = MinMaxScaler() #StandardScaler() #RobustScaler() #
    else:
        scaler = RobustScaler(unit_variance=True)
    view_features_scaled = pd.DataFrame(scaler.fit_transform(view_features_trim1.values), columns=view_features_trim1.columns)

    view_features_trim2 = remove_correlated_features(view_features_scaled, threshold=0.85)


    # drop columns with zero variance using sklearn's VarianceThreshold
    sel = VarianceThreshold(threshold=0.01)
    sel.fit(view_features_trim2)
    view_features_trimmed = view_features_trim2[view_features_trim2.columns[sel.get_support(indices=True)]]
    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

    feat_selector = BorutaPy(rf, n_estimators='auto', perc=98, alpha=0.05, two_step=False,verbose=0, random_state=42)

    feat_selector.fit(view_features_trimmed.values, y)


    # check selected features
    boruta_selected_features = view_features_trimmed.columns[feat_selector.support_].to_list() # | feat_selector.support_weak_

    disc_features = view_features_trimmed[boruta_selected_features]
    disc_features = pd.DataFrame(scaler.fit_transform(disc_features.values), columns=disc_features.columns)

    return disc_features#, sig_pvals

def plot_raincloudplots(df, stable_features, events_df, save_path):
    rc_plot_df = df.copy()
    rc_plot_df['event'] = events_df
    for feature in stable_features:
        fig = plt.figure(figsize=(5,4))
        # Perform wilcoxon test to check if the feature is significantly different between the two groups
        
        stat, pvalue = f_oneway(rc_plot_df[events_df==1][feature].values, rc_plot_df[events_df==0][feature].values)
        
        pt.RainCloud(x='event', y=feature, data=rc_plot_df)
        plt.title('p_val: '+str(pvalue))
        plt.show()
        fig.savefig(save_path+'_'+feature+'.png', dpi=300, bbox_inches='tight')
        


## Bootstrap Survival Analysis

In [None]:
def get_hr_and_pval(threshold, val_risk_scores, y_val_survlabel):

    # Calculate Kaplan-Meier estimator for different risk groups
    risk_groups = threshold #np.mean(val_risk_scores)#np.percentile(test_risk_scores, 50)
    risk_group_labels = np.array([1 if x > risk_groups else 0 for x in val_risk_scores])#np.digitize(test_risk_scores, risk_groups)
    survival_probs = []
    survival_times = []

    for group_label in np.unique(risk_group_labels):
        group_indices = np.where(risk_group_labels == group_label)
        group_time, group_survival_prob = kaplan_meier_estimator(events[group_indices], times[group_indices])
        survival_probs.append(group_survival_prob)
        survival_times.append(group_time)


    tstat, pval, stats0, stats1 = compare_survival(y_val_survlabel, risk_group_labels, return_stats=True)

    return tstat, pval


# divide the centerview data on training and test set

training_data, holdout_data = train_test_split(rtog_leg_1, test_size=split_percentage, random_state=42, stratify=rtog_leg_1['event'])

N_runs = 50
# Set the seed
random.seed(42)

# Generate 50 random numbers
random_numbers = [random.randint(0, 1000) for _ in range(N_runs)]

bootstrap_metrics = pd.DataFrame()

warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FitFailedWarning)
warnings.simplefilter("ignore", RuntimeWarning)


print("Running {} iterations".format(N_runs))

for i in range(N_runs):

    random_seed = random_numbers[i] #np.random.randint(1, 1000)

    # divide the training data on training and validation set
    train_data, val_data = train_test_split(training_data, test_size=split_percentage, random_state=random_seed, stratify=training_data['event'])

    X_train = train_data.drop(['patient_id','event', 'time'], axis=1).reset_index(drop=True)
    y_train = train_data[['patient_id','event', 'time']].reset_index(drop=True)

    X_val = val_data.drop(['patient_id','event', 'time'], axis=1).reset_index(drop=True)
    y_val = val_data[['patient_id','event', 'time']].reset_index(drop=True)


    #scaler = RobustScaler(unit_variance=True)#MinMaxScaler()
    if scaler_type == "minmax":
        scaler = MinMaxScaler() #StandardScaler() #RobustScaler() #
    else:
        scaler = RobustScaler(unit_variance=True)
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)

    varTh = VarianceThreshold(threshold=0.05)
    varTh.fit(X_train)
    X_train = X_train[X_train.columns[varTh.get_support(indices=True)]]
    X_val = X_val[X_val.columns[varTh.get_support(indices=True)]]

    X_train_decorrelated = remove_correlated_features(X_train, 0.95)
    X_train_trimmed = X_train_decorrelated.copy()
    # apply boruta feature selection on the training set
    #X_train_trimmed = boruta_selected_features(X_train_decorrelated, y_train['event'].values)
    
    X_val_trimmed = X_val[X_train_trimmed.columns]

    print(f"Training features input to ElasticNet model (l1={l1_ratio}) = {X_train_trimmed.shape}")

    y_train_survlabel = Surv.from_dataframe('event', 'time', y_train)
    y_val_survlabel = Surv.from_dataframe('event', 'time', y_val)

    try:
        coxnet_pipe = make_pipeline(CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, alpha_min_ratio=0.01, max_iter=20, fit_baseline_model=True))
        warnings.simplefilter("ignore", UserWarning)
        warnings.simplefilter("ignore", FitFailedWarning)
        coxnet_pipe.fit(X_train_trimmed, y_train_survlabel)

        estimated_alphas = coxnet_pipe.named_steps["coxnetsurvivalanalysis"].alphas_
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
    
        gcv = GridSearchCV(
        make_pipeline(CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, fit_baseline_model=True)),
        param_grid={"coxnetsurvivalanalysis__alphas": [[v] for v in estimated_alphas]},
        cv=cv,
        error_score=0.5,
        n_jobs=1,
        ).fit(X_train_trimmed, y_train_survlabel)

        cv_results = pd.DataFrame(gcv.cv_results_)

        alphas = cv_results.param_coxnetsurvivalanalysis__alphas.map(lambda x: x[0])
        mean = cv_results.mean_test_score
        std = cv_results.std_test_score


        best_model = gcv.best_estimator_.named_steps["coxnetsurvivalanalysis"]
        best_coefs = pd.DataFrame(best_model.coef_, index=X_train_trimmed.columns, columns=["coefficient"])

        non_zero = np.sum(best_coefs.iloc[:, 0] != 0)
        non_zero_coefs = best_coefs.query("coefficient != 0")
        coef_order = non_zero_coefs.abs().sort_values("coefficient").index
        print("Number of non-zero features: {}".format(non_zero))

        coxnet_pred = gcv.best_estimator_
        C_test = coxnet_pred.score(X_val_trimmed, y_val_survlabel)
        print('C-index on test set: {:.3f}'.format(C_test))

        val_risk_scores = coxnet_pred.predict(X_val_trimmed)
        events, times = y_val['event'].values.astype(bool), y_val['time'].values

        coxnet_model = gcv.best_estimator_.named_steps["coxnetsurvivalanalysis"]

        hr_median, pval_median = get_hr_and_pval(np.median(val_risk_scores), val_risk_scores, y_val_survlabel)
        hr_mean, pval_mean = get_hr_and_pval(np.mean(val_risk_scores), val_risk_scores, y_val_survlabel)


        temp_dict = {'Seed': [random_seed], 'Test_Cindex': [C_test], 'Test_pval_mean': [pval_mean], 'Hazard_ratio_mean': [hr_mean], 'Test_pval_median': [pval_median], 'Hazard_ratio_median': [hr_median], 'Nonzero_features_count': [non_zero]}
        temp_df = pd.DataFrame.from_dict(temp_dict, orient='columns')
        temp_df['Nonzero_features']= [coef_order.values.tolist()]
        temp_df['Nonzero_features_coefs']= [non_zero_coefs['coefficient'].values.tolist()]


        bootstrap_metrics = pd.concat([bootstrap_metrics, temp_df], axis=0, ignore_index=True)

        if (i+1)%5 == 0:
            print("Iteration {} completed".format(i+1))
    except:
        print("Iteration {} failed".format(i+1))
        print("Retrying iteration {}...".format(i+1))
        i=i-1
        continue
    

In [None]:
bootstrap_metrics

In [None]:
feature_frequency_dict = {}
feature_coefficient_strength_dict = {}
for feature_list in bootstrap_metrics['Nonzero_features']:
    list_idx = bootstrap_metrics['Nonzero_features'].values.tolist().index(feature_list)
    for feature in feature_list:
        idx = feature_list.index(feature)
        feat_coef = np.abs(bootstrap_metrics['Nonzero_features_coefs'][list_idx][idx])
        
        if feature in feature_frequency_dict:
            feature_frequency_dict[feature] += 1
        else:
            feature_frequency_dict[feature] = 1
        
        if feature in feature_coefficient_strength_dict:
            feature_coefficient_strength_dict[feature] += feat_coef
        else:
            feature_coefficient_strength_dict[feature] = feat_coef

print(feature_frequency_dict)
print(feature_coefficient_strength_dict)

feature_frequency = pd.DataFrame(feature_frequency_dict.items(), columns=['feature', 'count'])
feature_coefs = pd.DataFrame(feature_coefficient_strength_dict.items(), columns=['feature', 'coefficient'])

# merge the two dataframes
feature_frequency = feature_frequency.merge(feature_coefs, on='feature', how='left')

feature_frequency = feature_frequency.sort_values(by='coefficient', ascending=False).reset_index(drop=True)

# get feature score using the formula: feature_score = feature_count * feature_coefficient_strength/50 and save it in a new column

feature_frequency['feature_score'] = feature_frequency['count'] * feature_frequency['coefficient']/N_runs

feature_frequency = feature_frequency.sort_values(by='feature_score', ascending=False).reset_index(drop=True)

feature_frequency

In [None]:

train_df = training_data[feature_frequency['feature'][:num_features].values]
holdout_df = holdout_data[feature_frequency['feature'][:num_features].values]


X_training = train_df.reset_index(drop=True)
y_training = training_data[['patient_id','event', 'time']].reset_index(drop=True)

X_holdout = holdout_df.reset_index(drop=True)
y_holdout = holdout_data[['patient_id','event', 'time']].reset_index(drop=True)

if scaler_type == "minmax":
    second_scaler = MinMaxScaler() #StandardScaler() #RobustScaler() #
else:
    second_scaler = RobustScaler(unit_variance=True)
X_training = pd.DataFrame(second_scaler.fit_transform(X_training), columns=X_training.columns)
X_holdout = pd.DataFrame(second_scaler.transform(X_holdout), columns=X_holdout.columns)

varTh = VarianceThreshold(threshold=0.01)
varTh.fit(X_training)
X_training = X_training[X_training.columns[varTh.get_support(indices=True)]]
X_holdout = X_holdout[X_holdout.columns[varTh.get_support(indices=True)]]

X_training_trimmed = remove_correlated_features(X_training, 0.8)
X_holdout_trimmed = X_holdout[X_training_trimmed.columns]
# concat the training and holdout data

print("Training data: {}".format(X_training_trimmed.shape))
print("Testing data: {}".format(X_holdout_trimmed.shape))

In [None]:
y_training

In [None]:
y_training_survlabel = Surv.from_dataframe('event', 'time', y_training)
y_holdout_survlabel = Surv.from_dataframe('event', 'time', y_holdout)

In [None]:
len(y_training_survlabel)

In [None]:
alphas = 10.0 ** np.linspace(-4, 4, 50)
coefficients = {}

cph = CoxPHSurvivalAnalysis()
for alpha in alphas:
    cph.set_params(alpha=alpha)
    cph.fit(X_training_trimmed, y_training_survlabel)
    key = round(alpha, 5)
    coefficients[key] = cph.coef_

coefficients = pd.DataFrame.from_dict(coefficients).rename_axis(index="feature", columns="alpha").set_index(X_training_trimmed.columns)

In [None]:
def plot_coefficients(coefs, n_highlight):
    _, ax = plt.subplots(figsize=(9, 6))
    n_features = coefs.shape[0]
    alphas = coefs.columns
    for row in coefs.itertuples():
        ax.semilogx(alphas, row[1:], ".-", label=row.Index)

    alpha_min = alphas.min()
    top_coefs = coefs.loc[:, alpha_min].map(abs).sort_values().tail(n_highlight)
    for name in top_coefs.index:
        coef = coefs.loc[name, alpha_min]
        plt.text(alpha_min, coef, name + "   ", horizontalalignment="right", verticalalignment="center")

    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()
    ax.grid(True)
    ax.set_xlabel("alpha")
    ax.set_ylabel("coefficient")

plot_coefficients(coefficients, n_highlight=6)

In [None]:
coxnet_pipe = make_pipeline(CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, alpha_min_ratio=0.01, max_iter=50, fit_baseline_model=True))
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FitFailedWarning)
coxnet_pipe.fit(X_training_trimmed, y_training_survlabel)

estimated_alphas = coxnet_pipe.named_steps["coxnetsurvivalanalysis"].alphas_
cv = KFold(n_splits=5, shuffle=True, random_state=42)
gcv = GridSearchCV(
    make_pipeline(CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, fit_baseline_model=True)),
    param_grid={"coxnetsurvivalanalysis__alphas": [[v] for v in estimated_alphas]},
    cv=cv,
    error_score=0.5,
    n_jobs=1,
).fit(X_training_trimmed, y_training_survlabel)

cv_results = pd.DataFrame(gcv.cv_results_)

In [None]:
alphas = cv_results.param_coxnetsurvivalanalysis__alphas.map(lambda x: x[0])
mean = cv_results.mean_test_score
std = cv_results.std_test_score

fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(alphas, mean)
ax.fill_between(alphas, mean - std, mean + std, alpha=0.15)
ax.set_xscale("log")
ax.set_ylabel("concordance index", fontsize=16)
ax.set_xlabel("alpha", fontsize=16)
yticks = [np.round(x,2) for x in ax.get_yticks()]
ax.set_yticklabels(yticks, fontsize=14)
ax.axvline(gcv.best_params_["coxnetsurvivalanalysis__alphas"][0], c="C1")
ax.axhline(0.5, color="grey", linestyle="--")
ax.grid(True)

In [None]:
X_training_trimmed

In [None]:
best_model = gcv.best_estimator_.named_steps["coxnetsurvivalanalysis"]
best_coefs = pd.DataFrame(best_model.coef_, index=X_training_trimmed.columns, columns=["coefficient"])

non_zero = np.sum(best_coefs.iloc[:, 0] != 0)
print(f"Number of non-zero coefficients: {non_zero}")

non_zero_coefs = best_coefs.query("coefficient != 0")
coef_order = non_zero_coefs.abs().sort_values("coefficient").index

fig1, ax = plt.subplots(figsize=(8,10))
non_zero_coefs.loc[coef_order].plot.barh(ax=ax, legend=False)
ax.set_xlabel("Feature Coefficient", fontsize=20)
ax.xaxis.set_tick_params(labelsize=16)
ax.yaxis.set_tick_params(labelsize=20)
ax.grid(True)

In [None]:
non_zero_coefs.index

In [None]:
X_training_trimmed

In [None]:
X_holdout_trimmed

In [None]:
non_zero_coefs.to_dict()['coefficient']

In [None]:
coef_order

In [None]:

print("Metrics for model")
coxnet_pred = gcv.best_estimator_

training_prediction = coxnet_pred.predict(X_training_trimmed)
holdout_prediction = coxnet_pred.predict(X_holdout_trimmed)

training_survival = coxnet_pred.predict_survival_function(X_training_trimmed)
times_training = np.arange(int(np.min(y_training_survlabel['time'])), int(np.max(y_training_survlabel["time"])))
try:
    survival_prediction_training = np.asarray([[fn(t) for t in times_training] for fn in training_survival])
except:
    print("Error in calculating survival prediction for training set")

holdout_survival = coxnet_pred.predict_survival_function(X_holdout_trimmed)
times_holdout = np.arange(int(np.min(y_holdout_survlabel['time'])), int(np.max(y_holdout_survlabel["time"])))
try:
    survival_prediction_holdout = np.asarray([[fn(t) for t in times_holdout] for fn in holdout_survival])
except:
    print("Error in calculating survival prediction for holdout set")

c_index_training = concordance_index_censored(y_training_survlabel["event"], y_training_survlabel["time"], training_prediction)
c_index_holdout = concordance_index_censored(y_holdout_survlabel["event"], y_holdout_survlabel["time"], holdout_prediction)
print('C-index on Training set: {:.3f}'.format(c_index_training[0]))
print('C-index on Holdout set: {:.3f}'.format(c_index_holdout[0]))

# calculate the IPCW C-index for the training and holdout set
c_ipcw_training = concordance_index_ipcw(y_training_survlabel, y_training_survlabel, training_prediction)
c_ipcw_holdout = concordance_index_ipcw(y_training_survlabel, y_holdout_survlabel, holdout_prediction)
print('IPCW C-index on Training set: {:.3f}'.format(c_ipcw_training[0]))
print('IPCW C-index on Holdout set: {:.3f}'.format(c_ipcw_holdout[0]))

try:
    ibs_training = integrated_brier_score(y_training_survlabel, y_training_survlabel, survival_prediction_training, times_training)
    ibs_holdout = integrated_brier_score(y_training_survlabel, y_holdout_survlabel, survival_prediction_holdout, times_holdout)
    print('IBS on training set: {:.3f}'.format(ibs_training))
    print('IBS on Holdout set: {:.3f}'.format(ibs_holdout))
except:
    print("Error in calculating IBS")

# create a dataframe to store the model type and c-index
model_metrics = pd.DataFrame()
model_metrics['cohort'] = ["Disparity"]
model_metrics['c_index_train'] = [c_index_training[0]]
model_metrics['c_index_holdout'] = [c_index_holdout[0]]
model_metrics['c_index_ipcw_train'] = [c_ipcw_training[0]]
model_metrics['c_index_ipcw_holdout'] = [c_ipcw_holdout[0]]
try:
    model_metrics['integrated_brier_score_train'] = [ibs_training]
    model_metrics['integrated_brier_score_holdout'] = [ibs_holdout]
except:
    model_metrics['integrated_brier_score_train'] = [0]
    model_metrics['integrated_brier_score_holdout'] = [0]



In [None]:
training_risk_scores = coxnet_pred.predict(X_training_trimmed)
events, times = y_training['event'].values.astype(bool),   y_training['time'].values
coxnet_model = gcv.best_estimator_.named_steps["coxnetsurvivalanalysis"]

In [None]:
min(training_risk_scores), max(training_risk_scores)

In [None]:
mean = np.mean(training_risk_scores)
std = np.std(training_risk_scores)

# Calculate the range
lower_bound = mean - std
upper_bound = mean + std

In [None]:
len(training_risk_scores)

In [None]:
threshold_values = np.linspace(lower_bound, upper_bound, num=int((upper_bound - (lower_bound)) / 0.001) + 1)
p_values = []
thresholds = []

for threshold in threshold_values:
    # Assuming 'train_risk_group_labels' needs to be defined based on the current threshold
    train_risk_group_labels = np.array([1 if x > threshold else 0 for x in training_risk_scores])#np.digitize(train_risk_scores, risk_groups)
    #p_values.append(multivariate_logrank_test(times, train_risk_group_labels, events).p_value)
    thresholds.append(threshold)
    p_values.append(logrank_test(times[train_risk_group_labels==0], times[train_risk_group_labels==1], events[train_risk_group_labels==0], events[train_risk_group_labels==1]).p_value)



In [None]:
risk_threshold = np.median(training_risk_scores)

In [None]:
# plot p values vs thresholds
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(thresholds, p_values)
ax.set_xlabel("Threshold value")
ax.set_ylabel("P-value")
ax.set_title("P-value vs Threshold")
#plot a red line in the minimum p-value
min_pval_idx = np.argmin(p_values)
ax.axvline(risk_threshold, color='blue', linestyle='--')
ax.axvline(thresholds[min_pval_idx], color='r', linestyle='--')
ax.axvline(np.median(training_risk_scores), color='g', linestyle='--')

In [None]:
train_risk_group_labels = np.array([1 if x > risk_threshold else 0 for x in training_risk_scores])#np.digitize(train_risk_scores, risk_groups)
train_survival_probs = []
train_survival_times = []

for group_label in np.unique(train_risk_group_labels):
    group_indices = np.where(train_risk_group_labels == group_label)
    group_time, group_survival_prob = kaplan_meier_estimator(events[group_indices], times[group_indices])
    train_survival_probs.append(group_survival_prob)
    train_survival_times.append(group_time)


In [None]:

holdout_risk_scores = coxnet_pred.predict(X_holdout_trimmed)
events, times = y_holdout['event'].values.astype(bool),   y_holdout['time'].values

coxnet_model = gcv.best_estimator_.named_steps["coxnetsurvivalanalysis"]


holdout_risk_group_labels = np.array([1 if x > risk_threshold else 0 for x in holdout_risk_scores])#np.digitize(test_risk_scores, risk_groups)
holdout_survival_probs = []
holdout_survival_times = []

for group_label in np.unique(holdout_risk_group_labels):
    group_indices = np.where(holdout_risk_group_labels == group_label)
    group_time, group_survival_prob = kaplan_meier_estimator(events[group_indices], times[group_indices])
    holdout_survival_probs.append(group_survival_prob)
    holdout_survival_times.append(group_time)



In [None]:

results = logrank_test(times[holdout_risk_group_labels==0], times[holdout_risk_group_labels==1], events[holdout_risk_group_labels==0], events[holdout_risk_group_labels==1])
results.print_summary()

In [None]:

results = multivariate_logrank_test(times, holdout_risk_group_labels, events)
results.p_value

In [None]:
y_holdout['risk_score'] = holdout_risk_scores
y_holdout['risk_group'] = holdout_risk_group_labels
y_holdout['patient_id'] = y_holdout['patient_id']
y_holdout = y_holdout.sort_values(by=['patient_id']).reset_index(drop=True)
y_holdout

In [None]:
y_training['risk_score'] = training_risk_scores
y_training['risk_group'] = train_risk_group_labels
y_training['patient_id'] = y_training['patient_id']
y_training = y_training.sort_values(by=['patient_id']).reset_index(drop=True)
y_training

In [None]:
from lifelines import CoxPHFitter

LL_train = y_training.drop(['patient_id', 'risk_score'], axis=1)
LL_holdout = y_holdout.drop(['patient_id', 'risk_score'], axis=1)
cph_train = CoxPHFitter()
cph_train.fit(LL_train, duration_col='time', event_col='event', show_progress=False)

cph_holdout = CoxPHFitter()
cph_holdout.fit(LL_holdout, duration_col='time', event_col='event', show_progress=False)


training_results = cph_train.summary
training_p = multivariate_logrank_test(y_training['time'], y_training['risk_group'], y_training['event']).p_value# training_results['p'].values[0]
training_hr = training_results['exp(coef)'].values[0]
training_ci_lower = training_results['exp(coef) lower 95%'].values[0]
training_ci_upper = training_results['exp(coef) upper 95%'].values[0]
training_log_likelihood = cph_train.log_likelihood_
model_metrics['training_p_value'] = [training_p]
model_metrics['training_hazard_ratio'] = [training_hr]
model_metrics['training_hr_ci_lower'] = [training_ci_lower]
model_metrics['training_hr_ci_upper'] = [training_ci_upper]
model_metrics['training_log_likelihood'] = [training_log_likelihood]
model_metrics['training_parameters'] = [cph_train.params_.shape[0]]

training_data_stats = (training_p, training_hr, training_ci_lower, training_ci_upper)


holdout_results = cph_holdout.summary
holdout_p = multivariate_logrank_test(y_holdout['time'], y_holdout['risk_group'], y_holdout['event']).p_value # holdout_results['p'].values[0]
holdout_hr = holdout_results['exp(coef)'].values[0]
holdout_ci_lower = holdout_results['exp(coef) lower 95%'].values[0]
holdout_ci_upper = holdout_results['exp(coef) upper 95%'].values[0]
holdout_log_likelihood = cph_holdout.log_likelihood_
model_metrics['holdout_p_value'] = [holdout_p]
model_metrics['holdout_hazard_ratio'] = [holdout_hr]
model_metrics['holdout_hr_ci_lower'] = [holdout_ci_lower]
model_metrics['holdout_hr_ci_upper'] = [holdout_ci_upper]
model_metrics['holdout_log_likelihood'] = [holdout_log_likelihood]
model_metrics['holdout_parameters'] = [cph_holdout.params_.shape[0]]

holdout_data_stats = (holdout_p, holdout_hr, holdout_ci_lower, holdout_ci_upper)




In [None]:
cph_holdout.score(LL_holdout, scoring_method='log_likelihood')

In [None]:
cph_train.print_summary()

In [None]:
cph_holdout.print_summary()

In [None]:
LL_holdout

In [None]:
from lifelines import KaplanMeierFitter
from lifelines.plotting import add_at_risk_counts
import seaborn as sns

def plot_km_curve_lifelines(data_df, data_stats, figure_save_path, title_str):
    
    # Create a colormap
    cmap = plt.cm.get_cmap('Reds')
    # Choose a shade of red
    hr_shade = cmap(0.75)

    cmap = plt.cm.get_cmap('Blues')
    # Choose a shade of red
    lr_shade = cmap(0.75)
    
    fig, ax = plt.subplots(1, 1, figsize=(15, 12))

    data_high_risk = data_df[data_df['risk_group']==1]
    data_low_risk = data_df[data_df['risk_group']==0]

    kmf_hr = KaplanMeierFitter()
    kmf_hr.fit(data_high_risk['time'], event_observed=data_high_risk['event'], label='High Risk')
    kmf_hr.plot_survival_function(ax=ax, color='#f8766d', lw=2, show_censors=True)

    kmf_lr = KaplanMeierFitter()
    kmf_lr.fit(data_low_risk['time'], event_observed=data_low_risk['event'], label='Low Risk')
    kmf_lr.plot_survival_function(ax=ax, color='#03bfc4', lw=2, show_censors=True, )
    
    
    print("Low risk median survival time: ",kmf_lr.median_survival_time_)
    print("High risk median survival time: ",kmf_hr.median_survival_time_)
    max_median_survival_time = max(kmf_hr.median_survival_time_ , kmf_lr.median_survival_time_)
    if max_median_survival_time == np.inf:
        plt.axhline(y=0.5, color='black', linestyle='--', lw=1)
    else:
        plt.plot([0, max_median_survival_time], [0.5, 0.5], color='black', linestyle='--', lw=1)
    # Vertical lines up to y=0.5, using plot for precise control
    plt.plot([kmf_hr.median_survival_time_, kmf_hr.median_survival_time_], [0, 0.5], color='black', linestyle='--', lw=1)
    plt.plot([kmf_lr.median_survival_time_, kmf_lr.median_survival_time_], [0, 0.5], color='black', linestyle='--', lw=1)
    ax.set_title(title_str, fontsize=26)

    yticks = [np.round(x,1) for x in ax.get_yticks()]
    ax.set_yticklabels(yticks, fontsize=20)
    ax.set_xticklabels(ax.get_xticks().astype(int), fontsize=20)


    ax.set_xlabel('Time (Years)', fontsize=28)
    ax.set_ylabel('Overall Survival Probability', fontsize=28)
    data_p, data_hr, data_ci_lower, data_ci_upper = data_stats
    format_p = lambda p: f"{p:.1e}" if p < 0.001 else f"{p:.4f}"

# Updated string formatting
    data_stats_text = f'p: {format_p(data_p)}\nHR: {data_hr:.2f} [95% CI: {data_ci_lower:.2f} - {data_ci_upper:.2f}]'
    #data_stats_text = f'p: {data_p:.1e if data_p < 0.001 else data_p:.4f}\nHR: {data_hr:.2f} [95% CI: {data_ci_lower:.2f} - {data_ci_upper:.2f}]'
    ax.text(0.03, 0.1, data_stats_text, transform=ax.transAxes, fontsize=24, verticalalignment='bottom')

    # Add the risk table at the bottom of the KM plot on ax[1] (the bottom subplot)
    sns.despine()
    add_at_risk_counts(kmf_hr, kmf_lr, ax=ax, fontsize=20)
    ax.legend(fontsize=24)
    plt.tight_layout()
    
    fig.savefig(figure_save_path, bbox_inches='tight')
    plt.show()





In [None]:
train_title_str = 'Training Set - RT+ADT treated patients (N={:d})'.format(len(y_training))
train_figure_save_path = f'KM_curve_training.png'
plot_km_curve_lifelines(y_training, training_data_stats, train_figure_save_path, train_title_str)

In [None]:
# plot the KM curve for the holdout data

holdout_title_str = 'Holdout Set - RT+ADT treated patients (N={:d})'.format(len(y_holdout))
holdout_figure_save_path = f'KM_curve_holdout.png'
plot_km_curve_lifelines(y_holdout, holdout_data_stats, holdout_figure_save_path, holdout_title_str)

### Checking selected feature distributions

In [None]:
for i, feature in enumerate(X_training_trimmed):
    pvalue = mannwhitneyu(X_training_trimmed[feature], y_training['event']).pvalue
    print(feature)
    plt.figure(figsize=(5,4))
    pt.RainCloud(x=rtog_features['event'], y=X_training_trimmed[feature])
    plt.title(f'p: {pvalue:.2}')
    plt.show()

## Test on RT+ADT+CT group

In [None]:
rtog_leg_2 = rtog_features.loc[rtog_features['RX'] == 2]

In [None]:
rtog_leg_2['event'].value_counts()

In [None]:
rtog_leg_2

In [None]:
X_holdout_doce = rtog_leg_2.drop(['patient_id', 'event', 'time'], axis=1).reset_index(drop=True)
y_holdout_doce = rtog_leg_2[['patient_id','event', 'time']].reset_index(drop=True)
X_holdout_doce = pd.DataFrame(scaler.transform(X_holdout_doce), columns=X_holdout_doce.columns)
X_holdout_doce_trimmed = X_holdout_doce[X_training_trimmed.columns]

In [None]:
y_holdout_doce_survlabel = Surv.from_dataframe('event', 'time', y_holdout_doce)

In [None]:
X_holdout_doce_trimmed

In [None]:
holdout_doce_risk_scores = coxnet_pred.predict(X_holdout_doce_trimmed)

In [None]:
doce_events, doce_times = y_holdout_doce['event'].values.astype(bool), y_holdout_doce['time'].values
coxnet_model = gcv.best_estimator_.named_steps["coxnetsurvivalanalysis"]

holdout_doce_risk_group_labels = np.array([1 if x > risk_threshold else 0 for x in holdout_doce_risk_scores])
holdout_doce_survival_probs = []
holdout_doce_survival_times = []

for group_label in np.unique(holdout_doce_risk_group_labels):
    group_indices = np.where(holdout_doce_risk_group_labels == group_label)
    group_time, group_survival_prob = kaplan_meier_estimator(doce_events[group_indices], doce_times[group_indices])
    holdout_survival_probs.append(group_survival_prob)
    holdout_survival_times.append(group_time)


In [None]:
results = logrank_test(doce_times[holdout_doce_risk_group_labels==0], doce_times[holdout_doce_risk_group_labels==1], doce_events[holdout_doce_risk_group_labels==0], doce_events[holdout_doce_risk_group_labels==1])
results.print_summary()

In [None]:
doce_results = multivariate_logrank_test(doce_times, holdout_doce_risk_group_labels, doce_events)
doce_results.p_value

In [None]:
y_holdout_doce['risk_score'] = holdout_doce_risk_scores
y_holdout_doce['risk_group'] = holdout_doce_risk_group_labels
y_holdout_doce['patient_id'] = y_holdout_doce['patient_id']
y_holdout_doce = y_holdout_doce.sort_values(by=['patient_id']).reset_index(drop=True)
y_holdout_doce

In [None]:
print("Metrics for treatment leg cohort")
coxnet_pred = gcv.best_estimator_

doce_holdout_prediction = coxnet_pred.predict(X_holdout_doce_trimmed)

doce_holdout_survival = coxnet_pred.predict_survival_function(X_holdout_doce_trimmed)
times_holdout_doce = np.arange(int(np.min(y_holdout_doce_survlabel['time'])), int(np.max(y_holdout_doce_survlabel["time"])))
survival_prediction_holdout_doce = np.asarray([[fn(t) for t in times_holdout_doce] for fn in doce_holdout_survival])

c_index_holdout = concordance_index_censored(y_holdout_doce_survlabel["event"], y_holdout_doce_survlabel["time"], doce_holdout_prediction)
print('C-index on Holdout set: {:.3f}'.format(c_index_holdout[0]))

c_ipcw_holdout = concordance_index_ipcw(y_training_survlabel, y_holdout_doce_survlabel, doce_holdout_prediction)
print('IPCW C-index on Holdout set: {:.3f}'.format(c_ipcw_holdout[0]))

model_metrics_doce = pd.DataFrame()
model_metrics_doce['cohort'] = ["doce"]
model_metrics_doce['c_index_holdout'] = [c_index_holdout[0]]
model_metrics_doce['c_index_ipcw_holdout'] = [c_ipcw_holdout[0]]
model_metrics_doce['integrated_brier_score_holdout'] = [0]

In [None]:

LL_holdout_doce = y_holdout_doce.drop(['patient_id', 'risk_score'], axis=1)
cph_holdout_doce = CoxPHFitter()
cph_holdout_doce.fit(LL_holdout_doce, duration_col='time', event_col='event', show_progress=False)

doce_holdout_results = cph_holdout_doce.summary
doce_holdout_p = multivariate_logrank_test(y_holdout_doce['time'], y_holdout_doce['risk_group'], y_holdout_doce['event']).p_value # holdout_results['p'].values[0]
doce_holdout_hr = doce_holdout_results['exp(coef)'].values[0]
doce_holdout_ci_lower = doce_holdout_results['exp(coef) lower 95%'].values[0]
doce_holdout_ci_upper = doce_holdout_results['exp(coef) upper 95%'].values[0]
doce_holdout_log_likelihood = cph_holdout_doce.log_likelihood_
model_metrics_doce['holdout_p_value'] = [doce_holdout_p]
model_metrics_doce['holdout_hazard_ratio'] = [doce_holdout_hr]
model_metrics_doce['holdout_hr_ci_lower'] = [doce_holdout_ci_lower]
model_metrics_doce['holdout_hr_ci_upper'] = [doce_holdout_ci_upper]
model_metrics_doce['holdout_log_likelihood'] = [doce_holdout_log_likelihood]
model_metrics_doce['holdout_parameters'] = [cph_holdout_doce.params_.shape[0]]

doce_holdout_data_stats = (doce_holdout_p, doce_holdout_hr, doce_holdout_ci_lower, doce_holdout_ci_upper)

In [None]:
cph_holdout_doce.score(LL_holdout, scoring_method='log_likelihood')

In [None]:
cph_holdout_doce.print_summary()

In [None]:
LL_holdout_doce

In [None]:
doce_holdout_title_str = 'Test Set - RT+ADT+CT (Docetaxel) treated patients (N={:d})'.format(len(y_holdout_doce))
doce_holdout_figure_save_path = f'KM_curve_holdout_treatment.png'
plot_km_curve_lifelines(y_holdout_doce, doce_holdout_data_stats, doce_holdout_figure_save_path, doce_holdout_title_str)

In [None]:
for i, feature in enumerate(X_holdout_doce_trimmed):
    pvalue = mannwhitneyu(X_holdout_doce_trimmed[feature], y_holdout_doce['event']).pvalue
    print(feature)
    plt.figure(figsize=(5,4))
    pt.RainCloud(x=rtog_features['event'], y=X_holdout_doce_trimmed[feature])
    plt.title(f'p: {pvalue:.2}')
    plt.show()

### APIC-low patients from both arms

In [None]:
chemo_favorable_risk = y_holdout_doce[y_holdout_doce['risk_group'] == 0]

In [None]:
chemo_favorable_risk

In [None]:
chemo_favorable_risk['chemo'] = 1

In [None]:
no_chemo_favorable_risk = y_holdout[y_holdout['risk_group'] == 0]

In [None]:
no_chemo_favorable_risk['chemo'] = 0

In [None]:
no_chemo_favorable_risk

In [None]:
# join dfs
favorable_risk = pd.concat([chemo_favorable_risk, no_chemo_favorable_risk])

In [None]:
favorable_risk

In [None]:
LL_holdout_doce

In [None]:
LL_holdout_doce = favorable_risk.drop(['patient_id', 'risk_group', 'risk_score'], axis=1)
cph_holdout_doce = CoxPHFitter()
cph_holdout_doce.fit(LL_holdout_doce, duration_col='time', event_col='event', show_progress=False)

doce_holdout_results = cph_holdout_doce.summary
doce_holdout_p = multivariate_logrank_test(favorable_risk['time'], favorable_risk['chemo'], favorable_risk['event']).p_value # holdout_results['p'].values[0] # change group to chemo/nochemo
doce_holdout_hr = doce_holdout_results['exp(coef)'].values[0]
doce_holdout_ci_lower = doce_holdout_results['exp(coef) lower 95%'].values[0]
doce_holdout_ci_upper = doce_holdout_results['exp(coef) upper 95%'].values[0]
doce_holdout_log_likelihood = cph_holdout_doce.log_likelihood_
model_metrics_doce['holdout_p_value'] = [doce_holdout_p]
model_metrics_doce['holdout_hazard_ratio'] = [doce_holdout_hr]
model_metrics_doce['holdout_hr_ci_lower'] = [doce_holdout_ci_lower]
model_metrics_doce['holdout_hr_ci_upper'] = [doce_holdout_ci_upper]
model_metrics_doce['holdout_log_likelihood'] = [doce_holdout_log_likelihood]
model_metrics_doce['holdout_parameters'] = [cph_holdout_doce.params_.shape[0]]

doce_holdout_data_stats = (doce_holdout_p, doce_holdout_hr, doce_holdout_ci_lower, doce_holdout_ci_upper)

In [None]:
doce_holdout_data_stats

In [None]:
cph_holdout_doce.print_summary()

In [None]:
def plot_km_same_risk_group(data_df1, data_df2, data_stats, figure_save_path, title_str):
    
    # Create a colormap
    cmap = plt.cm.get_cmap('Reds')
    # Choose a shade of red
    hr_shade = cmap(0.75)

    cmap = plt.cm.get_cmap('Blues')
    # Choose a shade of red
    lr_shade = cmap(0.75)
    
    fig, ax = plt.subplots(1, 1, figsize=(15, 12))

    data_high_risk = data_df1
    data_low_risk = data_df2

    kmf_hr = KaplanMeierFitter()
    kmf_hr.fit(data_high_risk['time'], event_observed=data_high_risk['event'], label='RT+ADT')
    kmf_hr.plot_survival_function(ax=ax, color='#f8766d', lw=2, show_censors=True)

    kmf_lr = KaplanMeierFitter()
    kmf_lr.fit(data_low_risk['time'], event_observed=data_low_risk['event'], label='RT+ADT+CT')
    kmf_lr.plot_survival_function(ax=ax, color='#03bfc4', lw=2, show_censors=True, )
    
    
    print("Low risk median survival time: ",kmf_lr.median_survival_time_)
    print("High risk median survival time: ",kmf_hr.median_survival_time_)
    max_median_survival_time = max(kmf_hr.median_survival_time_ , kmf_lr.median_survival_time_)
    if max_median_survival_time == np.inf:
        plt.axhline(y=0.5, color='black', linestyle='--', lw=1)
    else:
        plt.plot([0, max_median_survival_time], [0.5, 0.5], color='black', linestyle='--', lw=1)
    # Vertical lines up to y=0.5, using plot for precise control
    plt.plot([kmf_hr.median_survival_time_, kmf_hr.median_survival_time_], [0, 0.5], color='black', linestyle='--', lw=1)
    plt.plot([kmf_lr.median_survival_time_, kmf_lr.median_survival_time_], [0, 0.5], color='black', linestyle='--', lw=1)
    ax.set_title(title_str, fontsize=26)

    yticks = [np.round(x,1) for x in ax.get_yticks()]
    ax.set_yticklabels(yticks, fontsize=20)
    ax.set_xticklabels(ax.get_xticks().astype(int), fontsize=20)


    ax.set_xlabel('Time (Years)', fontsize=28)
    ax.set_ylabel('Overall Survival Probability', fontsize=28)
    data_p, data_hr, data_ci_lower, data_ci_upper = data_stats
    format_p = lambda p: f"{p:.1e}" if p < 0.001 else f"{p:.4f}"

# Updated string formatting
    data_stats_text = f'p: {format_p(data_p)}\nHR: {data_hr:.2f} [95% CI: {data_ci_lower:.2f} - {data_ci_upper:.2f}]'
    #data_stats_text = f'p: {data_p:.1e if data_p < 0.001 else data_p:.4f}\nHR: {data_hr:.2f} [95% CI: {data_ci_lower:.2f} - {data_ci_upper:.2f}]'
    ax.text(0.03, 0.1, data_stats_text, transform=ax.transAxes, fontsize=24, verticalalignment='bottom')

    # Add the risk table at the bottom of the KM plot on ax[1] (the bottom subplot)
    sns.despine()
    add_at_risk_counts(kmf_hr, kmf_lr, ax=ax, fontsize=20)
    ax.legend(fontsize=24)
    plt.tight_layout()
    
    fig.savefig(figure_save_path, bbox_inches='tight')
    plt.show()

In [None]:
risk_threshold

In [None]:
plot_km_same_risk_group(no_chemo_favorable_risk, chemo_favorable_risk, doce_holdout_data_stats, 'APIC_low_both_legs_km.png', "APIC-low RT+ADT vs RT+ADT+CT")

In [None]:
def calculate_survival_benefit(data_df1, data_df2, time_points=[1,3, 5,7, 10]):
    # Initialize Kaplan-Meier fitters for each group
    kmf_hr = KaplanMeierFitter()  # High risk group
    kmf_lr = KaplanMeierFitter()  # Low risk group

    # Fit data for high risk group
    kmf_hr.fit(data_df1['time'], event_observed=data_df1['event'])

    # Fit data for low risk group
    kmf_lr.fit(data_df2['time'], event_observed=data_df2['event'])

    # Calculate survival probabilities at specified time points
    survival_hr = kmf_hr.survival_function_.reindex(time_points, method='nearest')
    survival_lr = kmf_lr.survival_function_.reindex(time_points, method='nearest')

    # Calculate differences in survival probabilities and format them
    survival_benefits = {}
    for t in time_points:
        prob_hr = kmf_hr.predict(t)
        prob_lr = kmf_lr.predict(t)
        survival_benefit = (prob_lr - prob_hr) * 100  # in percentage points
        survival_benefits[t] = survival_benefit

    return survival_benefits

# Example usage:
# Assuming data_df1 and data_df2 are your dataframes for the two risk groups
benefits = calculate_survival_benefit(no_chemo_favorable_risk, chemo_favorable_risk)
print("Survival Benefits at specified time points:", benefits)


In [None]:
chemo_bad_risk = y_holdout_doce[y_holdout_doce['risk_group'] == 1]
no_chemo_bad_risk = y_holdout[y_holdout['risk_group'] == 1]

In [None]:
chemo_bad_risk['chemo'] = 1

In [None]:
no_chemo_bad_risk['chemo'] = 0

In [None]:
bad_risk = pd.concat([chemo_bad_risk, no_chemo_bad_risk])

In [None]:
LL_holdout_doce = bad_risk.drop(['patient_id', 'risk_group', 'risk_score'], axis=1)
cph_holdout_doce = CoxPHFitter()
cph_holdout_doce.fit(LL_holdout_doce, duration_col='time', event_col='event', show_progress=False)

doce_holdout_results = cph_holdout_doce.summary
doce_holdout_p = multivariate_logrank_test(bad_risk['time'], bad_risk['chemo'], bad_risk['event']).p_value # holdout_results['p'].values[0] # change group to chemo/nochemo
doce_holdout_hr = doce_holdout_results['exp(coef)'].values[0]
doce_holdout_ci_lower = doce_holdout_results['exp(coef) lower 95%'].values[0]
doce_holdout_ci_upper = doce_holdout_results['exp(coef) upper 95%'].values[0]
doce_holdout_log_likelihood = cph_holdout_doce.log_likelihood_
model_metrics_doce['holdout_p_value'] = [doce_holdout_p]
model_metrics_doce['holdout_hazard_ratio'] = [doce_holdout_hr]
model_metrics_doce['holdout_hr_ci_lower'] = [doce_holdout_ci_lower]
model_metrics_doce['holdout_hr_ci_upper'] = [doce_holdout_ci_upper]
model_metrics_doce['holdout_log_likelihood'] = [doce_holdout_log_likelihood]
model_metrics_doce['holdout_parameters'] = [cph_holdout_doce.params_.shape[0]]

doce_holdout_data_stats = (doce_holdout_p, doce_holdout_hr, doce_holdout_ci_lower, doce_holdout_ci_upper)

In [None]:
doce_holdout_results

In [None]:
plot_km_same_risk_group(no_chemo_bad_risk, chemo_bad_risk, doce_holdout_data_stats, 'APIC_high_both_legs_km.png', "APIC-high RT+ADT vs RT+ADT+CT")

In [None]:
benefits = calculate_survival_benefit(no_chemo_bad_risk, chemo_bad_risk)
print("Survival Benefits at specified time points:", benefits)

In [None]:
cph_holdout_doce.print_summary()