In [2]:
import pandas as pd
import numpy as np
import time
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
from lifelines.utils import concordance_index
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss
import matplotlib.ticker as mticker
from sksurv.tree import SurvivalTree
from sklearn.metrics import make_scorer
from sklearn.tree import plot_tree

In [3]:
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

In [4]:
# select features
X_surv_train = df_train.drop(columns=['TIMETOEVENT', 'MORTSTAT'])
X_surv_test = df_test.drop(columns=['TIMETOEVENT', 'MORTSTAT'])
# select target
y_surv_train = np.array([(event, time) for event, time in zip(df_train['MORTSTAT'], df_train['TIMETOEVENT'])], dtype=[('MORTSTAT', bool), ('TIMETOEVENT', float)])
y_surv_test = np.array([(event, time) for event, time in zip(df_test['MORTSTAT'], df_test['TIMETOEVENT'])], dtype=[('MORTSTAT', bool), ('TIMETOEVENT', float)])

In [5]:
def c_index_classification(y_true, y_pred_proba, times):
    y_true = np.asarray(y_true)
    y_pred_proba = np.asarray(y_pred_proba)
    times = np.asarray(times)
    cc = dc = tp = 0

    n = len(y_true)
    for i in range(n):
        for j in range(n):
            if i == j:
                continue
            if times[i] < times[j] and y_true[i] == 1:
                if y_pred_proba[i] < y_pred_proba[j]:
                    cc += 1
                elif y_pred_proba[i] > y_pred_proba[j]:
                    dc += 1
                else:
                    tp += 1

        
    return (cc + 0.5 * tp) / (cc + dc + tp)

In [6]:
# define scorer

In [7]:
def ll3y(estimator, X, y):
    event_indicators = np.where((y["MORTSTAT"] == 1) & (y["TIMETOEVENT"] <= 3), 0, 1)

    # Get predicted survival functions
    surv_funcs = estimator.predict_survival_function(X)

    # Get time points (assuming all survival functions have same time grid)
    time_points = surv_funcs[0].x

    # Extract survival probabilities at 3 years
    surv_prob_3y = np.array([fn(time_points[3]) for fn in surv_funcs])

    # Compute log-loss
    logloss = log_loss(event_indicators, surv_prob_3y)

    return -logloss  

In [8]:
def bs3y(estimator, X, y):
    event_indicators = np.where((y["MORTSTAT"] == 1) & (y["TIMETOEVENT"] <= 3), 0, 1)

    # Get predicted survival functions
    surv_funcs = estimator.predict_survival_function(X)

    # Get time points (assuming all survival functions have same time grid)
    time_points = surv_funcs[0].x

    # Extract survival probabilities at 3 years
    surv_prob_3y = np.array([fn(time_points[3]) for fn in surv_funcs])

    # Compute log-loss
    bs = brier_score_loss(event_indicators, surv_prob_3y)

    return -bs  

In [9]:
# SurvTree

In [10]:
# hyperparamter search 0 (scorer=log-loss)

In [11]:
param_grid_st0 = {'max_depth': [2, 3, 4, 5, 6]}

grid_search_st1 = GridSearchCV(estimator=SurvivalTree(), param_grid=param_grid_st0, scoring=ll3y, cv=4)

grid_search_st1.fit(X_surv_train, y_surv_train)
print(grid_search_st1.best_params_)

{'max_depth': 4}


In [12]:
results_df = pd.DataFrame(grid_search_st1.cv_results_)

# extract relevant columns
results_df = results_df[['rank_test_score', 'param_max_depth', 'mean_test_score']]

# convert negative log-loss to positive (since scoring='neg_log_loss')
results_df['ll3y'] = (- results_df['mean_test_score']).round(4)
results_df = results_df.drop(columns=['mean_test_score'])

results_df = results_df.sort_values(by='rank_test_score')
results_df.head(5)

Unnamed: 0,rank_test_score,param_max_depth,ll3y
2,1,4,0.1229
1,2,3,0.1244
3,3,5,0.1249
0,4,2,0.1278
4,5,6,0.132


In [13]:
# ST0L

In [14]:
st0l = SurvivalTree(max_depth=4)

start_time = time.time()
st0l.fit(X_surv_train, y_surv_train)
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

Training time: 1.00 seconds


In [15]:
# predict survival function
surv_probs_train = st0l.predict_survival_function(X_surv_train)
surv_probs_test = st0l.predict_survival_function(X_surv_test)

# all time-to-event points
time_points_train = surv_probs_train[0].x
time_points_test = surv_probs_test[0].x 
 
# convert survival probabilities to df
df_surv_probs_train = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_train],  
    columns=time_points_train)
df_surv_probs_test = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_test],  
    columns=time_points_train)

# surv probabilites after 1 and 3 years
suvr_prob_3y_train = df_surv_probs_train.iloc[:,3]
suvr_prob_3y_test = df_surv_probs_test.iloc[:,3]

# extract status
# get 3-year event status based on TIMETOEVENT and MORTSTAT
status_3y_train_surv = np.where((df_train["MORTSTAT"] == 1) & (df_train["TIMETOEVENT"] <= 3), 0, 1)
status_3y_test_surv = np.where((df_test["MORTSTAT"] == 1) & (df_test["TIMETOEVENT"] <= 3), 0, 1)

# log-loss
log_loss_3y_train = log_loss(status_3y_train_surv, suvr_prob_3y_train)
log_loss_3y_test = log_loss(status_3y_test_surv, suvr_prob_3y_test)

# compute Brier
bs_train_3y = brier_score_loss(status_3y_train_surv, suvr_prob_3y_train)
bs_test_3y = brier_score_loss(status_3y_test_surv, suvr_prob_3y_test)

# c-index
c_is = c_index_classification(df_train["MORTSTAT"], suvr_prob_3y_train, df_train['TIMETOEVENT'])

print(f'Log Loss is: {log_loss_3y_train:.4f}')
print(f'Log Loss os: {log_loss_3y_test:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1213
Log Loss os: 0.1210
BS is: 0.0310
BS os: 0.0306
C is: 0.8500


In [16]:
#_______________________________________________________________________________________________________________________________

In [17]:
# hyperparamter search 0 (scorer=Brier-score)

In [18]:
grid_search_st2 = GridSearchCV(estimator=SurvivalTree(), param_grid=param_grid_st0, scoring=bs3y, cv=4)

grid_search_st2.fit(X_surv_train, y_surv_train)
print(grid_search_st2.best_params_)

{'max_depth': 5}


In [19]:
results_df = pd.DataFrame(grid_search_st2.cv_results_)

# extract relevant columns
results_df = results_df[['rank_test_score', 'param_max_depth', 'mean_test_score']]
results_df['bs3y'] = (- results_df['mean_test_score']).round(4)
results_df = results_df.drop(columns=['mean_test_score'])

results_df = results_df.sort_values(by='rank_test_score')
results_df.head(5)

Unnamed: 0,rank_test_score,param_max_depth,bs3y
3,1,5,0.0313
4,2,6,0.0313
2,3,4,0.0314
1,4,3,0.0316
0,5,2,0.0319


In [20]:
# ST0B

In [21]:
st0b = SurvivalTree(max_depth=5)

start_time = time.time()
st0b.fit(X_surv_train, y_surv_train)
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

Training time: 1.32 seconds


In [22]:
# predict survival function
surv_probs_train = st0b.predict_survival_function(X_surv_train)
surv_probs_test = st0b.predict_survival_function(X_surv_test)

# all time-to-event points
time_points_train = surv_probs_train[0].x
time_points_test = surv_probs_test[0].x 
 
# convert survival probabilities to df
df_surv_probs_train = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_train],  
    columns=time_points_train)
df_surv_probs_test = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_test],  
    columns=time_points_train)

# surv probabilites after 1 and 3 years
suvr_prob_3y_train = df_surv_probs_train.iloc[:,3]
suvr_prob_3y_test = df_surv_probs_test.iloc[:,3]

# extract status
# get 3-year event status based on TIMETOEVENT and MORTSTAT
status_3y_train_surv = np.where((df_train["MORTSTAT"] == 1) & (df_train["TIMETOEVENT"] <= 3), 0, 1)
status_3y_test_surv = np.where((df_test["MORTSTAT"] == 1) & (df_test["TIMETOEVENT"] <= 3), 0, 1)

# log-loss
log_loss_3y_train = log_loss(status_3y_train_surv, suvr_prob_3y_train)
log_loss_3y_test = log_loss(status_3y_test_surv, suvr_prob_3y_test)

# compute Brier
bs_train_3y = brier_score_loss(status_3y_train_surv, suvr_prob_3y_train)
bs_test_3y = brier_score_loss(status_3y_test_surv, suvr_prob_3y_test)

# c-index
c_is =  c_index_classification(df_train["MORTSTAT"], suvr_prob_3y_train, df_train['TIMETOEVENT'])

print(f'Log Loss is: {log_loss_3y_train:.4f}')
print(f'Log Loss os: {log_loss_3y_test:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1191
Log Loss os: 0.1222
BS is: 0.0306
BS os: 0.0304
C is: 0.8585


In [23]:
#_______________________________________________________________________________________________________________________________

In [24]:
# hyperparamter search (scorer=log-loss)

In [25]:
param_grid_st1 = {'min_samples_leaf': [200, 400, 500, 600],
                  'max_leaf_nodes': [15, 25, 35, 40, 50]}

grid_search_st3 = GridSearchCV(estimator=SurvivalTree(), param_grid=param_grid_st1, scoring=ll3y, cv=4, n_jobs=-1, verbose=1)

grid_search_st3.fit(X_surv_train, y_surv_train)
print(grid_search_st3.best_params_)

Fitting 4 folds for each of 20 candidates, totalling 80 fits
{'max_leaf_nodes': 35, 'min_samples_leaf': 200}


In [26]:
results_df = pd.DataFrame(grid_search_st3.cv_results_)

# extract relevant columns
results_df = results_df[['rank_test_score', 'param_min_samples_leaf', 'param_max_leaf_nodes', 'mean_test_score']]
results_df['ll3y'] = (- results_df['mean_test_score']).round(4)
results_df = results_df.drop(columns=['mean_test_score'])

results_df = results_df.sort_values(by='rank_test_score')
results_df.head(5)

Unnamed: 0,rank_test_score,param_min_samples_leaf,param_max_leaf_nodes,ll3y
8,1,200,35,0.1198
16,2,200,50,0.1198
13,3,400,40,0.1199
9,4,400,35,0.1199
5,5,400,25,0.12


In [27]:
# ST1L

In [28]:
st1l = SurvivalTree(max_leaf_nodes=35, min_samples_leaf=200)

start_time = time.time()
st1l.fit(X_surv_train, y_surv_train)
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

Training time: 1.46 seconds


In [29]:
# predict survival function
surv_probs_train = st1l.predict_survival_function(X_surv_train)
surv_probs_test = st1l.predict_survival_function(X_surv_test)

# all time-to-event points
time_points_train = surv_probs_train[0].x
time_points_test = surv_probs_test[0].x 
 
# convert survival probabilities to df
df_surv_probs_train = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_train],  
    columns=time_points_train)
df_surv_probs_test = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_test],  
    columns=time_points_train)

# surv probabilites after 1 and 3 years
suvr_prob_3y_train = df_surv_probs_train.iloc[:,3]
suvr_prob_3y_test = df_surv_probs_test.iloc[:,3]

# extract status
# get 3-year event status based on TIMETOEVENT and MORTSTAT
status_3y_train_surv = np.where((df_train["MORTSTAT"] == 1) & (df_train["TIMETOEVENT"] <= 3), 0, 1)
status_3y_test_surv = np.where((df_test["MORTSTAT"] == 1) & (df_test["TIMETOEVENT"] <= 3), 0, 1)

# log-loss
log_loss_3y_train = log_loss(status_3y_train_surv, suvr_prob_3y_train)
log_loss_3y_test = log_loss(status_3y_test_surv, suvr_prob_3y_test)

# compute Brier
bs_train_3y = brier_score_loss(status_3y_train_surv, suvr_prob_3y_train)
bs_test_3y = brier_score_loss(status_3y_test_surv, suvr_prob_3y_test)

# c-index
c_is =  c_index_classification(df_train["MORTSTAT"], suvr_prob_3y_train, df_train['TIMETOEVENT'])

print(f'Log Loss is: {log_loss_3y_train:.4f}')
print(f'Log Loss os: {log_loss_3y_test:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1174
Log Loss os: 0.1160
BS is: 0.0306
BS os: 0.0303
C is: 0.8705


In [30]:
# ST2L

In [31]:
st2l = SurvivalTree(max_leaf_nodes=50, min_samples_leaf=200)

start_time = time.time()
st2l.fit(X_surv_train, y_surv_train)
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

Training time: 2.23 seconds


In [32]:
# predict survival function
surv_probs_train = st2l.predict_survival_function(X_surv_train)
surv_probs_test = st2l.predict_survival_function(X_surv_test)

# all time-to-event points
time_points_train = surv_probs_train[0].x
time_points_test = surv_probs_test[0].x 
 
# convert survival probabilities to df
df_surv_probs_train = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_train],  
    columns=time_points_train)
df_surv_probs_test = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_test],  
    columns=time_points_train)

# surv probabilites after 1 and 3 years
suvr_prob_3y_train = df_surv_probs_train.iloc[:,3]
suvr_prob_3y_test = df_surv_probs_test.iloc[:,3]

# extract status
# get 3-year event status based on TIMETOEVENT and MORTSTAT
status_3y_train_surv = np.where((df_train["MORTSTAT"] == 1) & (df_train["TIMETOEVENT"] <= 3), 0, 1)
status_3y_test_surv = np.where((df_test["MORTSTAT"] == 1) & (df_test["TIMETOEVENT"] <= 3), 0, 1)

# log-loss
log_loss_3y_train = log_loss(status_3y_train_surv, suvr_prob_3y_train)
log_loss_3y_test = log_loss(status_3y_test_surv, suvr_prob_3y_test)

# compute Brier
bs_train_3y = brier_score_loss(status_3y_train_surv, suvr_prob_3y_train)
bs_test_3y = brier_score_loss(status_3y_test_surv, suvr_prob_3y_test)

# c-index
c_is =  c_index_classification(df_train["MORTSTAT"], suvr_prob_3y_train, df_train['TIMETOEVENT'])

print(f'Log Loss is: {log_loss_3y_train:.4f}')
print(f'Log Loss os: {log_loss_3y_test:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1163
Log Loss os: 0.1155
BS is: 0.0305
BS os: 0.0301
C is: 0.8751


In [33]:
# ST3L

In [34]:
st3l = SurvivalTree(max_leaf_nodes=40, min_samples_leaf=400)

start_time = time.time()
st3l.fit(X_surv_train, y_surv_train)
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

Training time: 1.29 seconds


In [35]:
# predict survival function
surv_probs_train = st3l.predict_survival_function(X_surv_train)
surv_probs_test = st3l.predict_survival_function(X_surv_test)

# all time-to-event points
time_points_train = surv_probs_train[0].x
time_points_test = surv_probs_test[0].x 
 
# convert survival probabilities to df
df_surv_probs_train = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_train],  
    columns=time_points_train)
df_surv_probs_test = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_test],  
    columns=time_points_train)

# surv probabilites after 1 and 3 years
suvr_prob_3y_train = df_surv_probs_train.iloc[:,3]
suvr_prob_3y_test = df_surv_probs_test.iloc[:,3]

# extract status
# get 3-year event status based on TIMETOEVENT and MORTSTAT
status_3y_train_surv = np.where((df_train["MORTSTAT"] == 1) & (df_train["TIMETOEVENT"] <= 3), 0, 1)
status_3y_test_surv = np.where((df_test["MORTSTAT"] == 1) & (df_test["TIMETOEVENT"] <= 3), 0, 1)

# log-loss
log_loss_3y_train = log_loss(status_3y_train_surv, suvr_prob_3y_train)
log_loss_3y_test = log_loss(status_3y_test_surv, suvr_prob_3y_test)

# compute Brier
bs_train_3y = brier_score_loss(status_3y_train_surv, suvr_prob_3y_train)
bs_test_3y = brier_score_loss(status_3y_test_surv, suvr_prob_3y_test)

# c-index
c_is =  c_index_classification(df_train["MORTSTAT"], suvr_prob_3y_train, df_train['TIMETOEVENT'])

print(f'Log Loss is: {log_loss_3y_train:.4f}')
print(f'Log Loss os: {log_loss_3y_test:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1173
Log Loss os: 0.1160
BS is: 0.0308
BS os: 0.0303
C is: 0.8727


In [36]:
#_______________________________________________________________________________________________________________________________

In [37]:
# hyperparamter search (scorer=Brier score)

In [38]:
grid_search_st4 = GridSearchCV(estimator=SurvivalTree(), param_grid=param_grid_st1, scoring=bs3y, cv=4, n_jobs=-1, verbose=1)

grid_search_st4.fit(X_surv_train, y_surv_train)
print(grid_search_st4.best_params_)

Fitting 4 folds for each of 20 candidates, totalling 80 fits
{'max_leaf_nodes': 50, 'min_samples_leaf': 200}


In [39]:
results_df = pd.DataFrame(grid_search_st4.cv_results_)

# extract relevant columns
results_df = results_df[['rank_test_score', 'param_min_samples_leaf', 'param_max_leaf_nodes', 'mean_test_score']]

results_df['bs3y'] = (- results_df['mean_test_score']).round(4)
results_df = results_df.drop(columns=['mean_test_score'])

results_df = results_df.sort_values(by='rank_test_score')
results_df.head(5)

Unnamed: 0,rank_test_score,param_min_samples_leaf,param_max_leaf_nodes,bs3y
16,1,200,50,0.031
12,2,200,40,0.031
8,3,200,35,0.0311
13,4,400,40,0.0311
9,5,400,35,0.0311


In [40]:
# ST1B

In [41]:
st1b = SurvivalTree(max_leaf_nodes=50, min_samples_leaf=200)

start_time = time.time()
st1b.fit(X_surv_train, y_surv_train)
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

Training time: 1.65 seconds


In [42]:
# predict survival function
surv_probs_train = st1b.predict_survival_function(X_surv_train)
surv_probs_test = st1b.predict_survival_function(X_surv_test)

# all time-to-event points
time_points_train = surv_probs_train[0].x
time_points_test = surv_probs_test[0].x 
 
# convert survival probabilities to df
df_surv_probs_train = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_train],  
    columns=time_points_train)
df_surv_probs_test = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_test],  
    columns=time_points_train)

# surv probabilites after 1 and 3 years
suvr_prob_3y_train = df_surv_probs_train.iloc[:,3]
suvr_prob_3y_test = df_surv_probs_test.iloc[:,3]

# extract status
# get 3-year event status based on TIMETOEVENT and MORTSTAT
status_3y_train_surv = np.where((df_train["MORTSTAT"] == 1) & (df_train["TIMETOEVENT"] <= 3), 0, 1)
status_3y_test_surv = np.where((df_test["MORTSTAT"] == 1) & (df_test["TIMETOEVENT"] <= 3), 0, 1)

# log-loss
log_loss_3y_train = log_loss(status_3y_train_surv, suvr_prob_3y_train)
log_loss_3y_test = log_loss(status_3y_test_surv, suvr_prob_3y_test)

# compute Brier
bs_train_3y = brier_score_loss(status_3y_train_surv, suvr_prob_3y_train)
bs_test_3y = brier_score_loss(status_3y_test_surv, suvr_prob_3y_test)

# c-index
c_is =  c_index_classification(df_train["MORTSTAT"], suvr_prob_3y_train, df_train['TIMETOEVENT'])

print(f'Log Loss is: {log_loss_3y_train:.4f}')
print(f'Log Loss os: {log_loss_3y_test:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1163
Log Loss os: 0.1155
BS is: 0.0305
BS os: 0.0301
C is: 0.8751


In [43]:
# ST2B

In [44]:
st2b = SurvivalTree(max_leaf_nodes=40, min_samples_leaf=200)

start_time = time.time()
st2b.fit(X_surv_train, y_surv_train)
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

Training time: 1.58 seconds


In [45]:
# predict survival function
surv_probs_train = st2b.predict_survival_function(X_surv_train)
surv_probs_test = st2b.predict_survival_function(X_surv_test)

# all time-to-event points
time_points_train = surv_probs_train[0].x
time_points_test = surv_probs_test[0].x 
 
# convert survival probabilities to df
df_surv_probs_train = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_train],  
    columns=time_points_train)
df_surv_probs_test = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_test],  
    columns=time_points_train)

# surv probabilites after 1 and 3 years
suvr_prob_3y_train = df_surv_probs_train.iloc[:,3]
suvr_prob_3y_test = df_surv_probs_test.iloc[:,3]

# extract status
# get 3-year event status based on TIMETOEVENT and MORTSTAT
status_3y_train_surv = np.where((df_train["MORTSTAT"] == 1) & (df_train["TIMETOEVENT"] <= 3), 0, 1)
status_3y_test_surv = np.where((df_test["MORTSTAT"] == 1) & (df_test["TIMETOEVENT"] <= 3), 0, 1)

# log-loss
log_loss_3y_train = log_loss(status_3y_train_surv, suvr_prob_3y_train)
log_loss_3y_test = log_loss(status_3y_test_surv, suvr_prob_3y_test)

# compute Brier
bs_train_3y = brier_score_loss(status_3y_train_surv, suvr_prob_3y_train)
bs_test_3y = brier_score_loss(status_3y_test_surv, suvr_prob_3y_test)

# c-index
c_is =  c_index_classification(df_train["MORTSTAT"], suvr_prob_3y_train, df_train['TIMETOEVENT'])

print(f'Log Loss is: {log_loss_3y_train:.4f}')
print(f'Log Loss os: {log_loss_3y_test:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1171
Log Loss os: 0.1160
BS is: 0.0306
BS os: 0.0303
C is: 0.8725


In [46]:
# ST3B

In [47]:
st3b = SurvivalTree(max_leaf_nodes=35, min_samples_leaf=200)

start_time = time.time()
st3b.fit(X_surv_train, y_surv_train)
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

Training time: 1.46 seconds


In [48]:
# predict survival function
surv_probs_train = st3b.predict_survival_function(X_surv_train)
surv_probs_test = st3b.predict_survival_function(X_surv_test)

# all time-to-event points
time_points_train = surv_probs_train[0].x
time_points_test = surv_probs_test[0].x 
 
# convert survival probabilities to df
df_surv_probs_train = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_train],  
    columns=time_points_train)
df_surv_probs_test = pd.DataFrame(
    data=[fn(time_points_train) for fn in surv_probs_test],  
    columns=time_points_train)

# surv probabilites after 1 and 3 years
suvr_prob_3y_train = df_surv_probs_train.iloc[:,3]
suvr_prob_3y_test = df_surv_probs_test.iloc[:,3]

# extract status
# get 3-year event status based on TIMETOEVENT and MORTSTAT
status_3y_train_surv = np.where((df_train["MORTSTAT"] == 1) & (df_train["TIMETOEVENT"] <= 3), 0, 1)
status_3y_test_surv = np.where((df_test["MORTSTAT"] == 1) & (df_test["TIMETOEVENT"] <= 3), 0, 1)

# log-loss
log_loss_3y_train = log_loss(status_3y_train_surv, suvr_prob_3y_train)
log_loss_3y_test = log_loss(status_3y_test_surv, suvr_prob_3y_test)

# compute Brier
bs_train_3y = brier_score_loss(status_3y_train_surv, suvr_prob_3y_train)
bs_test_3y = brier_score_loss(status_3y_test_surv, suvr_prob_3y_test)

# c-index
c_is =  c_index_classification(df_train["MORTSTAT"], suvr_prob_3y_train, df_train['TIMETOEVENT'])

print(f'Log Loss is: {log_loss_3y_train:.4f}')
print(f'Log Loss os: {log_loss_3y_test:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1174
Log Loss os: 0.1160
BS is: 0.0306
BS os: 0.0303
C is: 0.8705
