In [2]:
import pandas as pd
import numpy as np
import time
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from lifelines.utils import concordance_index
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss
import matplotlib.ticker as mticker

In [3]:
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

In [4]:
# define three-year mortalitty status
df_train.loc[:,'MORTSTAT3Y'] = np.where((df_train["MORTSTAT"] == 1) & (df_train["TIMETOEVENT"] <= 3), 1, 0)
df_test.loc[:,'MORTSTAT3Y'] = np.where((df_test["MORTSTAT"] == 1) & (df_test["TIMETOEVENT"] <= 3), 1, 0)

In [5]:
# select features and target
X_train_3y = df_train.drop(columns=['MORTSTAT3Y', 'TIMETOEVENT', 'MORTSTAT'])
y_train_3y = df_train['MORTSTAT3Y']

X_test_3y = df_test.drop(columns=['MORTSTAT3Y', 'TIMETOEVENT', 'MORTSTAT'])
y_test_3y = df_test['MORTSTAT3Y']

In [6]:
def c_index_classification(y_true, y_pred_proba, times):
    y_true = np.asarray(y_true)
    y_pred_proba = np.asarray(y_pred_proba)
    times = np.asarray(times)
    cc = dc = tp = 0

    n = len(y_true)
    for i in range(n):
        for j in range(n):
            if i == j:
                continue
            if times[i] < times[j] and y_true[i] == 1:
                if y_pred_proba[i] > y_pred_proba[j]:
                    cc += 1
                elif y_pred_proba[i] < y_pred_proba[j]:
                    dc += 1
                else:
                    tp += 1

        
    return (cc + 0.5 * tp) / (cc + dc + tp)

In [7]:
# XGBClassifier

In [8]:
# hyperparamter search (scorer=log-loss)

In [9]:
param_grid_xgbc = {'n_estimators': [100, 150, 200],
                   'learning_rate': [0.01, 0.02, 0.05, 0.1],
                   'subsample': [0.2, 0.5, 0.8],
                   'colsample_bytree': [0.6, 0.8, 1.0],
                   'max_depth': [2, 3, 4]}

grid_search_gbc1 = GridSearchCV(XGBClassifier(random_state=42, objective='binary:logistic'), param_grid_xgbc, cv=4, scoring='neg_log_loss', n_jobs=-1, verbose=1)
grid_search_gbc1.fit(X_train_3y, y_train_3y)

print(grid_search_gbc1.best_params_)

Fitting 4 folds for each of 324 candidates, totalling 1296 fits


  _data = np.array(data, dtype=dtype, copy=copy,


{'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.2}


In [10]:
results_df = pd.DataFrame(grid_search_gbc1.cv_results_)

# extract relevant columns
results_df = results_df[['rank_test_score', 'param_n_estimators', 'param_learning_rate', 'param_subsample', 'param_colsample_bytree', 'param_max_depth', 'mean_test_score']]

results_df['ll3y'] = (- results_df['mean_test_score']).round(4)
results_df = results_df.drop(columns=['mean_test_score'])

results_df = results_df.sort_values(by='rank_test_score')
results_df.head(3)

Unnamed: 0,rank_test_score,param_n_estimators,param_learning_rate,param_subsample,param_colsample_bytree,param_max_depth,ll3y
285,1,200,0.05,0.2,1.0,3,0.112
178,2,200,0.05,0.5,0.8,3,0.112
286,3,200,0.05,0.5,1.0,3,0.112


In [11]:
# XGBC1

In [12]:
xgbc1l = XGBClassifier(objective='binary:logistic', random_state=42, 
                      n_estimators=200, learning_rate=0.05, subsample=0.2, colsample_bytree=1, max_depth=3)

# train
start_time = time.time()
xgbc1l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.2f} seconds")

Training Time: 265.64 seconds


In [13]:
# get predicted probabilities
y_pred_proba_train_3y = xgbc1l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = xgbc1l.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1080
Log Loss os: 0.1091
BS is: 0.0286
BS os: 0.0291
C is: 0.8967


In [14]:
# XGBC2L

In [15]:
xgbc2l = XGBClassifier(objective='binary:logistic', random_state=42, 
                        n_estimators=200, learning_rate=0.05, subsample=0.5, colsample_bytree=0.8, max_depth=3)

# train
start_time = time.time()
xgbc2l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.2f} seconds")

Training Time: 1.33 seconds


In [16]:
# get predicted probabilities
y_pred_proba_train_3y = xgbc2l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = xgbc2l.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1079
Log Loss os: 0.1091
BS is: 0.0285
BS os: 0.0290
C is: 0.8968


In [17]:
# XGBC3l

In [18]:
xgbc3l = XGBClassifier(objective='binary:logistic', random_state=42,
                        n_estimators=200, learning_rate=0.05, subsample=0.5, colsample_bytree=1, max_depth=3)

# train
start_time = time.time()
xgbc3l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.2f} seconds")

Training Time: 1.35 seconds


In [19]:
# get predicted probabilities
y_pred_proba_train_3y = xgbc3l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = xgbc3l.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1078
Log Loss os: 0.1091
BS is: 0.0286
BS os: 0.0290
C is: 0.8969


In [20]:
# hyperparamter search (scorer=Brier-score)

In [21]:
grid_search_gbc2 = GridSearchCV(XGBClassifier(random_state=42, objective='binary:logistic'), param_grid_xgbc, cv=4, scoring='neg_brier_score', n_jobs=-1, verbose=1)
grid_search_gbc2.fit(X_train_3y, y_train_3y)

print(grid_search_gbc2.best_params_)

Fitting 4 folds for each of 324 candidates, totalling 1296 fits
{'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.5}


In [22]:
results_df = pd.DataFrame(grid_search_gbc2.cv_results_)

# extract relevant columns
results_df = results_df[['rank_test_score', 'param_n_estimators', 'param_learning_rate', 'param_subsample', 'param_colsample_bytree', 'param_max_depth', 'mean_test_score']]
results_df['bs3y'] = (- results_df['mean_test_score']).round(4)
results_df = results_df.drop(columns=['mean_test_score'])

results_df = results_df.sort_values(by='rank_test_score')
results_df.head(3)

Unnamed: 0,rank_test_score,param_n_estimators,param_learning_rate,param_subsample,param_colsample_bytree,param_max_depth,bs3y
286,1,200,0.05,0.5,1.0,3,0.0295
70,2,200,0.05,0.5,0.6,3,0.0295
178,3,200,0.05,0.5,0.8,3,0.0295


In [23]:
# XGBC1B

In [24]:
xgbc1b = XGBClassifier(objective='binary:logistic', random_state=42,
                       n_estimators=200, learning_rate=0.05, subsample=0.5, colsample_bytree=1, max_depth=3)
# train
start_time = time.time()
xgbc1b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.2f} seconds")

Training Time: 2.61 seconds


In [25]:
# get predicted probabilities
y_pred_proba_train_3y = xgbc1b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = xgbc1b.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1078
Log Loss os: 0.1091
BS is: 0.0286
BS os: 0.0290
C is: 0.8969


In [26]:
# XGBC2B

In [27]:
xgbc2b = XGBClassifier(objective='binary:logistic', random_state=42,
                        n_estimators=200, learning_rate=0.05, subsample=0.5, colsample_bytree=0.6, max_depth=3)

# train
start_time = time.time()
xgbc2b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.2f} seconds")

Training Time: 1.41 seconds


In [28]:
# get predicted probabilities
y_pred_proba_train_3y = xgbc2b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = xgbc2b.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1082
Log Loss os: 0.1092
BS is: 0.0286
BS os: 0.0290
C is: 0.8962


In [29]:
# XGBC3B

In [30]:
xgbc3b = XGBClassifier(objective='binary:logistic', random_state=42,
                        n_estimators=200, learning_rate=0.05, subsample=0.5, colsample_bytree=0.8, max_depth=3)

# train
start_time = time.time()
xgbc3b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.2f} seconds")

Training Time: 1.36 seconds


In [31]:
# get predicted probabilities
y_pred_proba_train_3y = xgbc3b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = xgbc3b.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1079
Log Loss os: 0.1091
BS is: 0.0285
BS os: 0.0290
C is: 0.8968
