In [2]:
import pandas as pd
import numpy as np
import time
import matplotlib.colors as mcolors
from lifelines.utils import concordance_index
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss
import matplotlib.ticker as mticker
from sklearn.ensemble import GradientBoostingClassifier  
from itertools import product

In [3]:
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

In [4]:
# define three-year mortalitty status
df_train.loc[:,'MORTSTAT3Y'] = np.where((df_train["MORTSTAT"] == 1) & (df_train["TIMETOEVENT"] <= 3), 1, 0)
df_test.loc[:,'MORTSTAT3Y'] = np.where((df_test["MORTSTAT"] == 1) & (df_test["TIMETOEVENT"] <= 3), 1, 0)

In [5]:
# select features and target
X_train_3y = df_train.drop(columns=['MORTSTAT3Y', 'TIMETOEVENT', 'MORTSTAT'])
y_train_3y = df_train['MORTSTAT3Y']

X_test_3y = df_test.drop(columns=['MORTSTAT3Y', 'TIMETOEVENT', 'MORTSTAT'])
y_test_3y = df_test['MORTSTAT3Y']

In [6]:
def c_index_classification(y_true, y_pred_proba, times):
    y_true = np.asarray(y_true)
    y_pred_proba = np.asarray(y_pred_proba)
    times = np.asarray(times)
    cc = dc = tp = 0

    n = len(y_true)
    for i in range(n):
        for j in range(n):
            if i == j:
                continue
            if times[i] < times[j] and y_true[i] == 1:
                if y_pred_proba[i] > y_pred_proba[j]:
                    cc += 1
                elif y_pred_proba[i] < y_pred_proba[j]:
                    dc += 1
                else:
                    tp += 1

        
    return (cc + 0.5 * tp) / (cc + dc + tp)

In [7]:
# GradientBoostingClassifier

In [8]:
# hyperparamter search (scorer=log-loss)

In [9]:
param_grid_gbc = {'n_estimators': [100, 150, 200],
                  'learning_rate': [0.01, 0.05, 0.1],
                  'subsample': [0.5, 0.8, 1],
                  'max_leaf_nodes': [5, 10, 20],
                  'max_features': [20, 30, None]}

grid_search_gbc1 = GridSearchCV(GradientBoostingClassifier(random_state=42, loss='log_loss'), param_grid_gbc, cv=4, scoring='neg_log_loss', n_jobs=-1, verbose=2)
grid_search_gbc1.fit(X_train_3y, y_train_3y)

print(grid_search_gbc1.best_params_)

Fitting 4 folds for each of 243 candidates, totalling 972 fits
{'learning_rate': 0.05, 'max_features': 20, 'max_leaf_nodes': 10, 'n_estimators': 200, 'subsample': 1}


In [10]:
results_df = pd.DataFrame(grid_search_gbc1.cv_results_)

# extract relevant columns
results_df = results_df[['rank_test_score', 'param_n_estimators', 'param_learning_rate', 'param_subsample', 'param_max_leaf_nodes', 'param_max_features', 'mean_test_score']]

# convert negative log-loss to positive (since scoring='neg_log_loss')
results_df['ll3y'] = (- results_df['mean_test_score']).round(4)
results_df = results_df.drop(columns=['mean_test_score'])

results_df = results_df.sort_values(by='rank_test_score')
results_df.head(3)

Unnamed: 0,rank_test_score,param_n_estimators,param_learning_rate,param_subsample,param_max_leaf_nodes,param_max_features,ll3y
98,1,200,0.05,1.0,10,20,0.1125
107,1,200,0.05,1.0,20,20,0.1125
185,3,150,0.1,1.0,20,20,0.1127


In [11]:
# GBC1L

In [12]:
gbc1l = GradientBoostingClassifier(loss='log_loss', random_state=42,
                                  n_estimators=200, learning_rate=0.05, subsample=1, max_leaf_nodes=10, max_features=20)
                                  
# train
start_time = time.time()
gbc1l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.2f} seconds")

Training Time: 21.83 seconds


In [13]:
# get predicted probabilities
y_pred_proba_train_3y = gbc1l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = gbc1l.predict_proba(X_test_3y)[:, 1]

# compute log loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1082
Log Loss os: 0.1099
BS is: 0.0285
BS os: 0.0292
C is: 0.8948


In [14]:
# GBC2L

In [15]:
gbc2l = GradientBoostingClassifier(loss='log_loss', random_state=42,
                                  n_estimators=200, learning_rate=0.05, subsample=1, max_leaf_nodes=20, max_features=20)
                                  
# train
start_time = time.time()
gbc2l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.2f} seconds")

Training Time: 17.83 seconds


In [16]:
# get predicted probabilities
y_pred_proba_train_3y = gbc2l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = gbc2l.predict_proba(X_test_3y)[:, 1]

# compute log loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1082
Log Loss os: 0.1099
BS is: 0.0285
BS os: 0.0292
C is: 0.8948


In [17]:
# GBC3L

In [18]:
gbc3l = GradientBoostingClassifier(loss='log_loss', random_state=42,
                                  n_estimators=150, learning_rate=0.10, subsample=1, max_leaf_nodes=20, max_features=20)

# train
start_time = time.time()
gbc3l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.2f} seconds")

Training Time: 13.49 seconds


In [19]:
# get predicted probabilities
y_pred_proba_train_3y = gbc3l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = gbc3l.predict_proba(X_test_3y)[:, 1]

# compute log loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1068
Log Loss os: 0.1097
BS is: 0.0281
BS os: 0.0292
C is: 0.8976


In [20]:
#_______________________________________________________________________________________________________________________________

In [21]:
# hyperparamter search (scorer=Brier score)

In [22]:
grid_search_gbc2 = GridSearchCV(GradientBoostingClassifier(random_state=42, loss='log_loss'), param_grid_gbc, cv=4, scoring='neg_brier_score', n_jobs=-1, verbose=2)
grid_search_gbc2.fit(X_train_3y, y_train_3y)

print(grid_search_gbc2.best_params_)

Fitting 4 folds for each of 243 candidates, totalling 972 fits
{'learning_rate': 0.05, 'max_features': 20, 'max_leaf_nodes': 10, 'n_estimators': 200, 'subsample': 1}


In [23]:
results_df = pd.DataFrame(grid_search_gbc2.cv_results_)

# extract relevant columns
results_df = results_df[['rank_test_score', 'param_n_estimators', 'param_learning_rate', 'param_subsample', 'param_max_leaf_nodes', 'param_max_features', 'mean_test_score']]

# convert negative log-loss to positive (since scoring='neg_log_loss')
results_df['ll3y'] = (- results_df['mean_test_score']).round(4)
results_df = results_df.drop(columns=['mean_test_score'])

results_df = results_df.sort_values(by='rank_test_score')
results_df.head(3)

Unnamed: 0,rank_test_score,param_n_estimators,param_learning_rate,param_subsample,param_max_leaf_nodes,param_max_features,ll3y
107,1,200,0.05,1.0,20,20,0.0296
98,1,200,0.05,1.0,10,20,0.0296
95,3,150,0.05,1.0,10,20,0.0297


In [24]:
# GBC1B

In [25]:
gbc1b = GradientBoostingClassifier(loss='log_loss', random_state=42,
                                  n_estimators=200, learning_rate=0.05, subsample=1, max_leaf_nodes=20, max_features=20)
                                           
# train
start_time = time.time()
gbc1b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.2f} seconds")

Training Time: 21.65 seconds


In [26]:
# get predicted probabilities
y_pred_proba_train_3y = gbc1b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = gbc1b.predict_proba(X_test_3y)[:, 1]

# compute log loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1082
Log Loss os: 0.1099
BS is: 0.0285
BS os: 0.0292
C is: 0.8948


In [27]:
# GBC2B

In [28]:
gbc2b = GradientBoostingClassifier(loss='log_loss', random_state=42,
                                 n_estimators=200, learning_rate=0.05, subsample=1, max_leaf_nodes=10, max_features=20)
                                           
# train
start_time = time.time()
gbc2b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.2f} seconds")

Training Time: 17.04 seconds


In [29]:
# get predicted probabilities
y_pred_proba_train_3y = gbc2b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = gbc2b.predict_proba(X_test_3y)[:, 1]

# compute log loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1082
Log Loss os: 0.1099
BS is: 0.0285
BS os: 0.0292
C is: 0.8948


In [30]:
# GBC3B

In [31]:
gbc3b = GradientBoostingClassifier(loss='log_loss', random_state=42,
                                n_estimators=150, learning_rate=0.05, subsample=1, max_leaf_nodes=10, max_features=20)
                                           
# train
start_time = time.time()
gbc3b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.2f} seconds")

Training Time: 13.39 seconds


In [32]:
# get predicted probabilities
y_pred_proba_train_3y = gbc3b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = gbc3b.predict_proba(X_test_3y)[:, 1]

# compute log loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1094
Log Loss os: 0.1104
BS is: 0.0287
BS os: 0.0292
C is: 0.8927
