In [2]:
import pandas as pd
import numpy as np
import time
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
from lifelines.utils import concordance_index
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss
import matplotlib.ticker as mticker
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer
from sklearn.tree import plot_tree
from itertools import product
import matplotlib.pyplot as plt

In [3]:
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

In [4]:
# define three-year mortalitty status
df_train.loc[:,'MORTSTAT3Y'] = np.where((df_train["MORTSTAT"] == 1) & (df_train["TIMETOEVENT"] <= 3), 1, 0)
df_test.loc[:,'MORTSTAT3Y'] = np.where((df_test["MORTSTAT"] == 1) & (df_test["TIMETOEVENT"] <= 3), 1, 0)

In [5]:
# select features and target
X_train_3y = df_train.drop(columns=['MORTSTAT3Y', 'TIMETOEVENT', 'MORTSTAT'])
y_train_3y = df_train['MORTSTAT3Y']

X_test_3y = df_test.drop(columns=['MORTSTAT3Y', 'TIMETOEVENT', 'MORTSTAT'])
y_test_3y = df_test['MORTSTAT3Y']

In [6]:
# deifne c-index as in thesis

In [7]:
def c_index_classification(y_true, y_pred_proba, times):
    y_true = np.asarray(y_true)
    y_pred_proba = np.asarray(y_pred_proba)
    times = np.asarray(times)
    cc = dc = tp = 0

    n = len(y_true)
    for i in range(n):
        for j in range(n):
            if i == j:
                continue
            if times[i] < times[j] and y_true[i] == 1:
                if y_pred_proba[i] > y_pred_proba[j]:
                    cc += 1
                elif y_pred_proba[i] < y_pred_proba[j]:
                    dc += 1
                else:
                    tp += 1

        
    return (cc + 0.5 * tp) / (cc + dc + tp)

In [8]:
# DecisionTreeClassifier

In [9]:
# grid search 0 (log-loss)

In [10]:
param_grid0 = {'max_depth': [3, 4, 5, 6, 7, 8, 10, 12]}

grid_search1 = GridSearchCV(DecisionTreeClassifier(criterion='log_loss'), param_grid0, cv=4, scoring='neg_log_loss')
grid_search1.fit(X_train_3y, y_train_3y)

print(grid_search1.best_params_)

{'max_depth': 4}


In [11]:
# grid search 0 (brier-score)

In [12]:
grid_search2 = GridSearchCV(DecisionTreeClassifier(criterion='log_loss'), param_grid0, cv=4, scoring='neg_brier_score')
grid_search2.fit(X_train_3y, y_train_3y)

print(grid_search2.best_params_)

{'max_depth': 5}


In [13]:
# DTC0L

In [14]:
dtc0l = DecisionTreeClassifier(criterion='log_loss', max_depth=4)

# train
start_time = time.time()
dtc0l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 2.1047 seconds


In [15]:
# get predicted probabilities
y_pred_proba_train_3y = dtc0l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = dtc0l.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1215
Log Loss os: 0.1200
BS is: 0.0313
BS os: 0.0308
C is: 0.8533


In [16]:
# DTC0B

In [17]:
dtc0b = DecisionTreeClassifier(criterion='log_loss', max_depth=6)

# train
start_time = time.time()
dtc0b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 0.4530 seconds


In [18]:
# get predicted probabilities
y_pred_proba_train_3y = dtc0b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = dtc0b.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1162
Log Loss os: 0.1334
BS is: 0.0302
BS os: 0.0306
C is: 0.8700


In [19]:
#______________________________________________________________________________________________________________________________

In [20]:
# grid search 1 (log-loss)

In [21]:
param_grid1 = {'max_leaf_nodes': [10, 20, 30, 40, 50],
              'min_samples_leaf': [100, 200, 300, 400]}

grid_search3 = GridSearchCV(DecisionTreeClassifier(criterion='log_loss'), param_grid1, cv=4, scoring='neg_log_loss', n_jobs=-1, verbose=2)
grid_search3.fit(X_train_3y, y_train_3y)

print(grid_search3.best_params_)

Fitting 4 folds for each of 20 candidates, totalling 80 fits
{'max_leaf_nodes': 30, 'min_samples_leaf': 200}


In [22]:
results_df = pd.DataFrame(grid_search3.cv_results_)
# extract relevant columns
results_df = results_df[['rank_test_score', 'param_max_leaf_nodes', 'param_min_samples_leaf', 'mean_test_score']]

results_df['ll3y'] = (- results_df['mean_test_score']).round(4)
results_df = results_df.drop(columns=['mean_test_score'])

results_df = results_df.sort_values(by='rank_test_score')
results_df.head(3)

Unnamed: 0,rank_test_score,param_max_leaf_nodes,param_min_samples_leaf,ll3y
9,1,30,200,0.1217
4,2,20,100,0.1217
7,3,20,400,0.1218


In [23]:
# DTC1L

In [24]:
dtc1l = DecisionTreeClassifier(criterion='log_loss', max_leaf_nodes= 30, min_samples_leaf= 200)

# train
start_time = time.time()
dtc1l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 0.3823 seconds


In [25]:
# get predicted probabilities
y_pred_proba_train_3y = dtc1l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = dtc1l.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1184
Log Loss os: 0.1176
BS is: 0.0308
BS os: 0.0305
C is: 0.8659


In [26]:
# DTC2L

In [27]:
dtc2l = DecisionTreeClassifier(criterion='log_loss', max_leaf_nodes=20, min_samples_leaf=100)

# train
start_time = time.time()
dtc2l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 0.3111 seconds


In [28]:
# get predicted probabilities
y_pred_proba_train_3y = dtc2l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = dtc2l.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1198
Log Loss os: 0.1185
BS is: 0.0310
BS os: 0.0307
C is: 0.8600


In [29]:
# DTC3L

In [30]:
dtc3l = DecisionTreeClassifier(criterion='log_loss', max_leaf_nodes=20, min_samples_leaf=400)

# train
start_time = time.time()
dtc3l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 0.2932 seconds


In [31]:
# get predicted probabilities
y_pred_proba_train_3y = dtc3l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = dtc3l.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1200
Log Loss os: 0.1183
BS is: 0.0311
BS os: 0.0306
C is: 0.8599


In [32]:
#_______________________________________________________________________________________________________________________________

In [33]:
# grid search 2 brier-score

In [34]:
grid_search4 = GridSearchCV(DecisionTreeClassifier(criterion='log_loss'), param_grid1, cv=4, scoring='neg_brier_score', n_jobs=-1)
grid_search4.fit(X_train_3y, y_train_3y)

print(grid_search4.best_params_)

{'max_leaf_nodes': 40, 'min_samples_leaf': 300}


In [35]:
results_df = pd.DataFrame(grid_search4.cv_results_)
# extract relevant columns
results_df = results_df[['rank_test_score', 'param_max_leaf_nodes', 'param_min_samples_leaf', 'mean_test_score']]

results_df['bs3y'] = (- results_df['mean_test_score']).round(4)
results_df = results_df.drop(columns=['mean_test_score'])

results_df = results_df.sort_values(by='rank_test_score')
results_df.head(3)

Unnamed: 0,rank_test_score,param_max_leaf_nodes,param_min_samples_leaf,bs3y
14,1,40,300,0.0313
18,2,50,300,0.0313
16,3,50,100,0.0313


In [36]:
# DTC1B

In [37]:
dtc1b = DecisionTreeClassifier(criterion='log_loss', max_leaf_nodes=40, min_samples_leaf=300)

# train
start_time = time.time()
dtc1b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 0.3910 seconds


In [38]:
# get predicted probabilities
y_pred_proba_train_3y = dtc1b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = dtc1b.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1177
Log Loss os: 0.1173
BS is: 0.0307
BS os: 0.0305
C is: 0.8695


In [39]:
# DTC2B

In [40]:
dtc2b = DecisionTreeClassifier(criterion='log_loss', max_leaf_nodes=50, min_samples_leaf=300)

# train
start_time = time.time()
dtc2b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 0.4584 seconds


In [41]:
# get predicted probabilities
y_pred_proba_train_3y = dtc2b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = dtc2b.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1170
Log Loss os: 0.1169
BS is: 0.0307
BS os: 0.0304
C is: 0.8719


In [42]:
# DTC3B

In [43]:
dtc3b = DecisionTreeClassifier(criterion='log_loss', max_leaf_nodes=50, min_samples_leaf=100)

# train
start_time = time.time()
dtc3b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 0.3893 seconds


In [44]:
# get predicted probabilities
y_pred_proba_train_3y = dtc3b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = dtc3b.predict_proba(X_test_3y)[:, 1]

# compute Log Loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1162
Log Loss os: 0.1175
BS is: 0.0304
BS os: 0.0305
C is: 0.8727
