In [2]:
import pandas as pd
import numpy as np
import time
import matplotlib.colors as mcolors
from lifelines.utils import concordance_index
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss
import matplotlib.ticker as mticker
from sklearn.ensemble import RandomForestClassifier  
from itertools import product
import matplotlib.pyplot as plt

In [3]:
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

In [4]:
# define three-year mortalitty status
df_train.loc[:,'MORTSTAT3Y'] = np.where((df_train["MORTSTAT"] == 1) & (df_train["TIMETOEVENT"] <= 3), 1, 0)
df_test.loc[:,'MORTSTAT3Y'] = np.where((df_test["MORTSTAT"] == 1) & (df_test["TIMETOEVENT"] <= 3), 1, 0)

In [5]:
# select features and target
X_train_3y = df_train.drop(columns=['MORTSTAT3Y', 'TIMETOEVENT', 'MORTSTAT'])
y_train_3y = df_train['MORTSTAT3Y']

X_test_3y = df_test.drop(columns=['MORTSTAT3Y', 'TIMETOEVENT', 'MORTSTAT'])
y_test_3y = df_test['MORTSTAT3Y']

In [6]:
def c_index_classification(y_true, y_pred_proba, times):
    y_true = np.asarray(y_true)
    y_pred_proba = np.asarray(y_pred_proba)
    times = np.asarray(times)
    cc = dc = tp = 0

    n = len(y_true)
    for i in range(n):
        for j in range(n):
            if i == j:
                continue
            if times[i] < times[j] and y_true[i] == 1:
                if y_pred_proba[i] > y_pred_proba[j]:
                    cc += 1
                elif y_pred_proba[i] < y_pred_proba[j]:
                    dc += 1
                else:
                    tp += 1

        
    return (cc + 0.5 * tp) / (cc + dc + tp)

In [7]:
# RandomForestClassifier

In [8]:
# grid search 

In [9]:
param_grid = {'n_estimators': [100, 150, 200],
              'max_leaf_nodes': [20, 30, 40, 50],
              'max_features': [20, 30, 40, None],
              'max_samples': [0.1, 0.2, 0.4]}

grid_search1 = GridSearchCV(RandomForestClassifier(random_state=42, criterion='log_loss'), param_grid, cv=4, scoring='neg_log_loss', n_jobs=-1, verbose=2)
grid_search1.fit(X_train_3y, y_train_3y)

print(grid_search1.best_params_)

Fitting 4 folds for each of 144 candidates, totalling 576 fits


  _data = np.array(data, dtype=dtype, copy=copy,


{'max_features': None, 'max_leaf_nodes': 50, 'max_samples': 0.1, 'n_estimators': 200}


In [10]:
results_df = pd.DataFrame(grid_search1.cv_results_)

# extract relevant columns
results_df = results_df[['rank_test_score', 'param_n_estimators', 'param_max_leaf_nodes', 'param_max_features', 'param_max_samples', 'mean_test_score']]
results_df['ll3y'] = (- results_df['mean_test_score']).round(4)
results_df = results_df.drop(columns=['mean_test_score'])

results_df = results_df.sort_values(by='rank_test_score')
results_df.head(3)

Unnamed: 0,rank_test_score,param_n_estimators,param_max_leaf_nodes,param_max_features,param_max_samples,ll3y
137,1,200,50,,0.1,0.114
136,2,150,50,,0.1,0.114
101,3,200,50,40.0,0.1,0.114


In [11]:
# RFC1L

In [12]:
rfc1l = RandomForestClassifier(criterion= 'log_loss', random_state=42,
                               n_estimators=200, max_leaf_nodes=50, max_features=None, max_samples=0.1)
# train
start_time = time.time()
rfc1l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 40.8231 seconds


In [13]:
# get predicted probabilities
y_pred_proba_train_3y = rfc1l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = rfc1l.predict_proba(X_test_3y)[:, 1]

# compute log loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1077
Log Loss os: 0.1105
BS is: 0.0291
BS os: 0.0293
C is: 0.9028


In [14]:
# RFC2L

In [15]:
rfc2l = RandomForestClassifier(criterion= 'log_loss', random_state=42,
                               n_estimators=150, max_leaf_nodes=50, max_features=None, max_samples=0.1)
# train
start_time = time.time()
rfc2l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 7.7297 seconds


In [16]:
# get predicted probabilities
y_pred_proba_train_3y = rfc2l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = rfc2l.predict_proba(X_test_3y)[:, 1]

# compute log loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1077
Log Loss os: 0.1105
BS is: 0.0291
BS os: 0.0293
C is: 0.9023


In [17]:
# RFC3l

In [18]:
rfc3l = RandomForestClassifier(criterion= 'log_loss', random_state=42,
                               n_estimators=200, max_leaf_nodes=50, max_features=40, max_samples=0.1)
# train
start_time = time.time()
rfc3l.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 9.8758 seconds


In [19]:
# get predicted probabilities
y_pred_proba_train_3y = rfc3l.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = rfc3l.predict_proba(X_test_3y)[:, 1]

# compute log loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1080
Log Loss os: 0.1106
BS is: 0.0291
BS os: 0.0293
C is: 0.9024


In [20]:
# grid search 2 brier-score

In [21]:
grid_search = GridSearchCV(RandomForestClassifier(random_state=42, criterion='log_loss'), param_grid, cv=4, scoring='neg_brier_score', n_jobs=-1, verbose=2)
grid_search.fit(X_train_3y, y_train_3y)

print(grid_search.best_params_)

Fitting 4 folds for each of 144 candidates, totalling 576 fits


  _data = np.array(data, dtype=dtype, copy=copy,


{'max_features': None, 'max_leaf_nodes': 50, 'max_samples': 0.1, 'n_estimators': 200}


In [22]:
results_df = pd.DataFrame(grid_search.cv_results_)

# extract relevant columns
results_df = results_df[['rank_test_score', 'param_n_estimators', 'param_max_leaf_nodes', 'param_max_features', 'param_max_samples', 'mean_test_score']]

# convert negative log-loss to positive (since scoring='neg_log_loss')
results_df['ll3y'] = (- results_df['mean_test_score']).round(4)
results_df = results_df.drop(columns=['mean_test_score'])

results_df = results_df.sort_values(by='rank_test_score')
results_df.head(3)

Unnamed: 0,rank_test_score,param_n_estimators,param_max_leaf_nodes,param_max_features,param_max_samples,ll3y
137,1,200,50,,0.1,0.03
101,2,200,50,40.0,0.1,0.03
136,3,150,50,,0.1,0.03


In [23]:
# RFC1B

In [24]:
rfc1b = RandomForestClassifier(criterion= 'log_loss', random_state=42,
                               n_estimators=200, max_leaf_nodes=50, max_features=None, max_samples=0.1)
# train
start_time = time.time()
rfc1b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 17.6425 seconds


In [25]:
# get predicted probabilities
y_pred_proba_train_3y = rfc1b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = rfc1b.predict_proba(X_test_3y)[:, 1]

# compute log loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1077
Log Loss os: 0.1105
BS is: 0.0291
BS os: 0.0293
C is: 0.9028


In [26]:
# RFC2B

In [27]:
rfc2b = RandomForestClassifier(criterion= 'log_loss', random_state=42,
                               n_estimators=200, max_leaf_nodes=50, max_features=40, max_samples=0.1)
# train
start_time = time.time()
rfc2b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 10.1346 seconds


In [28]:
# get predicted probabilities
y_pred_proba_train_3y = rfc2b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = rfc2b.predict_proba(X_test_3y)[:, 1]

# compute log loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1080
Log Loss os: 0.1106
BS is: 0.0291
BS os: 0.0293
C is: 0.9024


In [29]:
# RFC3B

In [30]:
rfc3b = RandomForestClassifier(criterion= 'log_loss', random_state=42,
                               n_estimators=150, max_leaf_nodes=50, max_features=None, max_samples=0.1)
# train
start_time = time.time()
rfc3b.fit(X_train_3y, y_train_3y)
print(f"Training Time: {time.time() - start_time:.4f} seconds")

Training Time: 8.1248 seconds


In [31]:
# get predicted probabilities
y_pred_proba_train_3y = rfc3b.predict_proba(X_train_3y)[:, 1]
y_pred_proba_test_3y = rfc3b.predict_proba(X_test_3y)[:, 1]

# compute log loss
logloss_train_3y = log_loss(y_train_3y, y_pred_proba_train_3y)
logloss_test_3y = log_loss(y_test_3y, y_pred_proba_test_3y)

# compute Brier
bs_train_3y = brier_score_loss(y_train_3y, y_pred_proba_train_3y)
bs_test_3y = brier_score_loss(y_test_3y, y_pred_proba_test_3y)

# c-index
c_is = c_index_classification(df_train['MORTSTAT'], y_pred_proba_train_3y, df_train['TIMETOEVENT'])

print(f'Log Loss is: {logloss_train_3y:.4f}')
print(f'Log Loss os: {logloss_test_3y:.4f}')
print(f'BS is: {bs_train_3y:.4f}')
print(f'BS os: {bs_test_3y:.4f}')
print(f'C is: {c_is:.4f}')

Log Loss is: 0.1077
Log Loss os: 0.1105
BS is: 0.0291
BS os: 0.0293
C is: 0.9023
