In [2]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss

In [3]:
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

In [4]:
# CPH0

In [5]:
df_train_cph0 = df_train.copy()
df_test_cph0 = df_test.copy()

In [6]:
cph = CoxPHFitter()
cph.fit(df_train_cph0, duration_col='TIMETOEVENT', event_col='MORTSTAT')

cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'TIMETOEVENT'
event col,'MORTSTAT'
baseline estimation,breslow
number of observations,82720
number of events observed,4619
partial log-likelihood,-45724.40
time fit was run,2025-03-08 22:04:53 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
REGION,0.02,1.02,0.01,-0.01,0.05,0.99,1.05,0.0,1.47,0.14,2.81
AGE,0.07,1.08,0.0,0.07,0.08,1.07,1.08,0.0,38.12,<0.005,inf
SEX,-0.39,0.68,0.03,-0.45,-0.33,0.63,0.72,0.0,-12.31,<0.005,113.27
MARST,0.01,1.01,0.0,0.01,0.01,1.01,1.01,0.0,6.89,<0.005,37.43
FAMSIZE,-0.02,0.98,0.02,-0.06,0.01,0.95,1.01,0.0,-1.27,0.20,2.29
RACEA,-0.04,0.96,0.03,-0.09,0.01,0.91,1.01,0.0,-1.68,0.09,3.43
USBORN,0.04,1.04,0.01,0.03,0.05,1.03,1.06,0.0,7.48,<0.005,43.58
EDUCREC1,0.01,1.01,0.01,-0.01,0.02,0.99,1.02,0.0,0.82,0.41,1.27
POVERTY,-0.1,0.9,0.01,-0.12,-0.08,0.88,0.92,0.0,-8.85,<0.005,59.98
FSRAWSCORE,0.0,1.0,0.01,-0.01,0.02,0.99,1.02,0.0,0.5,0.61,0.7

0,1
Concordance,0.88
Partial AIC,91542.81
log-likelihood ratio test,11042.70 on 47 df
-log2(p) of ll-ratio test,inf


In [7]:
print(f'APLL_is: {cph.score(df_train_cph0, scoring_method="log_likelihood"):.4f}')
print(f'APLL_os: {cph.score(df_test_cph0, scoring_method="log_likelihood"):.4f}')
print(f'C_is: {cph.score(df_train_cph0, scoring_method="concordance_index"):.4f}')
print(f'C_os: {cph.score(df_test_cph0, scoring_method="concordance_index"):.4f}')

APLL_is: -0.5528
APLL_os: -0.4629
C_is: 0.8817
C_os: 0.8849


In [8]:
# predict the survival function for each individual
survival_probs_16_train = cph.predict_survival_function(df_train_cph0)
survival_probs_16_test = cph.predict_survival_function(df_test_cph0)

# extract 3y survival probabilities
survival_prob_3y_train = survival_probs_16_train.iloc[3] 
survival_prob_3y_test = survival_probs_16_test.iloc[3] 

# get 3-year event status based on TIMETOEVENT and MORTSTAT
status_3y_train = np.where((df_train_cph0["MORTSTAT"] == 1) & (df_train_cph0["TIMETOEVENT"] <= 3), 1, 0)
status_3y_test = np.where((df_test_cph0["MORTSTAT"] == 1) & (df_test_cph0["TIMETOEVENT"] <= 3), 1, 0)

print(f'log-loss is: {log_loss(status_3y_train, 1-survival_prob_3y_train):.4f}')
print(f'log-loss os: {log_loss(status_3y_test, 1-survival_prob_3y_test):.4f}')
print(f'BS is: {brier_score_loss(status_3y_train, 1-survival_prob_3y_train):.4f}')
print(f'BS os: {brier_score_loss(status_3y_test, 1-survival_prob_3y_test):.4f}')

log-loss is: 0.1137
log-loss os: 0.1118
BS is: 0.0299
BS os: 0.0296


In [9]:
# save for next model
df_train_cph0.to_csv("df_train_cph0.csv", index=False)
df_test_cph0.to_csv("df_test_cph0.csv", index=False)