In [1]:
import numpy as np
import pandas as pd
import os

import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import pyampute
import pickle 
import time
import ast 
from scipy.stats import chi2
from scipy.stats import mstats
from scipy.stats.mstats import winsorize
from scipy import stats
from xgboost import XGBRegressor
from sklearn import tree
from pyampute.ampute import MultivariateAmputation
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from lifelines import CoxPHFitter, WeibullFitter, WeibullAFTFitter
from datetime import datetime, date, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tableone import TableOne 
from os.path import isfile, join
from sklearn.metrics import mean_absolute_error, roc_auc_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from statsmodels.gam.tests.test_penalized import df_autos
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import norm
from scipy.spatial import distance
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from app_transition_dict import get_transition_dict, get_transition_code
from app_init import get_multi_state_covariates, get_multi_state_cov_quartiles
from app_init import replace_covariate_labels, replace_pvalue, get_variables_cox
import warnings 
warnings.filterwarnings('ignore')

drive = 'G'
main_path = drive + ':/Shared drives/CKD_Progression/data/CKD_COHORT_Jan2010_Mar2024_v3.csv'
data_path = drive + ':/Shared drives/CKD_Progression/data/'
docs_path = drive + ':/Shared drives/CKD_Progression/docs/'
save_path = drive + ':/Shared drives/CKD_Progression/save/'
resu_path = drive + ':/Shared drives/CKD_Progression/result/'
covariates_path = docs_path + 'covariates.csv'
removecols_path = docs_path + 'remove_columns.csv'

def generate_df_continuous(df, variables, q = 3):
    bins_dict = {} 
    for variable, prefix, column_name in variables:
        df[column_name], bins = pd.qcut(
            df[variable],
            q = q,
            labels = False,
            duplicates = 'drop',
            retbins = True)
        bins_dict[variable] = bins
        dummies = pd.get_dummies(df[column_name], prefix = prefix)
        df = pd.concat([df, dummies], axis = 1)
    return df, bins_dict

def generate_df_continuous_predefined(df, variables, get_columns = False):
    for variable, prefix, column_name, bins, labels in variables:
        df[column_name] = pd.cut(df[variable], bins = bins, labels = labels, right = False)

        if get_columns:
            dummies = pd.get_dummies(df[column_name], prefix = prefix)
            df = pd.concat([df, dummies], axis = 1)
    return df

def get_first_dates():
    heart_failure = pd.read_excel(docs_path + 'HF_FIRSTDATE_2010_2023.xlsx') ['ENC_HN'].unique().tolist()
    hypertension  = pd.read_excel(docs_path + 'HTN_FIRSTDATE_2010_2023.xlsx')['ENC_HN'].unique().tolist()
    diabetes = pd.read_csv(docs_path + 'DM_FIRSTDATE_2010_2023.csv')['ENC_HN'].unique().tolist()
    atrialfb = pd.read_csv(docs_path + 'AF_FIRSTDATE_2010_2023.csv')['ENC_HN'].unique().tolist()
    return heart_failure, hypertension, diabetes, atrialfb

def merge_comorbidity(df, comorbidity, disease_code):
    disease_column = disease_code.upper()
    df[disease_column] = df['ENC_HN'].isin(comorbidity).astype(int)
    return df

In [2]:
def load_dataset(version = '13', get_columns = False):
    covariates, variables = get_multi_state_cov_quartiles(), get_variables_cox()
    order_covariates = pd.read_csv(docs_path + 'cox_covariates.csv')
    model_vars = ['ENC_HN', 'transition', 'fr', 'to', 'status', 'tstart', 'tstops', 'time']
    heart_failure, hypertension, diabetes, atrialfb = get_first_dates()
    
    long_df = pd.read_csv(save_path + 'multi_state_long_ver0' + f'{version}.csv')
    long_df['gender']  = long_df['gender'].replace('M', 1).replace('F', 0)
    long_df['pathway'] = long_df['fr'] + '_to_' + long_df['to']
    long_df = generate_df_continuous_predefined(long_df, variables, get_columns = get_columns)
    long_df['statin']  = long_df[['statinhydro', 'statinlipo']].max(axis = 1)
    long_df['raas']    = long_df[['arb', 'acei']].max(axis = 1)
    long_df = long_df.drop(columns = ['statinhydro', 'statinlipo'])
    long_df = merge_comorbidity(long_df, heart_failure, 'hf')
    long_df = merge_comorbidity(long_df, diabetes,      'dm')
    long_df = merge_comorbidity(long_df, atrialfb,      'af')
    return covariates, order_covariates, long_df


covariates, order_covariates, long_df = load_dataset(version = '13', get_columns = False)
covariates.remove('Gout')
covariates.remove('PHOS_BINDER')
covariates.remove('ANTI_PL')

In [1]:
def generate_coxph(long_df, covariate, pathway):
    transition_data = long_df[long_df['pathway'] == pathway].copy()
    cph = CoxPHFitter()
    formula = f'C({covariate})'
    cph_model = cph.fit(
        transition_data,
        duration_col = 'tstops',
        event_col    = 'status',
        formula = formula,
        step_size = 0.1, 
        show_progress = False)
    return cph_model

def coxph_statistics(result):
    roundup = 2
    hazd = np.round(result.hazard_ratios_, roundup)[0]
    ster = np.round(result.standard_errors_[0], roundup)
    coef_low, coef_upp  = result.confidence_intervals_.reset_index().loc[0, '95% lower-bound'], \
                          result.confidence_intervals_.reset_index().loc[0, '95% upper-bound']
    confidence_interval = f'({np.round(np.exp(coef_low), roundup)}, {np.round(np.exp(coef_upp), roundup)})'
    pvalue = np.round(result._compute_p_values(), roundup)[0]
    events_obs = result.event_observed.sum()
    events_tot = result.event_observed.shape[0] 
    return hazd, confidence_interval, pvalue, ster, events_obs, events_tot

def get_log_likelihood(cph_model):
    deg = len(cph_model.params_)
    LL0 = cph_model._ll_null_
    LL1 = cph_model.log_likelihood_
    LLR = -2 * (LL0 - LL1)
    p_value = chi2.sf(LLR, len(cph_model.params_))
    return deg, LL0, LL1, LLR, p_value

def cox_table(cph_model, variable):
    categorical = ['bin_bmi', 'bin_glu', 'bin_hba']
    test_statistics = cph_model.log_likelihood_ratio_test().test_statistic
    univariate_covariate = cph_model.summary.reset_index()
    univariate_covariate['95CI HR'] = '(' + np.round(univariate_covariate['exp(coef) lower 95%'], 2).astype(str) + ', ' +\
                                            np.round(univariate_covariate['exp(coef) upper 95%'], 2).astype(str) + ')'
    univariate_covariate = univariate_covariate.drop(columns = ['se(coef)', 'z',
                                                                'coef lower 95%', 
                                                                'coef upper 95%', 
                                                                'exp(coef) lower 95%', 
                                                                'exp(coef) upper 95%',])
    univariate_covariate = univariate_covariate.rename(columns = {'exp(coef)': 'HR', 'p': 'pvalue'})
    univariate_covariate = univariate_covariate[['covariate', 'coef', 'HR', '95CI HR', 'pvalue']]
    univariate_covariate['LLT'] = test_statistics
    univariate_covariate['deg'] = len(cph_model.params_)
    univariate_covariate = pd.concat([pd.DataFrame({'covariate': [variable], 'coef': [0], 'HR': [1], '95CI HR': np.NaN, 'pvalue': np.NaN}), 
                                    univariate_covariate], 
                                    ignore_index = True)
    if variable in categorical:
        deg, LL0, LL1, LLR, p_value = get_log_likelihood(cph_model)
        univariate_covariate.loc[variable, 'LLT'] = LLR
        univariate_covariate.loc[variable, 'deg'] = deg
        univariate_covariate.loc[variable, 'pvalue'] = p_value
    return univariate_covariate


def univariate_coxph(df, pathway, save = False):
    univariate_list = []
    categorical = ['bin_bmi', 'bin_glu', 'bin_hba']
    for covariate in tqdm(covariates):
        cph_model = generate_coxph(df, covariate, pathway)
        univariate_covariate = cox_table(cph_model, covariate)
        univariate_covariate['percentage'] = df[df[covariate] == 1]['ENC_HN'].nunique()/df['ENC_HN'].nunique()
        univariate_list.append(univariate_covariate)
    univariate_df = pd.concat(univariate_list, axis = 0)
    univariate_df = pd.merge(univariate_df, order_covariates, on = 'covariate', how = 'inner')
    univariate_df = univariate_df[univariate_df['include'] == 1]
    univariate_df = univariate_df.sort_values(['order'], ascending = True)
    univariate_df['covariate'] = univariate_df['replace']
    univariate_df['patient_observed'] = cph_model.event_observed.sum()
    univariate_df['patient_risk'] = df['ENC_HN'].nunique()
    univariate_df = univariate_df[['variable', 'covariate', 'patient_observed', 'patient_risk', 'LLT', 'deg', 'percentage', 'coef', 'HR', '95CI HR', 'pvalue']]
    for var in ['coef', 'HR', 'pvalue', 'LLT', 'percentage']:
        univariate_df[var] = np.round(univariate_df[var], 2)
    univariate_df = replace_pvalue(univariate_df)
    univariate_df['included'] = np.NaN
    univariate_df['pvalue'] = chi2.sf(univariate_df['LLT'], univariate_df['deg'])
    return univariate_df, cph_model

In [4]:
pathways = long_df['pathway'].unique().tolist()
for path in ['CKD3A_to_CKD4', 'CKD3A_to_CKD5A', 'CKD3A_to_CKD5B', 'CKD3B_to_CKD5A', 'CKD3B_to_CKD5B', 'CKD4_to_CKD5B']:
    pathways.remove(path)

pathways = ['CKD5A_to_CKD5B']
addition = ['CKD4_to_CKD5B'] 


execute = False 
if execute:
    for idx, pathway in enumerate(pathways):
        covariates, order_covariates, long_df = load_dataset(version = '13', get_columns = True)
        covariates.remove('HF')
        if idx >= 0:
            long_df = long_df[long_df['HF'] == 0].reset_index(drop = True)
        univariate, cph_model = univariate_coxph(long_df, pathway, save = False)
        univariate['variable'] = univariate['variable'].replace('hdl_low ', 'hdl_low').replace('rua_normal ', 'rua_normal')
        univariate.to_csv(resu_path + f'univariate/LR_test/{addition[idx]}.csv', index = False)

    covariates, order_covariates, long_df = load_dataset(version = '13', get_columns = True)
    for idx, pathway in enumerate(pathways):
        df = long_df[long_df['pathway'] == pathway]
        univariate = pd.read_csv(resu_path + f'univariate/LR_test/{addition[idx]}.csv')
        percentage_columns = univariate['variable'].tolist()
        univariate['percentage'] = univariate.apply(
            lambda row: df[df[row['variable']] == 1]['ENC_HN'].nunique() / df['ENC_HN'].nunique() * 100, axis = 1)
        univariate.to_csv(resu_path + f'univariate/LR_test/{addition[idx]}.csv', index = False)


execute = False 
if execute:
    path_list = [
        os.path.join(os.path.join(resu_path, 'univariate/'), filename)
        for filename in os.listdir(os.path.join(resu_path, 'univariate/LR_test'))
        if filename.endswith(('.csv', '.xls', '.xlsx'))]

    output_excel = resu_path + 'univariate/LR_test/univariate_results.xlsx'
    with pd.ExcelWriter(output_excel, engine = 'openpyxl') as writer:
        for file_path in path_list:
            sheet_name = os.path.splitext(os.path.basename(file_path))[0]
            df = pd.read_csv(file_path)
            df.to_excel(writer, sheet_name = sheet_name, index = False)

execute = False 
if execute:
    covariates, order_covariates, long_df = load_dataset(version = '13', get_columns = True)
    for pathway in pathways:
        df = long_df[long_df['pathway'] == pathway]
        univariate = pd.read_csv(resu_path + f'univariate/LR_test/{pathway}.csv')
        percentage_columns = univariate['variable'].tolist()
        univariate['percentage'] = univariate.apply(
            lambda row: df[df[row['variable']] == 1]['ENC_HN'].nunique() / df['ENC_HN'].nunique() * 100, axis = 1)
        univariate.to_csv(resu_path + f'univariate/LR_test/{pathway}.csv', index = False)

In [5]:
def generate_coxph_multivariate(long_df, pathway, multivariate_covariates):
    transition_data = long_df[long_df['pathway'] == pathway].copy()
    cph = CoxPHFitter()
    formula = multivariate_covariates
    cph_model = cph.fit(
        transition_data,
        duration_col = 'tstops',
        event_col    = 'status',
        formula = formula,
        step_size = 0.5)
    return cph_model

def validate(df):
    df.loc[df['covariate'] == 'bmi_over', 'covariate'] = 'temp_placeholder'
    df.loc[df['covariate'] == 'bmi_under', 'covariate'] = 'bmi_over'
    df.loc[df['covariate'] == 'temp_placeholder', 'covariate'] = 'bmi_under'
    return df

def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data['variable'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
    return vif_data

def cox_multi_table(cph_model):
    multivariate_covariate = cph_model.summary.reset_index()
    multivariate_covariate['95CI HR'] = '(' + np.round(multivariate_covariate['exp(coef) lower 95%'], 2).astype(str) + ', ' +\
                                              np.round(multivariate_covariate['exp(coef) upper 95%'], 2).astype(str) + ')'
    multivariate_covariate = multivariate_covariate.drop(columns = ['se(coef)', '-log2(p)', 'z',
                                                                'coef lower 95%', 
                                                                'coef upper 95%', 
                                                                'exp(coef) lower 95%', 
                                                                'exp(coef) upper 95%',])
    multivariate_covariate = multivariate_covariate.rename(columns = {'exp(coef)': 'HR', 'p': 'pvalue'})
    multivariate_covariate = multivariate_covariate[['covariate', 'coef', 'HR', '95CI HR', 'pvalue']]
    return multivariate_covariate

covariates, order_covariates, long_df = load_dataset(version = '13', get_columns = True)

In [35]:
pathway = 'CKD3A_to_CVD'
univariate = pd.read_excel(resu_path + f'univariate/LR_test/{pathway}.xlsx')

multivariate_covariates = univariate[univariate['included'] == 1]['variable'].tolist()
continuous_covariates = ['bin_bmi']
multivariate_covariates = ' + '.join(multivariate_covariates)

multivariate_cox = generate_coxph_multivariate(long_df, pathway, multivariate_covariates)
multivariate_transition = cox_multi_table(multivariate_cox)
multivariate_transition['pvalue'] = np.round(multivariate_transition['pvalue'], 5)
results = np.round(multivariate_cox.summary.reset_index().sort_values(['covariate'], ascending = True), 4)
results = validate(results)
multivariate_cox.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'tstops'
event col,'status'
baseline estimation,breslow
number of observations,23429
number of events observed,6268
partial log-likelihood,-54243.39
time fit was run,2025-01-04 06:40:35 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
AF,0.3,1.35,0.04,0.22,0.38,1.24,1.46,7.41,<0.005,42.8
DM,0.23,1.26,0.03,0.17,0.29,1.18,1.33,7.59,<0.005,44.83
HT,-0.06,0.95,0.03,-0.12,0.01,0.89,1.01,-1.68,0.09,3.44
hba_high,-0.76,0.47,0.05,-0.86,-0.66,0.42,0.52,-14.95,<0.005,165.52
hba_prediabetes,-0.58,0.56,0.05,-0.67,-0.49,0.51,0.61,-12.82,<0.005,122.56
rua_hyper,-0.1,0.9,0.03,-0.15,-0.05,0.86,0.95,-3.8,<0.005,12.76
statin,0.16,1.17,0.03,0.11,0.21,1.11,1.24,5.84,<0.005,27.51
tri_high,-0.1,0.91,0.03,-0.15,-0.04,0.86,0.96,-3.52,<0.005,11.18

0,1
Concordance,0.57
Partial AIC,108502.78
log-likelihood ratio test,381.73 on 8 df
-log2(p) of ll-ratio test,255.19


In [63]:
covariates_list = [cov.strip() for cov in multivariate_covariates.split('+')]

covariate_values = long_df[long_df['pathway'] == pathway][covariates_list].reset_index(drop = True)

survival_func = multivariate_cox.predict_survival_function(covariate_values)
survival_func.index = survival_func.index / 12
transition_probabilities = []

for year in range(1, 14):
    t1 = year
    t2 = year + 1
    S_t1 = survival_func.loc[t1].values[0] if t1 in survival_func.index else survival_func.iloc[survival_func.index.get_loc(t1, method = 'nearest')].values[0]
    S_t2 = survival_func.loc[t2].values[0] if t2 in survival_func.index else survival_func.iloc[survival_func.index.get_loc(t2, method = 'nearest')].values[0]
    
    if S_t1 is not None and S_t2 is not None:
        transition_prob = 1 - S_t2
        transition_probabilities.append({
            'start': t1,
            'end':   t2,
            'transition_prob': round(transition_prob, 4)
        })
transition_prob_table = pd.DataFrame(transition_probabilities)

In [110]:
pathways = long_df['pathway'].unique().tolist()
for path in ['CKD3A_to_CKD4', 'CKD3A_to_CKD5A', 'CKD3A_to_CKD5B', 'CKD3B_to_CKD5A', 'CKD3B_to_CKD5B', 'CKD4_to_CKD5B']:
    pathways.remove(path)

for pathway in tqdm(pathways):
    univariate = pd.read_excel(resu_path + 'univariate/univariate_results.xlsx', sheet_name = pathway)
    univariate_transition = pd.read_csv(resu_path + f'univariate/{pathway}.csv')

    multivariate_covariates = univariate[univariate['included'] == 1]['variable'].tolist()
    multivariate_covariates = ' + '.join(multivariate_covariates)

    multivariate_cox = generate_coxph_multivariate(long_df, pathway, multivariate_covariates)
    multivariate_transition = cox_multi_table(multivariate_cox)
    for var in ['coef', 'HR', 'pvalue']:
            multivariate_transition[var] = np.round(multivariate_transition[var], 2)
    multivariate_transition = replace_pvalue(multivariate_transition)
    multivariate_transition = multivariate_transition.rename(columns = {
        'covariate': 'variable',
        'coef':    'adj_coef',
        'HR':      'adj_HR',
        '95CI HR': 'adj_95CI_HR',
        'pvalue':  'adj_pvalue'})
    overall = pd.merge(univariate_transition, multivariate_transition, 
                    on = ['variable'], how = 'left')
    overall.to_csv(resu_path + f'multivariate/{pathway}.csv', index = False)

path_list = [
    os.path.join(os.path.join(resu_path, 'multivariate/'), filename)
    for filename in os.listdir(os.path.join(resu_path, 'multivariate/'))
    if filename.endswith(('.csv', '.xls', '.xlsx'))]

output_excel = resu_path + 'multivariate/multivariate_results.xlsx'
with pd.ExcelWriter(output_excel, engine = 'openpyxl') as writer:
    for file_path in path_list:
        sheet_name = os.path.splitext(os.path.basename(file_path))[0]
        df = pd.read_csv(file_path)
        df.to_excel(writer, sheet_name = sheet_name, index = False)

100%|██████████| 19/19 [00:23<00:00,  1.23s/it]


In [6]:
from lifelines.utils import concordance_index
from sklearn.metrics import brier_score_loss

def calculate_c_index_and_brier_score(model, df, time_col, event_col, time_points):
    predicted_survival = model.predict_survival_function(df, times=time_points)
    c_index = concordance_index(df[time_col], -model.predict_partial_hazard(df), df[event_col])

    integrated_brier_score = 0
    valid_time_points = 0

    for t in time_points:
        predicted_probs = predicted_survival.loc[t]
        actual_times = df[time_col]
        actual_events = df[event_col]
        sample_weights = (actual_times <= t)

        if sample_weights.sum() > 0:
            brier_at_t = brier_score_loss(
                y_true=actual_events,
                y_prob=predicted_probs,
                sample_weight=sample_weights
            )
            integrated_brier_score += brier_at_t
            valid_time_points += 1

    if valid_time_points > 0:
        integrated_brier_score /= valid_time_points
    else:
        integrated_brier_score = float('nan')  

    return c_index, integrated_brier_score/30


In [None]:
from sksurv.util import Surv
transition_df = long_df[long_df['pathway'] == pathway]
independent_df = transition_df.drop(columns = ['status'])
independent_df = transition_df[multivariate_covariates]

groundtruth_df = transition_df[['status', 'time']]
groundtruth_df['status'] = groundtruth_df['status'].astype(bool)
groundtruth_df['time'] = pd.to_numeric(groundtruth_df['time'])

censored_weight = len(groundtruth_df) / groundtruth_df['status'].value_counts()[0]
event_weight    = len(groundtruth_df) / groundtruth_df['status'].value_counts()[1]

groundtruth_df['sample_weight'] = groundtruth_df['status'].apply(lambda x: event_weight if x else censored_weight)

X, y = independent_df, Surv.from_dataframe('status', 'time', groundtruth_df)
X_train, X_test, y_train, y_test, sample_weight_train,   sample_weight_test  = train_test_split(X, y, groundtruth_df['sample_weight'].to_numpy(), test_size = 0.20, random_state = 42)
X_train, X_calib, y_train, y_calib, sample_weight_train, sample_weight_calib = train_test_split(
    X_train, y_train, sample_weight_train, test_size = 0.30, random_state = 42)

In [11]:
from sksurv.util import Surv

In [None]:
train_ratio = 0.6
calibration_ratio = 0.2
test_ratio = 0.2

# Assign each row to a dataset
n = len(transition_df)
train_end = int(n * train_ratio)
calibration_end = train_end + int(n * calibration_ratio)

# Shuffle the data to ensure randomness
transition_df = transition_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Create a new column for the dataset assignment
conditions = ["train"] * train_end + ["calibration"] * (calibration_end - train_end) + ["test"] * (n - calibration_end)
transition_df["dataset"] = conditions


In [32]:
pathways = long_df['pathway'].unique().tolist()
for path in ['CKD3A_to_CKD4', 'CKD3A_to_CKD5A', 'CKD3A_to_CKD5B', 'CKD3B_to_CKD5A', 'CKD3B_to_CKD5B', 'CKD4_to_CKD5B']:
    pathways.remove(path)
CINDEX, IBS = [], []
for pathway in tqdm(pathways):
    univariate = pd.read_excel(resu_path + f'univariate/LR_test/{pathway}.xlsx')

    multivariate_covariates = univariate[univariate['included'] == 1]['variable'].tolist()
    continuous_covariates = ['bin_bmi']

    transition_df = long_df[long_df['pathway'] == pathway]

    train_ratio = 0.6
    calibration_ratio = 0.2
    test_ratio = 0.2
    n = len(transition_df)
    train_end = int(n * train_ratio)
    calibration_end = train_end + int(n * calibration_ratio)
    transition_df = transition_df.sample(frac = 1, random_state = 42).reset_index(drop = True)
    conditions = ['train'] * train_end + ['calibration'] * (calibration_end - train_end) + ['test'] * (n - calibration_end)
    transition_df['dataset'] = conditions

    transition_df = transition_df[transition_df['dataset'] == 'test']
    multivariate_covariates = ' + '.join(multivariate_covariates)
    multivariate_cox = generate_coxph_multivariate(transition_df, pathway, multivariate_covariates)
    multivariate_transition = cox_multi_table(multivariate_cox)
    multivariate_transition['pvalue'] = np.round(multivariate_transition['pvalue'], 5)
    results = np.round(multivariate_cox.summary.reset_index().sort_values(['covariate'], ascending = True), 4)
    results = validate(results)
    results[['covariate', 'exp(coef)', 'p']]

    transition_data = transition_df[transition_df['pathway'] == pathway]
    time_col  = 'tstops'  
    event_col = 'status'  
    time_points = np.linspace(1, 1000, 1000)
    c_index, integrated_brier = calculate_c_index_and_brier_score(multivariate_cox, transition_data, time_col, event_col, time_points)
    CINDEX.append(c_index)
    IBS.append(integrated_brier)

100%|██████████| 19/19 [00:50<00:00,  2.64s/it]


In [34]:
results = pd.DataFrame(data = (pathways, CINDEX, IBS)).T
results.columns = ['pathways', 'cindex', 'brier']
results.to_excel(resu_path + 'multivariate/COXPH_RESULTS_12January2025.xlsx')

In [64]:
pathways = long_df['pathway'].unique().tolist()
for path in ['CKD3A_to_CKD4', 'CKD3A_to_CKD5A', 'CKD3A_to_CKD5B', 'CKD3B_to_CKD5A', 'CKD3B_to_CKD5B', 'CKD4_to_CKD5B']:
    pathways.remove(path)

RISK, EVENTS, CENSOR, RATE, PERSON_YEARS, INCIDENCE = [], [], [], [], [], []
for pathway in tqdm(pathways):
    transition_df = long_df[long_df['pathway'] == pathway].reset_index(drop = True)
    num_risk   = transition_df['ENC_HN'].nunique()
    num_events = transition_df['status'].sum()
    num_censor = num_risk - num_events
    rate_censor = (num_censor / num_risk) * 100
    person_years = transition_df[transition_df['status'] == 1]['tstops'].sum() 
    incidence_rate = (num_events / person_years) * 1000
    RISK.append(num_risk)
    EVENTS.append(num_events)
    CENSOR.append(num_censor)
    RATE.append(rate_censor)
    PERSON_YEARS.append(person_years)
    INCIDENCE.append(incidence_rate)

results = pd.DataFrame(data = (pathways, RISK, EVENTS, CENSOR, RATE, PERSON_YEARS, INCIDENCE)).T
results.columns = ['pathways', 'risk', 'events', 'num_censor', 'censor_rate', 'person_years', 'incidence']
base_results = resu_path + 'baseline_models/'
results.to_csv(base_results + 'combined_analysis.csv', index = False)

100%|██████████| 19/19 [00:01<00:00, 13.51it/s]


In [38]:
import os
import pandas as pd

base_results = resu_path + 'baseline_models/'
all_files = [os.path.join(base_results, f) for f in os.listdir(base_results) 
             if os.path.isfile(os.path.join(base_results, f))]

transition_paths = pd.read_excel(base_results + 'CoxPH.xlsx')['name'].unique().tolist()
assert len(transition_paths) == 25

model_order = ['CoxPH', 'RSF', 'SVM', 'XGBoost', 'DeepSurv']

combined_data = []

for file_path in all_files:
    model_name = os.path.splitext(os.path.basename(file_path))[0]
    if model_name not in model_order:
        continue
    if file_path.endswith('.xlsx'):
        df = pd.read_excel(file_path, sheet_name='Sheet1')
    elif file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    else:
        continue
    for transition in transition_paths:
        transition_df = df[df['name'] == transition]
        if not transition_df.empty:
            combined_data.append({
                'transition': transition,
                'model': model_name,
                'fit_predict_time': transition_df['fit_predict_time'].values[0],
                'cindex': transition_df['cindex'].values[0],
                'r_cindex': transition_df['r_cindex'].values[0],
                'brier': transition_df['brier'].values[0],
                'r_brier': transition_df['r_brier'].values[0],
                'ace': transition_df['ace'].values[0],
            })

final_df = pd.DataFrame(combined_data)
final_df['model'] = pd.Categorical(final_df['model'], categories=model_order, ordered=True)
final_df['fit_predict_time'] = np.round(final_df['fit_predict_time'], 2)
final_df['cindex'] = np.round(final_df['cindex'] * 100, 2)
final_df['r_cindex'] = np.round(final_df['r_cindex'] * 100, 2)
final_df['brier'] = np.round(final_df['brier'], 4)
final_df['r_brier'] = np.round(final_df['r_brier'], 4)
final_df['ace'] = np.round(final_df['ace'], 4) 
final_df = final_df.sort_values(['transition', 'model'])


output_path = base_results + 'combined_results.xlsx'
final_df.to_excel(output_path, index=False)

print(f"Combined results saved to {output_path}")


Combined results saved to G:/Shared drives/CKD_Progression/result/baseline_models/combined_results.xlsx
