In [1]:
import numpy as np
import pandas as pd
import os

import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import pyampute
import pickle 
import time
import ast 
from scipy.stats import chi2
from scipy.stats import mstats
from scipy.stats.mstats import winsorize
from scipy import stats
from xgboost import XGBRegressor
from sklearn import tree
from pyampute.ampute import MultivariateAmputation
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from lifelines import CoxPHFitter, WeibullFitter, WeibullAFTFitter
from datetime import datetime, date, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tableone import TableOne 
from os.path import isfile, join
from sklearn.metrics import mean_absolute_error, roc_auc_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from statsmodels.gam.tests.test_penalized import df_autos
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import norm
from scipy.spatial import distance
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from app_transition_dict import get_transition_dict, get_transition_code
from app_init import get_multi_state_covariates, get_multi_state_cov_quartiles
from app_init import replace_covariate_labels, replace_pvalue, get_variables_cox
import warnings 
warnings.filterwarnings('ignore')

drive = 'G'
main_path = drive + ':/Shared drives/CKD_Progression/data/CKD_COHORT_Jan2010_Mar2024_v3.csv'
data_path = drive + ':/Shared drives/CKD_Progression/data/'
docs_path = drive + ':/Shared drives/CKD_Progression/docs/'
save_path = drive + ':/Shared drives/CKD_Progression/save/'
resu_path = drive + ':/Shared drives/CKD_Progression/result/'
covariates_path = docs_path + 'covariates.csv'
removecols_path = docs_path + 'remove_columns.csv'

def generate_df_continuous(df, variables, q = 3):
    bins_dict = {} 
    for variable, prefix, column_name in variables:
        df[column_name], bins = pd.qcut(
            df[variable],
            q = q,
            labels = False,
            duplicates = 'drop',
            retbins = True)
        bins_dict[variable] = bins
        dummies = pd.get_dummies(df[column_name], prefix = prefix)
        df = pd.concat([df, dummies], axis = 1)
    return df, bins_dict

def generate_df_continuous_predefined(df, variables, get_columns = False):
    for variable, prefix, column_name, bins, labels in variables:
        df[column_name] = pd.cut(df[variable], bins = bins, labels = labels, right = False)

        if get_columns:
            dummies = pd.get_dummies(df[column_name], prefix = prefix)
            df = pd.concat([df, dummies], axis = 1)
    return df

def get_first_dates():
    heart_failure = pd.read_excel(docs_path + 'HF_FIRSTDATE_2010_2023.xlsx') ['ENC_HN'].unique().tolist()
    hypertension  = pd.read_excel(docs_path + 'HTN_FIRSTDATE_2010_2023.xlsx')['ENC_HN'].unique().tolist()
    diabetes = pd.read_csv(docs_path + 'DM_FIRSTDATE_2010_2023.csv')['ENC_HN'].unique().tolist()
    atrialfb = pd.read_csv(docs_path + 'AF_FIRSTDATE_2010_2023.csv')['ENC_HN'].unique().tolist()
    return heart_failure, hypertension, diabetes, atrialfb

def merge_comorbidity(df, comorbidity, disease_code):
    disease_column = disease_code.upper()
    df[disease_column] = df['ENC_HN'].isin(comorbidity).astype(int)
    return df

In [2]:
def load_dataset(version = '13', get_columns = False):
    covariates, variables = get_multi_state_cov_quartiles(), get_variables_cox()
    order_covariates = pd.read_csv(docs_path + 'cox_covariates.csv')
    model_vars = ['ENC_HN', 'transition', 'fr', 'to', 'status', 'tstart', 'tstops', 'time']
    heart_failure, hypertension, diabetes, atrialfb = get_first_dates()
    
    long_df = pd.read_csv(save_path + 'multi_state_long_ver0' + f'{version}.csv')
    long_df['gender']  = long_df['gender'].replace('M', 1).replace('F', 0)
    long_df['pathway'] = long_df['fr'] + '_to_' + long_df['to']
    long_df = generate_df_continuous_predefined(long_df, variables, get_columns = get_columns)
    long_df['statin']  = long_df[['statinhydro', 'statinlipo']].max(axis = 1)
    long_df['raas']    = long_df[['arb', 'acei']].max(axis = 1)
    long_df = long_df.drop(columns = ['statinhydro', 'statinlipo'])
    long_df = merge_comorbidity(long_df, heart_failure, 'hf')
    long_df = merge_comorbidity(long_df, diabetes,      'dm')
    long_df = merge_comorbidity(long_df, atrialfb,      'af')
    return covariates, order_covariates, long_df


covariates, order_covariates, long_df = load_dataset(version = '13', get_columns = False)
covariates.remove('Gout')
covariates.remove('PHOS_BINDER')
covariates.remove('ANTI_PL')

In [3]:
def generate_coxph(long_df, covariate, pathway):
    transition_data = long_df[long_df['pathway'] == pathway].copy()
    cph = CoxPHFitter()
    formula = f'C({covariate})'
    cph_model = cph.fit(
        transition_data,
        duration_col = 'time',
        event_col    = 'status',
        formula = formula,
        step_size = 0.1, 
        show_progress = False)
    return cph_model

def coxph_statistics(result):
    roundup = 2
    hazd = np.round(result.hazard_ratios_, roundup)[0]
    ster = np.round(result.standard_errors_[0], roundup)
    coef_low, coef_upp  = result.confidence_intervals_.reset_index().loc[0, '95% lower-bound'], \
                          result.confidence_intervals_.reset_index().loc[0, '95% upper-bound']
    confidence_interval = f'({np.round(np.exp(coef_low), roundup)}, {np.round(np.exp(coef_upp), roundup)})'
    pvalue = np.round(result._compute_p_values(), roundup)[0]
    events_obs = result.event_observed.sum()
    events_tot = result.event_observed.shape[0] 
    return hazd, confidence_interval, pvalue, ster, events_obs, events_tot

def get_log_likelihood(cph_model):
    deg = len(cph_model.params_)
    LL0 = cph_model._ll_null_
    LL1 = cph_model.log_likelihood_
    LLR = -2 * (LL0 - LL1)
    p_value = chi2.sf(LLR, len(cph_model.params_))
    return deg, LL0, LL1, LLR, p_value

def cox_table(cph_model, variable):
    categorical = ['bin_bmi', 'bin_glu', 'bin_hba']
    test_statistics = cph_model.log_likelihood_ratio_test().test_statistic
    univariate_covariate = cph_model.summary.reset_index()
    univariate_covariate['95CI HR'] = '(' + np.round(univariate_covariate['exp(coef) lower 95%'], 2).astype(str) + ', ' +\
                                            np.round(univariate_covariate['exp(coef) upper 95%'], 2).astype(str) + ')'
    univariate_covariate = univariate_covariate.drop(columns = ['se(coef)', 'z',
                                                                'coef lower 95%', 
                                                                'coef upper 95%', 
                                                                'exp(coef) lower 95%', 
                                                                'exp(coef) upper 95%',])
    univariate_covariate = univariate_covariate.rename(columns = {'exp(coef)': 'HR', 'p': 'pvalue'})
    univariate_covariate = univariate_covariate[['covariate', 'coef', 'HR', '95CI HR', 'pvalue']]
    univariate_covariate['LLT'] = test_statistics
    univariate_covariate['deg'] = len(cph_model.params_)
    univariate_covariate = pd.concat([pd.DataFrame({'covariate': [variable], 'coef': [0], 'HR': [1], '95CI HR': np.NaN, 'pvalue': np.NaN}), 
                                    univariate_covariate], 
                                    ignore_index = True)
    if variable in categorical:
        deg, LL0, LL1, LLR, p_value = get_log_likelihood(cph_model)
        univariate_covariate.loc[variable, 'LLT'] = LLR
        univariate_covariate.loc[variable, 'deg'] = deg
        univariate_covariate.loc[variable, 'pvalue'] = p_value
    return univariate_covariate


def univariate_coxph(df, pathway, save = False):
    univariate_list = []
    categorical = ['bin_bmi', 'bin_glu', 'bin_hba']
    for covariate in tqdm(covariates):
        cph_model = generate_coxph(df, covariate, pathway)
        univariate_covariate = cox_table(cph_model, covariate)
        univariate_covariate['percentage'] = df[df[covariate] == 1]['ENC_HN'].nunique()/df['ENC_HN'].nunique()
        univariate_list.append(univariate_covariate)
    univariate_df = pd.concat(univariate_list, axis = 0)
    univariate_df = pd.merge(univariate_df, order_covariates, on = 'covariate', how = 'inner')
    univariate_df = univariate_df[univariate_df['include'] == 1]
    univariate_df = univariate_df.sort_values(['order'], ascending = True)
    univariate_df['covariate'] = univariate_df['replace']
    univariate_df['patient_observed'] = cph_model.event_observed.sum()
    univariate_df['patient_risk'] = df['ENC_HN'].nunique()
    univariate_df = univariate_df[['variable', 'covariate', 'patient_observed', 'patient_risk', 'LLT', 'deg', 'percentage', 'coef', 'HR', '95CI HR', 'pvalue']]
    for var in ['coef', 'HR', 'pvalue', 'LLT', 'percentage']:
        univariate_df[var] = np.round(univariate_df[var], 2)
    univariate_df = replace_pvalue(univariate_df)
    univariate_df['included'] = np.NaN
    univariate_df['pvalue'] = chi2.sf(univariate_df['LLT'], univariate_df['deg'])
    return univariate_df, cph_model

In [None]:
pathways = long_df['pathway'].unique().tolist()
for path in ['CKD3A_to_CKD4', 'CKD3A_to_CKD5A', 'CKD3A_to_CKD5B', 'CKD3B_to_CKD5A', 'CKD3B_to_CKD5B', 'CKD4_to_CKD5B']:
    pathways.remove(path)

for pathway in pathways:
    print(pathway)
    univariate, cph_model = univariate_coxph(long_df, pathway, save = False)
    univariate['variable'] = univariate['variable'].replace('hdl_low ', 'hdl_low').replace('rua_normal ', 'rua_normal')
    univariate.to_csv(resu_path + f'univariate/feature_selection/{pathway}.csv', index = False)

covariates, order_covariates, long_df = load_dataset(version = '13', get_columns = True)
for pathway in pathways:
    df = long_df[long_df['pathway'] == pathway]
    univariate = pd.read_csv(resu_path + f'univariate/feature_selection/{pathway}.csv')
    percentage_columns = univariate['variable'].tolist()
    univariate['percentage'] = univariate.apply(
        lambda row: df[df[row['variable']] == 1]['ENC_HN'].nunique() / df['ENC_HN'].nunique() * 100, axis = 1)
    univariate.to_csv(resu_path + f'univariate/feature_selection/{pathway}.csv', index = False)

path_list = [
    os.path.join(os.path.join(resu_path, 'univariate/'), filename)
    for filename in os.listdir(os.path.join(resu_path, 'univariate/feature_selection'))
    if filename.endswith(('.csv', '.xls', '.xlsx'))]

output_excel = resu_path + 'univariate/feature_selection/univariate_results.xlsx'
with pd.ExcelWriter(output_excel, engine = 'openpyxl') as writer:
    for file_path in path_list:
        sheet_name = os.path.splitext(os.path.basename(file_path))[0]
        df = pd.read_csv(file_path)
        df.to_excel(writer, sheet_name = sheet_name, index = False)

CKD3A_to_CKD3B


100%|██████████| 25/25 [01:56<00:00,  4.66s/it]


CKD3A_to_CVD


100%|██████████| 25/25 [01:50<00:00,  4.40s/it]


CKD3A_to_DEAD


100%|██████████| 25/25 [01:04<00:00,  2.57s/it]


CVD_to_CKD3B


100%|██████████| 25/25 [00:32<00:00,  1.30s/it]


CVD_to_CKD4


100%|██████████| 25/25 [00:43<00:00,  1.73s/it]


CVD_to_CKD5A


100%|██████████| 25/25 [01:00<00:00,  2.43s/it]


CVD_to_CKD5B


100%|██████████| 25/25 [00:39<00:00,  1.57s/it]


CVD_to_DEAD


100%|██████████| 25/25 [00:37<00:00,  1.50s/it]


CKD3B_to_CKD4


100%|██████████| 25/25 [01:14<00:00,  2.97s/it]


CKD3B_to_CVD


100%|██████████| 25/25 [01:16<00:00,  3.05s/it]


CKD3B_to_DEAD


100%|██████████| 25/25 [00:38<00:00,  1.53s/it]


CKD4_to_CKD5A


100%|██████████| 25/25 [00:45<00:00,  1.83s/it]


CKD4_to_CVD


100%|██████████| 25/25 [00:32<00:00,  1.31s/it]


CKD4_to_DEAD


100%|██████████| 25/25 [00:23<00:00,  1.04it/s]


CKD5A_to_CKD5B


100%|██████████| 25/25 [00:36<00:00,  1.47s/it]


CKD5A_to_CVD


100%|██████████| 25/25 [00:19<00:00,  1.28it/s]


CKD5A_to_DEAD


100%|██████████| 25/25 [00:17<00:00,  1.40it/s]


CKD5B_to_CVD


100%|██████████| 25/25 [00:27<00:00,  1.08s/it]


CKD5B_to_DEAD


100%|██████████| 25/25 [00:16<00:00,  1.55it/s]


In [None]:
covariates, order_covariates, long_df = load_dataset(version = '13', get_columns = True)
for pathway in pathways:
    df = long_df[long_df['pathway'] == pathway]
    univariate = pd.read_csv(resu_path + f'univariate/LR_test/{pathway}.csv')
    percentage_columns = univariate['variable'].tolist()
    univariate['percentage'] = univariate.apply(
        lambda row: df[df[row['variable']] == 1]['ENC_HN'].nunique() / df['ENC_HN'].nunique() * 100, axis = 1)
    univariate.to_csv(resu_path + f'univariate/LR_test/{pathway}.csv', index = False)