In [3]:
import numpy as np
import pandas as pd
import os

import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import pyampute
import pickle 
import time

from scipy.stats import mstats
from scipy.stats.mstats import winsorize
from scipy import stats
from xgboost import XGBRegressor
from sklearn import tree
from pyampute.ampute import MultivariateAmputation
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from lifelines import CoxPHFitter, WeibullFitter, WeibullAFTFitter
from datetime import datetime, date, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tableone import TableOne 
from os.path import isfile, join
from sklearn.metrics import mean_absolute_error, roc_auc_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from statsmodels.gam.tests.test_penalized import df_autos
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import norm
from scipy.spatial import distance
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from app_transition_dict import get_transition_dict
import warnings 
warnings.filterwarnings('ignore')


drive = 'M'
main_path = drive + ':/Shared drives/CKD_Progression/data/CKD_COHORT_Jan2010_Mar2024_v3.csv'
data_path = drive + ':/Shared drives/CKD_Progression/data/'
docs_path = drive + ':/Shared drives/CKD_Progression/docs/'
save_path = drive + ':/Shared drives/CKD_Progression/save/'
resu_path = drive + ':/Shared drives/CKD_Progression/result/'
covariates_path = docs_path + 'covariates.csv'
removecols_path = docs_path + 'remove_columns.csv'

In [5]:
def get_covariates(_path = 'M:/Shared drives/CKD_MS/multi_stage_CKD/data/covariates_080823.xlsx'):
    covariates_df = pd.read_excel(_path)
    covariates = covariates_df[covariates_df['ignore'] != 'IGNORE']
    covariates = covariates['variable'].tolist()
    covariates_ord = covariates_df[covariates_df['variable_type'] == 'ordinal']    ['variable'].tolist()
    covariates_con = covariates_df[covariates_df['variable_type'] == 'continuous'] ['variable'].tolist()
    covariates_dic = covariates_df[covariates_df['variable_type'] == 'dichotomous']['variable'].tolist()
    return covariates, covariates_ord, covariates_con, covariates_dic
    
def univariate_df(patients, covariates, path = save_path + 'qoc_cohort_ver002.csv'):
    df = pd.read_csv(path)
    df = df[covariates]
    df = df[df['ENC_HN'].isin(patients)]
    return df

def generate_coxph(da, covariate, time, event):
    cph = CoxPHFitter()
    db = da[covariate + [time] + [event]]
    cph_model = cph.fit(db, time, event, robust = True)
    return cph_model

def generate_weibull(da, covariate, time, event):
    ''' 
        If the value of rho > 1, the rate of failure increases over time
        If rho = 1, exponential distribution. If rho = 2, Rayleigh distribution
    '''
    weibull = WeibullFitter()
    mask = da[time] != 0
    db = da[mask]
    db = db[covariate + [time] + [event]]
    weibull_model = weibull.fit(db[time], db[event], show_progress = True)
    return weibull_model

def generate_weibull_aft(da, covariate, time, event):
    weibull = WeibullAFTFitter()
    mask = da[time] != 0
    db = da[mask]
    db = db[covariate + [time] + [event]]
    weibull_model = weibull.fit(db, time, event)
    return weibull_model

def replace_covariate_labels(data):
    data['covariate'] = data['covariate'].replace('gender', 'Gender')
    data['covariate'] = data['covariate'].replace('ANTI_PL', 'Anti-platelet drugs')
    data['covariate'] = data['covariate'].replace('PHOS_BINDER', 'Phosphate binder drugs')
    data['covariate'] = data['covariate'].replace('dpp4',  'DPP-4 inhibitor')
    data['covariate'] = data['covariate'].replace('glp1',  'GLP-1 inhibitor')
    data['covariate'] = data['covariate'].replace('sglt2', 'SGLT-2 inhibitor')
    data['covariate'] = data['covariate'].replace('acei', 'ACEI')
    data['covariate'] = data['covariate'].replace('arb', 'ARB')
    data['covariate'] = data['covariate'].replace('bb', 'Beta Blockers')
    data['covariate'] = data['covariate'].replace('statinhydro', 'Hydrophilic Statin drugs')
    data['covariate'] = data['covariate'].replace('statinlipo', 'Lipophilic Statin drugs')
    data['covariate'] = data['covariate'].replace('HT',  'Hypertension')
    data['covariate'] = data['covariate'].replace('PVD', 'Peripheral Vascular Disease')
    data['covariate'] = data['covariate'].replace('stroke', 'Stroke')
    data['covariate'] = data['covariate'].replace('DLP', 'Dyslipidemia')
    data['covariate'] = data['covariate'].replace('Gout', 'Gout')
    data['covariate'] = data['covariate'].replace('T2DM', 'Type 2 DM')
    return data

def replace_pvalue(data):
    data['pvalue'] = [0.0 if (value <= 0.001) else value for value in data['pvalue']]
    data['pvalue'] = data['pvalue'].replace(0.0, '<0.001')
    return data

def coxph_statistics(result):
    roundup = 4
    hazd = np.round(result.hazard_ratios_, roundup)[0]
    ster = np.round(result.standard_errors_[0], roundup)
    coef_low, coef_upp  = result.confidence_intervals_.reset_index().loc[0, '95% lower-bound'],\
                          result.confidence_intervals_.reset_index().loc[0, '95% upper-bound']
    confidence_interval = f'({np.round(np.exp(coef_low), roundup)}, {np.round(np.exp(coef_upp), roundup)})'
    pvalue = np.round(result._compute_p_values(), roundup)[0]
    return hazd, confidence_interval, pvalue, ster

In [6]:
covariates, covariates_ord, covariates_con, covariates_dic = get_covariates()
covariates_dic.remove('dead')
covariates_dic.remove('CVD')
covariates_dic.remove('T2DM')

multistage = pd.read_csv(save_path + 'multistage_ver001.csv')
ms_columns = multistage.columns.tolist()
multistage = multistage[ms_columns]
patients   = multistage['ENC_HN'].tolist()

In [7]:
def get_first_dates():
    heart_failure = pd.read_excel(docs_path + 'HF_FIRSTDATE_2010_2023.xlsx')
    heart_failure['First date'] = pd.to_datetime(heart_failure['First date'])
    heart_failure = heart_failure.rename(columns = {'First date': 'hf_date'})

    hypertension = pd.read_excel(docs_path + 'HTN_FIRSTDATE_2010_2023.xlsx')
    hypertension['D001KEY'] = pd.to_datetime(hypertension['D001KEY'])
    hypertension = hypertension.rename(columns = {'D001KEY': 'htn_date'})
    hypertension = hypertension[['ENC_HN', 'htn_date']]

    diabetes = pd.read_csv(docs_path + 'DM_FIRSTDATE_2010_2023.csv')
    diabetes['T2DM_date'] = pd.to_datetime(diabetes['T2DM_date'])
    diabetes = diabetes.rename(columns = {'T2DM_date': 'T2DM_date'})
    return heart_failure, hypertension, diabetes

def merge_comorbidity(df, comorbidity, disease_code):
        disease_column = disease_code.upper()
        df = df.merge(comorbidity, how = 'outer', 
                        left_on  = ['ENC_HN','visit_date'], 
                        right_on = ['ENC_HN', f'{disease_code}_date'])
        df['visit_date'] = df['visit_date'].fillna(df[f'{disease_code}_date'])
        df = df[df['visit_date'].notna()]
        df = df.sort_values(['ENC_HN', 'visit_date'], ascending = [True, True])
        df[disease_column] = np.where(pd.notna(df[f'{disease_code}_date']), 1, 0)
        return df

heart_failure, hypertension, diabetes = get_first_dates()

In [8]:
def univariate_df(patients, covariates, path = save_path + 'qoc_cohort_ver002.csv'):
    df = pd.read_csv(path)
    df['ANTI_PL']     = df[['asa', 'other_antipl', 'p2y12', 'pde']].max(axis = 1)
    df['PHOS_BINDER'] = df[['phos_binder_alu', 'phos_binder_ca', 'phos_binder_lanthanum', 'phos_binder_sevelamer']].max(axis = 1)
    df = df.fillna(1)
    df = df[covariates]
    df = df[df['ENC_HN'].isin(patients)]
    return df

df = univariate_df(patients, ['ENC_HN'] + covariates_dic)
df = pd.merge(df, multistage, on = 'ENC_HN', how = 'inner', indicator = True)
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
transition_period_columns = df.columns[df.columns.str.contains('_months')]

In [9]:
def univariate_coxph(df, time, event='CKD3A_status', save=False):
    path = resu_path + 'univariate/'
    HAZARD, CONF_INT, PVAL, SERROR = [], [], [], []
    for covariate in covariates_dic:
        try:
            cph_model = generate_coxph(df, [covariate], time, event)
            hazd, confidence_interval, pvalue, ster = coxph_statistics(cph_model) 
            HAZARD.append(hazd)
            CONF_INT.append(confidence_interval)
            PVAL.append(pvalue)
            SERROR.append(ster)
            
        except Exception as e:
            print(f"Error with covariate {covariate}: {e}")
            HAZARD.append(np.nan)
            CONF_INT.append(np.nan)
            PVAL.append(np.nan)
            SERROR.append(np.nan)
    
    data = {'covariate': covariates_dic, 'hazard': HAZARD, '95CI': CONF_INT, 'pvalue': PVAL, 'SE': SERROR}
    univariate_covariate = pd.DataFrame(data)
    if save:
        univariate_covariate.to_csv(path + 'univariate_' + str(time)[:-7] + '.csv', index=False)
    return univariate_covariate

In [8]:
transition_period_columns = multistage.columns[multistage.columns.str.contains('_months')]
gold_standard = df.copy()
execute = True

if execute:
    univariate_coxph(gold_standard, time = transition_period_columns[0],   event = 'CKD3B_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[1],   event = 'CKD04_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[2],   event = 'CKD5A_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[3],   event = 'CKD5B_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[4],   event = 'CVD00_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[5],   event = 'DEATH_status', save = True)

    univariate_coxph(gold_standard, time = transition_period_columns[7],   event = 'CKD04_status',  save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[8],   event = 'CKD5A_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[9],   event = 'CKD5B_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[10],  event = 'CVD00_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[11],  event = 'DEATH_status', save = True)

    univariate_coxph(gold_standard, time = transition_period_columns[12],  event = 'CKD5A_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[13],  event = 'CKD5B_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[14],  event = 'CVD00_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[15],  event = 'DEATH_status', save = True)

    univariate_coxph(gold_standard, time = transition_period_columns[16],  event = 'CKD5B_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[17],  event = 'CVD00_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[18],  event = 'DEATH_status', save = True)

    univariate_coxph(gold_standard, time = transition_period_columns[19],  event = 'CVD00_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[20],  event = 'DEATH_status', save = True)

    univariate_coxph(gold_standard, time = transition_period_columns[21],  event = 'CKD3B_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[22],  event = 'CKD04_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[23],  event = 'CKD5A_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[24],  event = 'CKD5B_status', save = True)
    univariate_coxph(gold_standard, time = transition_period_columns[25],  event = 'DEATH_status', save = True)

In [13]:
folder_path = resu_path + 'univariate/'
path_list, df_list = [], []
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv') or file.endswith('.xlsx')]
csv_files = [path_list.append(folder_path + file) for file in csv_files]
path_list = sorted(path_list)
for file in path_list:
    transition = file.split('/')[5].split('.')[0]
    transition = transition[11:].upper()
    transition = transition.replace('_', ' ').replace('TO', 'to')
    df = pd.read_csv(os.path.join(file))
    df['transition'] = transition
    df_list.append(df)
df = pd.concat(df_list, ignore_index = True).iloc[:, 1:]
df = replace_pvalue(df)
df = replace_covariate_labels(df)