In [135]:
import numpy as np
import pandas as pd
import os

import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import pyampute
import pickle 
import time
import ast 
from scipy.stats import chi2
from scipy.stats import mstats
from scipy.stats.mstats import winsorize
from scipy import stats
from xgboost import XGBRegressor
from sklearn import tree
from pyampute.ampute import MultivariateAmputation
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from lifelines import CoxPHFitter, WeibullFitter, WeibullAFTFitter
from datetime import datetime, date, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tableone import TableOne 
from os.path import isfile, join
from sklearn.metrics import mean_absolute_error, roc_auc_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from statsmodels.gam.tests.test_penalized import df_autos
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import norm
from scipy.spatial import distance
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from app_transition_dict import get_transition_dict, get_transition_code
from app_init import get_multi_state_covariates, get_multi_state_cov_quartiles
from app_init import replace_covariate_labels, replace_pvalue, get_variables_cox
import warnings 
warnings.filterwarnings('ignore')

drive = 'G'
main_path = drive + ':/Shared drives/CKD_Progression/data/CKD_COHORT_Jan2010_Mar2024_v3.csv'
data_path = drive + ':/Shared drives/CKD_Progression/data/'
docs_path = drive + ':/Shared drives/CKD_Progression/docs/'
save_path = drive + ':/Shared drives/CKD_Progression/save/'
resu_path = drive + ':/Shared drives/CKD_Progression/result/'
covariates_path = docs_path + 'covariates.csv'
removecols_path = docs_path + 'remove_columns.csv'

def generate_df_continuous(df, variables, q = 3):
    bins_dict = {} 
    for variable, prefix, column_name in variables:
        df[column_name], bins = pd.qcut(
            df[variable],
            q = q,
            labels = False,
            duplicates = 'drop',
            retbins = True)
        bins_dict[variable] = bins
        dummies = pd.get_dummies(df[column_name], prefix = prefix)
        df = pd.concat([df, dummies], axis = 1)
    return df, bins_dict

def generate_df_continuous_predefined(df, variables, get_columns = False):
    for variable, prefix, column_name, bins, labels in variables:
        df[column_name] = pd.cut(df[variable], bins = bins, labels = labels, right = False)

        if get_columns:
            dummies = pd.get_dummies(df[column_name], prefix = prefix)
            df = pd.concat([df, dummies], axis = 1)
    return df

def get_first_dates():
    heart_failure = pd.read_excel(docs_path + 'HF_FIRSTDATE_2010_2023.xlsx') ['ENC_HN'].unique().tolist()
    hypertension  = pd.read_excel(docs_path + 'HTN_FIRSTDATE_2010_2023.xlsx')['ENC_HN'].unique().tolist()
    diabetes = pd.read_csv(docs_path + 'DM_FIRSTDATE_2010_2023.csv')['ENC_HN'].unique().tolist()
    atrialfb = pd.read_csv(docs_path + 'AF_FIRSTDATE_2010_2023.csv')['ENC_HN'].unique().tolist()
    return heart_failure, hypertension, diabetes, atrialfb

def merge_comorbidity(df, comorbidity, disease_code):
    disease_column = disease_code.upper()
    df[disease_column] = df['ENC_HN'].isin(comorbidity).astype(int)
    return df

def load_dataset(version = '13', get_columns = False):
    covariates, variables = get_multi_state_cov_quartiles(), get_variables_cox()
    order_covariates = pd.read_csv(docs_path + 'cox_covariates.csv')
    model_vars = ['ENC_HN', 'transition', 'fr', 'to', 'status', 'tstart', 'tstops', 'time']
    heart_failure, hypertension, diabetes, atrialfb = get_first_dates()
    
    long_df = pd.read_csv(save_path + 'multi_state_long_ver0' + f'{version}.csv')
    long_df['gender']  = long_df['gender'].replace('M', 1).replace('F', 0)
    long_df['pathway'] = long_df['fr'] + '_to_' + long_df['to']
    long_df = generate_df_continuous_predefined(long_df, variables, get_columns = get_columns)
    long_df['statin']  = long_df[['statinhydro', 'statinlipo']].max(axis = 1)
    long_df['raas']    = long_df[['arb', 'acei']].max(axis = 1)
    long_df = long_df.drop(columns = ['statinhydro', 'statinlipo'])
    long_df = merge_comorbidity(long_df, heart_failure, 'hf')
    long_df = merge_comorbidity(long_df, diabetes,      'dm')
    long_df = merge_comorbidity(long_df, atrialfb,      'af')
    return covariates, order_covariates, long_df

def calculate_incidence_confidence_interval(data, 
                                            events_col, 
                                            person_years_col, 
                                            confidence=0.95, 
                                            max_limit=100):
    z = norm.ppf((1 + confidence) / 2) 
    ci_combined = []
    for _, row in data.iterrows():
        events = row[events_col]
        person_years = row[person_years_col] * 2

        if person_years > 0:
            rate = events / person_years
            margin_of_error = z * np.sqrt(events) / person_years
            margin_of_error = min(margin_of_error, max_limit)
            lower_bound = (rate - margin_of_error) * 1000
            upper_bound = (rate + margin_of_error) * 1000
        else:
            lower_bound = 0  # Set lower bound to 0 if person_years is zero
            upper_bound = 0  # Set upper bound to 0 if person_years is zero

        lower_bound = max(lower_bound, 0)
        ci_combined.append(f"{lower_bound:.2f}-{upper_bound:.2f}")
    
    return ci_combined

In [89]:
covariates, order_covariates, long_df = load_dataset(version = '13', get_columns = True)
covariates.remove('ANTI_PL')
covariates.remove('PHOS_BINDER')
covariates.remove('dpp4')
covariates.remove('glp1')
covariates.remove('HF')
covariates.remove('Gout')
covariates.remove('bb')

order_covariates = pd.read_csv(resu_path + 'univariate/univariate_order.csv')
order_covariates = order_covariates[['code', 'order']]

In [86]:
long_df = long_df[long_df['HF'] == 0].reset_index(drop = True)

In [138]:
pathways

['CKD3A_to_CKD3B',
 'CKD3A_to_CVD',
 'CKD3A_to_DEAD',
 'CVD_to_CKD3B',
 'CVD_to_CKD4',
 'CVD_to_CKD5A',
 'CVD_to_CKD5B',
 'CVD_to_DEAD',
 'CKD3B_to_CKD4',
 'CKD3B_to_CVD',
 'CKD3B_to_DEAD',
 'CKD4_to_CKD5A',
 'CKD4_to_CVD',
 'CKD4_to_DEAD',
 'CKD5A_to_CKD5B',
 'CKD5A_to_CVD',
 'CKD5A_to_DEAD',
 'CKD5B_to_CVD',
 'CKD5B_to_DEAD']

In [139]:
def get_baseline(outcome, characteristics):
    oc_baseline = outcome.copy()    
    baseline = TableOne(oc_baseline, columns=characteristics, groupby=['status'], pval=True, htest_name=True, row_percent=True)
    return baseline.tableone.reset_index()

pathways = long_df['pathway'].unique().tolist()
for path in ['CKD3A_to_CKD4', 'CKD3A_to_CKD5A', 'CKD3A_to_CKD5B', 'CKD3B_to_CKD5A', 'CKD3B_to_CKD5B', 'CKD4_to_CKD5B']:
    pathways.remove(path)
results = []

for pathway in ['CVD_to_CKD3B', 'CVD_to_CKD4', 'CVD_to_CKD5A', 'CVD_to_CKD5B', 'CVD_to_DEAD']:
    transition_df = long_df[long_df['pathway'] == pathway]

    baseline = get_baseline(transition_df, covariates)
    baseline.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in baseline.columns]
    baseline.columns = ['covariate', 'category', 'missing', 'overall', 'status0', 'status1', 'pvalue', 'test']
    baseline = baseline.drop(columns = ['missing', 'overall', 'test'])

    results = []
    for covariate in covariates:
        if covariate in transition_df.columns:
            categories = transition_df[covariate].unique()
            for category in categories:
                category_data = transition_df[(transition_df[covariate] == category) & (transition_df['status'] == 1)]
                num_events = category_data['ENC_HN'].nunique() 
                person_years = category_data['time'].sum() * 2

                if person_years > 0:
                    incidence = ((num_events / person_years) * 1000) / 2
                else:
                    incidence = 0  

                results.append({'covariate': covariate, 
                                'category': category, 
                                'events': num_events,
                                'person_years': person_years,
                                'incidence': incidence})

    results = pd.DataFrame(results)
    results = results[results['category'].notna()]
    results['code'] = results['covariate'] + ' ' + results['category'].astype(str)
    results['95CI IR'] = calculate_incidence_confidence_interval(results, 'events', 'person_years')
    results = pd.merge(results, order_covariates, on='code', how='inner')
    results = results.sort_values('order', ascending=True)
    results['incidence'] = np.round(results['incidence'], 2)
    results['incidence'] = results['incidence'].astype(str) + ' (' + results['95CI IR'].astype(str) + ')'
    results = results.drop(columns=['code', '95CI IR'])
    results['person_years'] = results['person_years'] * 2
    naming = 'CVD_to_CKD3B'
    results.to_csv(resu_path + f'univariate/incidence/{pathway}.csv', index = False)

In [140]:
results

Unnamed: 0,covariate,category,events,person_years,incidence,order
1,gender,0,122,7381.610959,16.53 (13.59-19.46),1
0,gender,1,116,9501.89589,12.21 (9.99-14.43),2
17,bin_age,less60,25,2060.054795,12.14 (7.38-16.89),3
16,bin_age,geq60,213,14823.452055,14.37 (12.44-16.30),4
19,bin_bmi,normal,17,1856.876712,9.16 (4.80-13.51),5
20,bin_bmi,under,6,474.345205,12.65 (2.53-22.77),6
18,bin_bmi,over,215,14552.284932,14.77 (12.80-16.75),7
7,HT,0.0,24,1337.424658,17.94 (10.77-25.12),8
6,HT,1.0,214,15546.082192,13.77 (11.92-15.61),9
8,PVD,0.0,232,16069.084932,14.44 (12.58-16.30),10


In [141]:
## RESULTS

folder_path = r'G:/Shared drives/CKD_Progression/result/univariate/LR_test/'
excel_files = [file for file in os.listdir(folder_path) if file.endswith(('.xls', '.xlsx'))]

excel_files = [
    os.path.join(folder_path, file) for file in excel_files 
    if ('CKD' in file or 'CVD' in file or 'DEAD' in file) and not file.startswith('~$')]
excel_files = sorted(excel_files)

In [142]:
df_list = []
for file in excel_files:
    path = os.path.splitext(os.path.basename(file))[0]
    df = pd.read_excel(file, sheet_name='Sheet1').iloc[1:]
    df['Unnamed: 12'] = pd.to_numeric(df['Unnamed: 12'], errors = 'coerce').apply(lambda x: np.round(x, 2) if pd.notnull(x) else x)
    df['HR (95CI)'] = df['Unnamed: 12'].astype(str) + ' ' + df['Unnamed: 13'].fillna('')
    df = df[['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'HR (95CI)']]
    df.columns = ['description', 'variable', 'category', path]
    df[path] = df[path].replace('nan ', np.NaN)
    df_list.append(df)

results = pd.concat(df_list, axis = 1).drop(columns = ['description', 'variable', 'category'])

df = df_list[0][['description', 'variable', 'category']]
main_results = pd.concat([df, results], axis = 1)


In [143]:
main_results.to_excel(r'G:\Shared drives\CKD_Progression\result\multivariate\main_results.xlsx')