In [1]:
import numpy as np
import pandas as pd
import os
import ast
import datetime
from os.path import isfile, join
from tqdm import tqdm
from datetime import datetime, date, timedelta
from dateutil.relativedelta import relativedelta
from openpyxl import load_workbook
from collections import Counter
import warnings 
warnings.filterwarnings('ignore')

os.chdir('M:/Shared drives/CKD_Progression/')

drive = 'M'
main_path = drive + ':/Shared drives/CKD_Progression/data/CKD_COHORT_Jan2010_Mar2024_v3.csv'
main_path = drive + ':/Shared drives/CKD-DW-Sam/CKD_COHORT_Jan2010_Mar2024_ver1/result/CKD_COHORT_Jan2010_Mar2024_v1.csv'
main_path = drive + ':/Shared drives/CKD_Progression/data/CKD_cohort_raw_2023_70325.parquet.gzip'
data_path = drive + ':/Shared drives/CKD_Progression/data/'
docs_path = drive + ':/Shared drives/CKD_Progression/docs/'
save_path = drive + ':/Shared drives/CKD_Progression/save/'
covariates_path = docs_path + 'covariates.csv'
removecols_path = docs_path + 'remove_columns.csv'

def get_patients_new():
    folder_path = docs_path + 'CKD_first_dates_70325_COHORT_2.csv'
    df = pd.read_csv(folder_path, encoding = 'utf-8')
    df = df.drop_duplicates()
    patient_list_flag = df['ENC_HN'].unique().tolist()
    return patient_list_flag

def study_period(df, column, start_date, end_date):
    df[column] = pd.to_datetime(df[column], errors = 'coerce')
    mask = (df[column] >= start_date) & (df[column] <= end_date)
    df = df.loc[mask]
    return df

def exclusion_icd():
    ''' 
    Enlists all ICD codes for the relevant cardiac diseases
    Goal: Remove patients with ICD before CDK3
    '''
    path = docs_path + 'diagnosis and procedure.xlsx'
    sheet_names = ['CVD', 'IHD', 'TIA', 'Hemorrhagic stroke', 'Ischemic stroke', 'Cerebrovascular']
    ICD_CODES_DICT = {}
    for diag in sheet_names:
        df_disease = pd.read_excel(path, sheet_name = diag)
        ICD_CODES_DICT[diag] = df_disease['ICD code'].to_list()
    return ICD_CODES_DICT

def remove_nonexistent(reference, function = 'sum'):
    originals = pd.read_excel(docs_path + 'ms_data_function_ver3.xlsx')
    originals_list = originals[originals['function'] == function]['variable'].tolist()
    reference_list = reference.columns
    return [elem for elem in originals_list if elem in reference_list]

def remove_outliers(df, docs_path = docs_path):
    file_path = docs_path + 'possible_range.xlsx'
    possible_range = pd.read_excel(file_path)
    check_range_columns = possible_range['variable'].tolist()
    upper_values = possible_range['max'].astype(float).tolist()
    lower_values = possible_range['min'].astype(float).tolist()

    for covariate, upper, lower in tqdm(zip(check_range_columns, upper_values, lower_values), 
                                        total = len(check_range_columns), desc = 'Removing outliers'):
        if covariate in df.columns:
            df[covariate] = pd.to_numeric(df[covariate], errors = 'coerce')
            outlier_mask = (df[covariate] < lower) | (df[covariate] > upper)
            df.loc[outlier_mask, covariate] = np.NaN
    return df

def carry_covariates():
    carry_df = pd.read_excel(docs_path + 'ms_data_function_ver3.xlsx')
    forward_list = carry_df[carry_df['carry'] == 'forward']['variable'].tolist()
    forback_list = carry_df[carry_df['carry'] == 'forward_backward']['variable'].tolist()
    lumping_list = carry_df[carry_df['carry'] == 'ignore']['variable'].tolist()
    fllzero_list = carry_df[carry_df['carry'] == 'fill_zero']['variable'].tolist()

    all_columns = da.columns.tolist()
    forward_list = list(set(all_columns).difference(forward_list))
    forback_list = list(set(all_columns).difference(forback_list))
    lumping_list = list(set(all_columns).difference(lumping_list))
    fllzero_list = list(set(all_columns).difference(fllzero_list))

    return forward_list, forback_list, lumping_list, fllzero_list

def carried_values(patient_data):
    forward_list, forback_list, lumping_list, fllzero_list = carry_covariates()
    patient_data[forward_list] = patient_data[forward_list].fillna(method = 'ffill')
    patient_data[forback_list] = patient_data[forback_list].fillna(method = 'ffill')
    patient_data[forback_list] = patient_data[forback_list].fillna(method = 'bfill')
    return patient_data

def determine_outcome(df):
    def update_columns(df, col_name, condition):
        df.loc[condition, col_name] = 1
        df[col_name] = df.groupby(['ENC_HN'])[col_name].ffill().fillna(0)
    condition_patterns = exclusion_icd()
    for condition, patterns in condition_patterns.items():
        pattern_regex = '|'.join(patterns)
        update_columns(df, condition, df['diagnosis_all'].astype(str).str.contains(pattern_regex, na = False))
    df['stroke'] = df[['TIA', 'Hemorrhagic stroke', 'Ischemic stroke']].max(axis = 1)
    df = df.drop(['TIA', 'Hemorrhagic stroke', 'Ischemic stroke'], axis = 1)
    return df

patients = get_patients_new() 
assert pd.Series(patients).nunique() == 70325

In [2]:
covariates = pd.read_csv(covariates_path)
ignore_covariates = covariates[covariates['ignore']    == 'IGNORE']['variable'].to_list()
finals_covariates = covariates[covariates['ignore']    != 'IGNORE']['variable'].to_list()
cleann_covariates = covariates[covariates['clean' ]    == 'CLEAN' ]['variable'].to_list()
explor_covariates = covariates[covariates['explore '] == 'EXPLORE']['variable'].to_list()

finals_covariates.remove('delta')
finals_covariates.remove('modulo')

remove = pd.read_csv(removecols_path).iloc[:, 0]
remove = remove.tolist()
for rem in remove:
    finals_covariates.remove(rem)

In [7]:
# data = pd.read_csv(main_path, encoding = 'utf-8')
# data = data[data['ENC_HN'].isin(patients)] 

In [None]:
data = pd.read_parquet(main_path)

In [173]:
df = data[finals_covariates]
df = df[[col for col in df.columns if col not in ['ENC_HN']][:0] + ['ENC_HN'] +\
        [col for col in df.columns if col not in ['ENC_HN']][0:]]
df = determine_outcome(df)
df['visit_date'] = pd.to_datetime(df['visit_date'], errors = 'coerce')
df = df.sort_values(['ENC_HN', 'visit_date'])
df['delta']  = df.groupby('ENC_HN')['visit_date'].transform(lambda x: (x - x.iloc[0]).dt.days)
df['modulo_030'] = np.floor_divide(df['delta'], 30)
df['modulo_180'] = np.floor_divide(df['delta'], 180)
df['modulo_365'] = np.floor_divide(df['delta'], 365.25)    

In [174]:
df = remove_outliers(df)

mask = (df['BMI'].isnull()) & (df['BW'].notna()) & (df['height'].notna())
df.loc[mask, 'BMI'] = np.divide(df['BW'], np.divide(df['height'], 100) ** 2)
mask = (df['BMI'].notna()) & (df['BW'].isnull()) & (df['height'].notna())
df.loc[mask, 'BW'] = np.multiply(np.power(np.divide(df['height'], 100), 2), df['BMI'])
mask = (df['BMI'].notna()) & (df['BW'].notna()) & (df['height'].isnull())
df.loc[mask, 'height'] = np.sqrt(np.divide(df['BW'], df['BMI'])) * 100

mask = (df['Lipid_HDL'].isnull()) & (df['Lipid_LDL'].notna()) & (df['Lipid_Cholesterol'].notna()) & (df['Lipid_Triglyceride'].notna())
df.loc[mask, 'Lipid_HDL'] = df['Lipid_Cholesterol'] - df['Lipid_LDL'] - np.divide(df['Lipid_Triglyceride'], 5)
mask = (df['Lipid_HDL'].notna()) & (df['Lipid_LDL'].isnull()) & (df['Lipid_Cholesterol'].notna()) & (df['Lipid_Triglyceride'].notna())
df.loc[mask, 'Lipid_LDL'] = df['Lipid_Cholesterol'] - df['Lipid_HDL'] - np.divide(df['Lipid_Triglyceride'], 5)

Removing outliers: 100%|██████████| 29/29 [00:00<00:00, 50.70it/s]


In [175]:
sum_list = remove_nonexistent(df, 'sum')
max_list = remove_nonexistent(df, 'max')
min_list = remove_nonexistent(df, 'min')
fst_list = remove_nonexistent(df, 'first')
lst_list = remove_nonexistent(df, 'last')
ave_list = remove_nonexistent(df, 'mean')

In [176]:
df[min_list] = df[min_list].apply(pd.to_numeric, errors = 'coerce')
df[max_list] = df[max_list].apply(pd.to_numeric, errors = 'coerce')

ds = df.groupby(['ENC_HN', 'modulo_365'])
da = df.groupby(['ENC_HN', 'modulo_365'])[ave_list].mean().reset_index()
da[min_list] = ds[min_list].agg('min').  reset_index()[min_list]
da[fst_list] = ds[fst_list].agg('first').reset_index()[fst_list]
da[lst_list] = ds[lst_list].agg('last'). reset_index()[lst_list]
da[max_list] = ds[max_list].agg('max').  reset_index()[max_list]

In [178]:
baseline_df = da[da['modulo_365'] < 1]
baseline_df = baseline_df[(baseline_df['CKD_stage'] == 'stage_3a')] 
baseline_df = baseline_df[(baseline_df['stroke'] == 0) & (baseline_df['CVD'] == 0) & (baseline_df['Cerebrovascular'] == 0) & (baseline_df['IHD'] == 0)] 

# 32579 patients with CKD3A
# 2614 patients with stroke 
# 4640 patients with IHD
# 314  patients with cerebrovascular
# 8465 patients with CVD
# Total patients removed: 8886 

cohort_patients = baseline_df['ENC_HN'].unique().tolist()
len(cohort_patients)

23693

In [140]:
qoc_cohort = da[da['ENC_HN'].isin(cohort_patients)]
qoc_cohort = qoc_cohort.reset_index(drop = True)
qoc_cohort = qoc_cohort[qoc_cohort['modulo_365'] < 1]

In [155]:
all_columns = qoc_cohort.columns.tolist()
all_missing = pd.DataFrame(qoc_cohort.isnull().sum()).reset_index()
all_missing.columns = ['columns', 'missing']
all_missing['miss_rate'] = all_missing['missing'] / qoc_cohort.shape[0]

In [154]:
execute = True
if execute:
    qoc_cohort .to_csv(save_path + 'qoc_cohort_ver001.csv',  index = False)
    all_missing.to_csv(save_path + 'qoc_mis365_ver001.csv',  index = False)