In [2]:
import numpy as np
import pandas as pd
import os
import ast
import datetime
from os.path import isfile, join
from tqdm import tqdm
from datetime import datetime, date, timedelta
from dateutil.relativedelta import relativedelta
from openpyxl import load_workbook
from collections import Counter
import warnings 
warnings.filterwarnings('ignore')

os.chdir('J:/Shared drives/CKD_Progression/')

drive = 'J'
main_path = drive + ':/Shared drives/CKD_Progression/data/CKD_COHORT_Jan2010_Mar2024_v3.csv'
main_path = drive + ':/Shared drives/CKD-DW-Sam/CKD_COHORT_Jan2010_Mar2024_ver1/result/CKD_COHORT_Jan2010_Mar2024_v1.csv'
main_path = drive + ':/Shared drives/CKD_Progression/data/CKD_cohort_raw_2023_70325.parquet.gzip'
data_path = drive + ':/Shared drives/CKD_Progression/data/'
docs_path = drive + ':/Shared drives/CKD_Progression/docs/'
save_path = drive + ':/Shared drives/CKD_Progression/save/'
covariates_path = docs_path + 'covariates.csv'
removecols_path = docs_path + 'remove_columns.csv'

def get_patients_new():
    folder_path = docs_path + 'CKD_first_dates_70325_COHORT_2.csv'
    df = pd.read_csv(folder_path, encoding = 'utf-8')
    df = df.drop_duplicates()
    patient_list_flag = df['ENC_HN'].unique().tolist()
    return patient_list_flag

def study_period(df, column, start_date, end_date):
    df[column] = pd.to_datetime(df[column], errors = 'coerce')
    mask = (df[column] >= start_date) & (df[column] <= end_date)
    df = df.loc[mask]
    return df

def exclusion_icd():
    ''' 
    Enlists all ICD codes for the relevant cardiac diseases
    Goal: Remove patients with ICD before CDK3
    '''
    path = docs_path + 'diagnosis and procedure.xlsx'
    sheet_names = ['CVD', 'IHD', 'TIA', 'Hemorrhagic stroke', 'Ischemic stroke', 'Cerebrovascular']
    ICD_CODES_DICT = {}
    for diag in sheet_names:
        df_disease = pd.read_excel(path, sheet_name = diag)
        ICD_CODES_DICT[diag] = df_disease['ICD code'].to_list()
    return ICD_CODES_DICT

def remove_nonexistent(reference, function = 'sum'):
    originals = pd.read_excel(docs_path + 'ms_data_function_ver3.xlsx')
    originals_list = originals[originals['function'] == function]['variable'].tolist()
    reference_list = reference.columns
    return [elem for elem in originals_list if elem in reference_list]

def remove_outliers(df, docs_path = docs_path):
    file_path = docs_path + 'possible_range.xlsx'
    possible_range = pd.read_excel(file_path)
    check_range_columns = possible_range['variable'].tolist()
    upper_values = possible_range['max'].astype(float).tolist()
    lower_values = possible_range['min'].astype(float).tolist()

    for covariate, upper, lower in tqdm(zip(check_range_columns, upper_values, lower_values), 
                                        total = len(check_range_columns), desc = 'Removing outliers'):
        if covariate in df.columns:
            df[covariate] = pd.to_numeric(df[covariate], errors = 'coerce')
            outlier_mask = (df[covariate] < lower) | (df[covariate] > upper)
            df.loc[outlier_mask, covariate] = np.NaN
    return df

def carry_covariates():
    carry_df = pd.read_excel(docs_path + 'ms_data_function_ver3.xlsx')
    forward_list = carry_df[carry_df['carry'] == 'forward']['variable'].tolist()
    forback_list = carry_df[carry_df['carry'] == 'forward_backward']['variable'].tolist()
    lumping_list = carry_df[carry_df['carry'] == 'ignore']['variable'].tolist()
    fllzero_list = carry_df[carry_df['carry'] == 'fill_zero']['variable'].tolist()

    all_columns = da.columns.tolist()
    forward_list = list(set(all_columns).difference(forward_list))
    forback_list = list(set(all_columns).difference(forback_list))
    lumping_list = list(set(all_columns).difference(lumping_list))
    fllzero_list = list(set(all_columns).difference(fllzero_list))

    return forward_list, forback_list, lumping_list, fllzero_list

def carried_values(patient_data):
    forward_list, forback_list, lumping_list, fllzero_list = carry_covariates()
    patient_data[forward_list] = patient_data[forward_list].fillna(method = 'ffill')
    patient_data[forback_list] = patient_data[forback_list].fillna(method = 'ffill')
    patient_data[forback_list] = patient_data[forback_list].fillna(method = 'bfill')
    return patient_data

def determine_outcome(df):
    def update_columns(df, col_name, condition):
        df.loc[condition, col_name] = 1
        df[col_name] = df.groupby(['ENC_HN'])[col_name].ffill().fillna(0)
    condition_patterns = exclusion_icd()
    for condition, patterns in condition_patterns.items():
        pattern_regex = '|'.join(patterns)
        update_columns(df, condition, df['icd'].astype(str).str.contains(pattern_regex, na = False))
    df['stroke'] = df[['TIA', 'Hemorrhagic stroke', 'Ischemic stroke']].max(axis = 1)
    df = df.drop(['TIA', 'Hemorrhagic stroke', 'Ischemic stroke'], axis = 1)
    return df

patients = get_patients_new() 
assert pd.Series(patients).nunique() == 70325

In [3]:
covariates = pd.read_csv(covariates_path)
ignore_covariates = covariates[covariates['ignore']    == 'IGNORE']['variable'].to_list()
finals_covariates = covariates[covariates['ignore']    != 'IGNORE']['variable'].to_list()
cleann_covariates = covariates[covariates['clean' ]    == 'CLEAN' ]['variable'].to_list()
explor_covariates = covariates[covariates['explore '] == 'EXPLORE']['variable'].to_list()

finals_covariates.remove('delta')
finals_covariates.remove('modulo')

remove = pd.read_csv(removecols_path).iloc[:, 0]
remove = remove.tolist()
for rem in remove:
    finals_covariates.remove(rem)

In [4]:
data = pd.read_parquet(main_path)

In [33]:
def safe_literal_eval(s):
    try:
        return ast.literal_eval(s)
    except ValueError:
        return s
df = data[['ENC_HN', 'visit_date','UA_protein_dipstick']]
df = df[df['UA_protein_dipstick'].apply(lambda x: len(x) > 0)].to_csv('J:/Shared drives/CKD_QOC/data/protein_dipstick.csv', index = False)

In [7]:
df = data.copy()
df = determine_outcome(df)
df['visit_date'] = pd.to_datetime(df['visit_date'], errors = 'coerce')
df = df.sort_values(['ENC_HN', 'visit_date'])
df['delta']  = df.groupby('ENC_HN')['visit_date'].transform(lambda x: (x - x.iloc[0]).dt.days)
df['modulo_030'] = np.floor_divide(df['delta'], 30)
df['modulo_180'] = np.floor_divide(df['delta'], 180)
df['modulo_365'] = np.floor_divide(df['delta'], 365.25)    

In [9]:
sum_list = remove_nonexistent(df, 'sum')
max_list = remove_nonexistent(df, 'max')
min_list = remove_nonexistent(df, 'min')
fst_list = remove_nonexistent(df, 'first')
lst_list = remove_nonexistent(df, 'last')
ave_list = remove_nonexistent(df, 'mean')

In [10]:
df[min_list] = df[min_list].apply(pd.to_numeric, errors = 'coerce')
df[max_list] = df[max_list].apply(pd.to_numeric, errors = 'coerce')

ds = df.groupby(['ENC_HN', 'modulo_365'])
da = df.groupby(['ENC_HN', 'modulo_365'])[ave_list].mean().reset_index()
da[min_list] = ds[min_list].agg('min').  reset_index()[min_list]
da[fst_list] = ds[fst_list].agg('first').reset_index()[fst_list]
da[lst_list] = ds[lst_list].agg('last'). reset_index()[lst_list]
da[max_list] = ds[max_list].agg('max').  reset_index()[max_list]

In [36]:
# baseline_df = da[da['modulo_365'] <= 1]
# baseline_df = baseline_df[(baseline_df['CKD_stage'] == 'stage_3a')] 
# baseline_df = baseline_df[(baseline_df['stroke'] == 0) & (baseline_df['CVD'] == 0) & (baseline_df['Cerebrovascular'] == 0) & (baseline_df['IHD'] == 0)] 

In [39]:
da[['ENC_HN', 'visit_date', 'Chem_glucose']].to_csv(save_path + 'glucose.csv', index = False)