In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import funcs.utils as utils
import funcs.plotting as plot
import funcs.amyloid as amyloid

In [2]:
PROCESSED_DIR = "data/processed"

### 1. Load Data
---

In [3]:
# Raw Data
data_df = pd.read_csv(os.path.join(PROCESSED_DIR, "AL_with_ccp_03.tsv"), sep='\t', index_col=0).rename(columns=amyloid.ddict_unclean)

# Fix Dates
data_df = pd.concat([pd.to_datetime(data_df[amyloid.dates][var], format="mixed") for var in amyloid.dates], axis=1, keys=amyloid.dates).join(
    data_df.drop(amyloid.dates, axis=1)  
)

In [4]:
# Treatments processed
treatments_df = pd.read_csv(os.path.join(PROCESSED_DIR, "treatments_processed.tsv"), sep='\t', index_col=0)
treatments_df['Start date'] = pd.to_datetime(treatments_df['Start date'], format="mixed")
treatments_df = treatments_df.join(data_df['Date of admission to center'])

## 2. ESKD Treatment & Timing
---

In [5]:
# Load RRT dates
rrt_files = ["PeritonealDialysis.csv","Hemodialysis.csv","KidneyTransplant.csv","AddnlPD.csv"]

rrt_df = list()

# Add all modalities
for rrt in rrt_files:
    _df = pd.read_csv(os.path.join("data/raw",rrt),sep=",",index_col=0)
    _df["modality"] = "Peritoneal Dialysis" if rrt in ["PeritonealDialysis.csv","AddnlPD.csv"] else rrt.split(".csv")[0]
    rrt_df.append(_df)

rrt_df = pd.concat(rrt_df)

# Fix dates mis-entered in database
rrt_df.loc[9423, "Date performed"] = "7/27/1994"
rrt_df.loc[9665, "Date performed"] = "4/5/2003"

# Save file
rrt_df.to_csv("data/processed/rrt_dates_all.tsv", sep="\t")

# Filter for duplicate entries (ie patient has multiple RRT options, we only want first one)
rrt_filt_df = rrt_df.dropna(subset=["Date performed"]).sort_values("Date performed").reset_index().drop_duplicates(subset="Code ID").set_index("Code ID")
rrt_filt_df = pd.concat([rrt_filt_df,rrt_df.loc[np.setdiff1d(rrt_df.index,rrt_filt_df.index),:]])

In [6]:
# Create ESKD dataframe
eskd_df = data_df.join(treatments_df[treatments_df["treatment_eskd"]].reset_index()[["Code ID","treatment_eskd"]].drop_duplicates(
    subset="Code ID").set_index("Code ID"))[["treatment_eskd"]]

eskd_df["treatment_eskd"] = eskd_df["treatment_eskd"].apply(lambda x: 1 if x==True else 0)

# Join ESKD dataframe to RRT timing
eskd_df = eskd_df.join(rrt_filt_df)
eskd_df = eskd_df.drop(columns=["Date of diagnosis"]).rename(columns={"Date performed":"Date of RRT Start","End date":"Date of RRT End","modality":"RRT Type"})
eskd_df = data_df[["Date of admission to center","Date of diagnosis","Date of death","Date of last visit","time","status","time_from_admission"]].join(eskd_df)
eskd_df["Date of RRT Start"] = pd.to_datetime(eskd_df["Date of RRT Start"], format="mixed")
eskd_df["Date of RRT End"] = pd.to_datetime(eskd_df["Date of RRT End"], format="mixed")

# Get time to ESKD
eskd_df['time_to_eskd'] = eskd_df.apply(lambda row: utils.get_time_eskd(row),1)
eskd_df['time_to_eskd_from_admission'] = eskd_df.apply(lambda row: utils.get_time_eskd(row, start_time="Date of admission to center"),1)

In [7]:
eskd_df[(eskd_df['time_to_eskd_from_admission']<0) & (eskd_df['treatment_eskd']==0)]

Unnamed: 0_level_0,Date of admission to center,Date of diagnosis,Date of death,Date of last visit,time,status,time_from_admission,treatment_eskd,Date of RRT Start,Date of RRT End,Comment,RRT Type,time_to_eskd,time_to_eskd_from_admission
Code ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013168,2013-10-09,2013-07-24,NaT,2013-10-08,0.208077,0,-0.002738,0,NaT,NaT,,,0.208077,-0.002738
2014033,2014-02-25,2012-09-01,NaT,2014-02-24,1.481177,0,-0.002738,0,NaT,NaT,,,1.481177,-0.002738
2017108,2017-06-30,2017-01-13,NaT,2017-06-26,0.449008,0,-0.010951,0,NaT,NaT,,,0.449008,-0.010951


In [8]:
# Competing risk
eskd_df['CR_event'] = eskd_df.apply(utils.get_cr_event,axis=1)
eskd_df['CR_time'] = eskd_df.apply(utils.get_cr_time, axis=1, start_time="Date of admission to center")

In [18]:
df = eskd_df.join(data_df['eGFR'])
df = df[df['treatment_eskd']==0]

df[df['eGFR']<15].sort_values("time_to_eskd_from_admission")

Unnamed: 0_level_0,Date of admission to center,Date of diagnosis,Date of death,Date of last visit,time,status,time_from_admission,treatment_eskd,Date of RRT Start,Date of RRT End,Comment,RRT Type,time_to_eskd,time_to_eskd_from_admission,CR_event,CR_time,eGFR
Code ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2013168,2013-10-09,2013-07-24,NaT,2013-10-08,0.208077,0,-0.002738,0,NaT,NaT,,,0.208077,-0.002738,0,-0.002738,5.711518
2023168,2023-10-02,2023-10-02,NaT,2023-10-02,0.000000,0,0.000000,0,NaT,NaT,,,0.0,0.0,0,0.000000,14.428905
2014190,2014-11-10,2011-07-22,NaT,2014-11-10,3.304586,0,0.000000,0,NaT,NaT,,,3.304586,0.0,0,0.000000,14.735230
2017105,2017-06-26,2016-05-29,NaT,2017-06-26,1.075975,0,0.000000,0,NaT,NaT,,,1.075975,0.0,0,0.000000,11.444409
2011151,2011-09-06,2011-08-23,NaT,2011-09-06,0.038330,0,0.000000,0,NaT,NaT,,,0.03833,0.0,0,0.000000,11.811013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99101,1999-09-13,1998-11-25,2006-01-05,1999-09-13,7.112936,1,6.313484,0,NaT,NaT,,,7.112936,6.313484,2,6.313484,7.460452
2016003,2016-01-04,2015-07-21,NaT,2022-11-28,7.356605,0,6.899384,0,NaT,NaT,,,7.356605,6.899384,0,6.899384,14.277386
2011128,2011-08-01,2009-05-01,2018-07-18,2011-08-01,9.212868,1,6.962355,0,NaT,NaT,,,9.212868,6.962355,2,6.962355,8.997445
2009015,2009-02-02,2008-06-16,2018-01-29,2009-02-02,9.620808,1,8.988364,0,NaT,NaT,,,9.620808,8.988364,2,8.988364,11.694695


In [10]:
#eskd_df.to_csv("data/processed/treatment_eskd.tsv", sep="\t")