In [12]:
import pandas as pd

df1 = pd.read_csv("../../Daisy_lab_combined.tsv", sep="\t")

### Extracting Relevant Values and Creating Pivot Data Frame

In [13]:
items = ['Kalium', 'Leucocyten', 'ALAT (GPT)', 'ASAT (GOT)', 'Fosfaat anorganisch', 'Magnesium', 'Glucose (n.n.)']

df_filter = df1[df1['O_AANVR_UITSLAG_ITEM_LANG'].isin(items)]

In [14]:
# Ensure you're working with a copy of the dataframe to avoid the SettingWithCopyWarning
df_new = df_filter.copy()

# Step 1: convert to datetime
df_new['DT_BEPALING'] = pd.to_datetime(df_new['DT_BEPALING'])

# Step 2: Extract the date and time separately
df_new['DATE'] = df_new['DT_BEPALING'].dt.date
df_new['TIME'] = df_new['DT_BEPALING'].dt.time

# Drop the unnecessary columns
df_drop = df_new.drop(['Unnamed: 0','AANVRAAG_NUMMER', 'UITSLAGREGEL', 'STATUS_AANVRAAG', 'O_STATUS_UITSLAG','UITSLAG_CONCLUSIE', 'UITSLAG_TEKST_LAB', 'DT_BEPALING'], axis=1)

df_drop.head()
# Reorder columns
df_reordered = df_drop.iloc[:, [0, 5, 6, 1, 2, 4, 3]]

df_reordered.rename(columns={'SEQ_ZPAT_PATIENT': 'PATIENT_ID', 'O_AANVR_UITSLAG_ITEM_LANG': 'VALUE_TYPE', 'UITSLAG_WAARDE': 'VALUE_RESULT', 'NORMAALWAARDE': 'NORMAL_RANGE'}, inplace=True)

# Step 3: Pivot the table using ITEM_OMS as columns and UITSLAG_WAARDE as values
df_pivot = df_reordered.pivot_table(index=['PATIENT_ID', 'DATE', 'TIME'], columns='VALUE_TYPE', values='VALUE_RESULT', aggfunc='first')

# Step 4: Reset the index for a cleaner look and rename the columns
df_pivot = df_pivot.reset_index()

len_before = len(df_pivot)

# Count the number of occurrences per PATIENT_ID
df_pivot['count_per_patient'] = df_pivot.groupby('PATIENT_ID')['PATIENT_ID'].transform('count')

# Keep only patients with 3 or more entries
df_pivot = df_pivot[df_pivot['count_per_patient'] >= 3]

# Drop the helper column
df_pivot = df_pivot.drop(columns=['count_per_patient'])

len_after = len(df_pivot)

print(f"{len_before-len_after} rows were dropped")

# Display the result
df_pivot.head(20)

# df_pivot.to_excel("df_pivot.xlsx", index = False)

135 rows were dropped


VALUE_TYPE,PATIENT_ID,DATE,TIME,ALAT (GPT),ASAT (GOT),Fosfaat anorganisch,Glucose (n.n.),Kalium,Leucocyten,Magnesium
1,17313559,2020-01-29,08:00:00,26,25,1.22,3.8,4.0,3.6,0.73
2,17313559,2020-01-30,08:10:00,25,23,1.11,4.1,4.3,4.1,0.73
3,17313559,2020-02-03,08:05:00,32,20,1.37,4.3,4.3,5.0,0.7
4,17313559,2020-02-05,07:55:00,26,17,1.25,4.4,4.2,4.4,0.7
5,17313559,2020-02-06,08:05:00,24,17,1.23,4.3,4.6,4.5,0.7
6,17313559,2020-02-10,07:30:00,17,16,1.18,4.3,4.2,4.6,0.65
7,17313559,2020-02-13,07:45:00,18,16,1.2,3.8,3.6,4.6,0.64
8,17313559,2020-02-17,08:05:00,15,16,1.4,4.4,4.0,5.9,0.65
9,17313559,2020-02-20,07:55:00,16,16,1.28,3.8,4.2,5.7,0.68
10,17313559,2020-02-24,07:50:00,15,18,1.36,3.2,3.8,4.4,0.67


### Combining the DataFrames

In [15]:
df2 = pd.read_csv("../../annonymizedDatasets/maskedDAIsy_LabCombined.csv", sep="\t")
df_vitals = pd.read_csv("../../annonymizedDatasets/maskedDAIsy_Vitals.csv", sep="\t")
df_bmi = pd.read_csv("../../annonymizedDatasets/maskedDAIsy_AllDatasetsCombinedWoRepIntakes_v1.tsv", sep = "\t")

In [16]:
items = ['Kalium', 'Leucocyten', 'ALAT (GPT)', 'ASAT (GOT)', 'Fosfaat anorganisch', 'Magnesium', 'Glucose (n.n.)']

df_items = df2[df2['O_ITEM'].isin(items)]

df_items.head(10)

Unnamed: 0,pid,intid,O_ITEM,seq_num-lab,STATUS_AANVRAAG,O_STATUS_UITSLAG,p_DATE_BEPALING,UITSLAG_WAARDE,NORMAALWAARDE
0,4,1315,ALAT (GPT),1,RB,Gefiatteerd,2142-02-21,39.0,0 - 34
1,4,1315,ASAT (GOT),1,RB,Gefiatteerd,2142-02-21,17.0,0 - 31
4,4,1315,Fosfaat anorganisch,1,RB,Gefiatteerd,2142-02-21,1.29,0.78 - 1.42
5,4,1315,Kalium,1,RB,Gefiatteerd,2142-02-21,4.5,3.5 - 5.3
7,4,1315,Magnesium,1,RB,Gefiatteerd,2142-02-21,0.91,0.70 - 0.91
11,10,934,ALAT (GPT),1,RB,Gefiatteerd,2140-01-29,26.0,0 - 34
12,10,934,ASAT (GOT),1,RB,Gefiatteerd,2140-01-29,25.0,0 - 31
18,10,934,Fosfaat anorganisch,1,RB,Gefiatteerd,2140-01-29,1.22,0.78 - 1.42
19,10,934,Glucose (n.n.),1,RB,Gefiatteerd,2140-01-29,3.8,4.0 - 7.8
21,10,934,Kalium,1,RB,Gefiatteerd,2140-01-29,4.0,3.5 - 5.3


In [17]:
# Ensure you're working with a copy of the dataframe to avoid the SettingWithCopyWarning
df_new = df_items.copy()

# Step 1: convert to datetime
df_new['p_DATE_BEPALING'] = pd.to_datetime(df_new['p_DATE_BEPALING'])

# Drop the unnecessary columns
df_drop = df_new.drop(['seq_num-lab','STATUS_AANVRAAG', 'O_STATUS_UITSLAG'], axis=1)

# Reorder columns
df_reordered = df_drop.iloc[:, [0, 1, 3, 2, 4, 5]]

df_reordered.rename(columns={'pid': 'PATIENT_ID', 'intid': 'INTAKE_ID','O_ITEM': 'CHEMICAL_VALUE', 'UITSLAG_WAARDE': 'VALUE_RESULT', 'NORMAALWAARDE': 'NORMAL_RANGE', 'p_DATE_BEPALING': 'DATE'}, inplace=True)

df_reordered.sort_values(by = 'PATIENT_ID')

# # df_pivot.to_excel("df_pivot.xlsx", index = False)

Unnamed: 0,PATIENT_ID,INTAKE_ID,DATE,CHEMICAL_VALUE,VALUE_RESULT,NORMAL_RANGE
0,4,1315,2142-02-21,ALAT (GPT),39,0 - 34
1,4,1315,2142-02-21,ASAT (GOT),17,0 - 31
4,4,1315,2142-02-21,Fosfaat anorganisch,1.29,0.78 - 1.42
5,4,1315,2142-02-21,Kalium,4.5,3.5 - 5.3
7,4,1315,2142-02-21,Magnesium,0.91,0.70 - 0.91
...,...,...,...,...,...,...
54023,1601,1076,2140-08-31,Kalium,4.6,3.5 - 5.3
54025,1601,1076,2140-08-31,Leucocyten,3.6,4.3 - 10.0
54026,1601,1076,2140-08-31,Magnesium,0.88,0.66 - 1.07
54086,1601,1076,2140-09-07,ASAT (GOT),11,0 - 31


In [31]:
# Step 1: convert to datetime
df_vitals['p_DT_METING'] = pd.to_datetime(df_vitals['p_DT_METING'])

# Step 2: Extract the date and time separately
df_vitals['DATE'] = df_vitals['p_DT_METING'].dt.date
df_vitals['TIME'] = df_vitals['p_DT_METING'].dt.time

cols_to_keep = ['O_']

df_vitals.sort_values(by="pid")

Unnamed: 0,pid,intid,Split,seq_num-vitals,p_DT_METING,O_METING,WAARDE1,WAARDE2,DATE,TIME
21996,1,900,Test,2,2140-02-27 11:11:12,Body Mass Index,167.0,159.6,2140-02-27,11:11:12
21797,1,900,Test,1,2140-06-25 11:10:14,Body Mass Index,167.0,159.1,2140-06-25,11:10:14
14149,2,697,Train,5,2138-03-20 13:13:46,Body Mass Index,168.0,46.6,2138-03-20,13:13:46
13279,2,697,Train,1,2138-02-27 19:04:47,Body Mass Index,168.0,45.9,2138-02-27,19:04:47
13337,2,697,Train,2,2138-02-20 18:51:06,Body Mass Index,168.0,46.5,2138-02-20,18:51:06
...,...,...,...,...,...,...,...,...,...,...
69506,1601,1076,Test,149,2140-04-09 09:01:05,Saturatie (%),98.0,,2140-04-09,09:01:05
69553,1601,1076,Test,161,2140-10-15 08:00:00,Temperatuur (c),36.2,,2140-10-15,08:00:00
69616,1601,1076,Test,172,2140-10-09 08:07:49,Tensie / Pols,110.0,83.0,2140-10-09,08:07:49
87697,1606,1637,Train,1,2143-10-13 17:12:28,Body Mass Index,168.0,38.8,2143-10-13,17:12:28


In [20]:
# Step 1: convert to datetime
df_bmi['p_startdate'] = pd.to_datetime(df_bmi['p_startdate'])

# Step 2: Extract the date and time separately
df_bmi['DATE'] = df_bmi['p_startdate'].dt.date

df_bmi['intid'] = df_bmi['intid'].astype(int)

df_bmi.head()

Unnamed: 0,intid,seq_num-edeq,EDEQ-Score,pid,p_startdate,p_dischargedate,Main-Age,Main-Bsex,edu_level,edu_comp,...,IND_ocd_comorbiditeit,aantal_eerdere_trajecten,andere_comorbiditeiten,duur_stoornis_in_jaren,p_dectool_invultijd,seq_num-dec,uitslag_waarde,verstoord_eetpatroon,volgorde,DATE
0,1491,1,476,0,2143-03-08,,47,Vrouw,"HAVO, HBS, VWO, Atheneum. Gymnasium of MBO (MT...",MBOHAVOVWO,...,0.0,8.0,vermoeden van persoonlijkheidsprobl.,25.0,2142-12-14 00:00:00,1.0,3.0,,1.0,2143-03-08
1,900,1,46,1,2139-07-18,2142-08-03 00:00:00,32,Vrouw,"HAVO, HBS, VWO, Atheneum. Gymnasium of MBO (MT...",MBOHAVOVWO,...,,,,,,,,,,2139-07-18
2,697,1,166,2,2138-01-30,2138-09-14 00:00:00,17,Vrouw,"HAVO, HBS, VWO, Atheneum. Gymnasium of MBO (MT...",MBOHAVOVWO,...,,,,,,,,,,2138-01-30
3,598,1,45,3,2137-06-19,2140-09-14 00:00:00,58,Vrouw,"VMBO-T (MAVO/MULO) afgerond OF HAVO, VWO, Athe...",VMBO-T,...,,,,,,,,,,2137-06-19
4,1315,1,462,4,2142-01-21,2142-10-03 00:00:00,19,Vrouw,"HAVO, HBS, VWO, Atheneum. Gymnasium of MBO (MT...",MBOHAVOVWO,...,0.0,,,,2141-12-22 13:26:28,1.0,3.0,,1.0,2142-01-21


In [28]:
### Merge the dataframes
df_bmi.rename(columns={'pid': 'PATIENT_ID', 'Main-Age': 'AGE', 'Main-Bsex': 'SEX', 'duur_stoornis_in_jaren': 'ED_Duration'}, inplace=True)

columns_to_keep = ['PATIENT_ID', 'AGE','SEX', 'EDtype', 'ED_Duration', 'Split']

df_merged = df_reordered.merge(df_bmi[columns_to_keep], on='PATIENT_ID', how='left')

In [33]:
df_merged.sort_values(by="PATIENT_ID")

Unnamed: 0,PATIENT_ID,INTAKE_ID,DATE,CHEMICAL_VALUE,VALUE_RESULT,NORMAL_RANGE,AGE,SEX,EDtype,ED_Duration,Split
0,4,1315,2142-02-21,ALAT (GPT),39,0 - 34,19.0,Vrouw,Anorexia nervosa,,Train
1,4,1315,2142-02-21,ASAT (GOT),17,0 - 31,19.0,Vrouw,Anorexia nervosa,,Train
2,4,1315,2142-02-21,Fosfaat anorganisch,1.29,0.78 - 1.42,19.0,Vrouw,Anorexia nervosa,,Train
3,4,1315,2142-02-21,Kalium,4.5,3.5 - 5.3,19.0,Vrouw,Anorexia nervosa,,Train
4,4,1315,2142-02-21,Magnesium,0.91,0.70 - 0.91,19.0,Vrouw,Anorexia nervosa,,Train
...,...,...,...,...,...,...,...,...,...,...,...
21133,1601,1076,2140-08-31,Kalium,4.6,3.5 - 5.3,29.0,Vrouw,Anorexia nervosa,,Test
21134,1601,1076,2140-08-31,Leucocyten,3.6,4.3 - 10.0,29.0,Vrouw,Anorexia nervosa,,Test
21135,1601,1076,2140-08-31,Magnesium,0.88,0.66 - 1.07,29.0,Vrouw,Anorexia nervosa,,Test
21159,1601,1076,2140-09-07,ASAT (GOT),11,0 - 31,29.0,Vrouw,Anorexia nervosa,,Test
