In [1]:
import pandas as pd

df1 = pd.read_csv("../../Daisy_lab_combined.tsv", sep="\t")

### Extracting Relevant Values and Creating Pivot Data Frame

In [2]:
items = ['Kalium', 'Leucocyten', 'ALAT (GPT)', 'ASAT (GOT)', 'Fosfaat anorganisch', 'Magnesium', 'Glucose (n.n.)']

df_filter = df1[df1['O_AANVR_UITSLAG_ITEM_LANG'].isin(items)]

In [3]:
# Ensure you're working with a copy of the dataframe to avoid the SettingWithCopyWarning
df_new = df_filter.copy()

# Step 1: convert to datetime
df_new['DT_BEPALING'] = pd.to_datetime(df_new['DT_BEPALING'])

# Step 2: Extract the date and time separately
df_new['DATE'] = df_new['DT_BEPALING'].dt.date
df_new['TIME'] = df_new['DT_BEPALING'].dt.time

# Drop the unnecessary columns
df_drop = df_new.drop(['Unnamed: 0','AANVRAAG_NUMMER', 'UITSLAGREGEL', 'STATUS_AANVRAAG', 'O_STATUS_UITSLAG','UITSLAG_CONCLUSIE', 'UITSLAG_TEKST_LAB', 'DT_BEPALING'], axis=1)

df_drop.head()
# Reorder columns
df_reordered = df_drop.iloc[:, [0, 5, 6, 1, 2, 4, 3]]

df_reordered.rename(columns={'SEQ_ZPAT_PATIENT': 'PATIENT_ID', 'O_AANVR_UITSLAG_ITEM_LANG': 'VALUE_TYPE', 'UITSLAG_WAARDE': 'VALUE_RESULT', 'NORMAALWAARDE': 'NORMAL_RANGE'}, inplace=True)

# Step 3: Pivot the table using ITEM_OMS as columns and UITSLAG_WAARDE as values
df_pivot = df_reordered.pivot_table(index=['PATIENT_ID', 'DATE', 'TIME'], columns='VALUE_TYPE', values='VALUE_RESULT', aggfunc='first')

# Step 4: Reset the index for a cleaner look and rename the columns
df_pivot = df_pivot.reset_index()

len_before = len(df_pivot)

# Count the number of occurrences per PATIENT_ID
df_pivot['count_per_patient'] = df_pivot.groupby('PATIENT_ID')['PATIENT_ID'].transform('count')

# Keep only patients with 3 or more entries
df_pivot = df_pivot[df_pivot['count_per_patient'] >= 3]

# Drop the helper column
df_pivot = df_pivot.drop(columns=['count_per_patient'])

len_after = len(df_pivot)

print(f"{len_before-len_after} rows were dropped")

# Display the result
df_pivot.head(20)

# df_pivot.to_excel("df_pivot.xlsx", index = False)

135 rows were dropped


VALUE_TYPE,PATIENT_ID,DATE,TIME,ALAT (GPT),ASAT (GOT),Fosfaat anorganisch,Glucose (n.n.),Kalium,Leucocyten,Magnesium
1,17313559,2020-01-29,08:00:00,26,25,1.22,3.8,4.0,3.6,0.73
2,17313559,2020-01-30,08:10:00,25,23,1.11,4.1,4.3,4.1,0.73
3,17313559,2020-02-03,08:05:00,32,20,1.37,4.3,4.3,5.0,0.7
4,17313559,2020-02-05,07:55:00,26,17,1.25,4.4,4.2,4.4,0.7
5,17313559,2020-02-06,08:05:00,24,17,1.23,4.3,4.6,4.5,0.7
6,17313559,2020-02-10,07:30:00,17,16,1.18,4.3,4.2,4.6,0.65
7,17313559,2020-02-13,07:45:00,18,16,1.2,3.8,3.6,4.6,0.64
8,17313559,2020-02-17,08:05:00,15,16,1.4,4.4,4.0,5.9,0.65
9,17313559,2020-02-20,07:55:00,16,16,1.28,3.8,4.2,5.7,0.68
10,17313559,2020-02-24,07:50:00,15,18,1.36,3.2,3.8,4.4,0.67


## Combining the DataFrames

In [75]:
df2 = pd.read_csv("../../annonymizedDatasets/maskedDAIsy_LabCombined.csv", sep="\t")
df_vitals = pd.read_csv("../../annonymizedDatasets/maskedDAIsy_Vitals.csv", sep="\t")
df_age = pd.read_csv("../../annonymizedDatasets/maskedDAIsy_AllDatasetsCombinedWoRepIntakes_v1.tsv", sep = "\t")

### CHEMICAL DATAFRAME

In [54]:
# FILTERING OUT THE RELEVANT COLUMNS
items = ['Kalium', 'Leucocyten', 'ALAT (GPT)', 'ASAT (GOT)', 'Fosfaat anorganisch', 'Magnesium', 'Glucose (n.n.)']

df_items = df2[df2['O_ITEM'].isin(items)]

# df_items.head(10)

In [62]:
# PIVOTING THE DATAFRAME SO THAT ONE LINE IS ONE DATE'S CHEMICAL MEASUREMENT
df_new = df_items.copy()

# Step 1: convert to datetime
df_new['p_DATE_BEPALING'] = pd.to_datetime(df_new['p_DATE_BEPALING'])

# Drop the unnecessary columns
df_drop = df_new.drop(['STATUS_AANVRAAG', 'O_STATUS_UITSLAG'], axis=1)

# Reorder columns
df_reordered = df_drop.iloc[:, [0, 1, 3, 4, 2, 5, 6]]

df_reordered.rename(columns={'pid': 'PATIENT_ID', 'intid': 'INTAKE_ID','O_ITEM': 'CHEMICAL_VALUE', 'UITSLAG_WAARDE': 'VALUE_RESULT', 'NORMAALWAARDE': 'NORMAL_RANGE', 'p_DATE_BEPALING': 'DATE', 'seq_num-lab': 'SEQUENCE'}, inplace=True)

df_reordered.sort_values(by = 'PATIENT_ID')

df_pivot = df_reordered.pivot_table(index=['PATIENT_ID', 'INTAKE_ID', 'SEQUENCE', 'DATE'], columns='CHEMICAL_VALUE', values='VALUE_RESULT', aggfunc='first')

df_pivot = df_pivot.rename_axis(None, axis=1)

pivoted_df = df_pivot.reset_index()

# Set a new index as the row number
pivoted_df['ROW'] = range(1, len(pivoted_df) + 1)
pivoted_df.set_index('ROW', inplace=True)

pivoted_df.head(100)


Unnamed: 0_level_0,PATIENT_ID,INTAKE_ID,SEQUENCE,DATE,ALAT (GPT),ASAT (GOT),Fosfaat anorganisch,Glucose (n.n.),Kalium,Leucocyten,Magnesium
ROW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,4,1315,1,2142-02-21,39,17,1.29,,4.5,,0.91
2,10,934,1,2140-01-29,26,25,1.22,3.8,4.0,3.6,0.73
3,10,934,2,2140-01-30,25,23,1.11,4.1,4.3,4.1,0.73
4,10,934,3,2140-02-03,32,20,1.37,4.3,4.3,5.0,0.70
5,10,934,4,2140-02-05,26,17,1.25,4.4,4.2,4.4,0.70
...,...,...,...,...,...,...,...,...,...,...,...
96,25,1294,30,2142-12-19,,,,,,5.6,
97,25,1294,30,2143-09-25,35,46,1.22,,4.2,,1.00
98,25,1294,31,2143-09-25,,,,,,3.3,
99,25,1294,31,2143-09-26,37,47,1.26,,4.0,,0.98


### VITALS DATAFRAME (BMI, BLOOD PRESSURE)

In [69]:
# Step 1: convert to datetime
df_vitals['p_DT_METING'] = pd.to_datetime(df_vitals['p_DT_METING'])

# Step 2: Extract the date and time separately
df_vitals['DATE'] = df_vitals['p_DT_METING'].dt.date
df_vitals['TIME'] = df_vitals['p_DT_METING'].dt.time

# Drop the unnecessary columns
df_drop = df_vitals.drop(['Split', 'p_DT_METING', 'TIME'], axis=1)

# Reorder columns
df_reordered = df_drop.iloc[:, [0, 1, 2, 6, 3, 4, 5]]

df_reordered.rename(columns={'pid': 'PATIENT_ID', 'intid': 'INTAKE_ID','O_METING': 'MEASUREMENT ITEM', 'WAARDE1': 'VALUE 1', 'WAARDE2': 'VALUE 2', 'seq_num-vitals': 'SEQUENCE'}, inplace=True)

df_reordered.sort_values(by = ['PATIENT_ID', 'INTAKE_ID', 'SEQUENCE', 'DATE'], inplace=True)
df_reordered.head(20)

df_vitals = df_reordered.reset_index()

# Set a new index as the row number
df_vitals['ROW'] = range(1, len(df_vitals) + 1)
df_vitals.set_index('ROW', inplace=True)

df_vitals.head(10)

Unnamed: 0_level_0,index,PATIENT_ID,INTAKE_ID,SEQUENCE,DATE,MEASUREMENT ITEM,VALUE 1,VALUE 2
ROW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,21797,1,900,1,2140-06-25,Body Mass Index,167.0,159.1
2,21996,1,900,2,2140-02-27,Body Mass Index,167.0,159.6
3,13279,2,697,1,2138-02-27,Body Mass Index,168.0,45.9
4,13337,2,697,2,2138-02-20,Body Mass Index,168.0,46.5
5,13730,2,697,3,2138-05-22,Body Mass Index,168.0,48.4
6,13988,2,697,4,2138-08-05,Body Mass Index,168.0,48.0
7,14149,2,697,5,2138-03-20,Body Mass Index,168.0,46.6
8,10880,3,598,1,2137-05-07,Temperatuur (c),36.7,
9,11281,3,598,2,2137-05-07,Tensie / Pols,134.0,77.0
10,11311,3,598,3,2137-09-14,Temperatuur (c),36.4,


### AGE AND ED DATAFRAME

In [87]:
df_age['p_startdate'] = pd.to_datetime(df_age['p_startdate'])
df_age['DATE'] = df_age['p_startdate'].dt.date
df_age['intid'] = df_age['intid'].astype(int)

cols = ['intid', 'seq_num-edeq', 'pid', 'DATE', 'Main-Age', 'Main-Bsex', 'BMI', 'duur_stoornis_in_jaren', 'EDtype']

df_clean = df_age.loc[:, cols]

df_clean.rename(columns={'pid': 'PATIENT_ID', 'Main-Age': 'AGE', 'Main-Bsex': 'SEX', 'duur_stoornis_in_jaren': 'ED_Duration', 'intid': 'INTAKE_ID', 'seq_num-edeq' : 'SEQUENCE'}, inplace=True)

df_reordered = df_clean.iloc[:, [2,0,1,4,8,7,5,6]]

df_reordered.head()

Unnamed: 0,PATIENT_ID,INTAKE_ID,SEQUENCE,AGE,EDtype,ED_Duration,SEX,BMI
0,0,1491,1,47,Binge-ED,25.0,Vrouw,38.0
1,1,900,1,32,Binge-ED,,Vrouw,
2,2,697,1,17,Anorexia nervosa,,Vrouw,
3,3,598,1,58,Others,,Vrouw,
4,4,1315,1,19,Anorexia nervosa,,Vrouw,15.3


### MERGE DF_CHEM AND DF_AGE_BMI TOGETHER

In [93]:
df_merged = pivoted_df.merge(df_reordered, on=['PATIENT_ID', 'SEQUENCE', 'INTAKE_ID'], how='left')

df_merged = df_merged.iloc[:, [0,1,2,3, 11, 12, 13, 14, 15, 4,5,6,7,8,9,10]]

df_merged.head()

nan_count = df_merged.isna().sum()

print(len(df_merged))
print(nan_count)

5217
PATIENT_ID                0
INTAKE_ID                 0
SEQUENCE                  0
DATE                      0
AGE                    5021
EDtype                 5021
ED_Duration            5148
SEX                    5021
BMI                    5154
ALAT (GPT)             2149
ASAT (GOT)             2167
Fosfaat anorganisch    2161
Glucose (n.n.)         2505
Kalium                 2140
Leucocyten             2090
Magnesium              2160
dtype: int64


In [94]:
nan_rows = df_merged[df_merged['ASAT (GOT)'].isna()]

print(nan_rows)

      PATIENT_ID  INTAKE_ID  SEQUENCE       DATE  AGE EDtype  ED_Duration  \
28            18       1123         9 2141-03-01  NaN    NaN          NaN   
30            18       1123        10 2141-03-08  NaN    NaN          NaN   
32            18       1123        11 2141-03-09  NaN    NaN          NaN   
34            18       1123        12 2141-03-15  NaN    NaN          NaN   
59            25       1294        12 2141-12-20  NaN    NaN          NaN   
...          ...        ...       ...        ...  ...    ...          ...   
5208        1601       1076        22 2140-09-28  NaN    NaN          NaN   
5210        1601       1076        23 2140-10-05  NaN    NaN          NaN   
5212        1601       1076        24 2140-10-12  NaN    NaN          NaN   
5214        1601       1076        25 2140-10-19  NaN    NaN          NaN   
5216        1601       1076        26 2140-10-26  NaN    NaN          NaN   

      SEX  BMI ALAT (GPT) ASAT (GOT) Fosfaat anorganisch Glucose (n.n.)  \


In [41]:
df_check = df_vitals[df_vitals["pid"] == 1580]

df_check.head(100)

Unnamed: 0,pid,intid,Split,seq_num-vitals,p_DT_METING,O_METING,WAARDE1,WAARDE2,DATE,TIME
66429,1580,948,Train,1,2139-11-13 06:22:02,Temperatuur (c),36.2,,2139-11-13,06:22:02
66432,1580,948,Train,2,2139-11-15 08:27:25,Tensie / Pols,93.0,68.00,2139-11-15,08:27:25
66438,1580,948,Train,3,2139-10-29 21:18:57,Temperatuur (c),36.1,,2139-10-29,21:18:57
66440,1580,948,Train,4,2139-04-11 16:00:58,Tensie / Pols,85.0,59.00,2139-04-11,16:00:58
66441,1580,948,Train,5,2139-05-11 09:16:31,Defaecatie,1.0,,2139-05-11,09:16:31
...,...,...,...,...,...,...,...,...,...,...
66846,1580,948,Train,96,2139-11-16 10:11:50,Body Mass Index,169.0,42.80,2139-11-16,10:11:50
66847,1580,948,Train,97,2139-11-17 13:49:18,Body Mass Index,169.0,42.55,2139-11-17,13:49:18
66864,1580,948,Train,98,2139-11-29 10:16:02,Body Mass Index,169.0,44.20,2139-11-29,10:16:02
66868,1580,948,Train,99,2139-11-25 07:37:47,Body Mass Index,169.0,43.80,2139-11-25,07:37:47


In [42]:
df_check2 = df_pivot[df_pivot.index.get_level_values(0) == 1580]
# df_filtered = df[df.index.get_level_values(0) == 10]

df_check2.head(20)
# df_pivot.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,ALAT (GPT),ASAT (GOT),Fosfaat anorganisch,Glucose (n.n.),Kalium,Leucocyten,Magnesium
PATIENT_ID,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1580,2139-10-30,56,29,1.35,3.6,4.0,2.4,0.78
1580,2139-10-31,55,36,1.29,3.7,4.0,2.1,0.8
1580,2139-11-01,60,41,1.32,3.4,4.3,2.3,0.86
1580,2139-11-04,45,23,1.34,3.8,4.6,2.2,0.83
1580,2139-11-05,45,22,1.29,3.5,4.6,2.3,0.77
1580,2139-11-06,45,24,1.38,3.8,5.1,2.2,0.79
1580,2139-11-07,43,23,1.34,3.8,5.0,2.8,0.81
1580,2139-11-08,47,27,1.35,3.9,4.9,2.4,0.82
1580,2139-11-09,42,25,1.26,,4.8,,0.9
1580,2139-11-10,43,27,1.27,,5.0,,0.8


In [43]:
dates_chem = df_check2.index.get_level_values(1).unique().tolist()

dates_vitals = df_check["DATE"].unique().tolist()

dates_chem_dates = [d.date() if isinstance(d, pd.Timestamp) else d for d in dates_chem]

dates_vitals_dates = [d.date() if isinstance(d, pd.Timestamp) else d for d in dates_vitals]

overlap_dates = set(dates_chem_dates).intersection(dates_vitals_dates)

initial_length_chem = len(dates_chem_dates)
initial_length_vitals = len(dates_vitals_dates)
overlap_length = len(overlap_dates)

print("Initial length of dates_chem:", initial_length_chem)
print("Initial length of dates_vitals:", initial_length_vitals)
print("Overlap length:", overlap_length)

print("Overlapping Dates:", overlap_dates)


Initial length of dates_chem: 34
Initial length of dates_vitals: 129
Overlap length: 16
Overlapping Dates: {datetime.date(2139, 11, 13), datetime.date(2139, 12, 23), datetime.date(2140, 1, 27), datetime.date(2139, 12, 30), datetime.date(2139, 11, 28), datetime.date(2139, 11, 14), datetime.date(2139, 11, 25), datetime.date(2139, 12, 16), datetime.date(2140, 1, 20), datetime.date(2139, 11, 11), datetime.date(2139, 11, 27), datetime.date(2139, 11, 18), datetime.date(2139, 12, 27), datetime.date(2140, 1, 13), datetime.date(2139, 10, 31), datetime.date(2139, 10, 30)}


In [50]:
results = []

for pid in df_vitals["pid"].unique():

    if pid in df_vitals["pid"].values and pid in df_pivot.index.get_level_values(0).unique():
        df_check = df_vitals[df_vitals["pid"] == pid]
        df_check2 = df_pivot[df_pivot.index.get_level_values(0) == pid]

        dates_chem = df_check2.index.get_level_values(1).unique().tolist()
        dates_vitals = df_check["DATE"].unique().tolist()

        dates_chem_dates = [d.date() if isinstance(d, pd.Timestamp) else d for d in dates_chem]
        dates_vitals_dates = [d.date() if isinstance(d, pd.Timestamp) else d for d in dates_vitals]

        overlap_dates = set(dates_chem_dates).intersection(dates_vitals_dates)

        # calculate lengths
        initial_length_chem = len(dates_chem_dates)
        initial_length_vitals = len(dates_vitals_dates)
        overlap_length = len(overlap_dates)

        # calc overlap
        overlap_percentage = (overlap_length / initial_length_chem) * 100 if initial_length_chem > 0 else 0

        results.append({
            "Patient ID": pid,
            "Chem Dates Count": initial_length_chem,
            "Vitals Dates Count": initial_length_vitals,
            "Overlap Count": overlap_length,
            "Overlap Percentage": overlap_percentage,
            "Overlapping Dates": list(overlap_dates)
        })

summary_df = pd.DataFrame(results)

summary_df_sorted = summary_df.sort_values(by="Patient ID").reset_index(drop=True)

overlap_percentage_stats = {
    "Min": summary_df_sorted["Overlap Percentage"].min(),
    "Max": summary_df_sorted["Overlap Percentage"].max(),
    "Mean": summary_df_sorted["Overlap Percentage"].mean(),
    "Median": summary_df_sorted["Overlap Percentage"].median()
}

print("Overlap Percentage Statistics:")
print(overlap_percentage_stats)

summary_df_sorted.head(100)


Overlap Percentage Statistics:
{'Min': 0.0, 'Max': 100.0, 'Mean': 47.909374459747525, 'Median': 53.13725490196079}


Unnamed: 0,Patient ID,Chem Dates Count,Vitals Dates Count,Overlap Count,Overlap Percentage,Overlapping Dates
0,4,1,11,0,0.000000,[]
1,10,18,68,9,50.000000,"[2140-01-30, 2140-02-17, 2140-02-24, 2140-01-2..."
2,18,11,44,4,36.363636,"[2141-03-15, 2141-02-22, 2141-02-16, 2141-02-03]"
3,24,2,4,0,0.000000,[]
4,25,51,149,22,43.137255,"[2142-10-24, 2141-11-29, 2141-11-15, 2143-10-2..."
...,...,...,...,...,...,...
95,818,1,2,1,100.000000,[2141-06-24]
96,824,24,136,15,62.500000,"[2141-07-29, 2141-08-09, 2140-12-21, 2141-08-2..."
97,834,1,5,1,100.000000,[2140-11-16]
98,841,1,5,0,0.000000,[]
