In [85]:
import pandas as pd
import numpy as np

df1 = pd.read_csv("../../Daisy_lab_combined.tsv", sep="\t")

### Extracting Relevant Values and Creating Pivot Data Frame

In [26]:
items = ['Kalium', 'Leucocyten', 'ALAT (GPT)', 'ASAT (GOT)', 'Fosfaat anorganisch', 'Magnesium', 'Glucose (n.n.)']

df_filter = df1[df1['O_AANVR_UITSLAG_ITEM_LANG'].isin(items)]

In [27]:
# Ensure you're working with a copy of the dataframe to avoid the SettingWithCopyWarning
df_new = df_filter.copy()

# Step 1: convert to datetime
df_new['DT_BEPALING'] = pd.to_datetime(df_new['DT_BEPALING'])

# Step 2: Extract the date and time separately
df_new['DATE'] = df_new['DT_BEPALING'].dt.date
df_new['TIME'] = df_new['DT_BEPALING'].dt.time

# Drop the unnecessary columns
df_drop = df_new.drop(['Unnamed: 0','AANVRAAG_NUMMER', 'UITSLAGREGEL', 'STATUS_AANVRAAG', 'O_STATUS_UITSLAG','UITSLAG_CONCLUSIE', 'UITSLAG_TEKST_LAB', 'DT_BEPALING'], axis=1)

df_drop.head()
# Reorder columns
df_reordered = df_drop.iloc[:, [0, 5, 6, 1, 2, 4, 3]]

df_reordered.rename(columns={'SEQ_ZPAT_PATIENT': 'PATIENT_ID', 'O_AANVR_UITSLAG_ITEM_LANG': 'VALUE_TYPE', 'UITSLAG_WAARDE': 'VALUE_RESULT', 'NORMAALWAARDE': 'NORMAL_RANGE'}, inplace=True)

# Step 3: Pivot the table using ITEM_OMS as columns and UITSLAG_WAARDE as values
df_pivot = df_reordered.pivot_table(index=['PATIENT_ID', 'DATE', 'TIME'], columns='VALUE_TYPE', values='VALUE_RESULT', aggfunc='first')

# Step 4: Reset the index for a cleaner look and rename the columns
df_pivot = df_pivot.reset_index()

len_before = len(df_pivot)

# Count the number of occurrences per PATIENT_ID
df_pivot['count_per_patient'] = df_pivot.groupby('PATIENT_ID')['PATIENT_ID'].transform('count')

# Keep only patients with 3 or more entries
df_pivot = df_pivot[df_pivot['count_per_patient'] >= 3]

# Drop the helper column
df_pivot = df_pivot.drop(columns=['count_per_patient'])

len_after = len(df_pivot)

print(f"{len_before-len_after} rows were dropped")

# Display the result
df_pivot.head(20)

# df_pivot.to_excel("df_pivot.xlsx", index = False)

135 rows were dropped


VALUE_TYPE,PATIENT_ID,DATE,TIME,ALAT (GPT),ASAT (GOT),Fosfaat anorganisch,Glucose (n.n.),Kalium,Leucocyten,Magnesium
1,17313559,2020-01-29,08:00:00,26,25,1.22,3.8,4.0,3.6,0.73
2,17313559,2020-01-30,08:10:00,25,23,1.11,4.1,4.3,4.1,0.73
3,17313559,2020-02-03,08:05:00,32,20,1.37,4.3,4.3,5.0,0.7
4,17313559,2020-02-05,07:55:00,26,17,1.25,4.4,4.2,4.4,0.7
5,17313559,2020-02-06,08:05:00,24,17,1.23,4.3,4.6,4.5,0.7
6,17313559,2020-02-10,07:30:00,17,16,1.18,4.3,4.2,4.6,0.65
7,17313559,2020-02-13,07:45:00,18,16,1.2,3.8,3.6,4.6,0.64
8,17313559,2020-02-17,08:05:00,15,16,1.4,4.4,4.0,5.9,0.65
9,17313559,2020-02-20,07:55:00,16,16,1.28,3.8,4.2,5.7,0.68
10,17313559,2020-02-24,07:50:00,15,18,1.36,3.2,3.8,4.4,0.67


## Combining the DataFrames

In [105]:
df2 = pd.read_csv("../../annonymizedDatasets/maskedDAIsy_LabCombinedNew.csv", sep="\t")
df_vitals = pd.read_csv("../../annonymizedDatasets/maskedDAIsy_Vitals.csv", sep="\t")
df_age = pd.read_csv("../../annonymizedDatasets/maskedDAIsy_AllDatasetsCombinedWoRepIntakes_v1.tsv", sep = "\t")

### CHEMICAL DATAFRAME

In [29]:
# FILTERING OUT THE RELEVANT COLUMNS
items = ['Kalium', 'Leucocyten', 'ALAT (GPT)', 'ASAT (GOT)', 'Fosfaat anorganisch', 'Magnesium', 'Glucose (n.n.)']

df_items = df2[df2['O_ITEM'].isin(items)]

# df_items.head(10)

In [30]:
# PIVOTING THE DATAFRAME SO THAT ONE LINE IS ONE DATE'S CHEMICAL MEASUREMENT
df_new = df_items.copy()

# Step 1: convert to datetime
df_new['p_DATE_BEPALING'] = pd.to_datetime(df_new['p_DATE_BEPALING'])

# Drop the unnecessary columns
df_drop = df_new.drop(['STATUS_AANVRAAG', 'O_STATUS_UITSLAG'], axis=1)

# Reorder columns
df_reordered = df_drop.iloc[:, [0, 1, 3, 4, 2, 5, 6]]

df_reordered.rename(columns={'pid': 'PATIENT_ID', 'intid': 'INTAKE_ID','O_ITEM': 'CHEMICAL_VALUE', 'UITSLAG_WAARDE': 'VALUE_RESULT', 'NORMAALWAARDE': 'NORMAL_RANGE', 'p_DATE_BEPALING': 'DATE', 'seq_num-lab': 'SEQUENCE'}, inplace=True)

df_reordered.sort_values(by = 'PATIENT_ID')

df_pivot = df_reordered.pivot_table(index=['PATIENT_ID', 'INTAKE_ID', 'SEQUENCE', 'DATE'], columns='CHEMICAL_VALUE', values='VALUE_RESULT', aggfunc='first')

df_pivot = df_pivot.rename_axis(None, axis=1)

pivoted_df = df_pivot.reset_index()

# Set a new index as the row number
pivoted_df['ROW'] = range(1, len(pivoted_df) + 1)
pivoted_df.set_index('ROW', inplace=True)

val_count = pivoted_df['PATIENT_ID'].value_counts()

pivot_df = pivoted_df[pivoted_df['PATIENT_ID'].isin(val_count[val_count >= 3].index)]

pivot_df.head(100)


Unnamed: 0_level_0,PATIENT_ID,INTAKE_ID,SEQUENCE,DATE,ALAT (GPT),ASAT (GOT),Fosfaat anorganisch,Glucose (n.n.),Kalium,Leucocyten,Magnesium
ROW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,10,934,1,2140-01-29,26,25,1.22,3.8,4.0,3.6,0.73
3,10,934,2,2140-01-30,25,23,1.11,4.1,4.3,4.1,0.73
4,10,934,3,2140-02-03,32,20,1.37,4.3,4.3,5.0,0.70
5,10,934,4,2140-02-05,26,17,1.25,4.4,4.2,4.4,0.70
6,10,934,5,2140-02-06,24,17,1.23,4.3,4.6,4.5,0.70
...,...,...,...,...,...,...,...,...,...,...,...
101,56,794,20,2142-01-03,36,18,1.60,4.5,4.1,,0.90
102,56,794,21,2142-01-03,,,,,,4.2,
103,56,794,21,2142-01-10,39,18,1.71,4.0,4.6,,0.93
104,56,794,22,2142-01-10,,,,,,4.7,


### VITALS DATAFRAME (BMI, BLOOD PRESSURE)

In [110]:
# Step 1: convert to datetime
df_vitals['p_DT_METING'] = pd.to_datetime(df_vitals['p_DT_METING'])

# Step 2: Extract the date and time separately
df_vitals['DATE'] = df_vitals['p_DT_METING'].dt.date
df_vitals['TIME'] = df_vitals['p_DT_METING'].dt.time

# Drop the unnecessary columns
df_drop = df_vitals.drop(['Split', 'p_DT_METING', 'TIME'], axis=1)

# Reorder columns
df_reordered = df_drop.iloc[:, [0, 1, 2, 6, 3, 4, 5]]

df_reordered.rename(columns={'pid': 'PATIENT_ID', 'intid': 'INTAKE_ID','O_METING': 'MEASUREMENT ITEM', 'WAARDE1': 'VALUE 1', 'WAARDE2': 'VALUE 2', 'seq_num-vitals': 'SEQUENCE'}, inplace=True)

df_filtered = df_reordered[df_reordered['MEASUREMENT ITEM'].isin(['Body Mass Index', 'Tensie / Pols', 'Temperatuur (c)'])]

df_filtered.sort_values(by = ['PATIENT_ID', 'INTAKE_ID', 'SEQUENCE', 'DATE'], inplace=True)
df_filtered.head(20)

vitals_df = df_filtered.reset_index()

# vitals_df.columns
# Set a new index as the row number
# vitals_df.drop('index')
# vitals_df.set_index('ROW', inplace=True)

vitals_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,index,PATIENT_ID,INTAKE_ID,SEQUENCE,DATE,MEASUREMENT ITEM,VALUE 1,VALUE 2
0,21797,1,900,1,2140-06-25,Body Mass Index,167.0,159.1
1,21996,1,900,2,2140-02-27,Body Mass Index,167.0,159.6
2,13279,2,697,1,2138-02-27,Body Mass Index,168.0,45.9
3,13337,2,697,2,2138-02-20,Body Mass Index,168.0,46.5
4,13730,2,697,3,2138-05-22,Body Mass Index,168.0,48.4
5,13988,2,697,4,2138-08-05,Body Mass Index,168.0,48.0
6,14149,2,697,5,2138-03-20,Body Mass Index,168.0,46.6
7,10880,3,598,1,2137-05-07,Temperatuur (c),36.7,
8,11281,3,598,2,2137-05-07,Tensie / Pols,134.0,77.0
9,11311,3,598,3,2137-09-14,Temperatuur (c),36.4,


In [111]:
vitals_df['MEASUREMENT ITEM'].value_counts()

Body Mass Index    30735
Tensie / Pols      20597
Temperatuur (c)    20563
Name: MEASUREMENT ITEM, dtype: int64

In [139]:
# Initialize list for transformed rows
transformed_rows = []

# Loop over each unique patient ID in the vitals dataframe
for patient_id in vitals_df['PATIENT_ID'].unique():
    patient_data = vitals_df[vitals_df['PATIENT_ID'] == patient_id]

    # Initialize dictionary to hold the transformed row for each patient
    # We will append a new row for each sequence, so no need to overwrite
    for _, row in patient_data.iterrows():
        # Initialize the transformed_row for each individual row
        transformed_row = {
            'PATIENT_ID': patient_id,
            'DATE': row['DATE'],
            'SEQUENCE': row['SEQUENCE'],
            'INTAKE_ID': row['INTAKE_ID'],
            'BMI': np.nan,  # Default values for calculations
            'BP SYST': np.nan,
            'BP DIAS': np.nan
        }

        measurement = row['MEASUREMENT ITEM']

        # BMI (Body Mass Index) calculation
        if measurement == 'Body Mass Index':
            height_in_meters = row['VALUE 1'] / 100  # Convert height from cm to meters
            weight = row['VALUE 2']  # Weight is in kg
            # Check if height is 0 to avoid division by zero
            if height_in_meters == 0:
                bmi = np.nan  # Assign NaN if height is zero
            else:
                bmi = weight / (height_in_meters ** 2)  # Calculate BMI
            transformed_row['BMI'] = bmi

        # Blood Pressure (Tensie / Pols)
        elif measurement == 'Tensie / Pols':
            systolic = row['VALUE 1']
            diastolic = row['VALUE 2']
            transformed_row['BP SYST'] = systolic
            transformed_row['BP DIAS'] = diastolic

        # For other measurements, just assign the value to a new column
        else:
            transformed_row[measurement] = row['VALUE 1']

        # Append the transformed row to the list
        transformed_rows.append(transformed_row)

# Convert the list of transformed rows into a new DataFrame
transformed_df = pd.DataFrame(transformed_rows)

# Now, we pivot the dataframe to consolidate measurements for each (PATIENT_ID, DATE, INTAKE_ID)
# Using the transformed columns themselves, no need for 'VALUE 1'
pivoted_df = transformed_df.pivot_table(
    index=['PATIENT_ID', 'DATE', 'SEQUENCE', 'INTAKE_ID'],
    columns='MEASUREMENT ITEM',
    aggfunc='first',  # If there are duplicates, keep the first one
    dropna=False  # Keep NaN values for missing measurements
)

# Flatten the multi-index columns created by pivot_table
pivoted_df.columns = [f'{col}' for col in pivoted_df.columns]

# Reset index to flatten the dataframe
pivoted_df.reset_index(inplace=True)

# Sort by PATIENT_ID, DATE, and SEQUENCE for consistency
pivoted_df.sort_values(by=['PATIENT_ID', 'DATE', 'SEQUENCE'], inplace=True)

# Show the resulting DataFrame
pivoted_df.head(100)

KeyError: 'MEASUREMENT ITEM'

### AGE AND ED DATAFRAME

In [69]:
df_age['p_startdate'] = pd.to_datetime(df_age['p_startdate'])
df_age['DATE'] = df_age['p_startdate'].dt.date
df_age['intid'] = df_age['intid'].astype(int)

cols = ['intid', 'seq_num-edeq', 'pid', 'DATE', 'Main-Age', 'Main-Bsex', 'EDtype']

df_clean = df_age.loc[:, cols]

df_clean.rename(columns={'pid': 'PATIENT_ID', 'Main-Age': 'AGE', 'Main-Bsex': 'SEX', 'intid': 'INTAKE_ID', 'seq_num-edeq' : 'SEQUENCE'}, inplace=True)

df_filtered = df_clean[df_clean['EDtype'] == 'Anorexia nervosa']

df_filtered.head()

df_reordered = df_filtered.iloc[:, [2, 0, 1, 3, 4, 5, 6]]

df_reordered.head(20)

Unnamed: 0,PATIENT_ID,INTAKE_ID,SEQUENCE,DATE,AGE,SEX,EDtype
2,2,697,1,2138-01-30,17,Vrouw,Anorexia nervosa
4,4,1315,1,2142-01-21,19,Vrouw,Anorexia nervosa
8,9,1407,1,2142-08-02,16,Vrouw,Anorexia nervosa
9,10,934,1,2139-08-27,26,Vrouw,Anorexia nervosa
10,13,185,1,2134-04-11,15,Man,Anorexia nervosa
12,15,1402,1,2142-07-21,21,Vrouw,Anorexia nervosa
13,16,1184,1,2141-03-19,15,Vrouw,Anorexia nervosa
16,19,1653,1,2143-11-28,16,Vrouw,Anorexia nervosa
18,21,614,1,2137-06-22,20,Vrouw,Anorexia nervosa
20,24,1340,1,2142-03-22,27,Vrouw,Anorexia nervosa


### MERGE DF_CHEM AND DF_AGE_BMI TOGETHER

In [79]:
# since we only want AN patients, we should first filter pivot_df based on all the PID that are still in df_reordered
# so that we only have AN patients in pivot_df before merging
pivot_df = pivot_df[pivot_df['PATIENT_ID'].isin(df_reordered['PATIENT_ID'])]
pivot_df.head()

# filter on dates and patient ID's that are already in the pivot_df
# df_filtered = df_reordered[df_reordered[['PATIENT_ID', 'DATE']].isin(pivoted_df[['PATIENT_ID', 'DATE']].drop_duplicates()).all(axis=1)]

df_merged = pivot_df.merge(df_reordered, on=['PATIENT_ID', 'INTAKE_ID'], how='left')

df_merged = df_merged.iloc[:, [0,1,2,3, 12, 13, 14, 15, 4,5,6,7,8,9,10]]

df_merged.rename(columns={'SEQUENCE_x': 'SEQUENCE', 'DATE_x' : 'DATE'}, inplace=True)

df_merged.drop('DATE_y', axis=1, inplace=True)

df_merged.head(100)
# nan_count = df_merged.isna().sum()
#
# print(len(df_merged))
# print(nan_count)

Unnamed: 0,PATIENT_ID,INTAKE_ID,SEQUENCE,DATE,AGE,SEX,EDtype,ALAT (GPT),ASAT (GOT),Fosfaat anorganisch,Glucose (n.n.),Kalium,Leucocyten,Magnesium
0,10,934,1,2140-01-29,26.0,Vrouw,Anorexia nervosa,26,25,1.22,3.8,4.0,3.6,0.73
1,10,934,2,2140-01-30,26.0,Vrouw,Anorexia nervosa,25,23,1.11,4.1,4.3,4.1,0.73
2,10,934,3,2140-02-03,26.0,Vrouw,Anorexia nervosa,32,20,1.37,4.3,4.3,5.0,0.70
3,10,934,4,2140-02-05,26.0,Vrouw,Anorexia nervosa,26,17,1.25,4.4,4.2,4.4,0.70
4,10,934,5,2140-02-06,26.0,Vrouw,Anorexia nervosa,24,17,1.23,4.3,4.6,4.5,0.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,75,1622,11,2143-10-02,17.0,Vrouw,Anorexia nervosa,,,,,,8.5,
96,75,1622,11,2143-10-09,17.0,Vrouw,Anorexia nervosa,23,28,1.16,,,,0.81
97,75,1622,12,2143-10-09,17.0,Vrouw,Anorexia nervosa,,,,,,8.5,
98,75,1622,12,2143-10-16,17.0,Vrouw,Anorexia nervosa,22,20,1.24,,4.2,,0.80


In [124]:
vitals_new['DATE'] = pd.to_datetime(vitals_new['DATE'])
df_merged['DATE'] = pd.to_datetime(df_merged['DATE'])

df_merged = df_merged.sort_values(by=['PATIENT_ID', 'DATE'], ascending=[True, True])
vitals_new = vitals_new.sort_values(by=['PATIENT_ID', 'DATE'], ascending=[True, True])


df_merged
# # asof merge
# df_m = pd.merge_asof(
#     df_merged,
#     vitals_new,
#     on='DATE',
#     by='PATIENT_ID',
#     direction='nearest',
#     tolerance=pd.Timedelta(days=1)
# )
#
# # Count how many rows in pivot_df didn't find a match
# unmatched_rows = df_merged[~df_merged['DATE'].isin(df_m['DATE'])]
#
# print(f"Number of unmatched rows: {len(unmatched_rows)}")

Unnamed: 0,PATIENT_ID,INTAKE_ID,SEQUENCE,DATE,AGE,SEX,EDtype,ALAT (GPT),ASAT (GOT),Fosfaat anorganisch,Glucose (n.n.),Kalium,Leucocyten,Magnesium
0,10,934,1,2140-01-29,26.0,Vrouw,Anorexia nervosa,26,25,1.22,3.8,4.0,3.6,0.73
1,10,934,2,2140-01-30,26.0,Vrouw,Anorexia nervosa,25,23,1.11,4.1,4.3,4.1,0.73
2,10,934,3,2140-02-03,26.0,Vrouw,Anorexia nervosa,32,20,1.37,4.3,4.3,5.0,0.70
3,10,934,4,2140-02-05,26.0,Vrouw,Anorexia nervosa,26,17,1.25,4.4,4.2,4.4,0.70
4,10,934,5,2140-02-06,26.0,Vrouw,Anorexia nervosa,24,17,1.23,4.3,4.6,4.5,0.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3067,1601,1076,24,2140-10-12,29.0,Vrouw,Anorexia nervosa,,,,,,3.9,
3068,1601,1076,24,2140-10-19,29.0,Vrouw,Anorexia nervosa,<9,10,1.36,4.6,4.3,,0.75
3069,1601,1076,25,2140-10-19,29.0,Vrouw,Anorexia nervosa,,,,,,4.3,
3070,1601,1076,25,2140-10-26,29.0,Vrouw,Anorexia nervosa,10,<8,1.41,5.0,4.5,,0.76


In [127]:
vitals_new

Unnamed: 0,PATIENT_ID,INTAKE_ID,SEQUENCE,DATE,BMI,BP SYST,BP DIAS,Temperatuur (c)
1,1,900,2,2140-02-27,57.226864,,,
0,1,900,1,2140-06-25,57.047581,,,
3,2,697,2,2138-02-20,16.475340,,,
2,2,697,1,2138-02-27,16.262755,,,
6,2,697,5,2138-03-20,16.510771,,,
...,...,...,...,...,...,...,...,...
71789,1601,1076,274,2140-12-10,,114.0,78.0,
71790,1601,1076,275,2140-12-10,,,,36.4
71886,1601,1076,414,2140-12-15,18.503600,,,
71894,1606,1637,2,2143-07-11,14.066043,,,


In [43]:
dates_chem = df_check2.index.get_level_values(1).unique().tolist()

dates_vitals = df_check["DATE"].unique().tolist()

dates_chem_dates = [d.date() if isinstance(d, pd.Timestamp) else d for d in dates_chem]

dates_vitals_dates = [d.date() if isinstance(d, pd.Timestamp) else d for d in dates_vitals]

overlap_dates = set(dates_chem_dates).intersection(dates_vitals_dates)

initial_length_chem = len(dates_chem_dates)
initial_length_vitals = len(dates_vitals_dates)
overlap_length = len(overlap_dates)

print("Initial length of dates_chem:", initial_length_chem)
print("Initial length of dates_vitals:", initial_length_vitals)
print("Overlap length:", overlap_length)

print("Overlapping Dates:", overlap_dates)


Initial length of dates_chem: 34
Initial length of dates_vitals: 129
Overlap length: 16
Overlapping Dates: {datetime.date(2139, 11, 13), datetime.date(2139, 12, 23), datetime.date(2140, 1, 27), datetime.date(2139, 12, 30), datetime.date(2139, 11, 28), datetime.date(2139, 11, 14), datetime.date(2139, 11, 25), datetime.date(2139, 12, 16), datetime.date(2140, 1, 20), datetime.date(2139, 11, 11), datetime.date(2139, 11, 27), datetime.date(2139, 11, 18), datetime.date(2139, 12, 27), datetime.date(2140, 1, 13), datetime.date(2139, 10, 31), datetime.date(2139, 10, 30)}


In [48]:
results = []

for pid in df_vitals["PATIENT_ID"].unique():

    if pid in df_vitals["PATIENT_ID"].values and pid in pivot_df.index.get_level_values(0).unique():
        df_check = df_vitals[df_vitals["PATIENT_ID"] == pid]
        df_check2 = pivot_df[pivot_df['PATIENT_ID'] == pid]

        dates_chem = df_check2['DATE'].unique().tolist()
        dates_vitals = df_check["DATE"].unique().tolist()

        dates_chem_dates = [d.date() if isinstance(d, pd.Timestamp) else d for d in dates_chem]
        dates_vitals_dates = [d.date() if isinstance(d, pd.Timestamp) else d for d in dates_vitals]

        overlap_dates = set(dates_chem_dates).intersection(dates_vitals_dates)

        # calculate lengths
        initial_length_chem = len(dates_chem_dates)
        initial_length_vitals = len(dates_vitals_dates)
        overlap_length = len(overlap_dates)

        # calc overlap
        overlap_percentage = (overlap_length / initial_length_chem) * 100 if initial_length_chem > 0 else 0

        results.append({
            "Patient ID": pid,
            "Chem Dates Count": initial_length_chem,
            "Vitals Dates Count": initial_length_vitals,
            "Overlap Count": overlap_length,
            "Overlap Percentage": overlap_percentage,
            "Overlapping Dates": list(overlap_dates)
        })

summary_df = pd.DataFrame(results)

summary_df_sorted = summary_df.sort_values(by="Patient ID").reset_index(drop=True)

overlap_percentage_stats = {
    "Min": summary_df_sorted["Overlap Percentage"].min(),
    "Max": summary_df_sorted["Overlap Percentage"].max(),
    "Mean": summary_df_sorted["Overlap Percentage"].mean(),
    "Median": summary_df_sorted["Overlap Percentage"].median()
}

print("Overlap Percentage Statistics:")
print(overlap_percentage_stats)

summary_df_sorted.head(100)


Overlap Percentage Statistics:
{'Min': 0.0, 'Max': 0.0, 'Mean': 0.0, 'Median': 0.0}


Unnamed: 0,Patient ID,Chem Dates Count,Vitals Dates Count,Overlap Count,Overlap Percentage,Overlapping Dates
0,2,0,5,0,0.0,[]
1,3,0,4,0,0.0,[]
2,4,0,11,0,0.0,[]
3,5,0,14,0,0.0,[]
4,9,0,1,0,0.0,[]
...,...,...,...,...,...,...
95,151,24,57,0,0.0,[]
96,152,0,10,0,0.0,[]
97,156,0,54,0,0.0,[]
98,158,0,33,0,0.0,[]
