In [1]:
import pandas as pd
import random
import datetime as dt
import numpy as np

In [5]:
df_demo = pd.read_csv("../anonymized_Labels_refeeding.csv", sep='\t')
df_lab = pd.read_csv("../anonymized_Labels_refeeding_lab.csv", sep='\t')
df_vitals = pd.read_csv("../anonymized_Labels_refeeding_metingen.csv", sep='\t' )

df_main = pd.read_csv("BEP_imputed.csv")

In [6]:
# Set seed for reproducibility
random.seed(42)

# Get existing values
main_int = df_main['INTAKE_ID'].dropna().astype(int).unique().tolist()
new_int = df_demo['intid'].dropna().astype(int).unique().tolist()
main_pid = df_main['PATIENT_ID'].dropna().astype(int).unique().tolist()
new_pid = df_demo['pid'].dropna().astype(int).unique().tolist()

# Combine sets for uniqueness checks
existing_int_ids = set(main_int + new_int)
existing_pids = set(main_pid + new_pid)

# Determine typical lengths
typical_int_len = int(pd.Series(main_int).astype(str).str.len().median())
typical_pid_len = int(pd.Series(main_pid).astype(str).str.len().median())

# Function to generate a unique integer ID
def generate_unique_id(length, existing_ids):
    while True:
        new_id = random.randint(10**(length - 1), 10**length - 1)
        if new_id not in existing_ids:
            existing_ids.add(new_id)  # Add to set to prevent reuse
            return new_id

# Apply masking for missing values
for idx, row in df_demo.iterrows():
    if pd.isna(row['intid']):
        df_demo.at[idx, 'intid'] = generate_unique_id(typical_int_len, existing_int_ids)
    if pd.isna(row['pid']):
        df_demo.at[idx, 'pid'] = generate_unique_id(typical_pid_len, existing_pids)

# df = df_demo.drop(['cid', 'ggzob_id'], axis=1)
df1 = df_demo.copy()

#### REMOVE LATER, FOR NOW DROP THE DUPLICATES
df1 = df1.drop_duplicates(subset='pid', keep='first')

In [7]:
df1['datum_baseline'] = pd.to_datetime(df1['datum_baseline'])
df1['DATE'] = df1['datum_baseline'].astype('int64') // 10**9  # UNIX timestamp

# Convert INTAKE_ID to int
df1['intid'] = df1['intid'].astype(int)
df1['pid'] = df1['pid'].astype(int)

# Rename columns
df1.rename(columns={
    'pid': 'PATIENT_ID',
    'intid': 'INTAKE_ID',
    'leeftijd_baseline': 'AGE',
    'Geslacht': 'SEX'
}, inplace=True)

# Map gender to binary
df1['SEX'] = df1['SEX'].map({'Vrouw': 1, 'Man': 0})

# Reorder columns
df1 = df1[['PATIENT_ID', 'INTAKE_ID', 'DATE', 'SEX', 'AGE', 'cid','ggzob_id']]

  df1['datum_baseline'] = pd.to_datetime(df1['datum_baseline'])


In [8]:
df1

Unnamed: 0,PATIENT_ID,INTAKE_ID,DATE,SEX,AGE,cid,ggzob_id
0,56,794,1537833600,1,52,74144132,787361
1,325,1666,1706486400,1,21,443935059,893618
2,340,440,1455148800,1,50,457621420,547117
7,530,1190,1617062400,1,44,747031514,874023
9,893,1689,1706572800,1,25,1253153039,894537
10,1048,1305,1640304000,1,55,1441486767,879294
12,1171,1157,1611014400,1,22,1587832861,871683
13,1231,1556,1685491200,1,19,1665746275,889516
14,1317,1314,1638748800,1,20,1766861724,879459
15,350,5506,1720051200,1,33,1791108947,896935


In [9]:
set_a = set(df1['ggzob_id'].unique().tolist())
set_b = set(df_lab['ggzob_id'].unique().tolist())

overlap = set_a & set_b
only_in_a = set_a - set_b
only_in_b = set_b - set_a

print(overlap, only_in_a, only_in_b)

set() {787361, 889633, 879459, 871683, 874023, 896935, 894537, 889516, 547117, 862285, 893618, 886579, 879294} {':Metabolisme: NORMAAL', ':Opmerking Protrombinetijd:  Streefgebied', 'Onderstaande resultaten zijn doorgebeld door Marquise Benilia ', ':Dubieus  7 - 10 IU/ml', ':Antwoord/advies: Past NIET bij hoge gevoeligheid voor coumarines. Op basis van het VKORC1', '900940', ':CYP3A4 genotype', 'symptomen uitblijven.', '894537', '825641', '880114', 'Resulaten per fax doorgegeven op 04.12.2019 (16.13) door  Danielle van den Nobelen aan fax', 'Resulaten per fax doorgegeven op 12.04.2022 (14.40) door  Cindy van de Werp - Vromans aan fax', ':CYP2C19 genotype', '879459', 'Onderstaande resultaten zijn doorgebeld door Tamara Brandenburg - Dummer ', '--------------------', ':Antwoord/advies: Past bij normale OATP1B1 (SLCO1B1) transporter activiteit.', '547117', ':Positief   > 10 IU/ml', 'Aan :  GGZ Oost Brabant / kim', '862285', 'Datum : 15.08.2022 om 13:48', ':Conclusie: CYP3A41/1 (2 actieve 

In [10]:
df_lab = pd.read_csv("../anonymized_Labels_refeeding_lab.csv", sep='\t')

 ### LAB DATAFRAME

In [11]:
# Filter relevant lab items
df_lab = df_lab[df_lab['O_AANVR_UITSLAG_ITEM_LANG'].isin([
    'Kalium', 'Leucocyten', 'ALAT (GPT)', 'ASAT (GOT)',
    'Fosfaat anorganisch', 'Magnesium', 'Glucose (n.n.)'
])].copy()

# Convert date and extract UNIX timestamp
df_lab['DT_BEPALING'] = pd.to_datetime(df_lab['DT_BEPALING'], dayfirst=True, errors='coerce')
df_lab['DATE'] = df_lab['DT_BEPALING'].dt.normalize()
df_lab['DATE'] = df_lab['DATE'].astype('int64') // 10 ** 9

df_lab['ggzob_id'] = df_lab['ggzob_id'].astype(int)

# Drop unneeded columns
df_lab.drop(columns=['STATUS_AANVRAAG', 'O_STATUS_UITSLAG', 'DT_BEPALING', 'SEQ_ZPAT_PATIENT', 'UITSLAG_CONCLUSIE', 'UITSLAG_TEKST_LAB', 'NORMAALWAARDE', 'AANVRAAG_NUMMER', 'UITSLAGREGEL', 'intid'], inplace=True)

df_lab

Unnamed: 0,O_AANVR_UITSLAG_ITEM_LANG,UITSLAG_WAARDE,ggzob_id,DATE
10045,Leucocyten,4.4,859692,1575849600
10046,Leucocyten,4.4,859692,-9223372037
10047,Leucocyten,4.0,859692,1575244800
10048,Leucocyten,6.2,859692,1574899200
10049,Leucocyten,4.8,859692,1575417600
...,...,...,...,...
18942,Fosfaat anorganisch,1.24,889516,1743552000
23099,Leucocyten,Negatief,871683,1640563200
23100,Leucocyten,Negatief,871683,1674432000
23101,Leucocyten,Negatief,871683,1665446400


In [12]:
def clean_column(value):
    """
    Cleans individual lab result values by handling special cases.

    This function is designed to clean lab result values that may include:
    - Values like "<8", which are converted to one less than the number (e.g., "<8" → 7)
    - Purely alphabetical values (e.g., "NEG", "POS"), which are converted to NaN
    - Any numeric values are returned as-is

    :param value: A single lab result value (typically string or numeric)
    :return: A cleaned numeric value or NaN if the original was non-numeric text
    """
    if isinstance(value, str):
        if value.startswith("<"):
            return int(value[1:]) - 1  # Convert "<8" to 7
        elif value.isalpha():  # Check if the value is only letters
            return np.nan  # Replace letters with NaN
    return value  # Keep numeric values as they are

In [13]:
df_lab = df_lab[df_lab['ggzob_id'].isin(df1['ggzob_id'])]

# get the pseudonimized patient_id and intake_id from the df_demo
df_lab = df_lab.merge(
    df1[['ggzob_id', 'PATIENT_ID', 'INTAKE_ID']],
    on='ggzob_id',
    how='left')

# Rename columns
df_lab.rename(columns={
    'O_AANVR_UITSLAG_ITEM_LANG': 'CHEMICAL_VALUE',
    'UITSLAG_WAARDE': 'VALUE_RESULT'
}, inplace=True)

# Sort and pivot
df_lab.sort_values(by='PATIENT_ID', inplace=True)

df_lab = df_lab.pivot_table(
    index=['PATIENT_ID', 'INTAKE_ID', 'DATE', 'ggzob_id'],
    columns='CHEMICAL_VALUE',
    values='VALUE_RESULT',
    aggfunc='first'
).rename_axis(None, axis=1).reset_index()

# Add row index
df_lab['ROW'] = range(1, len(df_lab) + 1)
df_lab.set_index('ROW', inplace=True)

# Keep only patients with 3 or more measurements
patient_counts = df_lab['PATIENT_ID'].value_counts()
df_lab = df_lab[df_lab['PATIENT_ID'].isin(patient_counts[patient_counts >= 3].index)]

# Rename columns to clean variable names
df_lab.rename(columns={
    'ASAT (GOT)': 'AST',
    'ALAT (GPT)': 'ALT',
    'Fosfaat anorganisch': 'Phosphate',
    'Kalium': 'Potassium',
    'Leucocyten': 'Leucocytes',
    'Glucose (n.n.)': 'Glucose'
}, inplace=True)

# Clean and convert columns
for col in ['ALT', 'AST']:
    df_lab[col] = df_lab[col].apply(clean_column)

to_convert = ['Magnesium', 'ALT', 'AST', 'Phosphate', 'Glucose', 'Potassium', 'Leucocytes']
df_lab[to_convert] = df_lab[to_convert].apply(pd.to_numeric, errors='coerce')

df_lab['SEQUENCE'] = df_lab.groupby(['PATIENT_ID', 'INTAKE_ID'])['DATE'].rank(method='first').astype(int)

df_lab

Unnamed: 0_level_0,PATIENT_ID,INTAKE_ID,DATE,ggzob_id,ALT,AST,Phosphate,Glucose,Potassium,Leucocytes,Magnesium,SEQUENCE
ROW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,56,794,-9223372037,787361,51.0,11.0,1.04,3.9,4.3,2.3,0.90,1
2,56,794,1634688000,787361,120.0,59.0,0.38,3.6,4.8,3.9,0.90,2
3,56,794,1634774400,787361,133.0,63.0,0.88,4.0,4.4,3.2,0.86,3
4,56,794,1634860800,787361,122.0,44.0,1.30,4.4,4.7,3.9,0.81,4
5,56,794,1635120000,787361,123.0,41.0,1.15,3.9,4.5,3.2,0.88,5
...,...,...,...,...,...,...,...,...,...,...,...,...
373,1593,1025,1582502400,862285,20.0,21.0,1.51,4.7,4.3,4.4,0.85,9
374,1593,1025,1583107200,862285,24.0,19.0,1.45,4.7,4.2,4.8,0.82,10
375,1593,1025,1583712000,862285,26.0,21.0,1.44,4.6,4.1,4.6,0.87,11
376,1593,1025,1584316800,862285,25.0,20.0,1.60,,4.2,4.8,0.81,12


### VITALS DATAFRAME

In [403]:
df_vitals = pd.read_csv("../anonymized_Labels_refeeding_metingen.csv", sep='\t' )
df_vitals

Unnamed: 0,DT_METING,OPMERKING,O_METING,WAARDE1,WAARDE2,cid,ggzob_id,intid,pid
0,26-02-2016 09:45,Oor,Temperatuur (c),355,,4.576214e+08,547117,,340.0
1,26-02-2016 22:05,pols: 63 (),Tensie / Pols,119,84,4.576214e+08,547117,,340.0
2,27-02-2016 07:47,Oor,Temperatuur (c),355,,4.576214e+08,547117,,340.0
3,26-03-2016 10:55,BMI 14.5,Body Mass Index,162,3805,4.576214e+08,547117,,340.0
4,05-04-2016 07:53,BMI 14.84,Body Mass Index,162,3895,4.576214e+08,547117,,340.0
...,...,...,...,...,...,...,...,...,...
10405,26-01-2025 08:15,,Bloedsuiker (mmol/L),41,,4.576214e+08,900233,,340.0
10406,28-02-2025 11:03,BMI 14.95,Body Mass Index,1605,385,4.576214e+08,900233,,340.0
10407,31-03-2025 09:34,zittend pols: 64 (),Tensie / Pols,117,84,4.576214e+08,900233,,340.0
10408,09-04-2025 09:40,zittend pols: 64 (),Tensie / Pols,90,69,1.441487e+09,900940,,1048.0


In [404]:
# Convert to datetime and extract date/time
df_vitals['DT_METING'] = pd.to_datetime(df_vitals['DT_METING'], dayfirst=True, errors='coerce')
df_vitals['DATE'] = df_vitals['DT_METING'].dt.normalize()
df_vitals['TIME'] = df_vitals['DT_METING'].dt.time
df_vitals['DATE'] = df_vitals['DATE'].astype('int64') // 10**9  # Unix timestamp

# Filter relevant measurement items
df_vitals = df_vitals[df_vitals['O_METING'].isin([
    'Body Mass Index', 'Tensie / Pols', 'Temperatuur (c)'
])].copy()

df_vitals[['ggzob_id', 'cid']] = df_vitals[['ggzob_id', 'cid']].astype(int)

df_vitals = df_vitals[df_vitals['ggzob_id'].isin(df1['ggzob_id'])]

# get the pseudonimized patient_id and intake_id from the df_demo
df_vitals = df_vitals.merge(
    df1[['ggzob_id', 'PATIENT_ID', 'INTAKE_ID']],
    on='ggzob_id',
    how='left'
)

# Drop unnecessary columns
df_vitals.drop(columns=['DT_METING', 'TIME', 'OPMERKING', 'intid', 'pid'], inplace=True)

# Rename columns
df_vitals.rename(columns={
    'O_METING': 'MEASUREMENT ITEM',
    'WAARDE1': 'VALUE 1',
    'WAARDE2': 'VALUE 2'
}, inplace=True)

# Sort and reset index
df_vitals.sort_values(by=['PATIENT_ID', 'INTAKE_ID', 'DATE'], inplace=True)
df_vitals.reset_index(drop=True, inplace=True)

# Melt value columns
df_melted = df_vitals.melt(
    id_vars=['PATIENT_ID', 'INTAKE_ID', 'ggzob_id', 'cid', 'DATE', 'MEASUREMENT ITEM'],
    value_vars=['VALUE 1', 'VALUE 2'],
    var_name='VALUE_TYPE',
    value_name='VALUE'
)

# Append ' 1' or ' 2' based on VALUE_TYPE to MEASUREMENT ITEM
df_melted['MEASUREMENT ITEM'] = (df_melted['MEASUREMENT ITEM'] + ' ' + df_melted['VALUE_TYPE'].str.extract(r'(\d)')[0].fillna(''))

# Pivot to wide format
df_pivot = df_melted.pivot_table(
    index=['PATIENT_ID', 'INTAKE_ID', 'DATE'],
    columns='MEASUREMENT ITEM',
    values='VALUE',
    aggfunc='first'
).reset_index()

# Handle integer-like columns with commas
cols_to_int = ['Body Mass Index 1', 'Tensie / Pols 1', 'Tensie / Pols 2']
for col in cols_to_int:
    df_pivot[col] = pd.to_numeric(df_pivot[col], errors='coerce').astype('Int64')  # Convert to nullable int

# Handle float columns (only need one conversion per column)
cols_to_float = ['Body Mass Index 2', 'Temperatuur (c) 1']
for col in cols_to_float:
    df_pivot[col] = df_pivot[col].astype(str).str.replace(',', '.', regex=False)
    df_pivot[col] = pd.to_numeric(df_pivot[col], errors='coerce').astype(float)

# Calculate Height and BMI
df_pivot['Height (m)'] = df_pivot['Body Mass Index 1'] / 100
df_pivot['BMI'] = df_pivot['Body Mass Index 2'] / (df_pivot['Height (m)'] ** 2)

# Drop Body Mass Index 1 (height in cm)
df_pivot.drop(columns=['Body Mass Index 1'], inplace=True)

# Rename measurement columns
df_pivot.rename(columns={
    'Tensie / Pols 1': 'Systolic',
    'Tensie / Pols 2': 'Diastolic',
    'Temperatuur (c) 1': 'Temperature (C)',
    'Body Mass Index 2': 'Weight (kg)'
}, inplace=True)

# Reorder columns
df_pivot = df_pivot[[
    'PATIENT_ID', 'INTAKE_ID', 'DATE',
    'Weight (kg)', 'Height (m)', 'BMI',
    'Systolic', 'Diastolic', 'Temperature (C)'
]]


# Recalculate SEQUENCE
df_pivot.sort_values(by=['PATIENT_ID', 'INTAKE_ID', 'DATE'], inplace=True)
df_pivot['SEQUENCE'] = df_pivot.groupby(['PATIENT_ID', 'INTAKE_ID']).cumcount() + 1

# Aggregate: keep only first values per patient-intake-date
df_final = df_pivot.groupby(['PATIENT_ID', 'INTAKE_ID', 'DATE'], as_index=False).agg({
    'Weight (kg)': 'first',
    'Height (m)': 'first',
    'BMI': 'first',
    'Temperature (C)': 'first',
    'Systolic': 'first',
    'Diastolic': 'first'
})

# Add SEQUENCE again after aggregation
df_final['SEQUENCE'] = df_final.groupby(['PATIENT_ID', 'INTAKE_ID']).cumcount() + 1
df_final = df_final.sort_values(by=['PATIENT_ID', 'INTAKE_ID', 'DATE'])

df_final.columns.name = None

desired_order = [
    'PATIENT_ID', 'INTAKE_ID', 'DATE', 'SEQUENCE',
    'Height (m)', 'Weight (kg)', 'BMI',
    'Systolic', 'Diastolic', 'Temperature (C)'
]

df_final = df_final[desired_order]

In [405]:
df_final

Unnamed: 0,PATIENT_ID,INTAKE_ID,DATE,SEQUENCE,Height (m),Weight (kg),BMI,Systolic,Diastolic,Temperature (C)
0,56,794,1538438400,1,1.65,45.20,16.602388,116,74,36.4
1,56,794,1538524800,2,1.65,43.95,16.143251,103,76,35.6
2,56,794,1538611200,3,1.65,44.15,16.216713,107,91,36.2
3,56,794,1538697600,4,1.65,43.55,15.996327,103,70,35.9
4,56,794,1538784000,5,1.65,43.85,16.10652,112,79,36.1
...,...,...,...,...,...,...,...,...,...,...
1397,1593,1025,1598313600,78,1.71,53.40,18.262029,,,
1398,1593,1025,1598572800,79,1.71,53.70,18.364625,,,
1399,1593,1025,1598918400,80,1.71,53.80,18.398824,,,
1400,1593,1025,1599177600,81,1.71,54.10,18.501419,,,


### MERGING

In [406]:
# vitals = df_final
# demograph = df1
# lab = df_lab

# Ensure only patients in both datasets are kept
df_lab = df_lab[df_lab['PATIENT_ID'].isin(df1['PATIENT_ID'])].copy()

# Merge lab and age data
df_merged = df_lab.merge(df1, on=['PATIENT_ID', 'INTAKE_ID'], how='left')

# Rename overlapping columns to keep only relevant ones
df_merged.rename(columns={
    'DATE_x': 'DATE',
    'ggzob_id_x': 'ggzob_id',
}, inplace=True)

# Drop unnecessary duplicates
df_merged.drop(columns=['DATE_y', 'ggzob_id_y'], inplace=True)

# Reorder columns
df_merged = df_merged[[
    'PATIENT_ID', 'INTAKE_ID', 'SEQUENCE', 'cid','ggzob_id', 'DATE', 'AGE', 'SEX',
    'ALT', 'AST', 'Phosphate', 'Glucose', 'Potassium', 'Leucocytes', 'Magnesium'
]]

# Aggregate by patient and date, taking the first valid entry per group
df_tog = df_merged.groupby(['PATIENT_ID', 'DATE'], as_index=False).agg({
    'INTAKE_ID': 'first',
    'SEQUENCE': 'first',
    'AGE': 'first',
    'SEX': 'first',
    'ALT': 'first',
    'AST': 'first',
    'Phosphate': 'first',
    'Glucose': 'first',
    'Potassium': 'first',
    'Leucocytes': 'first',
    'Magnesium': 'first'
})

df_tog

Unnamed: 0,PATIENT_ID,DATE,INTAKE_ID,SEQUENCE,AGE,SEX,ALT,AST,Phosphate,Glucose,Potassium,Leucocytes,Magnesium
0,56,-9223372037,794,1,52,1,51.0,11.0,1.04,3.9,4.3,2.3,0.90
1,56,1634688000,794,2,52,1,120.0,59.0,0.38,3.6,4.8,3.9,0.90
2,56,1634774400,794,3,52,1,133.0,63.0,0.88,4.0,4.4,3.2,0.86
3,56,1634860800,794,4,52,1,122.0,44.0,1.30,4.4,4.7,3.9,0.81
4,56,1635120000,794,5,52,1,123.0,41.0,1.15,3.9,4.5,3.2,0.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,1593,1582502400,1025,9,17,1,20.0,21.0,1.51,4.7,4.3,4.4,0.85
372,1593,1583107200,1025,10,17,1,24.0,19.0,1.45,4.7,4.2,4.8,0.82
373,1593,1583712000,1025,11,17,1,26.0,21.0,1.44,4.6,4.1,4.6,0.87
374,1593,1584316800,1025,12,17,1,25.0,20.0,1.60,,4.2,4.8,0.81


In [407]:
# Merge lab+age data with vitals on DATE, PATIENT_ID, INTAKE_ID
df_combined = pd.merge(
    df_tog, df_final,
    on=['DATE', 'PATIENT_ID', 'INTAKE_ID'],
    how='left',
    suffixes=('_final', '_merged')
)

# Rename and drop duplicate SEQUENCE columns
df_combined.rename(columns={'SEQUENCE_final': 'SEQUENCE'}, inplace=True)
df_combined.drop(columns=['SEQUENCE_merged'], inplace=True)


# Reorder columns
new_column_order = [
    'PATIENT_ID', 'INTAKE_ID', 'SEQUENCE', 'DATE',
    'AGE', 'SEX', 'Weight (kg)', 'Height (m)', 'BMI',
    'Temperature (C)', 'Systolic', 'Diastolic',
    'ALT', 'AST', 'Phosphate', 'Glucose',
    'Potassium', 'Leucocytes', 'Magnesium'
]
df_combined = df_combined[new_column_order]

# Convert lab result columns to numeric
columns_to_convert = [
    'ALT', 'AST', 'Phosphate', 'Glucose',
    'Potassium', 'Leucocytes', 'Magnesium'
]
df_combined[columns_to_convert] = df_combined[columns_to_convert].apply(pd.to_numeric, errors='coerce')

# Fill in missing heights per patient
df_combined['Height (m)'] = df_combined.groupby('PATIENT_ID')['Height (m)'].transform(lambda x: x.ffill().bfill())

# Clean specific outlier or invalid values
df_combined['Temperature (C)'] = df_combined['Temperature (C)'].replace([45.5], float('nan'))
df_combined['Systolic'] = df_combined['Systolic'].replace(0, float('nan'))
df_combined['Diastolic'] = df_combined['Diastolic'].replace(0, float('nan'))

In [408]:
df_combined

Unnamed: 0,PATIENT_ID,INTAKE_ID,SEQUENCE,DATE,AGE,SEX,Weight (kg),Height (m),BMI,Temperature (C),Systolic,Diastolic,ALT,AST,Phosphate,Glucose,Potassium,Leucocytes,Magnesium
0,56,794,1,-9223372037,52,1,,1.65,,,,,51.0,11.0,1.04,3.9,4.3,2.3,0.90
1,56,794,2,1634688000,52,1,39.2,1.65,14.398531,37.2,108,69,120.0,59.0,0.38,3.6,4.8,3.9,0.90
2,56,794,3,1634774400,52,1,38.7,1.65,14.214876,36.6,103,73,133.0,63.0,0.88,4.0,4.4,3.2,0.86
3,56,794,4,1634860800,52,1,39.0,1.65,14.325069,35.0,108,71,122.0,44.0,1.30,4.4,4.7,3.9,0.81
4,56,794,5,1635120000,52,1,39.0,1.65,14.325069,34.7,118,80,123.0,41.0,1.15,3.9,4.5,3.2,0.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,1593,1025,9,1582502400,17,1,40.9,1.71,13.98721,,,,20.0,21.0,1.51,4.7,4.3,4.4,0.85
372,1593,1025,10,1583107200,17,1,41.4,1.71,14.158203,,,,24.0,19.0,1.45,4.7,4.2,4.8,0.82
373,1593,1025,11,1583712000,17,1,42.1,1.71,14.397592,,,,26.0,21.0,1.44,4.6,4.1,4.6,0.87
374,1593,1025,12,1584316800,17,1,42.7,1.71,14.602784,37.4,,,25.0,20.0,1.60,,4.2,4.8,0.81


In [None]:
# Still to do:

# add the labels for RFS (use the rule system from Mladena)
# put into large preprocessing file
# save as CSV

### Aspen Criteria

# A decrease in any 1, 2, or 3 of serum phosphorus, potassium, and/or magnesium levels by
# 10%~20% (mild RS)
# 20%~30% (moderate RS)
# or >30% and/or organ dysfunction resulting from a decrease in any of these and/or due to thiamin deficiency (severe RS)


In [24]:
df = pd.read_csv('BEP_imputed_TEST.csv')

In [28]:
# Define electrolytes to monitor
electrolytes = ['Phosphate', 'Potassium', 'Magnesium']

# Create placeholder columns for drop %
for col in electrolytes:
    df[f'{col}_DROP_%'] = None

# Group by patient-intake
rfs_labeled_groups = []

for _, group in df.groupby(['PATIENT_ID', 'INTAKE_ID']):
    group = group.sort_values('SEQUENCE')
    baseline = group.iloc[0]

    # Calculate % drop from baseline
    for col in electrolytes:
        base_value = baseline[col]
        group[f'{col}_DROP_%'] = (base_value - group[col]) / base_value * 100

    # Create RFS label if any electrolyte drop ≥ 10%
    group['RFS'] = ((group[[f'{col}_DROP_%' for col in electrolytes]] >= 10).any(axis=1)).astype(int)

    rfs_labeled_groups.append(group)

# Combine groups
df_final = pd.concat(rfs_labeled_groups).reset_index(drop=True)

# Drop the temporary drop % columns
drop_cols = [f'{col}_DROP_%' for col in electrolytes]
df_final.drop(columns=drop_cols, inplace=True)

# Move RFS column just after 'SEX'
cols = df_final.columns.tolist()
if 'RFS' in cols:
    cols.remove('RFS')
    insert_at = cols.index('SEX') + 1 if 'SEX' in cols else len(cols)
    cols.insert(insert_at, 'RFS')
    df_final = df_final[cols]

df_final

Unnamed: 0,PATIENT_ID,INTAKE_ID,SEQUENCE,DATE,AGE,SEX,RFS,Weight (kg),Height (m),BMI,Temperature (C),Systolic,Diastolic,ALT,AST,Phosphate,Glucose,Potassium,Leucocytes,Magnesium
0,56,794,1,-9223372037,52.0,1,0,34.2,1.65,12.561983,35.2,104,54,173.0,11.0,1.04,3.9,4.3,2.2,0.81
1,56,794,2,1634688000,52.0,1,1,39.2,1.65,14.398531,37.2,108,69,120.0,59.0,0.38,3.6,4.8,3.9,0.90
2,56,794,3,1634774400,52.0,1,1,38.7,1.65,14.214876,36.6,103,73,133.0,63.0,0.88,4.0,4.4,3.2,0.86
3,56,794,4,1634860800,52.0,1,0,39.0,1.65,14.325069,35.0,108,71,122.0,44.0,1.30,4.4,4.7,3.9,0.81
4,56,794,5,1635120000,52.0,1,0,39.0,1.65,14.325069,34.7,118,80,123.0,41.0,1.15,3.9,4.5,3.2,0.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,1593,1025,9,1582502400,17.0,1,0,40.9,1.71,13.987210,36.3,94,79,20.0,21.0,1.51,4.7,4.3,4.4,0.85
372,1593,1025,10,1583107200,17.0,1,1,41.4,1.71,14.158203,36.5,119,85,24.0,19.0,1.45,4.7,4.2,4.8,0.82
373,1593,1025,11,1583712000,17.0,1,0,42.1,1.71,14.397592,36.3,116,71,26.0,21.0,1.44,4.6,4.1,4.6,0.87
374,1593,1025,12,1584316800,17.0,1,1,42.7,1.71,14.602784,37.4,120,77,25.0,20.0,1.60,4.3,4.2,4.8,0.81


In [29]:
# For each PATIENT_ID, check if at least one RFS = 1 exists
patients_with_rfs = df_final.groupby('PATIENT_ID')['RFS'].any()

# Get the list of patients where RFS was not detected (should be empty if labeling worked)
patients_missing_rfs = patients_with_rfs[~patients_with_rfs].index.tolist()

# Print results
if patients_missing_rfs:
    print("⚠️ Labeling failed for the following patients (no RFS detected):")
    print(patients_missing_rfs)
else:
    print("✅ Labeling is correct: All patients have at least one RFS instance.")

rfs_sequences_all = (
    df_final[df_final['RFS'] == 1]
    .groupby('PATIENT_ID')['SEQUENCE']
    .apply(list)
)

print("\n✅ Labeling is correct: The following patients have at least one RFS detection.")
print("See patient numbers and accompanying sequence below (e.g. 2 is second timepoint in admission data")
print(rfs_sequences_all)

⚠️ Labeling failed for the following patients (no RFS detected):
[1317]

✅ Labeling is correct: The following patients have at least one RFS detection.
See patient numbers and accompanying sequence below (e.g. 2 is second timepoint in admission data
PATIENT_ID
56        [2, 3, 27, 36, 38, 47, 52, 59, 60, 61, 97, 124]
325     [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, ...
530                           [2, 3, 4, 5, 9, 12, 13, 14]
588           [8, 10, 11, 12, 17, 20, 22, 24, 25, 26, 29]
893                                                   [4]
1048    [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
1171        [3, 6, 9, 10, 11, 12, 13, 14, 15, 17, 19, 21]
1231    [3, 4, 5, 6, 8, 11, 15, 30, 33, 34, 40, 41, 43...
1576    [2, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...
1593                                             [10, 12]
Name: SEQUENCE, dtype: object


In [1]:
import pandas as pd

# Load your DataFrame
df = pd.read_csv("BEP_imputed_TEST.csv")  # or your train file

df.columns

Index(['PATIENT_ID', 'INTAKE_ID', 'SEQUENCE', 'DATE', 'AGE', 'SEX', 'RFS',
       'Weight (kg)', 'Height (m)', 'BMI', 'Temperature (C)', 'Systolic',
       'Diastolic', 'ALT', 'AST', 'Phosphate', 'Glucose', 'Potassium',
       'Leucocytes', 'Magnesium'],
      dtype='object')

In [None]:
['Weight (kg)', 'BMI', 'Temperature (C)', 'Systolic','Diastolic', 'ALT', 'AST', 'Phosphate', 'Glucose', 'Potassium', 'Leucocytes', 'Magnesium']