In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Create diagnosis dates matrix

In [None]:
full_data_path = "./sepsis_research/data/ukb675429.csv"

In [42]:
# Input participant table - choosing eids - LPs individuals (~250K)
lps_clinical_table_age = pd.read_csv("./MLHC/Data/raw_data/Participant_table/Participant_table all metabolmics data available.csv")
lps_clinical_table_sex = pd.read_csv("./MLHC/Data/raw_data/Participant_table/Participant_table all metabolmics data available with gender.csv")
lps_clinical_table_age.drop(columns=["Age when attended assessment centre | Instance 0"], inplace=True)

full_clinical_table = lps_clinical_table_age.merge(lps_clinical_table_sex, on="Participant ID")

In [22]:
# Input icd10 list
icd10_info = pd.read_csv("./MLHC/Data/raw_data/icd10_tree_information.csv")

In [23]:
icd10_info_non_cancer = icd10_info[icd10_info['category'] != "Cancer"].copy()
icd10_info_non_cancer.dropna(subset=['date_key_number'], inplace=True)
icd10_info_non_cancer['date_key_number'] = icd10_info_non_cancer['date_key_number'].astype(int).astype(str)
non_canacer_col_keys = icd10_info_non_cancer['date_key_number'].tolist()
print("Number of non cancer icd10:", icd10_info_non_cancer.shape[0])

icd10_info_cancer = icd10_info[icd10_info['category'] == "Cancer"].copy()
print("Number of cancer icd10:", icd10_info_cancer.shape[0])
cancer_codes = icd10_info_cancer["coding"].to_list()

Number of non cancer icd10: 1130
Number of cancer icd10: 90


Get cancer diagnosis dates

In [24]:
def find_code(row, wanted_codes):
    code_ind = np.where(row.isin(wanted_codes))[0]
    if len(code_ind) == 0:
        return 0
    else:
        return code_ind[0]

In [25]:
data = dd.read_csv(full_data_path)

# Take only cancer columns
wanted_columns_mask = [(field.startswith('40006') or field.startswith('40005') or field == 'eid') for field in data.columns]

wanted_columns = data.columns[wanted_columns_mask]

d = {field:"str" for field in wanted_columns if field != 'eid'}

data = dd.read_csv(full_data_path, dtype=d, usecols=list(wanted_columns))

# Take only chosen eids
data_filtered_cancer = data[data["eid"].isin(full_clinical_table["Participant ID"])]

data_filtered_cancer = data_filtered_cancer.compute(num_workers=80)

In [None]:
# Match cancer diagnosis date to the diagnosis code
data_filtered_cancer = data_filtered_cancer.dropna(how='all', subset=data_filtered_cancer.columns[1:]).reset_index(drop=True)

chosen_cols_mask_cancer = [(field.startswith('40006') or field == 'eid') for field in data_filtered_cancer.columns]
chosen_cols_cancer = data_filtered_cancer.columns[chosen_cols_mask_cancer]

cancer_df = data_filtered_cancer[chosen_cols_cancer].copy().reset_index(drop=True)
cancer_df.iloc[:, 1:] = cancer_df.iloc[:, 1:].applymap(lambda x: x[:3] if isinstance(x, str) else x)
print(cancer_df.head())
print(cancer_df.shape)

chosen_cols_mask_cancer_date = [(field.startswith('40005') or field == 'eid') for field in data_filtered_cancer.columns]
chosen_cols_cancer_date = data_filtered_cancer.columns[chosen_cols_mask_cancer_date]

cancer_date_df = data_filtered_cancer[chosen_cols_cancer_date].copy().reset_index(drop=True)
print(cancer_date_df.head())
print(cancer_date_df.shape)

In [None]:
patient_to_cancer_df = cancer_df[["eid"]].copy()

for code in cancer_codes:
    patient_to_cancer_df[f'tmp_{code}'] = cancer_df.apply(lambda row: find_code(row, [code]), axis=1)
    patient_to_cancer_df[f'{code}'] = cancer_date_df.to_numpy()[
        np.arange(patient_to_cancer_df.shape[0]), patient_to_cancer_df[f'tmp_{code}']]

    missing_indices = np.where(patient_to_cancer_df[f'tmp_{code}'] == 0)[0]
    patient_to_cancer_df.loc[missing_indices, f'{code}'] = np.nan
    patient_to_cancer_df.drop(columns=[f'tmp_{code}'], inplace=True)

patient_to_cancer_df = patient_to_cancer_df[patient_to_cancer_df.iloc[:, 1:].notna().sum(axis=1) > 0]
print(patient_to_cancer_df.shape)
patient_to_cancer_df.head(15)

Get non-cancer diagnosis dates

In [None]:
data = dd.read_csv(full_data_path)

# Take non cancer first occurrences columns and death dates
wanted_columns_mask = [any(field.startswith(prefix) for prefix in non_canacer_col_keys) or field.startswith('40000') or field.startswith('3140-') or field == 'eid' for field in
                       data.columns]

wanted_columns = data.columns[wanted_columns_mask]

d = {field: "str" for field in wanted_columns if field != 'eid'}

data = dd.read_csv(full_data_path, dtype=d, usecols=list(wanted_columns))

# Take only chosen eids
data_filtered_non_cancer = data[data["eid"].isin(full_clinical_table["Participant ID"])]

data_filtered_non_cancer = data_filtered_non_cancer.compute(num_workers=80)

In [34]:
col_map_non_cancer = dict(zip(icd10_info_non_cancer['date_key_number'], icd10_info_non_cancer['coding']))

data_filtered_non_cancer.drop(columns=['3140-1.0', '3140-2.0', '3140-3.0'], inplace=True)

data_filtered_non_cancer = data_filtered_non_cancer.rename(columns={'40000-0.0':'dod', '40000-1.0':'death_date_1', '3140-0.0': "Pregnant | Instance 0"})
data_filtered_non_cancer.drop(columns=['death_date_1'], inplace=True)
data_filtered_non_cancer = data_filtered_non_cancer.rename(columns=dict(
    zip(data_filtered_non_cancer.columns, [col.split('-')[0] for col in data_filtered_non_cancer.columns])))
data_filtered_non_cancer = data_filtered_non_cancer.rename(columns=col_map_non_cancer)

In [43]:
clinical_data_non_cancer = pd.merge(data_filtered_non_cancer, full_clinical_table, left_on="eid", right_on="Participant ID", how='inner')
# Drop duplicate id column
clinical_data_non_cancer.drop(columns="Participant ID", inplace=True)

# df2_cols = ["Age when attended assessment centre | Instance 0", "Sex", "Date of attending assessment centre | Instance 0", "Treatment/medication code | Instance 0", "Pregnant | Instance 0"]
df2_cols = ["Age when attended assessment centre | Instance 0", "Sex", "Date of attending assessment centre | Instance 0"]
df1_cols = [col for col in data_filtered_non_cancer.columns if col != "eid"]

clinical_data_non_cancer = clinical_data_non_cancer[["eid"] + df2_cols + df1_cols]
clinical_data_non_cancer.rename(columns={'Date of attending assessment centre | Instance 0':'doa'}, inplace=True)

In [44]:
# Merge cancer dates and all other icd10 codes
full_clinical_data = clinical_data_non_cancer.merge(patient_to_cancer_df, on="eid", how="left")

In [None]:
# Calculate estimated dob
dob_tmp = (pd.to_datetime(full_clinical_data["doa"]) - pd.to_timedelta(full_clinical_data["Age when attended assessment centre | Instance 0"] * 365.25, unit="D")).dt.normalize()

full_clinical_data.insert(4, "dob", dob_tmp)

In [46]:
full_clinical_data.to_csv("./MLHC/Data/raw_data/patient_to_all_icd10_dates_lps_individuals.csv", index=False)

# Create anonymization maps

In [24]:
data = dd.read_csv(full_data_path)

wanted_columns_mask = [field == 'eid' for field in data.columns]

wanted_columns = data.columns[wanted_columns_mask]

d = {field: "str" for field in wanted_columns if field != 'eid'}

data = dd.read_csv(full_data_path, dtype=d, usecols=list(wanted_columns))

full_data_eids = data.compute(num_workers=80)

In [26]:
full_data_eids_shuffled = full_data_eids.sample(frac=1, random_state=42).reset_index(drop=True)

In [34]:
# Map ukb eid to anonymous eid
anonymous_eid = range(1, len(full_data_eids_shuffled['eid'].drop_duplicates())+1)
date_shift = np.random.randint(-3000, 3000, size=len(full_data_eids_shuffled))

# Map ukb eid to random number of days to shift all dates by
eid_anonymization = pd.DataFrame({'eid': full_data_eids_shuffled['eid'].drop_duplicates().copy(), 'anonymous_eid': anonymous_eid, 'date_shift_days': date_shift})

eid_anonymization.to_csv("./MLHC/Data/preprocessed_data/eid_anonymization_map.csv", index=False)

In [None]:
# Map icd10 code to anonymous number
full_data = pd.read_csv("./MLHC/Data/raw_data/patient_to_all_icd10_dates_olink_individuals.csv")

icd10_codes = full_data.columns[8:].tolist()
codes_anony = list(range(1, len(icd10_codes)+1))

rng = np.random.default_rng(seed=42)
rng.shuffle(codes_anony)

icd10_anonymization = pd.DataFrame({
    "original_code": icd10_codes,
    "anon_code": codes_anony
})

icd10_anonymization.to_csv("./MLHC/Data/preprocessed_data/icd10_anonymization_map.csv", index=False)

In [42]:
# Map blood sample name to anonymous number
icd10_anonymization = pd.read_csv("./MLHC/Data/preprocessed_data/icd10_anonymization_map.csv")
blood_tests_info = pd.read_csv("./MLHC/Data/raw_data/blood_tests_information.csv")

blood_codes_anonymous = list(range(icd10_anonymization.shape[0]+1, (icd10_anonymization.shape[0] + 1) + blood_tests_info.shape[0]))

rng = np.random.default_rng(seed=42)
rng.shuffle(blood_codes_anonymous)

blood_anonymization = pd.DataFrame({
    "original_name": blood_tests_info["title"],
    "anon_code": blood_codes_anonymous
})

blood_anonymization.to_csv("./MLHC/Data/preprocessed_data/blood_anonymization_map.csv", index=False)

# Filter individuals and split to groups

In [None]:
raw_df = pd.read_csv("./MLHC/Data/raw_data/patient_to_all_icd10_dates_lps_individuals.csv")

In [49]:
# filter out individuals with abnormal diagnosis dates
special_codes = ["1901-01-01", "1902-02-02", "1903-03-03", "2037-07-07"]

mask = raw_df.isin(special_codes).any(axis=1)
filtered_df = raw_df[~mask].copy()

# filter out individuals with less than 5 diagnosis dates
icd10_cols = raw_df.columns[7:].tolist()
mask = filtered_df[icd10_cols].notna().sum(axis=1) >= 5
filtered_df = filtered_df[mask].copy()


In [50]:
# Spilt eid to train, validation, test
train_df, temp_df = train_test_split(filtered_df, test_size=0.2, random_state=42)

val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [51]:
eid_to_split = {}

eid_to_split.update({eid: "train" for eid in train_df["eid"]})
eid_to_split.update({eid: "val" for eid in val_df["eid"]})
eid_to_split.update({eid: "test" for eid in test_df["eid"]})

filtered_df["split_group"] = filtered_df["eid"].map(eid_to_split)
cols = list(filtered_df.columns)
cols.insert(1, cols.pop(cols.index("split_group")))
filtered_df = filtered_df[cols]

In [52]:
filtered_df.to_csv("./MLHC/Data/preprocessed_data/LPs_ind/patient_to_all_icd10_dates_lps_individuals_filtered.csv", index=False)

# Anonymize data

In [None]:
raw_df = pd.read_csv("./MLHC/Data/preprocessed_data/LPs_ind/patient_to_all_icd10_dates_lps_individuals_filtered.csv")
eid_anonymization_map = pd.read_csv("./MLHC/Data/preprocessed_data/eid_anonymization_map.csv")
icd10_anonymization_map = pd.read_csv("./MLHC/Data/preprocessed_data/icd10_anonymization_map.csv")

In [81]:
# Filter wanted columns
icd10_cols = raw_df.columns[8:].tolist()
anonymous_df = raw_df[["eid", "split_group", "Sex", "dob", "doa", "dod"] + icd10_cols].copy()

In [None]:
anonymous_df["doa"].apply(pd.to_datetime, errors="coerce")

In [None]:
# Date shift for the following: dob, dod, doa and diagnosis
anonymous_df = anonymous_df.merge(eid_anonymization_map[["eid", "date_shift_days"]], on="eid", how="left")
cols = anonymous_df.columns.tolist()
cols.insert(1, cols.pop(cols.index("date_shift_days")))
anonymous_df = anonymous_df[cols]

anonymous_df["doa"] = anonymous_df["doa"].apply(pd.to_datetime, errors="coerce")
anonymous_df["dob"] = anonymous_df["dob"].apply(pd.to_datetime, errors="coerce")
anonymous_df["dod"] = anonymous_df["dod"].apply(pd.to_datetime, errors="coerce")
anonymous_df[anonymous_df.columns[7:]] = anonymous_df[anonymous_df.columns[7:]].apply(pd.to_datetime, errors="coerce")

cols_to_shift = anonymous_df.columns[4:]
# anonymous_df[cols_to_shift] = anonymous_df[cols_to_shift].apply(pd.to_datetime, errors="coerce")
anonymous_df[cols_to_shift] = anonymous_df[cols_to_shift].add(
    pd.to_timedelta(anonymous_df["date_shift_days"], unit="D"), axis=0
)
anonymous_df.drop(columns=["date_shift_days"], inplace=True)

In [83]:
# anonymize the eid
eid_to_aeid = dict(zip(eid_anonymization_map['eid'], eid_anonymization_map['anonymous_eid']))

# anonymize the icd10 codes
icd10_to_aicd10 = dict(zip(icd10_anonymization_map['original_code'], icd10_anonymization_map['anon_code']))

anonymous_df.set_index('eid', inplace=True)

anonymous_df = anonymous_df.rename(index=eid_to_aeid, columns=icd10_to_aicd10, inplace=False).reset_index()

In [84]:
anonymous_df.to_csv("./MLHC/Data/preprocessed_data/LPs_ind/patient_to_all_icd10_dates_lps_individuals_anonymous.csv", index=False)

# Create JSON data file

In [None]:
data_df = pd.read_csv("./MLHC/Data/preprocessed_data/LPs_ind/patient_to_all_icd10_dates_lps_individuals_anonymous.csv")

In [86]:
fixed_cols = ["eid", "Sex", "split_group", "dob", "doa", "dod"]
icd10_cols = data_df.columns[6:]

In [89]:
# Get only females
data_df = data_df[data_df["Sex"] == "Female"]

In [90]:
# Build the JSON structure
json_data = {}

for idx, row in data_df.iterrows():
    eid = str(row["eid"])

    # Select only non-null ICD10 columns for this row
    icd10_values = row[icd10_cols].dropna()

    events = [
        {
            "diagdate": str(v.date()) if hasattr(v, "date") else str(v),
            "codes": code,
            "type": "ICD10"
        }
        for code, v in icd10_values.items()
    ]

    json_data[eid] = {
        "birth_date": str(row["dob"]) if pd.notna(row["dob"]) else None,
        "death_date": str(row["dod"]) if pd.notna(row["dod"]) else None,
        "attendance_date": str(row["doa"]) if pd.notna(row["doa"]) else None,
        "events": events,
        "split_group": row["split_group"]
    }


In [91]:
with open("./MLHC/Data/preprocessed_data/LPs_ind/lps_females_anonymous.json", "w") as f:
    json.dump(json_data, f, indent=2)

In [213]:
with open("./MLHC/Data/preprocessed_data/LPs_ind/lps_females_anonymous.json", "r") as f:
    data = json.load(f)

# Add blood samples

Create base matrix

In [None]:
disease_table_filtered = pd.read_csv("./MLHC/Data/preprocessed_data/LPs_ind/patient_to_all_icd10_dates_lps_individuals_filtered.csv")
blood_tests_info = pd.read_csv("./MLHC/Data/raw_data/blood_tests_information.csv")

In [12]:
blood_test_col_keys = [str(i) for i in blood_tests_info['field_id'].tolist()]

In [17]:
data = dd.read_csv(full_data_path)

wanted_columns_mask = [any(field.startswith(prefix+'-0.0') for prefix in blood_test_col_keys) or field == 'eid' for field in data.columns]

wanted_columns = data.columns[wanted_columns_mask]

d = {field: "str" for field in wanted_columns if field != 'eid'}

data = dd.read_csv(full_data_path, dtype=d, usecols=list(wanted_columns))

# Take only chosen eids
data_filtered = data[data["eid"].isin(disease_table_filtered["eid"])]

data_filtered = data_filtered.compute(num_workers=80)

In [19]:
col_map_blood_test = dict(zip(blood_test_col_keys, blood_tests_info['title']))

data_filtered = data_filtered.rename(columns=dict(
    zip(data_filtered.columns, [col.split('-')[0] for col in data_filtered.columns])))
data_filtered = data_filtered.rename(columns=col_map_blood_test)

In [20]:
merged_table = disease_table_filtered.merge(data_filtered, on="eid")

In [23]:
merged_table.to_csv("./MLHC/Data/preprocessed_data/LPs_ind/patient_to_all_icd10_dates_and_blood_lps_individuals_filtered.csv", index=False)

Additional filters on individuals

In [None]:
merged_table = pd.read_csv("./MLHC/Data/preprocessed_data/LPs_ind/patient_to_all_icd10_dates_and_blood_lps_individuals_filtered.csv")

In [91]:
# filter out pregnant individuals during the blood tests (removes 150 patients)
merged_table = merged_table[~((merged_table["Pregnant | Instance 0"] == 1) | (merged_table["Pregnant | Instance 0"] == 2))]

In [52]:
merged_table.to_csv("./MLHC/Data/preprocessed_data/LPs_ind/patient_to_all_icd10_dates_and_blood_lps_individuals_farther_filtered.csv", index=False)

Feature filtration (missingness) and standard scaling

In [None]:
merged_table = pd.read_csv("./MLHC/Data/preprocessed_data/LPs_ind/with_blood/patient_to_all_icd10_dates_and_blood_lps_individuals_farther_filtered.csv")


In [177]:
# Remove blood metrics that are with more than 80% missing values
blood_missing_percent = merged_table[merged_table.columns[-30:]].isna().mean() * 100
blood_missing_percent = blood_missing_percent.sort_values(ascending=False)

print("Percentage of missing values per feature:\n")
for col, pct in blood_missing_percent.items():
    print(f"{col:25} {pct:6.2f}%")

Percentage of missing values per feature:

Rheumatoid factor          91.26%
Oestradiol                 85.39%
Lipoprotein A              23.88%
Direct bilirubin           19.19%
Testosterone               14.07%
SHBG                       13.21%
Apolipoprotein A           12.90%
Phosphate                  12.54%
Total protein              12.50%
Glucose                    12.47%
Calcium                    12.42%
HDL cholesterol            12.41%
Albumin                    12.38%
Vitamin D                   8.65%
IGF-1                       4.92%
Apolipoprotein B            4.88%
Total bilirubin             4.79%
Glycated haemoglobin (HbA1c)   4.76%
Aspartate aminotransferase   4.74%
C-reactive protein          4.60%
LDL direct                  4.58%
Urate                       4.52%
Triglycerides               4.47%
Urea                        4.47%
Creatinine                  4.44%
Gamma glutamyltransferase   4.43%
Alanine aminotransferase    4.42%
Cystatin C                  4.40%
C

In [178]:
merged_table.drop(columns=["Rheumatoid factor", "Oestradiol"], inplace=True)

In [186]:
# Standartization

blood_columns = merged_table.columns[-28:].tolist()

scaled_train = merged_table[merged_table["split_group"] == "train"].copy()
scaled_val = merged_table[merged_table["split_group"] == "val"].copy()
scaled_test = merged_table[merged_table["split_group"] == "test"].copy()

scaler = StandardScaler()
scaler.fit(scaled_train[blood_columns])

scaled_train[blood_columns] = scaler.transform(scaled_train[blood_columns])
scaled_val[blood_columns] = scaler.transform(scaled_val[blood_columns])
scaled_test[blood_columns] = scaler.transform(scaled_test[blood_columns])

In [188]:
merged_table_scaled = pd.concat([scaled_train, scaled_val, scaled_test], axis=0, sort=False).sort_index()


In [192]:
merged_table_scaled.to_csv("./MLHC/Data/preprocessed_data/LPs_ind/with_blood/patient_to_all_icd10_dates_and_blood_lps_individuals_filtered_scaled.csv", index=False)


Anonymize data

In [None]:
merged_table = pd.read_csv("./MLHC/Data/preprocessed_data/LPs_ind/with_blood/patient_to_all_icd10_dates_and_blood_lps_individuals_filtered_scaled.csv")
anon_df = pd.read_csv("./MLHC/Data/preprocessed_data/LPs_ind/patient_to_all_icd10_dates_lps_individuals_anonymous.csv")
eid_anonymization_map = pd.read_csv("./MLHC/Data/preprocessed_data/eid_anonymization_map.csv")
icd10_anonymization_map = pd.read_csv("./MLHC/Data/preprocessed_data/icd10_anonymization_map.csv")
blood_anonymization_map = pd.read_csv("./MLHC/Data/preprocessed_data/blood_anonymization_map.csv")

In [196]:
merged_table = merged_table.merge(eid_anonymization_map, on="eid")

In [197]:
anon_df = anon_df[anon_df["eid"].isin(merged_table["anonymous_eid"])]

In [202]:
anon_df = anon_df.merge(merged_table[merged_table.columns[-30:-1]], right_on="anonymous_eid", left_on="eid", how="left").drop("anonymous_eid", axis=1)

In [207]:
col_map_blood_test = dict(zip(blood_anonymization_map["original_name"], blood_anonymization_map["anon_code"]))
anon_df.rename(columns=col_map_blood_test, inplace=True)

In [209]:
anon_df.to_csv("./MLHC/Data/preprocessed_data/LPs_ind/with_blood/patient_to_all_icd10_dates_and_blood_lps_individuals_anonymous_scaled.csv", index=False)

Create JSON

In [None]:
data_df = pd.read_csv("./MLHC/Data/preprocessed_data/LPs_ind/with_blood/patient_to_all_icd10_dates_and_blood_lps_individuals_anonymous_scaled.csv")

In [237]:
fixed_cols = ["eid", "Sex", "split_group", "dob", "doa", "dod"]
icd10_cols = data_df.columns[6:-28]
blood_cols = data_df.columns[-28:]

In [238]:
# Get only females
data_df = data_df[data_df["Sex"] == "Female"]

In [239]:
# Build the JSON structure
json_data = {}

for idx, row in data_df.iterrows():
    eid = str(row["eid"])

    # Select only non-null ICD10 columns for this row
    icd10_values = row[icd10_cols].dropna()
    blood_values = row[blood_cols].dropna()

    icd10_events = [
        {
            "diagdate": str(v.date()) if hasattr(v, "date") else str(v),
            "codes": code,
            "type": "ICD10"
        }
        for code, v in icd10_values.items()
    ]

    curr_blood_date = str(row["doa"].date()) if hasattr(row["doa"], "date") else str(row["doa"])

    blood_events = [
        {
            "diagdate": curr_blood_date,
            "codes": val,
            "type": str(test_code)
        }
        for test_code, val in blood_values.items()
    ]

    events = icd10_events + blood_events

    json_data[eid] = {
        "birth_date": str(row["dob"]) if pd.notna(row["dob"]) else None,
        "death_date": str(row["dod"]) if pd.notna(row["dod"]) else None,
        "attendance_date": str(row["doa"]) if pd.notna(row["doa"]) else None,
        "events": events,
        "split_group": row["split_group"]
    }


In [240]:
with open("./MLHC/Data/preprocessed_data/LPs_ind/with_blood/lps_females_anonymous_w_blood.json", "w") as f:
    json.dump(json_data, f, indent=2)