In [1]:
import pandas as pd
import numpy as np
data_dir = "data/"
data_path = "/orcd/pool/003/dbertsim_shared/ukb/"

In [2]:
df = pd.read_csv(f"{data_path}blood_protein_cancers_clean.csv")
drop_cols = ['Medication for cholesterol, blood pressure or diabetes'] + [col for col in df.columns if "operation" in col.lower()] + \
    [col for col in df.columns if "cancer" in col.lower()] + [col for col in df.columns if "time_to_diagnosis" in col.lower()]
df = df.drop(columns=drop_cols)

eids = set(df['eid'])

  df = pd.read_csv(f"{data_path}blood_protein_cancers_clean.csv")


In [3]:
def rename_columns(df, field_dict):
    # drop instances - take eid only 
    for c in set(df.columns) - {"eid"}:
        df = df.rename(columns={c: c.split("p")[1].split("_")[0]})
        
    # map from eid to name 
    for c in set(df.columns) - {"eid"}:
        df = df.rename(columns={c: field_dict[int(c)]})
        
    return df


field = pd.read_csv(f"{data_dir}field.tsv",sep="\t")
field_dict = dict(zip(field["field_id"], field["title"]))
category = pd.read_csv(f"{data_dir}category.tsv",sep="\t")
df_lab_time = rename_columns(pd.read_csv(f"{data_dir}time_stamps_participant.csv"),field_dict)

# Mapping ICD10 codes to cancers

In [4]:
df_cancer1 = pd.read_csv(f"{data_dir}cancer_type_and_date_participant.csv")
df_cancer2 = pd.read_csv(f"{data_dir}cancer_type_and_date_2_participant.csv")
df_cancer2 = df_cancer2.drop(columns = ['eid'])
df_cancer = pd.concat([df_cancer1, df_cancer2], axis=1)

  df_cancer2 = pd.read_csv(f"{data_dir}cancer_type_and_date_2_participant.csv")


In [5]:
df_cancer = df_cancer.rename(columns=lambda x: x.replace('p40005', 'Date of cancer diagnosis'))
df_cancer = df_cancer.rename(columns=lambda x: x.replace('p40006', 'Type of cancer: ICD10'))

# group ICD
for i in range(22):
    df_cancer[[f"diag_{i}_icd10_code", f"diag_{i}_icd10_name"]] = df_cancer[f"Type of cancer: ICD10_i{i}"].str.split(" ", n=1, expand=True)
    df_cancer[[f"diag_{i}_icd10_grouped", f"diag_{i}_icd10_decimal"]] = df_cancer[f"diag_{i}_icd10_code"].str.split(".", n=1, expand=True)

df_cancer = df_cancer.loc[df_cancer['eid'].isin(eids)]

In [6]:
# -----------------------------
# 1) Long table for 22 diagnosis slots
# -----------------------------
N_SLOTS = 22
icd_cols  = [f"diag_{i}_icd10_grouped" for i in range(N_SLOTS)]
date_cols = [f"Date of cancer diagnosis_i{i}" for i in range(N_SLOTS)]

long = pd.DataFrame({
    "eid": np.repeat(df_cancer["eid"].values, N_SLOTS),
    "icd10": df_cancer[icd_cols].to_numpy().ravel(),
    "dx_date": df_cancer[date_cols].to_numpy().ravel(),
})

long["dx_date"] = pd.to_datetime(long["dx_date"], errors="coerce")

# -----------------------------
# 2) ICD â†’ cancer map
# -----------------------------
CANCER_ICD_MAP = {
    "lung": {"C33", "C34"},
    "colorectal": {"C18", "C19", "C20"},
    "stomach": {"C16"}
}

icd_to_type = {
    icd: cancer
    for cancer, icds in CANCER_ICD_MAP.items()
    for icd in icds
}

long["cancer_type"] = long["icd10"].map(icd_to_type)

long_cancer = long[
    long["cancer_type"].notna() & long["dx_date"].notna()
].copy()

# -----------------------------
# 3) Earliest diagnosis per (eid, cancer_type)
# -----------------------------
first_dx = (
    long_cancer
    .groupby(["eid", "cancer_type"], as_index=False)
    .agg(diagnosis_date=("dx_date", "min"))
)

# -----------------------------
# 4) Time-to-dx + baseline-present flag
# -----------------------------
first_dx = first_dx.merge(
    df_lab_time[['eid','Date of attending assessment centre']],
    on="eid",
    how="left"
)

first_dx["assessment_date"] = pd.to_datetime(
    first_dx["Date of attending assessment centre"], errors="coerce"
)
first_dx["time_to_diagnosis"] = (
    (first_dx["diagnosis_date"] - first_dx["assessment_date"])
    .dt.days / 365.25
)

buffer_years = 30 / 365.25
first_dx["baseline_present"] = (
    first_dx["time_to_diagnosis"] <= buffer_years
).astype(int)

# -----------------------------
# 5) Wide format (adds diagnosis date too)
# -----------------------------
df_out1 = first_dx.pivot(
    index="eid",
    columns="cancer_type",
    values=["baseline_present", "time_to_diagnosis", "diagnosis_date"]
)

df_out1.columns = [
    f"{cancer}_cancer{'' if metric=='baseline_present' else '_' + metric}"
    for metric, cancer in df_out1.columns
]
df_out1 = df_out1.reset_index()

# fill flags for people with no diagnosis records
for cancer in CANCER_ICD_MAP:
    df_out1[f"{cancer}_cancer"] = (
        df_out1.get(f"{cancer}_cancer")
        .fillna(0)
        .astype(int)
    )
df_out1 = df_out1.drop(columns = [col for col in df_out1.columns if "diagnosis_date" in col])

In [7]:
df_out1.head()

Unnamed: 0,eid,colorectal_cancer,lung_cancer,stomach_cancer,colorectal_cancer_time_to_diagnosis,lung_cancer_time_to_diagnosis,stomach_cancer_time_to_diagnosis
0,1011206,0,0,0,,,12.4846
1,1014381,0,0,0,3.00616,,
2,1020994,1,0,0,-0.473648,,
3,1021100,0,0,0,10.940452,,
4,1025520,0,0,0,1.629021,,


# Preprocess ICD10 codes to non-cancer diagnoses 

In [8]:
df_diag1 = pd.read_csv(f"{data_path}time_stamps_hesin.csv")
df_diag2 = pd.read_csv(f"{data_path}time_stamps_hesin_diag.csv")

In [9]:
df_diag1 = df_diag1.rename(columns=lambda x: x.replace('eid', 'hesin_diag$eid'))
df_diag1.head()

Unnamed: 0,dnx_hesin_id,hesin_diag$eid,ins_index,admidate
0,1778835-13,1778835,13,2015-09-11
1,4819045-0,4819045,0,1999-02-15
2,3497026-1,3497026,1,2015-12-23
3,3746661-9,3746661,9,2016-02-12
4,1039009-23,1039009,23,2021-09-08


In [10]:
df_diag2 = df_diag2.rename(columns=lambda x: x.replace('participant$eid', 'eid'))
df_diag2[[f"diag_icd10_code", f"diag_icd10_name"]] = df_diag2["diag_icd10"].str.split(" ", n=1, expand=True)
df_diag2[[f"diag_icd10_grouped", f"diag_icd10_decimal"]] = df_diag2[f"diag_icd10_code"].str.split(".", n=1, expand=True)
df_diag2.head()

Unnamed: 0,dnx_hesin_diag_id,eid,hesin_diag$eid,ins_index,arr_index,level,diag_icd10,diag_icd10_code,diag_icd10_name,diag_icd10_grouped,diag_icd10_decimal
0,1000029-1-0,1000029,1000029,1,0,Primary/main diagnosis,O26.8 Other specified pregnancy-related condit...,O26.8,Other specified pregnancy-related conditions,O26,8
1,1000029-1-1,1000029,1000029,1,1,Secondary diagnosis,Z04.3 Examination and observation following ot...,Z04.3,Examination and observation following other ac...,Z04,3
2,1000029-2-3,1000029,1000029,2,3,Secondary diagnosis,Z35.1 Supervision of pregnancy with history of...,Z35.1,Supervision of pregnancy with history of abort...,Z35,1
3,1000029-2-2,1000029,1000029,2,2,Secondary diagnosis,O72.1 Other immediate postpartum haemorrhage,O72.1,Other immediate postpartum haemorrhage,O72,1
4,1000029-2-1,1000029,1000029,2,1,Secondary diagnosis,Z37.0 Single live birth,Z37.0,Single live birth,Z37,0


In [11]:
df_diag = pd.merge(
    df_diag2[["eid","hesin_diag$eid", "ins_index", "diag_icd10_grouped"]], df_diag1[["hesin_diag$eid", "ins_index", "admidate"]],
    on=["hesin_diag$eid", "ins_index"], how="left"
)
df_diag = df_diag.loc[df_diag['eid'].isin(eids)]
df_diag.head()

Unnamed: 0,eid,hesin_diag$eid,ins_index,diag_icd10_grouped,admidate
0,1000029,1000029,1,O26,2003-04-07
1,1000029,1000029,1,Z04,2003-04-07
2,1000029,1000029,2,Z35,2003-08-31
3,1000029,1000029,2,O72,2003-08-31
4,1000029,1000029,2,Z37,2003-08-31


In [12]:
DIAG_ICD_MAP = {
    "ischemia": {"I20", "I21", "I22", "I23", "I24", "I25"},
    "stroke": {"I60", "I61", "I62", "I63", "I64", "I65", "I66", "I67", "I68", "I69"},
    "alzheimers": {"G30"},
    "copd": {"J44"},
    "lower_resp": {"J40","J41","J42","J43","J45","J47","J4A"},
    "kidney": {"N17", "N18", "N19"},
    "diabetes": {"E08", "E09", "E10", "E11", "E13"},
    "hhd": {"I11"},
}

icd_to_diag = {
    icd: diag
    for diag, icds in DIAG_ICD_MAP.items()
    for icd in icds
}

df_diag["diagnosis"] = df_diag["diag_icd10_grouped"].map(icd_to_diag)
df_diag = df_diag[
    df_diag["diagnosis"].notna() & df_diag["admidate"].notna()
].copy()

# Earliest diagnosis per (eid, diagnosis)
first_dx = (
    df_diag
    .groupby(["eid", "diagnosis"], as_index=False)
    .agg(diagnosis_date=("admidate", "min"))
)

# Merge assessment date
first_dx = first_dx.merge(
    df_lab_time[['eid','Date of attending assessment centre']],
    on="eid", how="left"
)

first_dx["Date of attending assessment centre"] = pd.to_datetime(
    first_dx["Date of attending assessment centre"], errors="coerce"
)
first_dx["diagnosis_date"] = pd.to_datetime(
    first_dx["diagnosis_date"], errors="coerce"
)

first_dx["time_to_diagnosis"] = (
    (first_dx["diagnosis_date"] - first_dx["Date of attending assessment centre"])
    .dt.days / 365.25
)
buffer_years = 30 / 365.25
first_dx["baseline_present"] = (first_dx["time_to_diagnosis"] <= buffer_years).astype(int)

# Wide format
df_out2 = first_dx.pivot(
    index="eid",
    columns="diagnosis",
    values=["baseline_present", "time_to_diagnosis", "diagnosis_date"]
)

df_out2.columns = [
    f"{diag}{'' if metric == 'baseline_present' else '_' + metric}"
    for metric, diag in df_out2.columns
]
df_out2 = df_out2.reset_index()

# fill flags for people with no diagnosis records
for diag in DIAG_ICD_MAP:
    df_out2[diag] = df_out2[diag].fillna(0).astype(int)
    
df_out2 = df_out2.drop(columns = [col for col in df_out2.columns if "diagnosis_date" in col])

In [13]:
df = df.merge(df_out1, on="eid", how="left")
df = df.merge(df_out2, on="eid", how="left")

In [14]:
for diag in DIAG_ICD_MAP:
    print(f"{diag}: {df[diag].sum()}")
for cancer in CANCER_ICD_MAP:
    print(f"{cancer}: {df[f'{cancer}_cancer'].sum()}")

ischemia: 2334.0
stroke: 441.0
alzheimers: 6.0
copd: 386.0
lower_resp: 1797.0
kidney: 319.0
diabetes: 1266.0
hhd: 13.0
lung: 45.0
colorectal: 246.0
stomach: 17.0


In [15]:
df.to_csv(f"{data_path}blood_protein_diagnoses_clean_new.csv", index=False)

## Train/val/test split of dataset
1. Drop protein columns that have over 30% missing
2. Split into train, validation, and test using iterative-stratification on cancer

In [17]:
df = pd.read_csv(f"{data_path}blood_protein_diagnoses_clean_new.csv")

In [18]:
olink_cols = [col for col in df.columns if col.startswith("olink")]
threshold = 0.5

# Compute the fraction of missing olink values per row
row_missing_fraction = df[olink_cols].isna().mean(axis=1)

# Filter out rows with >30% missing olink values
rows_to_drop = df[row_missing_fraction >= threshold].index

preprocessed_df = df[row_missing_fraction < threshold]

# Compute the fraction of missing values for these columns
missing_fraction = preprocessed_df[olink_cols].isna().mean()

# Drop the columns with >50% missing
cols_to_drop = missing_fraction[missing_fraction >= threshold].index # only 3 columns
preprocessed_df = preprocessed_df.drop(columns=[col for col in olink_cols if col in cols_to_drop])

In [19]:
print(f"dropping patients: {len(rows_to_drop)}")
print(f"dropping cols: {len(cols_to_drop)}")

dropping patients: 6705
dropping cols: 3


In [20]:
DIAG_COLS = [
    "lung_cancer",
    "colorectal_cancer",
    "stomach_cancer",
    "ischemia",
    "stroke",
    "alzheimers",
    "copd",
    "lower_resp",
    "kidney",
    "hhd",
    "diabetes"
]

def bin_ttd(x):
    if pd.isna(x):         return "NA"
    if x <= 30/365.25:     return "<0"
    if 30/365.25 < x <= 1: return "0-1"   # (0,1]
    return ">1"

def proportions(frame, label):
    return (frame[f"{label}_strata"].value_counts(normalize=False)
            .reindex(["<0","0-1",">1","NA"])
            .fillna(0))

In [21]:
for diag in DIAG_COLS:
    preprocessed_df[f"{diag}_strata"] = preprocessed_df[f"{diag}_time_to_diagnosis"].apply(bin_ttd)
    print(f"\n{diag}:\n", proportions(preprocessed_df, diag))


lung_cancer:
 <0        42
0-1       29
>1       459
NA     45760
Name: lung_cancer_strata, dtype: int64

colorectal_cancer:
 <0       214
0-1       38
>1       564
NA     45474
Name: colorectal_cancer_strata, dtype: int64

stomach_cancer:
 <0        14
0-1        3
>1        79
NA     46194
Name: stomach_cancer_strata, dtype: int64

ischemia:
 <0      1979
0-1      203
>1      3515
NA     40593
Name: ischemia_strata, dtype: int64

stroke:
 <0       369
0-1       70
>1      2018
NA     43833
Name: stroke_strata, dtype: int64

alzheimers:
 <0         4
0-1        5
>1       555
NA     45726
Name: alzheimers_strata, dtype: int64

copd:
 <0       337
0-1       90
>1      1894
NA     43969
Name: copd_strata, dtype: int64

lower_resp:
 <0      1562
0-1      237
>1      3642
NA     40849
Name: lower_resp_strata, dtype: int64

kidney:
 <0       214
0-1       52
>1      3913
NA     42111
Name: kidney_strata, dtype: int64

hhd:
 <0        11.0
0-1        0.0
>1        26.0
NA     46253.0
Name:

In [22]:
## Create train, validation, and test datasets for predicting current cancer
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
# Sanity check: compare prevalences per label in full vs train vs test
def prevalence(table, cols):
    return pd.DataFrame({
        "prevalence": [table[c].mean() for c in cols],
        "n": [table[c].sum() for c in cols]
    }, index=cols)

def multilabel_stratified_split(
    df: pd.DataFrame,
    test_size=0.4,
    random_state=42,
    time=0
):
    """
    Splits df into train/test so that each label in label_cols
    has (approximately) the same prevalence in both splits,
    accounting for multi-label rows.
    """
    # 1) Build the multi-label target matrix (n_samples x n_labels)
    # Y = df[label_cols].astype(int).to_numpy()
    df_copy = df.copy()
    Y_cols = []
    for label in DIAG_COLS:
        for strata in ["<0","0-1",">1"]:
            df_copy[f"{label}_time_to_diagnosis_{strata}"] = (df[f"{label}_strata"] == strata)
            Y_cols.append(f"{label}_time_to_diagnosis_{strata}")
    Y = df_copy[Y_cols].to_numpy()

    # 2) Set up the multi-label stratified splitter
    msss = MultilabelStratifiedShuffleSplit(
        n_splits=1, test_size=test_size, random_state=random_state
    )

    # 3) Run the split; indices refer to rows of df
    (train_idx, test_idx), = msss.split(df, Y)
    
    train_df = df.iloc[train_idx].copy()
    test_df  = df.iloc[test_idx].copy()

    train_df_copy = df_copy.iloc[train_idx].copy()
    test_df_copy  = df_copy.iloc[test_idx].copy()
    
    summary = pd.concat(
        {
            "train": prevalence(train_df_copy, Y_cols),
            "test": prevalence(test_df_copy, Y_cols),
        },
        axis=1,
    )
    print(summary)

    return train_df, test_df

# Make the split
train_df, validtest_df = multilabel_stratified_split(preprocessed_df, test_size=0.3)
valid_df, test_df = multilabel_stratified_split(validtest_df, test_size=0.5)

train_df.to_csv(f"{data_path}ukb_diag_train.csv", index=False)
valid_df.to_csv(f"{data_path}ukb_diag_valid.csv", index=False)
test_df.to_csv(f"{data_path}ukb_diag_test.csv", index=False)

                                             train             test      
                                        prevalence     n prevalence     n
lung_cancer_time_to_diagnosis_<0          0.000895    29   0.000936    13
lung_cancer_time_to_diagnosis_0-1         0.000617    20   0.000648     9
lung_cancer_time_to_diagnosis_>1          0.009906   321   0.009937   138
colorectal_cancer_time_to_diagnosis_<0    0.004629   150   0.004609    64
colorectal_cancer_time_to_diagnosis_0-1   0.000833    27   0.000792    11
colorectal_cancer_time_to_diagnosis_>1    0.012190   395   0.012170   169
stomach_cancer_time_to_diagnosis_<0       0.000309    10   0.000288     4
stomach_cancer_time_to_diagnosis_0-1      0.000062     2   0.000072     1
stomach_cancer_time_to_diagnosis_>1       0.001697    55   0.001728    24
ischemia_time_to_diagnosis_<0             0.042743  1385   0.042774   594
ischemia_time_to_diagnosis_0-1            0.004382   142   0.004393    61
ischemia_time_to_diagnosis_>1         