In [None]:
import pandas as pd
import numpy as np
data_dir = "../data/"
data_path = "/orcd/pool/003/dbertsim_shared/ukb/"

In [2]:
df = pd.read_csv(f"{data_path}blood_protein_cancers_clean.csv")
drop_cols = ['assessment_date'] + ['Medication for cholesterol, blood pressure or diabetes'] + [col for col in df.columns if "operation" in col.lower()] + \
    [col for col in df.columns if "cancer" in col.lower()] + [col for col in df.columns if "time_to_diagnosis" in col.lower()]
df = df.drop(columns=drop_cols)
eids = set(df['eid'])

### Mapping column names

In [3]:
def rename_columns(df, field_dict):
    # drop instances - take eid only 
    for c in set(df.columns) - {"eid"}:
        df = df.rename(columns={c: c.split("p")[1].split("_")[0]})
        
    # map from eid to name 
    for c in set(df.columns) - {"eid"}:
        df = df.rename(columns={c: field_dict[int(c)]})
        
    return df


field = pd.read_csv(f"{data_dir}field.tsv",sep="\t")
field_dict = dict(zip(field["field_id"], field["title"]))
category = pd.read_csv(f"{data_dir}category.tsv",sep="\t")

In [4]:
df_lab_time = rename_columns(pd.read_csv(f"{data_dir}time_stamps_participant.csv"),field_dict)

## Mapping ICD codes to cancers

In [5]:
df_cancer1 = pd.read_csv(f"{data_dir}cancer_type_and_date_participant.csv")
df_cancer2 = pd.read_csv(f"{data_dir}cancer_type_and_date_2_participant.csv")
df_cancer2 = df_cancer2.drop(columns = ['eid'])
df_cancer = pd.concat([df_cancer1, df_cancer2], axis=1)

In [6]:
df_cancer = df_cancer.rename(columns=lambda x: x.replace('p40005', 'Date of cancer diagnosis'))
df_cancer = df_cancer.rename(columns=lambda x: x.replace('p40006', 'Type of cancer: ICD10'))

# group ICD
for i in range(22):
    df_cancer[[f"diag_{i}_icd10_code", f"diag_{i}_icd10_name"]] = df_cancer[f"Type of cancer: ICD10_i{i}"].str.split(" ", n=1, expand=True)
    df_cancer[[f"diag_{i}_icd10_grouped", f"diag_{i}_icd10_decimal"]] = df_cancer[f"diag_{i}_icd10_code"].str.split(".", n=1, expand=True)

df_cancer = df_cancer.loc[df_cancer['eid'].isin(eids)]

In [7]:
def icd_to_cancer(icd_cancer_map, df_cancer):

    # -----------------------------
    # 1) Long table for 22 diagnosis slots
    # -----------------------------
    N_SLOTS = 22
    icd_cols  = [f"diag_{i}_icd10_grouped" for i in range(N_SLOTS)]
    date_cols = [f"Date of cancer diagnosis_i{i}" for i in range(N_SLOTS)]

    long = pd.DataFrame({
        "eid": np.repeat(df_cancer["eid"].values, N_SLOTS),
        "icd10": df_cancer[icd_cols].to_numpy().ravel(),
        "dx_date": df_cancer[date_cols].to_numpy().ravel(),
    })

    long["dx_date"] = pd.to_datetime(long["dx_date"], errors="coerce")

    # -----------------------------
    # 2) ICD → cancer map
    # -----------------------------
    icd_to_type = {
        icd: cancer
        for cancer, icds in icd_cancer_map.items()
        for icd in icds
    }

    long["cancer_type"] = long["icd10"].map(icd_to_type)

    long_cancer = long[
        long["cancer_type"].notna() & long["dx_date"].notna()
    ].copy()

    # -----------------------------
    # 3) Earliest diagnosis per (eid, cancer_type)
    # -----------------------------
    first_dx = (
        long_cancer
        .groupby(["eid", "cancer_type"], as_index=False)
        .agg(diagnosis_date=("dx_date", "min"))
    )

    # -----------------------------
    # 4) Time-to-dx + baseline-present flag
    # -----------------------------
    first_dx = first_dx.merge(
        df_lab_time[['eid','Date of attending assessment centre']],
        on="eid",
        how="left"
    )

    first_dx["assessment_date"] = pd.to_datetime(
        first_dx["Date of attending assessment centre"], errors="coerce"
    )
    first_dx["time_to_diagnosis"] = (
        (first_dx["diagnosis_date"] - first_dx["assessment_date"])
        .dt.days / 365.25
    )

    buffer_years = 30 / 365.25
    first_dx["baseline_present"] = (
        first_dx["time_to_diagnosis"] <= buffer_years
    ).astype(int)

    # -----------------------------
    # 5) Wide format (adds diagnosis date too)
    # -----------------------------
    df_out = first_dx.pivot(
        index="eid",
        columns="cancer_type",
        values=["baseline_present", "time_to_diagnosis", "diagnosis_date"]
    )

    df_out.columns = [
        f"{cancer}_cancer{'' if metric=='baseline_present' else '_' + metric}"
        for metric, cancer in df_out.columns
    ]
    df_out = df_out.reset_index()
    df_out = df_out.drop(columns = [col for col in df_out.columns if "diagnosis_date" in col])
    return df_out

In [8]:
CANCER_ICD_MAP = {
        "skin": {"C43", "C44"},
        "breast": {"C50"},
        "prostate": {"C61"},
        "colorectal": {"C18", "C19", "C20"},
        "lung": {"C33", "C34"},
        "lymphoma": {"C81", "C82", "C83", "C84", "C85"},
        "kidney": {"C64", "C65"},
        "leukemia": {"C91", "C92", "C93", "C94", "C95"},
        "bladder": {"C67"},
        "pancreatic": {"C25"},
        "brain": {"C70", "C71", "C72"},
        "stomach": {"C16"},
    }
df_out1 = icd_to_cancer(CANCER_ICD_MAP, df_cancer)
df_out1.head()

Unnamed: 0,eid,bladder_cancer,brain_cancer,breast_cancer,colorectal_cancer,kidney_cancer,leukemia_cancer,lung_cancer,lymphoma_cancer,pancreatic_cancer,...,breast_cancer_time_to_diagnosis,colorectal_cancer_time_to_diagnosis,kidney_cancer_time_to_diagnosis,leukemia_cancer_time_to_diagnosis,lung_cancer_time_to_diagnosis,lymphoma_cancer_time_to_diagnosis,pancreatic_cancer_time_to_diagnosis,prostate_cancer_time_to_diagnosis,skin_cancer_time_to_diagnosis,stomach_cancer_time_to_diagnosis
0,1000083,0,0,0,0,0,0,0,0,0,...,,,,,,,,,1.886379,
1,1000133,0,0,0,0,0,0,0,0,0,...,,,,,,,,,3.028063,
2,1001769,0,0,0,0,0,0,0,0,0,...,,,,,,,,9.585216,,
3,1002052,0,0,0,0,0,0,0,0,0,...,,,,,,,,,-14.524298,
4,1002121,0,0,0,0,0,0,0,0,0,...,,,,,,,,,-2.058864,


In [9]:
CANCER_GROUP_ICD_MAP = {
    "oral_pharynx": {
        "C00", "C01", "C02", "C03", "C04", "C05", "C06",
        "C07", "C08", "C09", "C10", "C11", "C12", "C13", "C14"
    },

    "digestive_organs": {
        "C15", "C16", "C17", "C18", "C19", "C20",
        "C21", "C22", "C23", "C24", "C25", "C26"
    },

    "respiratory_intrathoracic": {
        "C30", "C31", "C32", "C33", "C34",
        "C37", "C38", "C39"
    },

    "bone_cartilage": {"C40", "C41"},

    # "skin": {"C43", "C44"}, # these are duplicates

    "mesothelial_soft_tissue": {"C45", "C46", "C47", "C48", "C49"},

    # "breast": {"C50"}, # duplicates

    "female_genital": {"C51", "C52", "C53", "C54", "C55", "C56", "C57", "C58"},

    "male_genital": {"C60", "C61", "C62", "C63"},

    "urinary_tract": {"C64", "C65", "C66", "C67", "C68"},

    "eye_brain_cns": {"C69", "C70", "C71", "C72"},

    "endocrine": {"C73", "C74", "C75"},

    "ill_defined_secondary": {"C76", "C77", "C78", "C79", "C80"},
    
    "in_situ": {"D00", "D01", "D02", "D03", "D04", "D05", "D06", "D07", "D09"},

    "hematologic": {
        "C81", "C82", "C83", "C84", "C85",  # lymphomas
        "C88",                              # immunoproliferative
        "C90",                              # multiple myeloma
        "C91", "C92", "C93", "C94", "C95",  # leukemias
        "C96"                               # other lymphoid/hematopoietic
    }
}

df_out2 = icd_to_cancer(CANCER_GROUP_ICD_MAP, df_cancer)
df_out2.head()

Unnamed: 0,eid,bone_cartilage_cancer,digestive_organs_cancer,endocrine_cancer,eye_brain_cns_cancer,female_genital_cancer,hematologic_cancer,ill_defined_secondary_cancer,in_situ_cancer,male_genital_cancer,...,eye_brain_cns_cancer_time_to_diagnosis,female_genital_cancer_time_to_diagnosis,hematologic_cancer_time_to_diagnosis,ill_defined_secondary_cancer_time_to_diagnosis,in_situ_cancer_time_to_diagnosis,male_genital_cancer_time_to_diagnosis,mesothelial_soft_tissue_cancer_time_to_diagnosis,oral_pharynx_cancer_time_to_diagnosis,respiratory_intrathoracic_cancer_time_to_diagnosis,urinary_tract_cancer_time_to_diagnosis
0,1001096,0,0,0,0,0,0,0,0,0,...,,,,,,,11.351129,,,
1,1001769,0,0,0,0,0,0,0,0,0,...,,,,,,9.585216,,,,
2,1002503,0,0,0,0,0,0,0,0,0,...,,,,,9.273101,,,,,
3,1002891,0,0,0,0,0,0,0,0,0,...,,,4.199863,,,,,,,
4,1007355,0,0,0,0,0,0,0,0,0,...,,,,,,9.344285,,,,


In [37]:
df = df.merge(df_out1, on="eid", how="left")
df = df.merge(df_out2, on="eid", how="left")
for cancer in CANCER_ICD_MAP:
    df[f"{cancer}_cancer"] = (
        df.get(f"{cancer}_cancer")
        .fillna(0)
        .astype(int)
    )
for cancer in CANCER_GROUP_ICD_MAP:
    df[f"{cancer}_cancer"] = (
        df.get(f"{cancer}_cancer")
        .fillna(0)
        .astype(int)
    )

In [11]:
for cancer in CANCER_ICD_MAP:
    print(f"{cancer}: {sum(df[f'{cancer}_cancer_time_to_diagnosis'].notna())}")
for cancer in CANCER_GROUP_ICD_MAP:
    print(f"{cancer}: {sum(df[f'{cancer}_cancer_time_to_diagnosis'].notna())}")

skin: 4465
breast: 1885
prostate: 1670
colorectal: 944
lung: 593
lymphoma: 427
kidney: 252
leukemia: 220
bladder: 197
pancreatic: 153
brain: 111
stomach: 115
oral_pharynx: 192
digestive_organs: 1541
respiratory_intrathoracic: 660
bone_cartilage: 12
mesothelial_soft_tissue: 142
female_genital: 597
male_genital: 1740
urinary_tract: 462
eye_brain_cns: 138
endocrine: 106
ill_defined_secondary: 126
in_situ: 1613
hematologic: 812


In [12]:
df.head()

Unnamed: 0,eid,Age at recruitment,Sex_male,Ethnic background,Body mass index (BMI),"Systolic blood pressure, automated reading","Diastolic blood pressure, automated reading",Townsend deprivation index at recruitment,Smoking status,Alcohol intake frequency.,...,eye_brain_cns_cancer_time_to_diagnosis,female_genital_cancer_time_to_diagnosis,hematologic_cancer_time_to_diagnosis,ill_defined_secondary_cancer_time_to_diagnosis,in_situ_cancer_time_to_diagnosis,male_genital_cancer_time_to_diagnosis,mesothelial_soft_tissue_cancer_time_to_diagnosis,oral_pharynx_cancer_time_to_diagnosis,respiratory_intrathoracic_cancer_time_to_diagnosis,urinary_tract_cancer_time_to_diagnosis
0,1000083,49,0,British,24.7295,116.0,71.0,-3.96,Previous,Three or four times a week,...,,,,,,,,,,
1,1000380,62,0,British,31.2026,124.0,81.0,-5.0,Never,Daily or almost daily,...,,,,,,,,,,
2,1001803,47,0,Any other white background,24.2187,98.0,57.0,2.0,Never,Never,...,,,,,,,,,,
3,1002917,52,1,British,20.1477,132.0,67.0,-4.23,Current,Special occasions only,...,,,,,,,,,,
4,1003287,69,0,British,28.1479,166.0,61.0,6.38,Previous,Three or four times a week,...,,,,,,,,,,


In [38]:
df.to_csv(f"{data_path}blood_protein_cancers_clean_new.csv", index=False)

In [None]:
temp[temp['Type of cancer: ICD10_i1'].notna()]

## Train/val/test split of cancer dataset
1. Drop protein columns that have over 50% missing
2. Split into train and test using iterative-stratification on cancer + top 7 cancers (with each time frame)

In [40]:
# df = pd.read_csv(f"{data_path}blood_protein_cancers_clean_new.csv")

olink_cols = [col for col in df.columns if col.startswith("olink")]
threshold = 0.5

# Compute the fraction of missing olink values per row
row_missing_fraction = df[olink_cols].isna().mean(axis=1)

# Filter out rows with >30% missing olink values
rows_to_drop = df[row_missing_fraction >= threshold].index

preprocessed_df = df[row_missing_fraction < threshold]

# Compute the fraction of missing values for these columns
missing_fraction = preprocessed_df[olink_cols].isna().mean()

# Drop the columns with >50% missing
cols_to_drop = missing_fraction[missing_fraction >= threshold].index # only 3 columns
preprocessed_df = preprocessed_df.drop(columns=[col for col in olink_cols if col in cols_to_drop])

In [41]:
print(f"dropping patients: {len(rows_to_drop)}")
print(f"dropping cols: {len(cols_to_drop)}")

dropping patients: 6705
dropping cols: 3


In [42]:
CANCER_COLS = list(CANCER_GROUP_ICD_MAP.keys()) + ["skin","breast"]

def bin_ttd(x):
    if pd.isna(x):         return "NA"
    if x <= 30/365.25:     return "<0"
    if 30/365.25 < x <= 1: return "0-1"   # (0,1]
    return ">1"

def proportions(frame, label):
    return (frame[f"{label}_strata"].value_counts(normalize=False)
            .reindex(["<0","0-1",">1", "NA"])
            .fillna(0))

In [43]:
for cancer in CANCER_COLS:
    preprocessed_df[f"{cancer}_strata"] = preprocessed_df[f"{cancer}_cancer_time_to_diagnosis"].apply(bin_ttd)
    print(f"\n{cancer}:\n", proportions(preprocessed_df, cancer))


oral_pharynx:
 <0        45
0-1        7
>1       116
NA     46122
Name: oral_pharynx_strata, dtype: int64

digestive_organs:
 <0       263
0-1       65
>1      1009
NA     44953
Name: digestive_organs_strata, dtype: int64

respiratory_intrathoracic:
 <0        69
0-1       31
>1       488
NA     45702
Name: respiratory_intrathoracic_strata, dtype: int64

bone_cartilage:
 <0         7.0
0-1        0.0
>1         4.0
NA     46279.0
Name: bone_cartilage_strata, dtype: float64

mesothelial_soft_tissue:
 <0        25
0-1        6
>1        89
NA     46170
Name: mesothelial_soft_tissue_strata, dtype: int64

female_genital:
 <0       213
0-1       22
>1       289
NA     45766
Name: female_genital_strata, dtype: int64

male_genital:
 <0       346
0-1       70
>1      1091
NA     44783
Name: male_genital_strata, dtype: int64

urinary_tract:
 <0        90
0-1       13
>1       290
NA     45897
Name: urinary_tract_strata, dtype: int64

eye_brain_cns:
 <0        26
0-1        5
>1        93
NA  

In [44]:
## Create train, validation, and test datasets for predicting current cancer
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
# Sanity check: compare prevalences per label in full vs train vs test
def prevalence(table, cols):
    return pd.DataFrame({
        "prevalence": [table[c].mean() for c in cols],
        "n": [table[c].sum() for c in cols]
    }, index=cols)

def multilabel_stratified_split(
    df: pd.DataFrame,
    test_size=0.4,
    random_state=42,
    time=0
):
    """
    Splits df into train/test so that each label in label_cols
    has (approximately) the same prevalence in both splits,
    accounting for multi-label rows.
    """
    # 1) Build the multi-label target matrix (n_samples x n_labels)
    # Y = df[label_cols].astype(int).to_numpy()
    df_copy = df.copy()
    Y_cols = []
    for label in CANCER_COLS:
        for strata in ["<0","0-1",">1"]:
            df_copy[f"{label}_time_to_diagnosis_{strata}"] = (df[f"{label}_strata"] == strata)
            Y_cols.append(f"{label}_time_to_diagnosis_{strata}")
    Y = df_copy[Y_cols].to_numpy()

    # 2) Set up the multi-label stratified splitter
    msss = MultilabelStratifiedShuffleSplit(
        n_splits=1, test_size=test_size, random_state=random_state
    )

    # 3) Run the split; indices refer to rows of df
    (train_idx, test_idx), = msss.split(df, Y)
    
    train_df = df.iloc[train_idx].copy()
    test_df  = df.iloc[test_idx].copy()
    
    train_df = train_df.drop(columns = [col for col in df.columns if "strata" in col])
    test_df = test_df.drop(columns = [col for col in df.columns if "strata" in col])

    train_df_copy = df_copy.iloc[train_idx].copy()
    test_df_copy  = df_copy.iloc[test_idx].copy()
    
    summary = pd.concat(
        {
            "train": prevalence(train_df_copy, Y_cols),
            "test": prevalence(test_df_copy, Y_cols),
        },
        axis=1,
    )
    print(summary)

    return train_df, test_df

# Make the split
train_df, test_df = multilabel_stratified_split(preprocessed_df, test_size=0.2)

train_df.to_csv(f"{data_path}ukb_cancer_train_new.csv", index=False)
test_df.to_csv(f"{data_path}ukb_cancer_test_new.csv", index=False)

                                                     train             test  \
                                                prevalence     n prevalence   
oral_pharynx_time_to_diagnosis_<0                 0.000972    36   0.000972   
oral_pharynx_time_to_diagnosis_0-1                0.000162     6   0.000108   
oral_pharynx_time_to_diagnosis_>1                 0.002511    93   0.002484   
digestive_organs_time_to_diagnosis_<0             0.005671   210   0.005725   
digestive_organs_time_to_diagnosis_0-1            0.001404    52   0.001404   
digestive_organs_time_to_diagnosis_>1             0.021792   807   0.021819   
respiratory_intrathoracic_time_to_diagnosis_<0    0.001485    55   0.001512   
respiratory_intrathoracic_time_to_diagnosis_0-1   0.000675    25   0.000648   
respiratory_intrathoracic_time_to_diagnosis_>1    0.010531   390   0.010585   
bone_cartilage_time_to_diagnosis_<0               0.000162     6   0.000108   
bone_cartilage_time_to_diagnosis_0-1              0.

In [45]:
for cancer in CANCER_ICD_MAP.keys():
    train_df[f"{cancer}_strata"] = train_df[f"{cancer}_cancer_time_to_diagnosis"].apply(bin_ttd)
    test_df[f"{cancer}_strata"] = test_df[f"{cancer}_cancer_time_to_diagnosis"].apply(bin_ttd)
    print(f"{proportions(train_df, cancer)}\n{proportions(test_df, cancer)}\n")

<0       886
0-1      111
>1      2082
NA     33953
Name: skin_strata, dtype: int64
<0      221
0-1      28
>1      520
NA     8489
Name: skin_strata, dtype: int64

<0       616
0-1       57
>1       650
NA     35709
Name: breast_strata, dtype: int64
<0      154
0-1      14
>1      162
NA     8928
Name: breast_strata, dtype: int64

<0       244
0-1       56
>1       862
NA     35870
Name: prostate_strata, dtype: int64
<0       55
0-1      14
>1      213
NA     8976
Name: prostate_strata, dtype: int64

<0       166
0-1       32
>1       445
NA     36389
Name: colorectal_strata, dtype: int64
<0       48
0-1       6
>1      119
NA     9085
Name: colorectal_strata, dtype: int64

<0        33
0-1       24
>1       367
NA     36608
Name: lung_strata, dtype: int64
<0        9
0-1       5
>1       92
NA     9152
Name: lung_strata, dtype: int64

<0        97
0-1       12
>1       191
NA     36732
Name: lymphoma_strata, dtype: int64
<0       27
0-1       3
>1       40
NA     9188
Name: lymphoma_

## Add follow up dates

In [None]:
data_path = "/orcd/pool/003/dbertsim_shared/ukb"

df_lab_time = rename_columns(pd.read_csv(f"data/time_stamps_participant.csv"),field_dict)
df_assessment_centre = rename_columns(pd.read_csv(f"data/assessment_centre.csv"),field_dict)
df = pd.merge(df_lab_time, df_assessment_centre, how = 'left', on = 'eid')

ASSESSMENT_CENTRE_TO_COUNTRY = {
    # England
    "Barts": "England",
    "Birmingham": "England",
    "Bristol": "England",
    "Bury": "England",
    "Cheadle (revisit)": "England",
    "Croydon": "England",
    "Hounslow": "England",
    "Leeds": "England",
    "Liverpool": "England",
    "Manchester": "England",
    "Middlesborough": "England",
    "Newcastle": "England",
    "Nottingham": "England",
    "Oxford": "England",
    "Reading": "England",
    "Sheffield": "England",
    "Stockport (pilot)": "England",
    "Stoke": "England",
    "Cheadle (imaging)": "England",
    "Reading (imaging)": "England",
    "Newcastle (imaging)": "England",
    "Bristol (imaging)": "England",

    # Scotland
    "Edinburgh": "Scotland",
    "Glasgow": "Scotland",

    # Wales
    "Cardiff": "Wales",
    "Swansea": "Wales",
    "Wrexham": "Wales",
}
COUNTRY_CENSOR_DATE = {
    "England":  pd.Timestamp("2023-05-31"),
    "Scotland": pd.Timestamp("2023-09-30"),
    "Wales":    pd.Timestamp("2016-12-31"),
}
df["country"] = df["UK Biobank assessment centre"].map(ASSESSMENT_CENTRE_TO_COUNTRY)
df["admin_censor_date"] = df["country"].map(COUNTRY_CENSOR_DATE)
df['admin_censor_date'] = pd.to_datetime(df['admin_censor_date'])
# if nan, then assume earlier date (2016)
df.loc[df['admin_censor_date'].isna(), 'admin_censor_date'] = pd.Timestamp("2016-12-31")

df['Date of attending assessment centre'] = pd.to_datetime(df['Date of attending assessment centre'])

df['time_to_follow_up'] = (df["admin_censor_date"] - df["Date of attending assessment centre"]).dt.days / 365.25


df_train = pd.read_csv(f"{data_path}/ukb_cancer_train.csv")
df_valid = pd.read_csv(f"{data_path}/ukb_cancer_valid.csv")
df_test = pd.read_csv(f"{data_path}/ukb_cancer_test.csv")

df_train = pd.merge(df_train, df[['eid','time_to_follow_up']], how = 'left', on = 'eid')
df_valid = pd.merge(df_valid, df[['eid','time_to_follow_up']], how = 'left', on = 'eid')
df_test = pd.merge(df_test, df[['eid','time_to_follow_up']], how = 'left', on = 'eid')

df_train = df_train.drop(columns = ['cancer_strata','breast_strata','prostate_strata','lung_strata','colorectal_strata','bladder_strata','pancreatic_strata'])
df_valid = df_valid.drop(columns = ['cancer_strata','breast_strata','prostate_strata','lung_strata','colorectal_strata','bladder_strata','pancreatic_strata'])
df_test = df_test.drop(columns = ['cancer_strata','breast_strata','prostate_strata','lung_strata','colorectal_strata','bladder_strata','pancreatic_strata'])

df_train.to_csv(f"{data_path}/ukb_cancer_train.csv", index=False)
df_valid.to_csv(f"{data_path}/ukb_cancer_valid.csv", index=False)
df_test.to_csv(f"{data_path}/ukb_cancer_test.csv", index=False)
