In [1]:
import pandas as pd
import numpy as np
data_dir = "../data/"

In [8]:
df_diag1 = pd.read_csv(f"{data_dir}cancer_type_and_date_participant.csv")
df_diag2 = pd.read_csv(f"{data_dir}cancer_type_and_date_2_participant.csv")
df_diag2 = df_diag2.drop(columns = ['eid'])
df_diag = pd.concat([df_diag1, df_diag2], axis=1)

  df_diag2 = pd.read_csv(f"{data_dir}cancer_type_and_date_2_participant.csv")


In [9]:
df_diag = df_diag.rename(columns=lambda x: x.replace('p40005', 'Date of cancer diagnosis'))
df_diag = df_diag.rename(columns=lambda x: x.replace('p40006', 'Type of cancer: ICD10'))

In [10]:
# group ICD
for i in range(22):
    df_diag[[f"diag_{i}_icd10_code", f"diag_{i}_icd10_name"]] = df_diag[f"Type of cancer: ICD10_i{i}"].str.split(" ", n=1, expand=True)
    df_diag[[f"diag_{i}_icd10_grouped", f"diag_{i}_icd10_decimal"]] = df_diag[f"diag_{i}_icd10_code"].str.split(".", n=1, expand=True)

## Exploration of cancer groups

In [35]:
# automatically find all diag_*_icd10_grouped columns
diag_cols = [c for c in df_diag.columns if c.startswith("diag_") and "icd10_grouped" in c]

# flatten all values into one array, drop NaNs, get uniques
all_icd10_groups = pd.unique(df_diag[diag_cols].values.ravel())

# remove NaN if present
all_icd10_groups = all_icd10_groups[pd.notna(all_icd10_groups)]

print(f"{len(all_icd10_groups)} unique ICD10 groups")
all_icd10_groups

array(['C44', 'C71', 'C73', 'C34', 'D06', 'D09', 'C67', 'C80', 'C50',
       'D04', 'C20', 'C65', 'C22', 'C54', 'C18', 'C62', 'C19', 'C91',
       'C43', 'D39', 'C84', 'C61', 'D05', 'D41', 'C15', 'C53', 'D34',
       'D47', 'C13', 'C64', 'C85', 'C96', 'C56', 'C90', 'C37', 'D07',
       'D48', 'D03', 'C09', 'D01', 'C81', 'C16', 'C21', 'C92', 'C25',
       'C78', 'C82', 'D32', 'C45', 'C00', 'D11', 'C17', 'C07', 'C83',
       'C77', 'C69', 'D45', 'C23', 'D33', 'C68', 'C32', 'C48', 'C04',
       'D00', 'C03', 'D02', 'C63', 'C57', 'C88', 'C55', 'C02', 'C38',
       'D43', 'D35', 'D46', 'C05', 'C49', 'C01', 'C24', 'C14', 'C51',
       'D37', 'D40', 'C66', 'C41', 'C12', 'C75', 'C08', 'C06', 'D42',
       'C31', 'C94', 'C70', 'C40', 'O01', 'D44', 'C30', 'C74', 'C52',
       'C72', 'C76', 'C11', 'C79', 'C60', 'D18', 'C10', 'D38', 'C93',
       'C26', 'C46', 'C33', 'C58', 'C42', 'C47', 'C95', 'D21', 'D27',
       'D30', 'D12', 'D16', 'C86', 'C97', 'D36', 'D15', 'D29', 'C39',
       'D10', 'D13']

In [8]:
def map_icd10_group(code):
    if pd.isna(code):
        return np.nan
    
    block = code[:3]  # e.g. C50, D05, C34

    c = block[0]
    n = int(block[1:3])

    if c == "C":
        if 0 <= n <= 14: return "head_neck"
        if n == 15: return "esophageal"
        if n == 16: return "gastric"
        if n == 17: return "small_bowel"
        if 18 <= n <= 21: return "colorectal"
        if n == 22: return "liver"
        if 23 <= n <= 24: return "biliary"
        if n == 25: return "pancreatic"
        if n == 26: return "gi_other"
        if 30 <= n <= 39: return "lung_thoracic"
        if 40 <= n <= 41: return "bone"
        if 43 <= n <= 44: return "skin"
        if 45 <= n <= 49: return "soft_tissue"
        if n == 50: return "breast"
        if 51 <= n <= 58: return "gynecologic"
        if 60 <= n <= 63: return "male_genital"
        if 64 <= n <= 68: return "urinary"
        if 69 <= n <= 72: return "brain_cns"
        if 73 <= n <= 75: return "endocrine"
        if 76 <= n <= 80: return "unknown_primary"
        if 81 <= n <= 96: return "hematologic"
        if n == 97: return "multiple_cancer"

    if c == "D":
        if 0 <= n <= 9: return "in_situ"
        if 10 <= n <= 36: return "benign"
        if 37 <= n <= 48: return "uncertain"

    if block == "O01":
        return "gestational"

    return "other"


In [9]:
for i in range(22):
    src = f"diag_{i}_icd10_grouped"
    dst = f"Type_of_cancer_i{i}"
    df_diag[dst] = df_diag[src].map(map_icd10_group)

In [10]:
# Collect all Type_of_cancer_i* columns
type_cols = [c for c in df_diag.columns if c.startswith("Type_of_cancer_i")]

# Flatten all values and count
cancer_type_counts = (
    pd.Series(df_diag[type_cols].values.ravel())
      .dropna()
      .value_counts()
)

cancer_type_counts

skin               46199
breast             20801
male_genital       16763
in_situ            16141
colorectal          9925
hematologic         7811
lung_thoracic       6212
gynecologic         5782
urinary             4688
uncertain           4097
head_neck           1889
benign              1677
pancreatic          1648
esophageal          1394
brain_cns           1348
soft_tissue         1270
unknown_primary     1052
gastric             1035
endocrine            999
liver                810
biliary              430
small_bowel          398
bone                 125
other                 46
gi_other              45
gestational           38
multiple_cancer        1
dtype: int64

## Add Skin cancer columns

In [2]:
def rename_columns(df, field_dict):
    # drop instances - take eid only 
    for c in set(df.columns) - {"eid"}:
        df = df.rename(columns={c: c.split("p")[1].split("_")[0]})
        
    # map from eid to name 
    for c in set(df.columns) - {"eid"}:
        df = df.rename(columns={c: field_dict[int(c)]})
        
    return df


field = pd.read_csv(f"../data/field.tsv",sep="\t")
field_dict = dict(zip(field["field_id"], field["title"]))
category = pd.read_csv(f"../data/category.tsv",sep="\t")

In [11]:
data_path = "/orcd/pool/003/dbertsim_shared/ukb"
df_lab_time = rename_columns(pd.read_csv(f"../data/time_stamps_participant.csv"),field_dict)
df = pd.merge(df_diag, df_lab_time[['eid','Date of attending assessment centre']], how = 'left', on = 'eid')

In [None]:
# 1) assessment date
df["assessment_date"] = pd.to_datetime(df["Date of attending assessment centre"], errors="coerce")

# 2) build long table for the 22 diagnosis slots
icd_cols  = [f"diag_{i}_icd10_grouped" for i in range(22)]
date_cols = [f"Date of cancer diagnosis_i{i}" for i in range(22)]  # adjust if your date col name differs

long = pd.DataFrame({
    "eid": np.repeat(df_diag["eid"].values, 22),
    "icd10": df_diag[icd_cols].to_numpy().ravel(),
    "dx_date": df_diag[date_cols].to_numpy().ravel(),
})

long["dx_date"] = pd.to_datetime(long["dx_date"], errors="coerce")

# 3) keep only skin cancer ICD blocks (melanoma + non-melanoma)
skin_codes = {"C43", "C44"}   # add "D03","D04" if you want in-situ too
skin = long[long["icd10"].isin(skin_codes) & long["dx_date"].notna()].copy()

# 4) earliest skin dx per patient
skin_first = skin.groupby("eid", as_index=False)["dx_date"].min()

# 5) join assessment date and compute years_to_dx
skin_first = skin_first.merge(df[["eid", "assessment_date"]], on="eid", how="left")
skin_first["skin_time_to_diagnosis"] = (skin_first["dx_date"] - skin_first["assessment_date"]).dt.days / 365.25

# 6) baseline-present flag with 30-day buffer (same as your rule)
buffer_years = 30 / 365.25
skin_first["skin_cancer"] = (skin_first["skin_years_to_dx"] <= buffer_years).astype(int)
skin_first.loc[skin_first["skin_cancer"] == 1, "skin_years_to_dx"] = 0.0

# 7) merge onto df
df_out = df.merge(skin_first[["eid", "skin_cancer", "skin_time_to_diagnosis"]], on="eid", how="left")

# fill no-records
df_out["skin_cancer"] = df_out["skin_cancer"].fillna(0).astype(int)


In [43]:
COHORT = "test"
df_cohort = pd.read_csv(f"{data_path}/ukb_cancer_{COHORT}.csv")

In [44]:
df_merged = pd.merge(df_cohort, df_out[['eid', 'skin_cancer', 'skin_time_to_diagnosis']], how = 'left', on = 'eid')

In [45]:
df_merged.to_csv(f"{data_path}/ukb_cancer_{COHORT}_with_skin.csv", index = False)

In [46]:
df_train = pd.read_csv(f"{data_path}/ukb_cancer_train_with_skin.csv")
df_valid = pd.read_csv(f"{data_path}/ukb_cancer_valid_with_skin.csv")
df_test = pd.read_csv(f"{data_path}/ukb_cancer_test_with_skin.csv")

In [55]:
df = pd.concat([df_train, df_valid, df_test])
df_cancer = df.loc[df['cancer_time_to_diagnosis'].notna()]

In [56]:
outcomes = ["breast", "prostate", "lung", "colorectal", "bladder", "pancreatic"]

time_cols = [f"{o}_time_to_diagnosis" for o in outcomes]

any_cancer = (df_cancer[time_cols].notna()).any(axis=1).astype(int)
print(f"Number of patients with the top six cancers: {any_cancer.mean()}")

Number of patients with the top six cancers: 0.41267797637079673


In [57]:
outcomes = ["breast", "prostate", "lung", "colorectal", "bladder", "pancreatic", "skin"]

time_cols = [f"{o}_time_to_diagnosis" for o in outcomes]

any_cancer = (df_cancer[time_cols].notna()).any(axis=1).astype(int)
print(f"Number of patients with the top seven (+skin) cancers: {any_cancer.mean()}")

Number of patients with the top seven (+skin) cancers: 0.7048621629809149


# Create cancer times metadata dataframe

In [28]:
# -----------------------------
# 1) Assessment date
# -----------------------------
df["assessment_date"] = pd.to_datetime(
    df["Date of attending assessment centre"], errors="coerce"
)

# -----------------------------
# 2) Long table for 22 diagnosis slots
# -----------------------------
N_SLOTS = 22
icd_cols  = [f"diag_{i}_icd10_grouped" for i in range(N_SLOTS)]
date_cols = [f"Date of cancer diagnosis_i{i}" for i in range(N_SLOTS)]

long = pd.DataFrame({
    "eid": np.repeat(df_diag["eid"].values, N_SLOTS),
    "icd10": df_diag[icd_cols].to_numpy().ravel(),
    "dx_date": df_diag[date_cols].to_numpy().ravel(),
})

long["dx_date"] = pd.to_datetime(long["dx_date"], errors="coerce")

# -----------------------------
# 3) ICD â†’ cancer map
# -----------------------------
CANCER_ICD_MAP = {
    "skin": {"C43", "C44"},          # add D03/D04 if desired
    "breast": {"C50", "D05"},
    "prostate": {"C61"},
    # "lung": {"C34"},
    # "colorectal": {"C18", "C20"},
    "liver": {"C22"},
    "pancreatic": {"C25"},
    "bladder": {"C67"},
    "trachea_bronchus_lung": {"C33", "C34"},
    "colon_rectum": {"C18", "C19", "C20"},
    "stomach": {"C16"}
}

icd_to_type = {
    icd: cancer
    for cancer, icds in CANCER_ICD_MAP.items()
    for icd in icds
}

long["cancer_type"] = long["icd10"].map(icd_to_type)

long_cancer = long[
    long["cancer_type"].notna() & long["dx_date"].notna()
].copy()

# -----------------------------
# 4) Earliest diagnosis per (eid, cancer_type)
# -----------------------------
first_dx = (
    long_cancer
    .groupby(["eid", "cancer_type"], as_index=False)
    .agg(diagnosis_date=("dx_date", "min"))
)

# -----------------------------
# 5) Time-to-dx + baseline-present flag
# -----------------------------
first_dx = first_dx.merge(
    df[["eid", "assessment_date"]],
    on="eid",
    how="left"
)

first_dx["time_to_diagnosis"] = (
    (first_dx["diagnosis_date"] - first_dx["assessment_date"])
    .dt.days / 365.25
)

buffer_years = 30 / 365.25
first_dx["baseline_present"] = (
    first_dx["time_to_diagnosis"] <= buffer_years
).astype(int)

first_dx.loc[
    first_dx["baseline_present"] == 1,
    "time_to_diagnosis"
] = 0.0

# -----------------------------
# 6) Wide format (adds diagnosis date too)
# -----------------------------
wide = first_dx.pivot(
    index="eid",
    columns="cancer_type",
    values=["baseline_present", "time_to_diagnosis", "diagnosis_date"]
)

wide.columns = [
    f"{cancer}_{'cancer' if metric=='baseline_present' else metric}"
    for metric, cancer in wide.columns
]
wide = wide.reset_index()

# -----------------------------
# 7) Merge back to main df
# -----------------------------
df_out = df.merge(wide, on="eid", how="left")

# fill flags for people with no diagnosis records
for cancer in CANCER_ICD_MAP:
    df_out[f"{cancer}_cancer"] = (
        df_out.get(f"{cancer}_cancer")
        .fillna(0)
        .astype(int)
    )


In [14]:
drop_cols = ['Date of attending assessment centre'] + [col for col in df_out.columns if col.startswith("diag_")] + date_cols + [col for col in df_out.columns if col.startswith("Type of")]

df_out = df_out.drop(columns=drop_cols)

In [42]:
# df_out.to_csv(f"{data_path}/ukb_cancer_time_metadata.csv", index=False)

In [15]:
df_cancer = pd.read_csv("../data/blood_protein_cancers_clean.csv")
eids = set(df_cancer["eid"])

  df_cancer = pd.read_csv("../data/blood_protein_cancers_clean.csv")


In [16]:
df_out = df_out.loc[df_out["eid"].isin(eids)]

In [26]:
for cancer in ["skin", "prostate", "breast", "bladder", "trachea_bronchus_lung", "colon_rectum", "stomach"]:
    print(df_out[f"{cancer}_cancer"].value_counts())

0    51719
1     1276
Name: skin_cancer, dtype: int64
0    52648
1      347
Name: prostate_cancer, dtype: int64
0    51989
1     1006
Name: breast_cancer, dtype: int64
0    52950
1       45
Name: bladder_cancer, dtype: int64
0    52950
1       45
Name: trachea_bronchus_lung_cancer, dtype: int64
0    52749
1      246
Name: colon_rectum_cancer, dtype: int64
0    52978
1       17
Name: stomach_cancer, dtype: int64


In [24]:
for cancer in ["skin", "prostate", "breast", "bladder", "trachea_bronchus_lung", "colon_rectum", "stomach"]:
    print(sum((df_out[f"{cancer}_time_to_diagnosis"] > 0) & (df_out[f"{cancer}_time_to_diagnosis"] <= 1)))

157
79
87
9
30
43
3
