In [4]:
### LOAD PACKAGES ###
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.cluster import AgglomerativeClustering
from scipy.stats import kruskal,chisquare
import altair as alt
import ugtm
import textwrap
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy 
from scipy.cluster.hierarchy import dendrogram, linkage
import os
from tqdm.notebook import trange, tqdm
from tqdm.auto import tqdm
from itertools import chain
%matplotlib inline


In [2]:
import warnings

# 忽略所有 SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [3]:
def get_fields(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields) & data_field["field.tab"].str.contains("f\\.\\d+\\.0\\.\\d")].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_fields_all(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields)].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_data_fields(fields, data, data_field):
    f = get_fields(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

def get_data_fields_all(fields, data, data_field):
    f = get_fields_all(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

In [4]:
def map_column_to_meaning(df, column_name, data_path, file):
    # 读取数据文件
    coding1001 = pd.read_csv(f"{data_path}/{file}", sep="\t")
    
    # 将 coding 列转换为字符串类型
    coding1001['coding'] = coding1001['coding'].astype('str')
    
    # 将列重命名为指定的 column_name
    coding1001.rename(columns={"coding": column_name}, inplace=True)
    
    # 将 column_name 列转换为字符串类型
    df[column_name] = df[column_name].astype('str')
    
    # 创建 code 到 meaning 的映射字典
    code_to_meaning = dict(zip(coding1001[column_name], coding1001['meaning']))
    
    # 使用映射字典替换 column_name 列的值，并将其转换为分类类型
    df[column_name] = df[column_name].map(code_to_meaning).astype('category')

In [5]:
from datetime import datetime, timedelta

def datetime_from_dec_year(dec_year):
    start = dec_year
    year = int(start)
    rem = start - year

    base = datetime(year, 1, 1)
    result = base + timedelta(seconds=(base.replace(year=base.year + 1) - base).total_seconds() * rem)
    #result.strftime("%Y-%m-%d")
    return result.date()

def extract_map_self_reported(data, data_field, code_map):
    pbar = tqdm(total=16)
    ### codes
    fields = ["20002"]; pbar.update(1)
    raw = get_data_fields_all(fields, data, data_field); pbar.update(1)
    col = "noncancer_illness_code_selfreported_f20002"; pbar.update(1)
    temp = pd.wide_to_long(raw, stubnames=[col], i="eid", j="instance_index", sep="_", suffix="\w+").reset_index(); pbar.update(1)
    codes = temp.rename(columns={col:"code"})\
        .assign(code=lambda x: x.code.astype(str))\
        .replace("None", np.nan) \
        .replace("nan", np.nan) \
        .dropna(subset=["code"], axis=0)\
        .assign(code=lambda x: x.code.astype(int)) \
        .merge(code_map, how="left",on="code") \
        .sort_values(["eid", "instance_index"]) \
        .reset_index(drop=True); pbar.update(1)
    
    ### dates
    fields = ["20008"]; pbar.update(1)
    raw = get_data_fields_all(fields, data, data_field); pbar.update(1)
    col="interpolated_year_when_noncancer_illness_first_diagnosed_f20008"; pbar.update(1)
    temp = pd.wide_to_long(raw, stubnames=[col], i="eid", j="instance_index", sep="_", suffix="\w+").reset_index(); pbar.update(1)
    dates = temp.rename(columns={col:"date"})\
        .dropna(subset=["date"], axis=0)\
        .sort_values(["eid", "instance_index"]) \
        .reset_index(drop=True); pbar.update(1)

    dates = dates[dates.date!=-1]; pbar.update(1)
    dates = dates[dates.date!=-3]; pbar.update(1)
    dates.date = dates.date.apply(datetime_from_dec_year); pbar.update(1)
    
    test = codes.merge(dates, how="left", on=["eid", "instance_index"]).assign(origin="self_reported").copy(); pbar.update(1)
    
    test["instance_index"] = test["instance_index"].astype("string"); pbar.update(1)
    test[['instance','n']] = test.instance_index.str.split("_",expand=True); pbar.update(1)
    pbar.close()
    
    return test[["eid", "origin", 'instance','n', "code", "meaning", "date"]]

## Get the path and data for this study

In [6]:
data_geno_path = "D:/UKBiobank/Geno"
data_base_path = "D:/UKBiobank/brain/2_datasets_pre"
save_data_path = "D:/UKBiobank/ML_Stroke/Data"
data_sup1_path = "D:/UKBiobank/Sup1"
data_sup4_path = "D:/UKBiobank/Sup4"
data_sup5_path = "D:/UKBiobank/Sup5"
data_exam_path = "D:/UKBiobank/Exam"
data_path = "D:/UKBiobank/Green"

In [7]:
data_geno = pd.read_feather(f"{data_geno_path}/ukb_data.feather")
data_geno_field = pd.read_feather(f"{data_geno_path}/ukb_data_field.feather")
data_geno_columns = data_geno.columns.to_list()

In [8]:
fields_geno = [
    "22009", # PCA Geno
]
temp_geno = get_data_fields_all(fields_geno, data_geno, data_geno_field)

In [9]:
data = pd.read_feather(f"{data_base_path}/ukb_data.feather")
data_field = pd.read_feather(f"{data_base_path}/ukb_data_field.feather")
data_columns = data.columns.to_list()
data_sup4 = pd.read_feather(f"{data_sup4_path}/ukb_data.feather")
data_sup4_field = pd.read_feather(f"{data_sup4_path}/ukb_data_field.feather")
data_sup4_columns = data_sup4.columns.to_list()
data_sup5 = pd.read_feather(f"{data_sup5_path}/ukb_data.feather")
data_sup5_field = pd.read_feather(f"{data_sup5_path}/ukb_data_field.feather")
data_sup5_columns = data_sup5.columns.to_list()
data_sup_1 = pd.read_feather(f"{data_sup1_path}/ukb_data.feather")
data_field_sup_1 = pd.read_feather(f"{data_sup1_path}/ukb_data_field.feather")
data_sup_1_columns = data_sup_1.columns.to_list()
data_cmr = pd.read_feather(f"{data_exam_path}/ukb_data.feather")
data_cmr_field = pd.read_feather(f"{data_exam_path}/ukb_data_field.feather")
data_cmr_columns = data_cmr.columns.to_list()


In [40]:
fields_blood = [
    "30030",	# Haematocrit percentage
    "30070",	# Red blood cell (erythrocyte) distribution width
    "30080",	# Platelet count
    "30100",	# Mean platelet (thrombocyte) volume
    "30110",	# Platelet distribution width
    "30140",	# Neutrophill count
    "30180",	# Lymphocyte percentage
    "30190",	# Monocyte percentage
    "30600",	# Albumin
    "30620",	# Alanine aminotransferase
    "30660",	# Direct bilirubin
    "30690",	# Cholesterol
    "30700",	# Creatinine
    "30710",	# C-reactive protein
    "30730",	# Gamma glutamyltransferase
    "30740",	# Glucose
    "30750",	# Glycated haemoglobin (HbA1c)
    "30760",	# HDL cholesterol
    "30870",	# Triglycerides
    "30880",	# Urate
    "30720",	# Cystatin C
    "30790",	# Lipoprotein A
    "30780",	# LDL direct
    "30650",	# Aspartate aminotransferase
    "30830",	# SHBG
    "30850",	# Testosterone
    "30000",	# White blood cell (leukocyte) count
]
temp1 = get_data_fields(fields_blood, data, data_field)
temp2 = get_data_fields(fields_blood, data_sup4, data_sup4_field)
temp3 = get_data_fields(fields_blood, data_sup5, data_sup5_field)

In [None]:
temp = temp1.merge(temp2, how="left", on="eid").merge(temp3, how="left", on="eid")
temp.to_feather(os.path.join(save_data_path, 'temp_blood.feather'))

In [41]:
temp = temp_geno.merge(temp1, how="left", on="eid").merge(temp2, how="left", on="eid").merge(temp3, how="left", on="eid")
temp.to_feather(os.path.join(save_data_path, 'temp_geno.feather'))

In [43]:
temp = pd.read_feather(os.path.join(save_data_path, 'temp_geno.feather'))

In [12]:
fields_cmr = [
    "24102",	# LV stroke volume
    "24105",	# LV myocardial mass
    "24100",	# LV end diastolic volume
    "24101",	# LV end systolic volume
    "24103",	# LV ejection fraction
    "24181",	# LV longitudinal strain global
    "24110",	# LA maximum volume
    "24113",	# LA ejection fraction
]

def get_fields_image(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields) & data_field["field.tab"].str.contains("f\\.\\d+\\.2\\.\\d")].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_data_fields_image(fields, data, data_field):
    f = get_fields_image(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

temp_cmr = get_data_fields_image(fields_cmr, data_cmr, data_cmr_field)

temp_cmr.to_feather(os.path.join(save_data_path, 'temp_cmr.feather'))

temp_cmr = pd.read_feather(os.path.join(save_data_path, 'temp_cmr.feather'))


In [13]:
# load data
ukbiobank_variable_definitions = pd.read_csv("D:/UKBiobank/AF_PHENOTYPE_GTM/data/ukbiobank/ukbiobank_variable_definitions.csv")
phecode_df = pd.read_csv("D:/UKBiobank/AF_PHENOTYPE_GTM/data/ukbiobank/phecode_icd10_mappings.csv", encoding="latin_1")
# af_definitions = pd.read_csv("D:/UKBiobank/AF_PHENOTYPE_GTM/data/ukbiobank/af_definitions.csv")

In [14]:
# diagnose information for AF from self report, diagnosis and death record
codes_self_reported = pd.read_feather(os.path.join('D:/UKBiobank/Green/Data/', 'self_reported.feather'))
hes_diagnoses = pd.read_feather(os.path.join('D:/UKBiobank/Green/Data/', 'hes_diagnoses.feather'))
death_codes = pd.read_feather(os.path.join('D:/UKBiobank/Green/Data/', 'death_codes.feather'))
diagnoses_codes = codes_self_reported.append(hes_diagnoses.append(death_codes)).sort_values(["eid", "date"]).dropna(subset=["date"], axis=0).reset_index(drop=True)

  diagnoses_codes = codes_self_reported.append(hes_diagnoses.append(death_codes)).sort_values(["eid", "date"]).dropna(subset=["date"], axis=0).reset_index(drop=True)


In [15]:
diagnoses_codes_unique = diagnoses_codes.drop_duplicates(subset='eid')

## Prepare Stroke diagnosis

In [16]:
time0_col="date_of_attending_assessment_centre_f53_0_0"

In [17]:
from joblib import Parallel, delayed

def had_diagnosis_before_per_ph(df_before, ph, ph_codes, temp):
    df_ph = df_before[df_before.code.astype('str').isin(ph_codes)][["eid"]]\
            .drop_duplicates(subset=["eid"])\
            .assign(phenotype=True) 
    return temp.merge(df_ph, how="left", on="eid").fillna(False).phenotype

def had_diagnosis_before(data, diagnoses_codes, phenotypes, time0=time0_col):
    diagnoses_codes_time = diagnoses_codes.merge(data[["eid", time0]], how="left", on="eid")
    
    temp = data[["eid"]].copy()
    df_before = diagnoses_codes_time[diagnoses_codes_time.date < diagnoses_codes_time[time0]]
                                                                                         
    df_phs = Parallel(n_jobs=20, require="sharedmem")(delayed(had_diagnosis_before_per_ph)(df_before, ph, phenotypes[ph], temp) for ph in tqdm(list(phenotypes)))
    for ph, df_ph_series in zip(tqdm(list(phenotypes)), df_phs): temp[ph] = df_ph_series
    
    return temp.sort_values("eid")

In [18]:
l10_basic = {
    "ALL Stroke": ['1081', '1086','1491', '1583', 
               '4309', '4319','4349','4369'] + 
            [f"I60{i:01}" for i in range(10)] + 
            [f"I61{i:01}" for i in chain(range(0,7), range(8,10))] +
            [f"I62{i:01}" for i in chain(range(0,2), range(9,10))] +
            [f"I63{i:01}" for i in chain(range(0,7), range(8,10))] +
            ['I64'],
    'Self report - Stroke': ['1081'],
    'Self report - Subarachnoid haemorrhage': ['1086'],
    'Self report - Brain haemorrhage': ['1491'],
    'Self report - Ischaemic stroke': ['1583'],
    'ICD 9 - Subarachnoid haemorrhage': ['4309'],
    'ICD 9 - Intracerebral haemorrhage': ['4319'],
    # 'ICD 9 - Other and unspecified intracranial haemorrhage': ['4320'],
    # 'ICD 9 - Occlusion of cerebral arteries': ['4340', '4341', '4349'],
    'ICD 9 - Occlusion of cerebral arteries': ['4349'],
    'ICD 9 - Acute, but ill-defined, cerebrovascular disease': ['4369'],
    'ICD 10 - Subarachnoid haemorrhage': [f"I60{i:01}" for i in range(10)],
    'ICD 10 - Intracerebral haemorrhage': [f"I61{i:01}" for i in chain(range(0,7), range(8,10))],
    'ICD 10 - Other nontraumatic intracranial haemorrhage': [f"I62{i:01}" for i in chain(range(0,2), range(9,10))],
    'ICD 10 - Cerebral infarction':[f"I63{i:01}" for i in chain(range(0,7), range(8,10))],
    'ICD 10 - Stroke, not specified as haemorrhage or infarction': ['I64']
}

In [7]:
[f"I63{i:01}" for i in chain(range(0,7), range(8,10))]

['I630', 'I631', 'I632', 'I633', 'I634', 'I635', 'I636', 'I638', 'I639']

In [19]:
l10 = {k: v for k, v in l10_basic.items() if len(v)!=0}

In [20]:
basics = pd.read_feather(os.path.join('D:/UKBiobank/Green/Data/', 'temp_basics.feather'))

In [21]:
diagnoses_before = had_diagnosis_before(basics, diagnoses_codes, l10, time0=time0_col)

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [22]:
diagnoses_before['ALL Stroke'].value_counts()

False    494431
True       7780
Name: ALL Stroke, dtype: int64

In [23]:
free_of_stroke = diagnoses_before[diagnoses_before['ALL Stroke'] == False].eid
basics_filtered = basics[basics['eid'].isin(free_of_stroke)].reset_index()

### excluding baseline stroke 7780, 494431  remained

In [24]:
basics_with_follow = basics_filtered[basics_filtered['eid'].isin(diagnoses_codes_unique.eid)].reset_index()

### excluding 20251 without follow up, 474180  remained

In [25]:
from dateutil.relativedelta import relativedelta
import datetime

def extract_endpoints_tte(data, diagnoses_codes, endpoint_list, time0_col, level=None):
    if level is not None: diagnoses_codes = diagnoses_codes.query("level==@level")
    diagnoses_codes_time0 = diagnoses_codes.merge(data[["eid", time0_col]], how="left", on="eid")
    
    #cens_time_right = max(diagnoses_codes.sort_values('date').groupby('origin').tail(1).date.to_list())
    cens_time_right = datetime.date(2023, 9, 30)
    print(f"t_0: {time0_col}")
    print(f"t_cens: {cens_time_right}")
    
    df_interval = diagnoses_codes_time0[(diagnoses_codes_time0.date > diagnoses_codes_time0[time0_col]) & 
                                        (diagnoses_codes_time0.date < cens_time_right)]
    
    temp = data[["eid", time0_col]].copy()
    for ph, ph_codes in tqdm(endpoint_list.items()):
        regex = "|".join(ph_codes)
        ph_df = df_interval[df_interval.code.astype('str').str.contains(regex, case=False)] \
            .sort_values('date').groupby('eid').head(1).assign(phenotype=1, date=lambda x: x.date)
        temp_ph = temp.merge(ph_df, how="left", on="eid").fillna('0')
        temp_ph.phenotype = temp_ph.phenotype.astype('int')
        
        temp[ph+"_event"], temp[ph+"_event_date"], temp[ph+"_origin"] = temp_ph.phenotype, temp_ph.date, temp_ph.origin
        
        fill_date = {ph+"_event_date" : lambda x: [cens_time_right if event==0 else event_date for event, event_date in zip(x[ph+"_event"], x[ph+"_event_date"])]}
        calc_tte = {ph+"_event_time" : lambda x: [(event_date-time0).days/365.25  for time0, event_date in zip(x[time0_col], x[ph+"_event_date"])]}
        
        temp = temp.assign(**fill_date).assign(**calc_tte).drop([ph+"_event_date"], axis=1)
        
    temp = temp.drop([time0_col], axis=1)     
    
    return temp.drop_duplicates()

In [26]:
endpoints_diagnoses = extract_endpoints_tte(basics_with_follow, diagnoses_codes, l10_basic, time0_col)

t_0: date_of_attending_assessment_centre_f53_0_0
t_cens: 2023-09-30


  0%|          | 0/14 [00:00<?, ?it/s]

In [27]:
endpoints_diagnoses

Unnamed: 0,eid,ALL Stroke_event,ALL Stroke_origin,ALL Stroke_event_time,Self report - Stroke_event,Self report - Stroke_origin,Self report - Stroke_event_time,Self report - Subarachnoid haemorrhage_event,Self report - Subarachnoid haemorrhage_origin,Self report - Subarachnoid haemorrhage_event_time,...,ICD 10 - Intracerebral haemorrhage_event_time,ICD 10 - Other nontraumatic intracranial haemorrhage_event,ICD 10 - Other nontraumatic intracranial haemorrhage_origin,ICD 10 - Other nontraumatic intracranial haemorrhage_event_time,ICD 10 - Cerebral infarction_event,ICD 10 - Cerebral infarction_origin,ICD 10 - Cerebral infarction_event_time,"ICD 10 - Stroke, not specified as haemorrhage or infarction_event","ICD 10 - Stroke, not specified as haemorrhage or infarction_origin","ICD 10 - Stroke, not specified as haemorrhage or infarction_event_time"
0,1000016,0,0,13.842574,0,0,13.842574,0,0,13.842574,...,13.842574,0,0,13.842574,0,0,13.842574,0,0,13.842574
1,1000029,0,0,15.307324,0,0,15.307324,0,0,15.307324,...,15.307324,0,0,15.307324,0,0,15.307324,0,0,15.307324
2,1000033,0,0,14.995209,0,0,14.995209,0,0,14.995209,...,14.995209,0,0,14.995209,0,0,14.995209,0,0,14.995209
3,1000045,0,0,14.242300,0,0,14.242300,0,0,14.242300,...,14.242300,0,0,14.242300,0,0,14.242300,0,0,14.242300
4,1000051,0,0,14.954141,0,0,14.954141,0,0,14.954141,...,14.954141,0,0,14.954141,0,0,14.954141,0,0,14.954141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474175,6023671,0,0,14.277892,0,0,14.277892,0,0,14.277892,...,14.277892,0,0,14.277892,0,0,14.277892,0,0,14.277892
474176,6023682,0,0,14.220397,0,0,14.220397,0,0,14.220397,...,14.220397,0,0,14.220397,0,0,14.220397,0,0,14.220397
474177,6023690,0,0,13.817933,0,0,13.817933,0,0,13.817933,...,13.817933,0,0,13.817933,0,0,13.817933,0,0,13.817933
474178,6023707,0,0,13.295003,0,0,13.295003,0,0,13.295003,...,13.295003,0,0,13.295003,0,0,13.295003,0,0,13.295003


In [28]:
stroke_event = endpoints_diagnoses[endpoints_diagnoses['ALL Stroke_event']==1].reset_index(drop=True)

In [29]:
stroke_event.to_feather(os.path.join(save_data_path, 'temp_stroke_new.feather'))

In [30]:
from joblib import Parallel, delayed

def had_diagnosis_before_per_ph(df_before, ph, ph_codes, temp):
    df_ph = df_before[df_before.code.astype('str').isin(ph_codes)][["eid"]]\
            .drop_duplicates(subset=["eid"])\
            .assign(phenotype=True) 
    return temp.merge(df_ph, how="left", on="eid").fillna(False).phenotype

def had_diagnosis_before_image(data, diagnoses_codes, phenotypes, time0=time0_col):
    diagnoses_codes_time = diagnoses_codes.merge(data[["eid", time0]], how="left", on="eid")
    
    temp = data[["eid"]].copy()
    cens_time_right = datetime.date(2014, 1, 1)
    df_before = diagnoses_codes_time[diagnoses_codes_time.date < cens_time_right]
                                                                                         
    df_phs = Parallel(n_jobs=20, require="sharedmem")(delayed(had_diagnosis_before_per_ph)(df_before, ph, phenotypes[ph], temp) for ph in tqdm(list(phenotypes)))
    for ph, df_ph_series in zip(tqdm(list(phenotypes)), df_phs): temp[ph] = df_ph_series
    
    return temp.sort_values("eid")

In [31]:
diagnoses_befor_image = had_diagnosis_before_image(basics, diagnoses_codes, l10, time0=time0_col)

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [32]:
diagnoses_befor_image['ALL Stroke'].value_counts()

False    491366
True      10845
Name: ALL Stroke, dtype: int64

In [33]:
diagnoses_befor_image.to_feather(os.path.join(save_data_path, 'diagnoses_before_image.feather'))

In [34]:
free_of_stroke_image = diagnoses_befor_image[diagnoses_befor_image['ALL Stroke'] == False].eid
basics_filtered_image = basics[basics['eid'].isin(free_of_stroke_image)].reset_index()

In [35]:
basics_with_follow_image = basics_filtered_image[basics_filtered_image['eid'].isin(diagnoses_codes_unique.eid)].reset_index()
endpoints_diagnoses_image = extract_endpoints_tte(basics_with_follow_image, diagnoses_codes, l10_basic, time0_col)

t_0: date_of_attending_assessment_centre_f53_0_0
t_cens: 2023-09-30


  0%|          | 0/14 [00:00<?, ?it/s]

In [36]:
stroke_event_image = endpoints_diagnoses_image[endpoints_diagnoses_image['ALL Stroke_event']==1].reset_index(drop=True)

In [37]:
stroke_event_image.to_feather(os.path.join(save_data_path, 'temp_stroke_image.feather'))

In [39]:
endpoints_diagnoses_image.to_feather(os.path.join(save_data_path, 'temp_stroke_diagnosis_image.feather'))