In [None]:
import gzip
from tqdm import tqdm
import pandas as pd
import numpy as np

In [None]:
patients = {}
family_history = { #10
    "20107": "illness of father",
    "20110": "illness of mother",
    "20111": "illness of siblings",
}
icd_codes = ["G309", "G301", "G300", "G308", "F000", "F002", "F009", "F001"]
ad = { # G309, G301, G300, G308, F000, F002, F009, F001
    "41202": "main ICD10",
    "41204": "secondary ICD10", 
    "41270": "Diagnoses - ICD10"
}
fields = {
    "34": "year of birth", 
    "50": "height",
    "189": "townsend deprivation index",
    "3062": "Forced vital capacity",
    "3063": "FEV1",
    "3064": "Peak expiratory flow",
    "3148": "Heel bone mineral density",
    "4079": "Diastolic blood pressure", 
    "4080": "Systolic blood pressure",
    "21001": "BMI",
    "21002": "weight",
    "30000": "White blood cell (leukocyte) count", 
    "30010": "Red blood cell (erythrocyte) count", 
    "30020": "Haemoglobin concentration",
    "30030": "Haematocrit percentage", 
    "30040": "Mean corpuscular volume",
    "30060": "Mean corpuscular haemoglobin concentration",
    "30080": "Platelet count", 
    "30090": "Platelet crit", 
    "30100": "Mean platelet (thrombocyte) volume",
    "30120": "Lymphocyte count",
    "30130": "Monocyte count",
    "30140": "Neutrophill count",
    "30150": "Eosinophill count",
    "30160": "Basophill count", 
    "30250": "Reticulocyte count",
    "30505": "Microalbumin in urine result flag", 
    "30515": "Creatinine (enzymatic) in urine result flag", 
    "30525": "Potassium in urine result flag", 
    "30535": "Sodium in urine result flag", 
    "30600": "Albumin", 
    "30610": "Alkaline phosphatase", 
    "30620": "Alanine aminotransferase", 
    "30630": "Apolipoprotein A",
    "30640": "Apolipoprotein B", 
    "30650": "Aspartate aminotransferase", 
    "30660": "Direct bilirubin", 
    "30670": "Urea", 
    "30680": "Calcium", 
    "30690": "Cholesterol", 
    "30700": "Creatinine", 
    "30710": "C-reactive protein", 
    "30720": "Cystatin C", 
    "30730": "Gamma glutamyltransferase", 
    "30740": "Glucose", 
    "30750": "Glycated haemoglobin (HbA1c)", 
    "30760": "HDL cholesterol", 
    "30770": "IGF-1", 
    "30780": "LDL direct", 
    "30790": "Lipoprotein A", 
    "30800": "Oestradiol", 
    "30810": "Phosphate", 
    "30820": "Rheumatoid factor", 
    "30830": "SHBG", 
    "30840": "Total bilirubin", 
    "30850": "Testosterone", 
    "30860": "Total protein", 
    "30870": "Triglycerides", 
    "30880": "Urate", 
    "30890": "Vitamin D"
}

# "22009": "genetic principal components",

cols = [
    "34","50","189","3062","3063","3064","3148","4079",
    "4080","21001","21002","30000","30010","30020",
    "30030","30040","30060","30080","30090","30100","30120",
    "30130","30140","30150","30160","30250","30505","30515",
    "30525","30535","30600","30610","30620","30630","30640",
    "30650","30660","30670","30680","30690","30700","30710",
    "30720","30730","30740","30750","30760","30770","30780",
    "30790","30800","30810","30820","30830","30840","30850",
    "30860","30870","30880","30890", "22009"
]
colnames = ["AD", "family history"] + [fields[a] for a in cols[:-1]] + [f"gcp {i}" for i in range(1,41)]

with gzip.open('ukbb_tab/ukb47948.long.tsv.gz', 'rb') as f:
    skip=1
    for line in tqdm(f):
        if skip: 
            skip -=1
            continue
        line = line.decode().strip().split("\t")
        if line[0] not in patients:
            patients[line[0]] = [False, False] + [""]*(len(cols)+39)
        if line[1] in family_history and line[-1] == "10":
            patients[line[0]][1] = True
        elif line[1] in ad and line[-1] in icd_codes:
            patients[line[0]][0] = True
        elif line[1] in fields and line[2]=="0" and line[3]=="0":
            patients[line[0]][cols.index(line[1])+2] = line[-1]
        elif line[1] == "22009" and int(line[3])<= 40: # "22009": "genetic principal components",
            patients[line[0]][cols.index(line[1])+2 + int(line[3])-1] = line[-1]

In [None]:
colnames = ["AD", "family history"] + [fields[a] for a in cols[:-1]] + [f"genotype principle component {i}" for i in range(1,41)]
pts = pd.DataFrame.from_dict(patients, orient='index', columns=colnames)
pts1 = pts.apply(pd.to_numeric, errors='ignore')

In [None]:
pts1.head()

In [None]:
pts1[pts1["AD"]==True].shape

In [None]:
pd.options.display.max_rows = 999
(pts1.isna().mean(axis=0) < 0.01).index

In [None]:
pts2 = pts1.dropna(subset=(pts1.isna().mean(axis=0) < 0.01).index)

In [None]:
pts2.shape

In [None]:
pts1.isna().mean(axis=0)

In [None]:
true_idx = list(pts1[pts1["AD"]==True].index)
np.random.seed(1)
false_idx_subset = list(np.random.choice(pts1[pts1["AD"]==False].index, size=50000, replace=False))
pts_subset = pts1.loc[true_idx + false_idx_subset]

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(pts1, title="patients")

In [None]:
profile.to_file("your_report.html")

In [None]:
pts1.to_csv("AD_patients.csv")
pts_subset.to_csv("AD_patients_subset.csv")

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(pts_subset, title="patients")
profile.to_file("your_report_subset.html")