# Preprocess tabular data 
Adapt from https://github.com/paulhager/MMCL-Tabular-Imaging/blob/main/data/create_cardiac_tabular_dataset.ipynb

In [1]:
import os
import csv
from os.path import join
import re
import random
import multiprocessing as mp
from glob import glob

import numpy as np
import torch
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import seaborn as sns
import nibabel as nib
from matplotlib import pyplot as plt
from torchvision import transforms
from tqdm import tqdm

from typing import List, Union
import operator

import sys
sys.path.append("../../")
# from utils.tabular_utils import *
from tabular_utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.options.display.max_columns = 700

# Set Paths

**IMPORTANT:** Set the correct base path to the data folder containing the raw data here. Set SAVE to True if you want to load the full data from scratch and save all the generated tables. Leave it to False if you are using files already created and just want to see what was done to the data.

In [3]:
BASE_PATH = '/bigdata/siyi/data/UKBB/features'
SUBJECT_DATA =  '/vol/biodata/data/biobank/18545/data'                           
RAW_DATA_PATH = '/vol/biodata/data/biobank/18545/downloaded'          
EXTRACTED_DATA_PATH = ''
DATA_PATH = join(RAW_DATA_PATH,'ukb675732.csv')
CARDIAC_FEATURES_PATH = join(BASE_PATH,'cardiac_features_18545.csv')
CARDIAC_PATIENTS_PATH = join(BASE_PATH,'cardiac_features_18545_imaging.csv')
CLEAN_FEATURES_PATH = join(BASE_PATH,'cardiac_features_clean_18545.csv')
DATADICT_PATH = join(BASE_PATH,'Data_Dictionary_Showcase.csv')
BRIDGE_PATH = join(BASE_PATH,'Bridge_eids_60520_87802.csv')

SAVE = True

# Load Data

In [4]:
datadict_df = pd.read_csv(DATADICT_PATH,quotechar='"',escapechar='\\')
# There are two BMI fields with the same name. One is measured by impedance though (instead of the standard way) and thus geets a different name
datadict_df.loc[datadict_df['FieldID']==23104,'Field']='Body mass index (BMI) Impedance'

In [None]:
datadict_df

Define dtypes of the columns for faster reading and better object types

In [6]:
# Need to use pandas Int64 to represent integers because missing values are floats normally in pandas
datatype_dict = {'Integer':"Int64", 'Categorical single':object, 'Date':str, 'Text':str, 'Continuous':float,
       'Time':str, 'Compound':object, 'Categorical multiple':object}

dtype = {}
dates = []
field_id2name = {}
for indx, row in datadict_df.iterrows():
    baseID = row['FieldID']
    instances = row['Instances']
    array = row['Array']
    field_id2name[baseID] = row['Field']
    for instance in range(instances):
        for arr in range(array):
            ID = '{}-{}.{}'.format(baseID,instance,arr)
            value_type = row['ValueType']
            if value_type == 'Time' or value_type == 'Date':
                dates.append(ID)
            dt = datatype_dict[value_type]
            if baseID==46:
                dt = float
            dtype[ID] = dt

In [7]:
print(len(dtype))

32647


In [8]:
def read_csv(filename):
        return pd.read_csv(filename,header=None)

def multithread_read(glob_str: str) -> pd.DataFrame:
        files = glob(glob_str)
        files.sort()
        print(files)
        threads = len(files)
        with mp.Pool(processes=threads) as pool:
                df_list = pool.map(read_csv,files)
        final_frame = pd.concat(df_list,ignore_index=True)
                
        print (f"There are {len(final_frame)} rows of data")
        return final_frame

def multi_merge(glob_str: str) -> pd.DataFrame:
        files = glob(glob_str)
        files.sort()
        print(files)
        merged_df = pd.read_csv(files[0])
        print(len(merged_df), files[0])
        for file in tqdm(files[1:]):
                df = pd.read_csv(file)
                print(len(df), file)
                merged_df = pd.merge(merged_df,df,on='eid')
        print (f"There are {len(merged_df)} rows of data")
        print (f"There are {len(merged_df.columns)} columns of data")
        return merged_df

# Dataset Management

## Cardiac Features

# Pre-proceed

In [None]:
data_df = pd.read_csv(CARDIAC_FEATURES_PATH)
rename(data_df=data_df, datadict_df=datadict_df)

In [None]:
data_df

0=Initial visit. 1=Followup. 2=Imaging. 3=Imaging followup.

Most important and the one that will be used downstream is 2 as it coincides with images. If 2 is not populated for a patient, initial visit (0) will be used.

Many numeric fields have some faulty entries. Invalid data is coerced to NA.

In [None]:
data_df_coverage = data_df.notna().sum()/len(data_df)*100
data_df_coverage

In [13]:
print(len(set(data_df['eid'])))
# print(len(set(data_df['eid_old'])))

502465


### Alcohol Intake Frequency

Field 6 is never

In [None]:
field_id = 'Alcohol intake frequency.-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

-3 is "Prefer not to answer". Replace with NA

In [16]:
data_df.loc[data_df[field_id]==-3, field_id] = pd.NA

In [None]:
plot_hist(data_df=data_df, field_id=field_id)

### Pulse Wave Arterial Stiffness Index

In [None]:
field_id = 'Pulse wave Arterial Stiffness index-2.0'
plot_hist(data_df=data_df, field_id=field_id)
notna_rows = check_coverage(data_df=data_df, field_ids=[field_id])
print(f'Mean: {data_df[field_id].mean()}, Std: {data_df[field_id].std()}')

In [None]:
vals = grab_sorted_values(data_df=data_df, field_id=field_id)
print(vals[:10])

In [30]:
remove_outliers(data_df=data_df, field_id=field_id, limit=50, greater=True)

In [None]:
plot_hist(data_df=data_df, field_id=field_id)

### Sex

In [None]:
field_id = 'Sex-0.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

### Age

In [None]:
field_id = 'Age when attended assessment centre-2.0'
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# Check differences between imaging and initial visit.
age_diff = data_df['Age when attended assessment centre-2.0']-data_df['Age when attended assessment centre-0.0']
age_diff = age_diff.dropna()
plt.hist(age_diff)

In [None]:
for x in data_df.columns:
    if 'Diastolic blood pressure' in x:
        print(x)

Most seem to have come between 7 and 12 years after initial visit. A lot can change in this time frame so try to avoid stats gathered at initial visit

### [None] Systolic Blood Pressure - Manual Reading

Two blood pressure readings are taken moments apart so each visit's value is averaged to one number before further processing

Blood pressure is very moment specific and thus only the imaging visit values are taken.

As we don't have manual reading pressure in our dataset, we directly use automated reading pressure

In [None]:
field_id = 'Systolic blood pressure, automated reading-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
calc_and_save_mean(data_df=data_df, field_id=field_id)
field_id = 'Systolic blood pressure, automated reading-2.mean'
plot_hist(data_df=data_df, field_id=field_id)
notna_rows = check_coverage(data_df=data_df, field_ids=[field_id])
print(f'Mean: {data_df[field_id].mean()}, Std: {data_df[field_id].std()}')
# field_id = 'Systolic blood pressure, manual reading-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# calc_and_save_mean(data_df=data_df, field_id=field_id)
# field_id = 'Systolic blood pressure, manual reading-2.mean'
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# Examine coverage of multiple readings across subjects
field_ids = ['Systolic blood pressure, automated reading-2.0']
_ = check_coverage(data_df=data_df, field_ids=field_ids)
field_ids = ['Systolic blood pressure, automated reading-2.1']
_ = check_coverage(data_df=data_df, field_ids=field_ids)
field_ids = ['Systolic blood pressure, automated reading-2.0','Systolic blood pressure, automated reading-2.1']
_ = check_coverage(data_df=data_df, field_ids=field_ids)
field_ids = ['Systolic blood pressure, automated reading-2.mean']
_ = check_coverage(data_df=data_df, field_ids=field_ids)
# field_ids = ['Systolic blood pressure, manual reading-2.0']
# _ = check_coverage(data_df=data_df, field_ids=field_ids)
# field_ids = ['Systolic blood pressure, manual reading-2.1']
# _ = check_coverage(data_df=data_df, field_ids=field_ids)
# field_ids = ['Systolic blood pressure, manual reading-2.0','Systolic blood pressure, manual reading-2.1']
# _ = check_coverage(data_df=data_df, field_ids=field_ids)
# field_ids = ['Systolic blood pressure, manual reading-2.mean']
# _ = check_coverage(data_df=data_df, field_ids=field_ids)

In [None]:
# Check differences in mean bp between initial visit and imaging visit. Could be interesting to use this and/or initial visit at later time as additional indication of health over life. Maybe with weighting by time between initial and imaging
field_id = 'Systolic blood pressure, automated reading-0.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
field_id = 'Systolic blood pressure, automated reading-0.0'
calc_and_save_mean(data_df=data_df, field_id=field_id)
field_ids = ['Systolic blood pressure, automated reading-0.0','Systolic blood pressure, automated reading-0.1','Systolic blood pressure, automated reading-2.0','Systolic blood pressure, automated reading-2.1','Systolic blood pressure, automated reading-0.mean','Systolic blood pressure, automated reading-2.mean']
common_ids = check_coverage(data_df=data_df, field_ids=field_ids)
sys_bp_df = data_df[common_ids][field_ids]
sys_bp_df['dif_visit_means'] = sys_bp_df.apply(lambda row: row['Systolic blood pressure, automated reading-2.mean']-row['Systolic blood pressure, automated reading-0.mean'],axis=1)
sys_bp_df['dif_visit_means']
plot_hist(data_df=sys_bp_df,field_id='dif_visit_means')

# field_id = 'Systolic blood pressure, manual reading-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# field_id = 'Systolic blood pressure, manual reading-0.0'
# calc_and_save_mean(data_df=data_df, field_id=field_id)
# field_ids = ['Systolic blood pressure, manual reading-0.0','Systolic blood pressure, manual reading-0.1','Systolic blood pressure, manual reading-2.0','Systolic blood pressure, manual reading-2.1','Systolic blood pressure, manual reading-0.mean','Systolic blood pressure, manual reading-2.mean']
# common_ids = check_coverage(data_df=data_df, field_ids=field_ids)
# sys_bp_df = data_df[common_ids][field_ids]
# sys_bp_df['dif_visit_means'] = sys_bp_df.apply(lambda row: row['Systolic blood pressure, manual reading-2.mean']-row['Systolic blood pressure, manual reading-0.mean'],axis=1)
# sys_bp_df['dif_visit_means']
# plot_hist(data_df=sys_bp_df,field_id='dif_visit_means')

### Diastolic Blood Pressure - Manual Reading

In [None]:
field_id = 'Diastolic blood pressure, automated reading-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
calc_and_save_mean(data_df=data_df, field_id=field_id)
field_id = 'Diastolic blood pressure, automated reading-2.mean'
plot_hist(data_df=data_df, field_id=field_id)

# field_id = 'Diastolic blood pressure, manual reading-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# calc_and_save_mean(data_df=data_df, field_id=field_id)
# field_id = 'Diastolic blood pressure, manual reading-2.mean'
# plot_hist(data_df=data_df, field_id=field_id)

### [None] Pulse Rate (during bp measurement) 
We don't have 'Pulse rate (during blood-pressure measurement)' feature, skip

In [16]:
for x in data_df.columns:
    if 'Pulse rate' in x or 'pulse rate' in x:
        print(x)

Pulse rate, automated reading-0.0
Pulse rate, automated reading-0.1
Pulse rate, automated reading-1.0
Pulse rate, automated reading-1.1
Pulse rate, automated reading-2.0
Pulse rate, automated reading-2.1
Pulse rate, automated reading-3.0
Pulse rate, automated reading-3.1


In [None]:
# field_id = 'Pulse rate (during blood-pressure measurement)-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# calc_and_save_mean(data_df=data_df, field_id=field_id)
# field_id = 'Pulse rate (during blood-pressure measurement)-2.mean'
# plot_hist(data_df=data_df, field_id=field_id)

### Pulse Rate, automated reading

In [None]:
field_id = 'Pulse rate, automated reading-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
calc_and_save_mean(data_df=data_df, field_id=field_id)
field_id = 'Pulse rate, automated reading-2.mean'
plot_hist(data_df=data_df, field_id=field_id)

### Diastolic Blood Pressure - Automated Reading

In [None]:
field_id = 'Diastolic blood pressure, automated reading-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
calc_and_save_mean(data_df=data_df, field_id=field_id)
field_id = 'Diastolic blood pressure, automated reading-2.mean'
plot_hist(data_df=data_df, field_id=field_id)

### Systolic Blood Pressure - Automated Reading

In [None]:
field_id = 'Systolic blood pressure, automated reading-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)
vals = grab_sorted_values(data_df=data_df, field_id=field_id)
print(vals[:200])

In [None]:
remove_outliers(data_df=data_df, field_id=field_id, limit=500, greater=True)
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
calc_and_save_mean(data_df=data_df, field_id=field_id)
field_id = 'Systolic blood pressure, automated reading-2.mean'
plot_hist(data_df=data_df, field_id=field_id)

Check relationship between automated and manual readings. Should only use one in final vector

Uncomment if you have two readings

In [None]:
# # Systolic
# field_id_manual = 'Systolic blood pressure, manual reading-2.0'
# field_id_auto = 'Systolic blood pressure, automated reading-2.0'
# field_ids=[field_id_manual,field_id_auto]
# field_id_indices = check_coverage(data_df=data_df, field_ids=field_ids)
# _ = check_coverage(data_df=data_df, field_ids=[field_id_manual])
# _ = check_coverage(data_df=data_df, field_ids=[field_id_auto])

In [None]:
# field_id_manual = 'Systolic blood pressure, manual reading-2.mean'
# field_id_auto = 'Systolic blood pressure, automated reading-2.mean'
# field_ids=[field_id_manual,field_id_auto]
# field_id_indices = check_coverage(data_df=data_df, field_ids=field_ids)
# _ = check_coverage(data_df=data_df, field_ids=[field_id_manual])
# _ = check_coverage(data_df=data_df, field_ids=[field_id_auto])

In [None]:
# field_id_manual = 'Systolic blood pressure, manual reading-2.mean'
# field_id_auto = 'Systolic blood pressure, automated reading-2.mean'
# systolic_bp_df = data_df[~data_df[field_id_manual].isna() & ~data_df[field_id_auto].isna()][[field_id_manual, field_id_auto]]
# dif_id = 'systolic bp auto - manual'
# systolic_bp_df[dif_id] = systolic_bp_df.apply(lambda row: row[field_id_auto]-row[field_id_manual],axis=1)
# plot_hist(data_df=systolic_bp_df,field_id=dif_id)

There is no overlap between manual and auto for a specific reading, but it could be that for the first reading a manual value was entered and for the second an automatic value (or vice-versa). This leads there to be some subjects that have a mean manual reading as well as a mean automatic reading. Since these were taken at two different time points we treat them as the same modality and just take the mean of the two

### Combine manual and automatic

In [None]:
# # Systolic
# field_id_manual = 'Systolic blood pressure, manual reading-2.mean'
# field_id_auto = 'Systolic blood pressure, automated reading-2.mean'
# field_id = 'Systolic blood pressure-2.mean'
# data_df[field_id] = data_df[[field_id_manual,field_id_auto]].mean(axis=1)
# _ = check_coverage(data_df=data_df, field_ids=[field_id])

In [None]:
# We don't have manual feature, thus using automated
# Systolic
field_id = 'Systolic blood pressure-2.mean'
field_id_auto = 'Systolic blood pressure, automated reading-2.mean'
data_df[field_id] = data_df[field_id_auto]
_ = check_coverage(data_df=data_df, field_ids=[field_id])

In [None]:
# # Diastolic
# field_id_manual = 'Diastolic blood pressure, manual reading-2.mean'
# field_id_auto = 'Diastolic blood pressure, automated reading-2.mean'
# field_id = 'Diastolic blood pressure-2.mean'
# data_df[field_id] = data_df[[field_id_manual,field_id_auto]].mean(axis=1)
# _ = check_coverage(data_df=data_df, field_ids=[field_id])

In [None]:
# Diastolic
field_id = 'Diastolic blood pressure-2.mean'
field_id_auto = 'Diastolic blood pressure, automated reading-2.mean'
data_df[field_id] = data_df[field_id_auto]
_ = check_coverage(data_df=data_df, field_ids=[field_id])

In [None]:
# # Pulse rate
# field_id_manual = 'Pulse rate (during blood-pressure measurement)-2.mean'
# field_id_auto = 'Pulse rate, automated reading-2.mean'
# field_id = 'Pulse rate-2.mean'
# data_df[field_id] = data_df[[field_id_manual,field_id_auto]].mean(axis=1)
# _ = check_coverage(data_df=data_df, field_ids=[field_id])

In [None]:
# Pulse rate
field_id = 'Pulse rate-2.mean'
field_id_auto = 'Pulse rate, automated reading-2.mean'
data_df[field_id] = data_df[field_id_auto]
_ = check_coverage(data_df=data_df, field_ids=[field_id])

### Body Fat Percentage

In [None]:
field_id = 'Body fat percentage-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

### Whole Body Fat Mass

In [None]:
field_id = 'Whole body fat mass-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

### [None] Whole Body Fat-free Mass
We don't have 'Whole body fat-free mass-2.0', skip

In [None]:
for x in data_df.columns:
    if 'body fat' in x or 'Body fat' in x:
        print(x)

In [None]:
# field_id = 'Whole body fat-free mass-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

### [None] Whole Body Water Mass

In [None]:
for x in data_df.columns:
    if 'body' in x or 'Body' in x:
        print(x)

In [None]:
# field_id = 'Whole body water mass-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

### Body mass index (BMI)

In [None]:
field_id = 'Body mass index (BMI)-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df,field_id=field_id)

### [None] Processed Meat Intake

-1	Do not know

-3	Prefer not to answer

In [None]:
for x in data_df.columns:
    if 'intake' in x or 'Intake' in x:
        print(x)

In [None]:
# field_id = 'Processed meat intake-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-3, field_id] = pd.NA
# data_df.loc[data_df[field_id]==-1, field_id] = pd.NA
# plot_hist(data_df=data_df,field_id=field_id)

### [None] Beef Intake

In [None]:
# field_id = 'Beef intake-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-3, field_id] = pd.NA
# data_df.loc[data_df[field_id]==-1, field_id] = pd.NA
# plot_hist(data_df=data_df,field_id=field_id)

### [None] Pork Intake

In [None]:
# field_id = 'Pork intake-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-3, field_id] = pd.NA
# data_df.loc[data_df[field_id]==-1, field_id] = pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

### [None] Lamb/mutton intake

In [None]:
# field_id = 'Lamb/mutton intake-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-3, field_id] = pd.NA
# data_df.loc[data_df[field_id]==-1, field_id] = pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Cooked vegetable intake-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-3, field_id] = pd.NA
# data_df.loc[data_df[field_id]==-1, field_id] = pd.NA
# data_df.loc[data_df[field_id]==-1, field_id] = pd.NA
# data_df.loc[data_df[field_id]==-10, field_id] = 0 # -10 is less than one so gets set to 0
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Salad / raw vegetable intake-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-3, field_id] = pd.NA
# data_df.loc[data_df[field_id]==-1, field_id] = pd.NA
# data_df.loc[data_df[field_id]==-10, field_id] = 0 # -10 is less than one so gets set to 0
# plot_hist(data_df=data_df, field_id=field_id)

### Ethnic Background

Should ensure only 1000 (caucasian) is included to prevent ethnic confounders

In [None]:
field_id = 'Ethnic background-0.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df,field_id=field_id)

### Overall Health Rating

Self reported

In [None]:
field_id = 'Overall health rating-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1, field_id] = pd.NA
data_df.loc[data_df[field_id]==-3, field_id] = pd.NA
plot_hist(data_df=data_df, field_id=field_id)

### Diabetes diagnosed by doctor

In [None]:
field_name = 'Diabetes diagnosed by doctor'
instance_array_size = 4
option_array_size = 1
answer = 1

diabetes_diagnosed = check_answer(data_df, answer, field_name, instance_array_size, option_array_size)
print(f'{sum(diabetes_diagnosed)} subjects indicated diabetes diagnosis by doctor')
data_df['Diabetes diagnosis'] = diabetes_diagnosed

In [None]:
# Check if we have an age for a diagnosis
field_name = 'Age diabetes diagnosed'
for i in range(4):
    field_id = f'{field_name}-{i}.0'
    update_through_age(data_df, field_id, 'Diabetes diagnosis')
print(f'{data_df["Diabetes diagnosis"].sum()} subjects indicated diabetes diagnosis by doctor or age for diagnosis')

In [None]:
print(data_df['Diabetes diagnosis'].sum())
print(data_df['Diabetes diagnosis'].sum()/len(data_df))

Check if diabetes medication is being taken further down

### Vascular/heart problems diagnosed by doctor

If code -7 (None of the above) was selected, then no additional choices were allowed. -> 0

If code -3 (Prefer not to answer) was selected, then no additional choices were allowed. -> pd.NA

These were both set to 0 corresponding to "None of the above" option

Here all visits are examined as the questions concerns any diagnosis in their history. A diagnosis specified in future visits is added to list of previous visits. Each field is also saved as an array of size 4 since multiple choices were allowed. These are converted into four new columns. For some reason only three visits are recorded here (repeat imaging not included)

In [None]:
field_name = 'Vascular/heart problems diagnosed by doctor'
instance_array_size=3
option_array_size=4

for i in range(instance_array_size):
    for j in range(option_array_size):
        field_id = f'{field_name}-{i}.{j}'
        data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
        data_df.loc[data_df[field_id]==-3, field_id] = pd.NA
        data_df.loc[data_df[field_id]==-7, field_id] = 0
        
# Should only cosider up to imaging visit as an occurrence after imaging could represent a different biological state
data_df['Heart attack diagnosed by doctor'] = check_answer(data_df=data_df, answer=1, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size)
data_df['Angina diagnosed by doctor'] = check_answer(data_df=data_df, answer=2, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size)
data_df['Stroke diagnosed by doctor'] = check_answer(data_df=data_df, answer=3, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size)
data_df['High blood pressure diagnosed by doctor'] = check_answer(data_df=data_df, answer=4, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size)

In [None]:
field_id = f'Vascular/heart problems diagnosed by doctor-2.0'
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
_ = sns.barplot(x=['Heart attack', 'Angina', 'Stroke', 'High blood pressure'], y=[sum(data_df['Heart attack diagnosed by doctor']), sum(data_df['Angina diagnosed by doctor']), sum(data_df['Stroke diagnosed by doctor']), sum(data_df['High blood pressure diagnosed by doctor'])])

### Age heart attack diagnosed

Should only be filled for people that indicated heart attack in 6150

-1 represents "Do not know"

-3 represents "Prefer not to answer"

**TODO: -1 is a perfectly valid answer and shouldn't be set to NA. Maybe set to mean of distribution?**

Take as valid option for ever having heart attack but remove when calculating time between image and heart attack

In [None]:
for i in range(3):
  field_id = f'Age heart attack diagnosed-{i}.0'
  data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
  data_df.loc[data_df[field_id]==-1, field_id] = pd.NA
  plot_hist(data_df=data_df, field_id=field_id)

**NOTE** Accepting dates for heart attack after imaging. Question is now: Who has such poor cardiac health that they already have had heart attack or will have one soon. Could make second field in future for just "Future heart attack".

In [None]:
field_id="Date of myocardial infarction-0.0"
data_df[field_id] = pd.to_datetime(data_df[field_id],errors='coerce')
plot_hist(data_df,field_id)
update_through_age(data_df, diag_id='Heart attack diagnosed by doctor', age_id=field_id)

In [None]:
field_id="Date of STEMI-0.0"
data_df[field_id] = pd.to_datetime(data_df[field_id],errors='coerce')
plot_hist(data_df,field_id)
update_through_age(data_df, diag_id='Heart attack diagnosed by doctor', age_id=field_id)

In [None]:
field_id="Date of NSTEMI-0.0"
data_df[field_id] = pd.to_datetime(data_df[field_id],errors='coerce')
plot_hist(data_df,field_id)
update_through_age(data_df, diag_id='Heart attack diagnosed by doctor', age_id=field_id)

In [None]:
_ = sns.barplot(x=['Heart attack', 'Angina', 'Stroke', 'High blood pressure'], y=[sum(data_df['Heart attack diagnosed by doctor']), sum(data_df['Angina diagnosed by doctor']), sum(data_df['Stroke diagnosed by doctor']), sum(data_df['High blood pressure diagnosed by doctor'])])

Check overlap between those that indicated heart attack in 6150 and gave a response to 3894

In [None]:
field_id_age = 'Age heart attack diagnosed-2.0'
heart_attack_df = data_df[data_df['Heart attack diagnosed by doctor']==True]
print(f"{len(heart_attack_df)} subjects indicated heart attack")
age_heart_attack_imaging_df = data_df[check_coverage(data_df=data_df, field_ids=[field_id_age])]
print("{} gave an actual age for when their heart attack was diagnosed".format(len(heart_attack_df[(heart_attack_df['Age heart attack diagnosed-2.0']!=-3)&(heart_attack_df['Age heart attack diagnosed-2.0']!=-1)])))
indicated_heart_attack_all_visits_df = age_heart_attack_imaging_df[(age_heart_attack_imaging_df['Heart attack diagnosed by doctor']==True)]
no_indicated_heart_attack_all_visits_df = age_heart_attack_imaging_df[(age_heart_attack_imaging_df['Heart attack diagnosed by doctor']==False)]
print(f"{len(indicated_heart_attack_all_visits_df)} subjects that provided an age for their heart attack indicated that they had a heart attack in one of the three visits")

During the imaging visit, 1134 subjects gave an age for when they had a heart attack but only 671 subjects indicated such in the required field 'Vascular/heart problems diagnosed by doctor'. 

We can add all subjects that indicated a date for a heart attack to the diagnosed heart attack column 

In [None]:
diag_id ='Heart attack diagnosed by doctor'

print(sum(data_df[diag_id]))
for i in range(4):
    age_id = f'Age heart attack diagnosed-{i}.0'
    update_through_age(data_df=data_df, diag_id=diag_id, age_id=age_id)
print(sum(data_df[diag_id]))

### Age high blood pressure diagnosed

In [None]:
for i in range(4):
    field_id = f'Age high blood pressure diagnosed-{i}.0'
    data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
    plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id_bp = 'Age high blood pressure diagnosed-2.0'
age_bp_imaging_df = data_df[check_coverage(data_df=data_df,field_ids=[field_id_bp])]
indicated_bp_all_visits_df = age_bp_imaging_df[(age_bp_imaging_df['High blood pressure diagnosed by doctor']==True)]
print(f"{len(indicated_bp_all_visits_df)} subjects that provided an age for their high bp indicated that they had high bp in one of the three visits")

We can add all subjects that indicated a date for high bp to the diagnosed high bp column 

In [None]:
diag_id = 'High blood pressure diagnosed by doctor'

print(sum(data_df[diag_id]))
for i in range(4):
    age_id = f'Age high blood pressure diagnosed-{i}.0'
    update_through_age(data_df=data_df, diag_id=diag_id, age_id=age_id)
print(sum(data_df[diag_id]))

### [None] Age angina diagnosed

In [None]:
for x in data_df.columns:
    if 'angina' in x or 'Angina' in x:
        print(x)

In [None]:
# for i in range(4):
#     field_id = f'Age angina diagnosed-{i}.0'
#     data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
#     plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id_bp = 'Age angina diagnosed-2.0'
# age_bp_imaging_df = data_df[check_coverage(data_df=data_df,field_ids=[field_id_bp])]
# indicated_bp_all_visits_df = age_bp_imaging_df[(age_bp_imaging_df['Angina diagnosed by doctor']==True)]
# print(f"{len(indicated_bp_all_visits_df)} subjects that provided an age for their high bp indicated that they had high bp in one of the three visits")

We can add all subjects that indicated a date for high bp to the diagnosed high bp column 

In [None]:
# diag_id = 'Angina diagnosed by doctor'

# print(sum(data_df[diag_id]))
# for i in range(4):
#     age_id = f'Age angina diagnosed-{i}.0'
#     update_through_age(data_df=data_df, diag_id=diag_id, age_id=age_id)
# print(sum(data_df[diag_id]))

### Age stroke diagnosed

In [None]:
for i in range(4):
    field_id = f'Age stroke diagnosed-{i}.0'
    data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
    plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id_bp = 'Age stroke diagnosed-2.0'
age_bp_imaging_df = data_df[check_coverage(data_df=data_df,field_ids=[field_id_bp])]
indicated_bp_all_visits_df = age_bp_imaging_df[(age_bp_imaging_df['Stroke diagnosed by doctor']==True)]
print(f"{len(indicated_bp_all_visits_df)} subjects that provided an age for their high bp indicated that they had high bp in one of the three visits")

We can add all subjects that indicated a date for high bp to the diagnosed high bp column 

In [None]:
diag_id = 'Stroke diagnosed by doctor'

print(sum(data_df[diag_id]))
for i in range(4):
    age_id = f'Age stroke diagnosed-{i}.0'
    update_through_age(data_df=data_df, diag_id=diag_id, age_id=age_id)
print(sum(data_df[diag_id]))

### Medication for cholesterol, blood pressure, diabetes, or take exogenous hormones

Female version of field 6177 (Medication for cholesterol, blood pressure or diabetes)

If code -7 ("None of the above") was selected, then no additional choices were allowed.    -> 0 

If code -1 ("Do not know") was selected, then no additional choices were allowed.          -> pd.NA 

If code -3 ("Prefer not to answer") was selected, then no additional choices were allowed. -> pd.NA 

Array with maximum length 4 (despite there being 5 options. Apparently no one indicated all 5 at the same time)

**TODO: Only consider imaging visit?**

In [None]:
field_name = 'Medication for cholesterol, blood pressure, diabetes, or take exogenous hormones'
instance_array_size=4
option_array_size=4

for i in range(instance_array_size):
    for j in range(option_array_size):
        field_id = f'{field_name}-{i}.{j}'
        data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
        data_df.loc[data_df[field_id]==-1, field_id] = pd.NA
        data_df.loc[data_df[field_id]==-3, field_id] = pd.NA
        data_df.loc[data_df[field_id]==-7, field_id] = 0
        

data_df['Cholesterol lowering medication regularly taken'] = check_answer(data_df=data_df, answer=1, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size)
data_df['Blood pressure medication regularly taken'] = check_answer(data_df=data_df, answer=2, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size)
data_df['Insulin medication regularly taken'] = check_answer(data_df=data_df, answer=3, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size)
data_df['Hormone replacement therapy medication regularly taken'] = check_answer(data_df=data_df, answer=4, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size)
data_df['Oral contraceptive pill or minipill medication regularly taken'] = check_answer(data_df=data_df, answer=5, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size)

In [None]:
field_name = 'Medication for cholesterol, blood pressure, diabetes, or take exogenous hormones'
option_array_size=4

data_df['Cholesterol lowering medication regularly taken-0.0'] = check_answer_single_visit(data_df=data_df, answer=1, visit=0, field_name=field_name, option_array_size=option_array_size)
data_df['Cholesterol lowering medication regularly taken-2.0'] = check_answer_single_visit(data_df=data_df, answer=1, visit=2, field_name=field_name, option_array_size=option_array_size)

data_df['Blood pressure medication regularly taken-0.0'] = check_answer_single_visit(data_df=data_df, answer=2, visit=0, field_name=field_name, option_array_size=option_array_size)
data_df['Blood pressure medication regularly taken-2.0'] = check_answer_single_visit(data_df=data_df, answer=2, visit=2, field_name=field_name, option_array_size=option_array_size)

data_df['Insulin medication regularly taken-0.0'] = check_answer_single_visit(data_df=data_df, answer=3, visit=0, field_name=field_name, option_array_size=option_array_size)
data_df['Insulin medication regularly taken-2.0'] = check_answer_single_visit(data_df=data_df, answer=3, visit=2, field_name=field_name, option_array_size=option_array_size)

data_df['Hormone replacement therapy medication regularly taken-0.0'] = check_answer_single_visit(data_df=data_df, answer=4, visit=0, field_name=field_name, option_array_size=option_array_size)
data_df['Hormone replacement therapy medication regularly taken-2.0'] = check_answer_single_visit(data_df=data_df, answer=4, visit=2, field_name=field_name, option_array_size=option_array_size)

data_df['Oral contraceptive pill or minipill medication regularly taken-0.0'] = check_answer_single_visit(data_df=data_df, answer=5, visit=0, field_name=field_name, option_array_size=option_array_size)
data_df['Oral contraceptive pill or minipill medication regularly taken-2.0'] = check_answer_single_visit(data_df=data_df, answer=5, visit=2, field_name=field_name, option_array_size=option_array_size)

In [None]:
m2 = data_df['Blood pressure medication regularly taken-2.0']
m0 = data_df['Blood pressure medication regularly taken-0.0']
just_2 = np.select([~m2 & m0],[True],False)
print(f'{just_2.sum()} subjects stoped taking blood pressure medication between the initial visit and the imaging visit')

In [None]:
# Update diabetes field
diabetes_diagnosed = check_answer(data_df=data_df, answer=3, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size, agnostic_field='Diabetes diagnosis')
data_df['Diabetes diagnosis'] = diabetes_diagnosed
print(f'{sum(diabetes_diagnosed)} subjects indicated diabetes diagnosis by doctor, age for diagnosis, or that they are taking insulin medication regularly')

In [None]:
# Set female specific fields to NA for men
data_df.loc[data_df['Sex-0.0']==1,'Hormone replacement therapy regularly taken'] = pd.NA
data_df.loc[data_df['Sex-0.0']==1,'Oral contraceptive pill or minipill regularly taken'] = pd.NA

In [None]:
field_id = f'Medication for cholesterol, blood pressure, diabetes, or take exogenous hormones-0.0'
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
ax = sns.barplot(x=['Cholesterol lowering medication', 'Blood pressure medication', 'Insulin', 'Hormone replacement therapy', 'Oral contraceptive pill or minipill'], y=[data_df['Cholesterol lowering medication regularly taken'].sum(), data_df['Blood pressure medication regularly taken'].sum(), data_df['Insulin medication regularly taken'].sum(), data_df['Hormone replacement therapy medication regularly taken'].sum(), data_df['Oral contraceptive pill or minipill medication regularly taken'].sum()])
ax.tick_params(axis='x', rotation=45)

### Medication for cholesterol, blood pressure or diabetes

Male version of field 6153 (Medication for cholesterol, blood pressure, diabetes, or take exogenous hormones)

If code -7 ("None of the above") was selected, then no additional choices were allowed.    -> 0 

If code -1 ("Do not know") was selected, then no additional choices were allowed.          -> pd.NA 

If code -3 ("Prefer not to answer") was selected, then no additional choices were allowed. -> pd.NA 

Array with maximum length 3 for all three options concurrently

In [None]:
field_name = 'Medication for cholesterol, blood pressure or diabetes'
instance_array_size=4
option_array_size=3

for i in range(instance_array_size):
    for j in range(option_array_size):
        field_id = f'{field_name}-{i}.{j}'
        data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
        data_df.loc[data_df[field_id]==-1, field_id] = pd.NA
        data_df.loc[data_df[field_id]==-3, field_id] = pd.NA
        data_df.loc[data_df[field_id]==-7, field_id] = 0
        
data_df['Cholesterol lowering medication regularly taken'] = check_answer(data_df=data_df, answer=1, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size, agnostic_field='Cholesterol lowering medication regularly taken')
data_df['Blood pressure medication regularly taken'] = check_answer(data_df=data_df, answer=2, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size, agnostic_field='Blood pressure medication regularly taken')
data_df['Insulin medication regularly taken'] = check_answer(data_df=data_df, answer=3, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size, agnostic_field='Insulin medication regularly taken')

In [None]:
field_name = 'Medication for cholesterol, blood pressure, diabetes, or take exogenous hormones'
option_array_size=4

data_df['Cholesterol lowering medication regularly taken-0.0'] = check_answer_single_visit(data_df=data_df, answer=1, visit=0, field_name=field_name, option_array_size=option_array_size, agnostic_field='Cholesterol lowering medication regularly taken-0.0')
data_df['Cholesterol lowering medication regularly taken-2.0'] = check_answer_single_visit(data_df=data_df, answer=1, visit=2, field_name=field_name, option_array_size=option_array_size, agnostic_field='Cholesterol lowering medication regularly taken-2.0')

data_df['Blood pressure medication regularly taken-0.0'] = check_answer_single_visit(data_df=data_df, answer=2, visit=0, field_name=field_name, option_array_size=option_array_size, agnostic_field='Blood pressure medication regularly taken-0.0')
data_df['Blood pressure medication regularly taken-2.0'] = check_answer_single_visit(data_df=data_df, answer=2, visit=2, field_name=field_name, option_array_size=option_array_size, agnostic_field='Blood pressure medication regularly taken-2.0')

data_df['Insulin medication regularly taken-0.0'] = check_answer_single_visit(data_df=data_df, answer=3, visit=0, field_name=field_name, option_array_size=option_array_size, agnostic_field='Insulin medication regularly taken-0.0')
data_df['Insulin medication regularly taken-2.0'] = check_answer_single_visit(data_df=data_df, answer=3, visit=2, field_name=field_name, option_array_size=option_array_size, agnostic_field='Insulin medication regularly taken-2.0')

In [None]:
# Update diabetes field
diabetes_diagnosed=check_answer(data_df=data_df, answer=3, field_name=field_name, instance_array_size=instance_array_size, option_array_size=option_array_size, agnostic_field='Diabetes diagnosis')
data_df['Diabetes diagnosis']=diabetes_diagnosed
print(f'{sum(diabetes_diagnosed)} subjects indicated diabetes diagnosis by doctor, age for diagnosis, or that they are taking insulin medication regularly')

In [None]:
field_id = f'Medication for cholesterol, blood pressure or diabetes-0.0'
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
ax = sns.barplot(x=['Cholesterol lowering medication', 'Blood pressure medication', 'Insulin'], y=[data_df['Cholesterol lowering medication regularly taken'].sum(), data_df['Blood pressure medication regularly taken'].sum(), data_df['Insulin medication regularly taken'].sum()])
ax.tick_params(axis='x', rotation=45)

### [None] Pacemaker

'Participants asked by interviewer if they have a pace-maker before the body impedance measures. Impedance measurement was not performed if they had a pace-maker.'

Should be checked against the operation code for pacemaker

In [None]:
for x in data_df.columns:
    if 'Pace' in x or 'pace' in x or 'maker' in x or 'Maker' in x:
        print(x)

In [None]:
# field_id = 'Pace-maker-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# cardiac_codes = ['1096','1548','1549']
# field_name = 'Operation code'
# data_df['Pacemaker operations performed'] = operations_performed(data_df=data_df, field_name=field_name, operation_codes=cardiac_codes)

In [None]:
# pace_maker_eids = data_df.loc[data_df['Pace-maker-2.0']==1]['eid']
# pace_maker_ops_eids = data_df.loc[data_df['Pacemaker operations performed']>0]['eid']
# print('{} subjects said they had a pacemaker'.format(len(pace_maker_eids)))
# print('{} subjects did NOT specify they had surgery for a pacemaker'.format(len(set(pace_maker_eids)-set(pace_maker_ops_eids))))

Apparently none of the people that specified they have a pacemaker at imaging time also specified a pacemaker operation

In [None]:
# field_id = 'Pace-maker-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# pace_maker_eids = data_df.loc[data_df['Pace-maker-0.0']==1]['eid']
# pace_maker_ops_eids = data_df.loc[data_df['Pacemaker operations performed']>0]['eid']
# print('{} subjects said they had a pacemaker'.format(len(pace_maker_eids)))
# print('{} subjects did NOT specify they had surgery for a pacemaker'.format(len(set(pace_maker_eids)-set(pace_maker_ops_eids))))

417 subjects said they had a pacemaker before the body impedance measure but did not specify it in their surgical recollection 

**TODO: Do we create a separate field for pacemaker and take the superset of the operation field and this field?**

### [None] Operation Code

Max array size 32. All 4 instances

**TODO: Weight by severity?** one column for me one for dr

**TODO: Repeat operations?**

In [None]:
for x in data_df.columns:
    if 'operation' in x or 'Operation' in x:
        print(x)

In [None]:
# # Cardiac codes taken from file CardiacOperationCodes.csv
# field_name = 'Operation code'
# cardiac_codes = [1069,1070,1071,1095,1096,1097,1098,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,1514,1515,1516,1545,1599,1552,1604,1548,1549,1550,1551,1523,1524,1553,1554,1393,1479,1555,1476]

# data_df['Cardiac operations performed'] = operations_performed(data_df=data_df, field_name=field_name, operation_codes=cardiac_codes) 

In [None]:
# plot_hist(data_df=data_df, field_id='Cardiac operations performed')

### Total thigh fat-free muscle volume

In [None]:
field_id = 'Total thigh fat-free muscle volume-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

### Total trunk fat volume

In [None]:
field_id = 'Total trunk fat volume-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

### [None] Total adipose tissue volume

In [None]:
for x in data_df.columns:
    if 'volume' in x or 'Volume' in x:
        print(x)

In [None]:
# field_id = 'Total adipose tissue volume-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

### [None] Total abdominal adipose tissue index

In [None]:
for x in data_df.columns:
    if 'ratio' in x or 'ratio' in x:
        print(x)

In [None]:
# field_id = 'Total abdominal adipose tissue index-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

### [None] Weight-to-muscle ratio

In [None]:
# field_id = 'Weight-to-muscle ratio-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

### [None] Abdominal fat ratio

In [None]:
# field_id = 'Abdominal fat ratio-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

### [None] Ever addicted to any substance or behaviour

In [None]:
# field_id = 'Ever addicted to any substance or behaviour-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

**NOTE: Very general. Followup questions specify yes answers and need to be parsed as such.**

### Ever physically dependent on alcohol

**NOTE** Only asked if answered yes to 20406 (Ever addicted to alcohol) which was only asked if answered yes to 20401 (Ever addicted to any substance or behaviour). Requires checks against other fields to fill in rest of nos. Initial field is quite sparse though so effort not made here to do that

In [None]:
#field_id = 'Ever physically dependent on alcohol-0.0'
#data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
#plot_hist(data_df=data_df, field_id=field_id)

### [None] Ever addicted to alcohol

In [None]:
for x in data_df.columns:
    if 'addict' in x:
        print(x)

In [None]:
# field_id = 'Ever addicted to alcohol-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

### [None] Ongoing addiction to alcohol

In [None]:
# field_id = 'Ongoing addiction to alcohol-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

### [None] Ever addicted to a behaviour or miscellanous

In [None]:
# field_id = 'Ever addicted to a behaviour or miscellanous-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

### [None] Ongoing behavioural or miscellanous addiction

In [None]:
# field_id = 'Ongoing behavioural or miscellanous addiction-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

**NOTE: Answered by practically no one. Remove**

### [None] Ever addicted to illicit or recreational drugs

In [None]:
# field_id = 'Ever addicted to illicit or recreational drugs-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

**NOTE: Answered by practically no one. Remove**

### [None] Substance of prescription or over-the-counter medication addiction

In [None]:
# field_id = 'Ongoing addiction or dependence on illicit or recreational drugs-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

### Substance of prescription or over-the-counter medication addiction

### Substances taken for anxiety

In [None]:
for x in data_df.columns:
    if 'anxiety' in x:
        print(x)

In [None]:
field_id = 'Substances taken for anxiety-0.3'
data_df[field_id].value_counts()

In [None]:
field_id = 'Substances taken for anxiety-0.1'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-818,field_id] = pd.NA # Prefer not to answer
data_df.loc[data_df[field_id]==-121,field_id] = pd.NA # Do not know
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Professional informed about anxiety-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-818,field_id] = pd.NA # Prefer not to answer
# data_df.loc[data_df[field_id]==-121,field_id] = pd.NA # Do not know
# plot_hist(data_df=data_df, field_id=field_id)

### [None] Substances taken for anxiety

In [None]:
# field_id = 'Professional informed about anxiety-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-818,field_id] = pd.NA # Prefer not to answer
# data_df.loc[data_df[field_id]==-121,field_id] = pd.NA # Do not know
# plot_hist(data_df=data_df, field_id=field_id)

### [None] Substances taken for anxiety

In [None]:
# field_id = 'Professional informed about anxiety-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-818,field_id] = pd.NA # Prefer not to answer
# data_df.loc[data_df[field_id]==-121,field_id] = pd.NA # Do not know
# plot_hist(data_df=data_df, field_id=field_id)

NOTE: Online follow-up question so relatively close to imaging 

### [None] Substances taken for anxiety

In [None]:
# field_id = 'Professional informed about anxiety-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-818,field_id] = pd.NA # Prefer not to answer
# data_df.loc[data_df[field_id]==-121,field_id] = pd.NA # Do not know
# plot_hist(data_df=data_df, field_id=field_id)

NOTE: Online follow-up question so relatively close to imaging 

### [None] Substances taken for anxiety

In [None]:
# field_id = 'Professional informed about anxiety-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-818,field_id] = pd.NA # Prefer not to answer
# data_df.loc[data_df[field_id]==-121,field_id] = pd.NA # Do not know
# plot_hist(data_df=data_df, field_id=field_id)

NOTE: Online follow-up question so relatively close to imaging 

### [None] Substances taken for anxiety

Needs to be fleshed out by looking for who answered "no" to 20421 and giving them the value of 0 here. Response rate too low though so left to future work

In [None]:
# field_id = 'Professional informed about anxiety-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-818,field_id] = pd.NA # Prefer not to answer
# data_df.loc[data_df[field_id]==-121,field_id] = pd.NA # Do not know
# plot_hist(data_df=data_df, field_id=field_id)

NOTE: Online follow-up question so relatively close to imaging 

### Substances taken for anxiety

In [None]:
# field_id = 'Professional informed about anxiety-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-818,field_id] = pd.NA # Prefer not to answer
# data_df.loc[data_df[field_id]==-121,field_id] = pd.NA # Do not know
# plot_hist(data_df=data_df, field_id=field_id)

NOTE: Online follow-up question so relatively close to imaging 

### [None] Substances taken for anxiety

In [None]:
# field_id = 'Professional informed about anxiety-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-818,field_id] = pd.NA # Prefer not to answer
# data_df.loc[data_df[field_id]==-121,field_id] = pd.NA # Do not know
# plot_hist(data_df=data_df, field_id=field_id)

### Substances taken for anxiety

### Activities undertaken to treat anxiety

### Blood biochemistry

**NOTE: Only taken at initial assessment and first repeat. Too far away from imaging. Remove**

### Blood cell count

**Very low coverage. (10%) Removed**

In [None]:
for x in data_df.columns:
    if 'Impedance' in x or 'impedance' in x:
        print(x)

In [None]:
# field_id = f'Total mass-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# remove_outliers(data_df,field_id,limit=100,greater=False)
#plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = f'Basal metabolic rate-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
remove_outliers(data_df,field_id,limit=10,greater=False)
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = f'Impedance of whole body-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# remove_outliers(data_df,field_id,limit=10,greater=False)
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = f'Waist circumference-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
remove_outliers(data_df,field_id,limit=10,greater=False)
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = f'Hip circumference-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
remove_outliers(data_df,field_id,limit=10,greater=False)
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = f'Standing height-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# remove_outliers(data_df,field_id,limit=10,greater=False)
# plot_hist(data_df=data_df, field_id=field_id)

**NOTE: Skip in favor of weight which is average of multiple fields**

In [None]:
# field_id = f'Weight (pre-imaging)-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = f'Height-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# remove_outliers(data_df,field_id,limit=10,greater=False)
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = f'Sitting height-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# remove_outliers(data_df,field_id,limit=10,greater=False)
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Weight-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id] = data_df.loc[data_df[field_id].isna(),'Weight (pre-imaging)-2.0'] # This is an aggregate field but has some missing values. Missing values are taken from pre-imaging weighing
remove_outliers(data_df,field_id,limit=2,greater=False)
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
data_df['Weight-2.0']

In [None]:
field_id = f'Average heart rate-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

The readings taken from Wenjias paper for some reason are super low coverage in UKBB. Going to use the values we calculated ourselves.

In [None]:
field_id = f'Average heart rate-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = f'Average heart rate-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'QRS duration-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
remove_outliers(data_df,field_id,limit=2,greater=False)
remove_outliers(data_df,field_id,limit=200,greater=True)
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = f'Average heart rate-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = f'Average heart rate-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = f'RR interval-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# remove_outliers(data_df,field_id,limit=2,greater=False)
# #remove_outliers(data_df,field_id,limit=700,greater=True)
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = f'Average heart rate-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = f'Body surface area-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = f'Tobacco smoking-0.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-818,field_id]=pd.NA
data_df.loc[data_df[field_id]==111,field_id]=3
data_df.loc[data_df[field_id]==112,field_id]=2
data_df.loc[data_df[field_id]==113,field_id]=1
data_df.loc[data_df[field_id]==114,field_id]=0
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = f'Shortness of breath walking on level ground-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1,field_id] = pd.NA
data_df.loc[data_df[field_id]==-3,field_id] = pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = f'Worrier / anxious feelings-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# #data_df.loc[data_df[field_id]==-1,field_id] = pd.NA
# #data_df.loc[data_df[field_id]==-3,field_id] = pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = f'Tense / \'highly strung\'-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id]==-1,field_id] = pd.NA
# data_df.loc[data_df[field_id]==-3,field_id] = pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

NMR metabolomics only collected during initial visit. **Skip**. 23400-23480

In [None]:
field_id = 'Number of days/week walked 10+ minutes-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-2,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Duration of walks-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
data_df.loc[data_df['Number of days/week walked 10+ minutes-2.0']==0,field_id]=0 # If answered 0 to previous questions, this question is not asked, even though their answer would be 0
remove_outliers(data_df,field_id,limit=1000,greater=True)
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Number of days/week of moderate physical activity 10+ minutes-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Duration of moderate activity-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
data_df.loc[data_df['Number of days/week of moderate physical activity 10+ minutes-2.0']==0,field_id]=0 # If answered 0 to previous questions, this question is not asked, even though their answer would be 0
remove_outliers(data_df,field_id,limit=1000,greater=True)
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Number of days/week of vigorous physical activity 10+ minutes-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Duration of vigorous activity-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
data_df.loc[data_df['Number of days/week of vigorous physical activity 10+ minutes-2.0']==0,field_id]=0 # If answered 0 to previous questions, this question is not asked, even though their answer would be 0
remove_outliers(data_df,field_id,limit=1000,greater=True)
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Usual walking pace-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-7,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Frequency of stair climbing in last 4 weeks-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means didnt indicate in questions 6164 so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Frequency of walking for pleasure in last 4 weeks-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means didnt indicate in questions 6164 so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Duration walking for pleasure-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't walk for pleasure so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# remove_outliers(data_df,field_id,limit=1000,greater=True)
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Frequency of strenuous sports in last 4 weeks-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means didnt indicate in questions 6164 so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Duration of strenuous sports-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
remove_outliers(data_df,field_id,limit=1000,greater=True)
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Duration of light DIY-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Time spent watching television (TV)-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-10,field_id]=0 # -10 means less than half hour
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Time spent using computer-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-10,field_id]=0
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Time spent driving-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-10,field_id]=0
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Frequency of heavy DIY in last 4 weeks-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Duration of heavy DIY-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# remove_outliers(data_df,field_id,limit=1000,greater=True)
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Frequency of other exercises in last 4 weeks-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Duration of other exercises-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# remove_outliers(data_df,field_id,limit=1000,greater=True)
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
option_array_size=5
for i in range(4):
  field_id = f'Types of physical activity in last 4 weeks-2.{i}'
  data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
  data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
  data_df.loc[data_df[field_id]==-3,field_id]=pd.NA

In [None]:
field_name='Types of physical activity in last 4 weeks'

data_df['Walking for pleasure-2.0'] = check_answer_single_visit(data_df=data_df, answer=1, visit=2, field_name=field_name, option_array_size=option_array_size)

plot_hist(data_df=data_df, field_id='Walking for pleasure-2.0')

In [None]:
field_name='Types of physical activity in last 4 weeks'

data_df['Other exercises-2.0'] = check_answer_single_visit(data_df=data_df, answer=2, visit=2, field_name=field_name, option_array_size=option_array_size)

plot_hist(data_df=data_df, field_id='Other exercises-2.0')

In [None]:
field_name='Types of physical activity in last 4 weeks'

data_df['Strenuous sports-2.0'] = check_answer_single_visit(data_df=data_df, answer=3, visit=2, field_name=field_name, option_array_size=option_array_size)

plot_hist(data_df=data_df, field_id='Strenuous sports-2.0')

In [None]:
field_name='Types of physical activity in last 4 weeks'

data_df['Light DIY-2.0'] = check_answer_single_visit(data_df=data_df, answer=4, visit=2, field_name=field_name, option_array_size=option_array_size)

plot_hist(data_df=data_df, field_id='Light DIY-2.0')

In [None]:
field_name='Types of physical activity in last 4 weeks'

data_df['Heavy DIY-2.0'] = check_answer_single_visit(data_df=data_df, answer=5, visit=2, field_name=field_name, option_array_size=option_array_size)

plot_hist(data_df=data_df, field_id='Heavy DIY-2.0')

In [None]:
field_name='Types of physical activity in last 4 weeks'

data_df['No physical activity-2.0'] = check_answer_single_visit(data_df=data_df, answer=-7, visit=2, field_name=field_name, option_array_size=option_array_size)

plot_hist(data_df=data_df, field_id='No physical activity-2.0')

In [None]:
# field_id = 'Systolic brachial blood pressure-2.0'
# remove_outliers(data_df,field_id,limit=1,greater=False)
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Diastolic brachial blood pressure-2.0'
remove_outliers(data_df,field_id,limit=1,greater=False)
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# fields = ["Heart rate during PWA","Systolic brachial blood pressure during PWA","Diastolic brachial blood pressure during PWA","Peripheral pulse pressure during PWA","Central systolic blood pressure during PWA","Central pulse pressure during PWA","Number of beats in waveform average for PWA","Central augmentation pressure during PWA","Augmentation index for PWA","Cardiac output during PWA","End systolic pressure during PWA","End systolic pressure index during PWA","Total peripheral resistance during PWA","Stroke volume during PWA","Mean arterial pressure during PWA","Cardiac index during PWA"]
fields = ["Heart rate during PWA","Systolic brachial blood pressure during PWA","Diastolic brachial blood pressure during PWA","Peripheral pulse pressure during PWA","Central systolic blood pressure during PWA","Central pulse pressure during PWA","Number of beats in waveform average for PWA","Central augmentation pressure during PWA","Augmentation index for PWA","Cardiac output during PWA","End systolic pressure during PWA","End systolic pressure index during PWA","Total peripheral resistance during PWA","Stroke volume during PWA","Cardiac index during PWA"]
for field in fields:
  field_id = f'{field}-2.0'
  data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
  #plt.figure()
  #plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Sleep duration-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Sleeplessness / insomnia-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Current tobacco smoking-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Past tobacco smoking-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id].isna(),field_id]=1 # If NA, indicated that they are currently smoking so assign to all day smoker category?
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Smoking/smokers in household-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=1 # If NA, indicated that they are currently smoking so assign to on smoker in household
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Exposure to tobacco smoke at home-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id].isna(),field_id]=7 # If NA, indicated that they are currently smoking so say 7 hours of tabacco smoke per week outside home
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
remove_outliers(data_df, field_id, limit=50, greater=True)
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Exposure to tobacco smoke outside home-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id].isna(),field_id]=7 # If NA, indicated that they are currently smoking so say 7 hours of tabacco smoke per week outside home
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
remove_outliers(data_df, field_id, limit=50, greater=True)
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Smoking status-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Pack years of smoking-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # Only asked to those that smoke, so non-smokers get value 0
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Pack years adult smoking as proportion of life span exposed to smoking-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # Only asked to those that smoke, so non-smokers get value 0
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Ever smoked-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id="Date of stroke-0.0"
# data_df[field_id] = pd.to_datetime(data_df[field_id],errors='coerce')
# plot_hist(data_df,field_id)

# diag_id = 'Stroke diagnosed by doctor'
# update_through_age(data_df=data_df, diag_id=diag_id, age_id=field_id)

In [None]:
# field_id="Date of ischaemic stroke-0.0"
# data_df[field_id] = pd.to_datetime(data_df[field_id],errors='coerce')
# plot_hist(data_df,field_id)

# diag_id = 'Stroke diagnosed by doctor'
# update_through_age(data_df=data_df, diag_id=diag_id, age_id=field_id)

In [None]:
# field_id="Date of intracerebral haemorrhage-0.0"
# data_df[field_id] = pd.to_datetime(data_df[field_id],errors='coerce')
# plot_hist(data_df,field_id)

# diag_id = 'Stroke diagnosed by doctor'
# update_through_age(data_df=data_df, diag_id=diag_id, age_id=field_id)

In [None]:
# field_id="Date of subarachnoid haemorrhage-0.0"
# data_df[field_id] = pd.to_datetime(data_df[field_id],errors='coerce')
# plot_hist(data_df,field_id)

# diag_id = 'Stroke diagnosed by doctor'
# update_through_age(data_df=data_df, diag_id=diag_id, age_id=field_id)

In [None]:
field_id = 'Age when attended assessment centre-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Alcohol usually taken with meals-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
# data_df.loc[data_df[field_id]==-6,field_id]=2 # -6 means it varies so assign to 2
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Alcohol drinker status-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Frequency of drinking alcohol-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# #data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# #data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]<0,field_id]=pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Frequency of consuming six or more units of alcohol-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# #data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# #data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]<0,field_id]=pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Amount of alcohol drunk on a typical drinking day-0.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# #data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# #data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]<0,field_id]=pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Falls in the last year-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
#data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
#data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]<0,field_id]=pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
field_id = 'Long-standing illness, disability or infirmity-2.0'
data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
data_df.loc[data_df[field_id]==-3,field_id]=pd.NA
plot_hist(data_df=data_df, field_id=field_id)

In [None]:
# field_id = 'Weight change compared with 1 year ago-2.0'
# data_df[field_id] = pd.to_numeric(data_df[field_id],errors='coerce')
# #data_df.loc[data_df[field_id].isna(),field_id]=0 # NA means doesn't do so answer is 0
# #data_df.loc[data_df[field_id]==-1,field_id]=pd.NA
# data_df.loc[data_df[field_id]<0,field_id]=pd.NA
# plot_hist(data_df=data_df, field_id=field_id)

In [545]:
cleaned_features_path = join(BASE_PATH,'cardiac_features_18545_clean_wo_clinical.csv')
# SAVE=False
if SAVE:
  data_df.to_csv(cleaned_features_path,index=False)
else:
  data_df = pd.read_csv(cleaned_features_path)
print(len(data_df.columns))
print(len(set(data_df['eid'])))

889
502465


In [546]:
data_df = pd.read_csv(join(BASE_PATH,'cardiac_features_18545_clean_wo_clinical.csv'))
original_feature_vector_df = pd.read_csv('/vol/biodata/data/biobank/18545/clinical/clinical_measures_39k_collated.csv')
original_feature_vector_df.rename(columns={'Unnamed: 0': 'eid'}, inplace=True)
new_data_df = pd.merge(data_df,original_feature_vector_df[['eid','LVEDV (mL)','LVESV (mL)','LVSV (mL)','LVEF (%)','LVCO (L/min)','LVM (g)','RVEDV (mL)','RVESV (mL)','RVSV (mL)','RVEF (%)']],on='eid', how='left')
print(len(new_data_df.columns))
print(len(set(new_data_df['eid'])))

  data_df = pd.read_csv(join(BASE_PATH,'cardiac_features_18545_clean_wo_clinical.csv'))


899
502465


In [547]:
data_df = new_data_df

In [548]:
# data_df['Long-standing illness, disability or infirmity-2.0'].value_counts()
# data_df['Shortness of breath walking on level ground-2.0'].value_counts()
print(data_df.shape, new_data_df.shape)

(502465, 899) (502465, 899)


## Save Cleaned Features

In [549]:
cleaned_features_path = join(BASE_PATH,'cardiac_features_18545_clean.csv')
if SAVE:
  data_df.to_csv(cleaned_features_path,index=False)
else:
  data_df = pd.read_csv(cleaned_features_path)
print(SAVE)

True


In [550]:
# Drop non-white subjects
print(len(data_df))
data_df.drop(data_df[data_df['Ethnic background-0.0']!=1001].index, inplace=True)
print(len(data_df))

502465
442554


## Convert Dataframe to Vector

In [551]:
def cardiac_features_to_vector_df(df):
  vec = []
  vec.append(df['eid'])
  # vec.append(df['eid_old'])
  vec.append(df['Pulse wave Arterial Stiffness index-2.0'])
  vec.append(df['Systolic blood pressure-2.mean'])
  vec.append(df['Diastolic blood pressure-2.mean'])
  vec.append(df['Pulse rate-2.mean'])
  vec.append(df['Body fat percentage-2.0'])
  vec.append(df['Whole body fat mass-2.0'])
  # vec.append(df['Whole body fat-free mass-2.0'])
  # vec.append(df['Whole body water mass-2.0'])
  vec.append(df['Body mass index (BMI)-2.0'])
  # vec.append(df['Cooked vegetable intake-2.0'])
  # vec.append(df['Salad / raw vegetable intake-2.0'])
  # vec.append(df['Cardiac operations performed'])
  # vec.append(df['Total mass-2.0'])
  vec.append(df['Basal metabolic rate-2.0'])
  # vec.append(df['Impedance of whole body-2.0'])
  vec.append(df['Waist circumference-2.0'])
  vec.append(df['Hip circumference-2.0'])
  # vec.append(df['Standing height-2.0'])
  # vec.append(df['Height-2.0'])
  # vec.append(df['Sitting height-2.0'])
  vec.append(df['Weight-2.0'])
  vec.append(df['Ventricular rate-2.0'])
  vec.append(df['P duration-2.0'])
  vec.append(df['QRS duration-2.0'])
  # vec.append(df['PQ interval-2.0'])
  # vec.append(df['RR interval-2.0'])
  # vec.append(df['PP interval-2.0'])
  vec.append(df['Cardiac output-2.0'])
  vec.append(df['Cardiac index-2.0'])
  vec.append(df['Average heart rate-2.0'])
  vec.append(df['Body surface area-2.0'])
  vec.append(df['Duration of walks-2.0'])
  vec.append(df['Duration of moderate activity-2.0'])
  vec.append(df['Duration of vigorous activity-2.0'])
  # vec.append(df['Time spent watching television (TV)-2.0'])
  # vec.append(df['Time spent using computer-2.0'])
  # vec.append(df['Time spent driving-2.0'])
  vec.append(df['Heart rate during PWA-2.0'])
  vec.append(df['Systolic brachial blood pressure during PWA-2.0'])
  vec.append(df['Diastolic brachial blood pressure during PWA-2.0'])
  vec.append(df['Peripheral pulse pressure during PWA-2.0'])
  vec.append(df['Central systolic blood pressure during PWA-2.0'])
  vec.append(df['Central pulse pressure during PWA-2.0'])
  vec.append(df['Number of beats in waveform average for PWA-2.0'])
  vec.append(df['Central augmentation pressure during PWA-2.0'])
  vec.append(df['Augmentation index for PWA-2.0'])
  vec.append(df['Cardiac output during PWA-2.0'])
  vec.append(df['End systolic pressure during PWA-2.0'])
  vec.append(df['End systolic pressure index during PWA-2.0'])
  vec.append(df['Total peripheral resistance during PWA-2.0'])
  vec.append(df['Stroke volume during PWA-2.0'])
  # vec.append(df['Mean arterial pressure during PWA-2.0'])
  vec.append(df['Cardiac index during PWA-2.0'])
  vec.append(df['Sleep duration-2.0'])
  vec.append(df['Exposure to tobacco smoke at home-2.0'])
  vec.append(df['Exposure to tobacco smoke outside home-2.0'])
  # vec.append(df['Pack years of smoking-2.0'])
  # vec.append(df['Pack years adult smoking as proportion of life span exposed to smoking-2.0'])
  vec.append(df['LVEDV (mL)'])
  vec.append(df['LVESV (mL)'])
  vec.append(df['LVSV (mL)'])
  vec.append(df['LVEF (%)'])
  vec.append(df['LVCO (L/min)'])
  vec.append(df['LVM (g)'])
  vec.append(df['RVEDV (mL)'])
  vec.append(df['RVESV (mL)'])
  vec.append(df['RVSV (mL)'])
  vec.append(df['RVEF (%)'])
  
  # vec.append(df['Worrier / anxious feelings-2.0'].apply(clean_categorical))
  vec.append(df['Shortness of breath walking on level ground-2.0'].apply(clean_categorical))
  vec.append(df['Sex-0.0'].apply(clean_categorical))
  vec.append(df['Diabetes diagnosis'].apply(clean_categorical))
  vec.append(df['Heart attack diagnosed by doctor'].apply(clean_categorical))
  vec.append(df['Angina diagnosed by doctor'].apply(clean_categorical))
  vec.append(df['Stroke diagnosed by doctor'].apply(clean_categorical))
  vec.append(df['High blood pressure diagnosed by doctor'].apply(clean_categorical))
  vec.append(df['Cholesterol lowering medication regularly taken'].apply(clean_categorical))
  vec.append(df['Blood pressure medication regularly taken'].apply(clean_categorical))
  vec.append(df['Insulin medication regularly taken'].apply(clean_categorical))
  vec.append(df['Hormone replacement therapy medication regularly taken'].apply(clean_categorical))
  vec.append(df['Oral contraceptive pill or minipill medication regularly taken'].apply(clean_categorical))
  # vec.append(df['Pace-maker-2.0'].apply(clean_categorical))
  # vec.append(df['Ever had diabetes (Type I or Type II)-0.0'].apply(clean_categorical))
  vec.append(df['Long-standing illness, disability or infirmity-2.0'].apply(clean_categorical))
  # vec.append(df['Tense / \'highly strung\'-2.0'].apply(clean_categorical))
  vec.append(df['Ever smoked-2.0'].apply(clean_categorical))

  vec.append(df['Sleeplessness / insomnia-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3, one_based=True)))
  # vec.append(df['Frequency of heavy DIY in last 4 weeks-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=7)))
  vec.append(df['Alcohol intake frequency.-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6, one_based=True)))
  # vec.append(df['Processed meat intake-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6)))
  # vec.append(df['Beef intake-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6)))
  # vec.append(df['Pork intake-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6)))
  # vec.append(df['Lamb/mutton intake-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6)))
  vec.append(df['Overall health rating-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=4, one_based=True)))
  # vec.append(df['Alcohol usually taken with meals-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3)))
  vec.append(df['Alcohol drinker status-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3)))
  # vec.append(df['Frequency of drinking alcohol-0.0'].apply(lambda col: one_hot_encode(value=col, num_classes=5)))
  # vec.append(df['Frequency of consuming six or more units of alcohol-0.0'].apply(lambda col: one_hot_encode(value=col, num_classes=5, one_based=True)))
  # vec.append(df['Amount of alcohol drunk on a typical drinking day-0.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6)))
  vec.append(df['Falls in the last year-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3, one_based=True)))
  # vec.append(df['Weight change compared with 1 year ago-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3)))
  vec.append(df['Number of days/week walked 10+ minutes-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
  vec.append(df['Number of days/week of moderate physical activity 10+ minutes-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
  vec.append(df['Number of days/week of vigorous physical activity 10+ minutes-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
  vec.append(df['Usual walking pace-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3, one_based=True)))
  # vec.append(df['Frequency of stair climbing in last 4 weeks-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6)))
  # vec.append(df['Frequency of walking for pleasure in last 4 weeks-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=7)))
  # vec.append(df['Duration walking for pleasure-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
  # vec.append(df['Frequency of strenuous sports in last 4 weeks-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=7)))
  vec.append(df['Duration of strenuous sports-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
  # vec.append(df['Duration of light DIY-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
  # vec.append(df['Duration of heavy DIY-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
  # vec.append(df['Frequency of other exercises in last 4 weeks-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=7)))
  # vec.append(df['Duration of other exercises-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
  vec.append(df['Current tobacco smoking-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3)))
  vec.append(df['Past tobacco smoking-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=4, one_based=True)))
  # vec.append(df['Smoking/smokers in household-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3)))
  vec.append(df['Smoking status-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3)))
  return vec

In [552]:
cardiac_features_encoding_df = cardiac_features_to_vector_df(data_df)
cardiac_features_encoding_df = pd.concat(cardiac_features_encoding_df,axis=1)
cardiac_features_encoding_df = cardiac_features_encoding_df.reset_index(drop=True)

In [553]:
print(len(cardiac_features_encoding_df.columns))

77


In [555]:

if SAVE:
    cardiac_features_encoding_df.to_csv(join(BASE_PATH,'cardiac_feature_18545_vector.csv'),index=False)
else:
    cardiac_features_encoding_df = pd.read_csv(join(BASE_PATH,'cardiac_feature_18545_vector.csv'))

In [556]:
assert(data_df.reset_index(drop=True)['eid']==cardiac_features_encoding_df['eid']).all()

In [None]:
cardiac_features_encoding_df

### Labels

In [558]:
diagnoses_field_list = []
for x in data_df.columns:
    if 'Diagnoses' in x:
        diagnoses_field_list.append(x)
print(diagnoses_field_list)

['Diagnoses - ICD10-0.0', 'Diagnoses - ICD10-0.1', 'Diagnoses - ICD10-0.2', 'Diagnoses - ICD10-0.3', 'Diagnoses - ICD10-0.4', 'Diagnoses - ICD10-0.5', 'Diagnoses - ICD10-0.6', 'Diagnoses - ICD10-0.7', 'Diagnoses - ICD10-0.8', 'Diagnoses - ICD10-0.9', 'Diagnoses - ICD10-0.10', 'Diagnoses - ICD10-0.11', 'Diagnoses - ICD10-0.12', 'Diagnoses - ICD10-0.13', 'Diagnoses - ICD10-0.14', 'Diagnoses - ICD10-0.15', 'Diagnoses - ICD10-0.16', 'Diagnoses - ICD10-0.17', 'Diagnoses - ICD10-0.18', 'Diagnoses - ICD10-0.19', 'Diagnoses - ICD10-0.20', 'Diagnoses - ICD10-0.21', 'Diagnoses - ICD10-0.22', 'Diagnoses - ICD10-0.23', 'Diagnoses - ICD10-0.24', 'Diagnoses - ICD10-0.25', 'Diagnoses - ICD10-0.26', 'Diagnoses - ICD10-0.27', 'Diagnoses - ICD10-0.28', 'Diagnoses - ICD10-0.29', 'Diagnoses - ICD10-0.30', 'Diagnoses - ICD10-0.31', 'Diagnoses - ICD10-0.32', 'Diagnoses - ICD10-0.33', 'Diagnoses - ICD10-0.34', 'Diagnoses - ICD10-0.35', 'Diagnoses - ICD10-0.36', 'Diagnoses - ICD10-0.37', 'Diagnoses - ICD10-0.

In [559]:
field_name = 'Diagnoses - ICD10'
infarction_codes = set(['I210', 'I211', 'I212', 'I213', 'I214', 'I219', 'I252'])
CAD_codes = set(['I200', 'I201', 'I208', 'I209', 
                 'I210', 'I211', 'I212', 'I213', 'I214', 'I219',
                 'I220', 'I221', 'I228', 'I229',
                 'I240', 'I248', 'I249'
                 'I250', 'I251', 'I252', 'I253', 'I254', 'I255', 'I256', 'I258', 'I259'])
dilated_cardiomyopathy = ['I420']
heart_failure = ['I500', 'I501', 'I509']
atherosclerosis = ['I700', 'I7000', 'I7001', 'I701', 'I7010', 'I7011', 'I702', 'I7020', 'I7021', 'I708', 'I7080', 'I7081', 'I709', 'I7090', 'I7091']
arterial_embolism_thrombosis = ['I740', 'I741', 'I742', 'I743', 'I744', 'I745', 'I748', 'I749']

broad_CAD = []
for c in [CAD_codes,dilated_cardiomyopathy,heart_failure,atherosclerosis,arterial_embolism_thrombosis]:
    broad_CAD.extend(c)
    
# Cerebral infarction as indication of poor artery health?
data_df = data_df.reset_index(drop=True)
cardiac_features_encoding_df = cardiac_features_encoding_df.reset_index(drop=True)

for target_codes, name in [(infarction_codes,'Infarction'), (CAD_codes, 'CAD'), (broad_CAD, 'CAD_broad')]:
    superset = pd.Series([False for i in range(len(cardiac_features_encoding_df))])
    for i in range(223):  # 243
        field_id = f'{field_name}-0.{i}'
        superset = (superset | (data_df[field_id].isin(target_codes)))
    print(sum(superset))
    cardiac_features_encoding_df[name] = superset.astype(int)

20666
48285
55119


In [560]:
cardiac_features_encoding_df.loc[cardiac_features_encoding_df['Heart attack diagnosed by doctor']!=cardiac_features_encoding_df['Infarction'],'Infarction']=1 # Some had heart attack diagnosed but not ICD code
print(len(cardiac_features_encoding_df.loc[cardiac_features_encoding_df['Heart attack diagnosed by doctor']!=cardiac_features_encoding_df['Infarction'],'Infarction']))

4759


In [561]:
def det_htn(row):
  sys_limit = 140
  dia_limit = 90

  if ((row['Systolic blood pressure-2.mean']>sys_limit) and (row['Diastolic blood pressure-2.mean']>dia_limit)) or (row['High blood pressure diagnosed by doctor']==1)or (row['Blood pressure medication regularly taken']==1):
    return 1
  else:
    return 0


cardiac_features_encoding_df['Hypertension'] = cardiac_features_encoding_df.apply(lambda row: det_htn(row),axis=1)

In [562]:

if SAVE:
  cardiac_features_encoding_df.to_csv(join(BASE_PATH,'cardiac_feature_18545_vector_labeled.csv'),index=False)
else:
  cardiac_features_encoding_df = pd.read_csv(join(BASE_PATH,'cardiac_feature_18545_vector_labeled.csv'))
print(SAVE)

True


# Convert Dataframe to Vector - No One-Hot

In [563]:
def cardiac_features_to_vector_no_onehot_df(df):
  vec = []
  vec.append(df['eid'])
  # vec.append(df['eid_old'])
  vec.append(df['Pulse wave Arterial Stiffness index-2.0'])
  vec.append(df['Systolic blood pressure-2.mean'])
  vec.append(df['Diastolic blood pressure-2.mean'])
  vec.append(df['Pulse rate-2.mean'])
  vec.append(df['Body fat percentage-2.0'])
  vec.append(df['Whole body fat mass-2.0'])
  # vec.append(df['Whole body fat-free mass-2.0'])
  # vec.append(df['Whole body water mass-2.0'])
  vec.append(df['Body mass index (BMI)-2.0'])
  # vec.append(df['Cooked vegetable intake-2.0'])
  # vec.append(df['Salad / raw vegetable intake-2.0'])
  # vec.append(df['Cardiac operations performed'])
  # vec.append(df['Total mass-2.0'])
  vec.append(df['Basal metabolic rate-2.0'])
  # vec.append(df['Impedance of whole body-2.0'])
  vec.append(df['Waist circumference-2.0'])
  vec.append(df['Hip circumference-2.0'])
  # vec.append(df['Standing height-2.0'])
  # vec.append(df['Height-2.0'])
  # vec.append(df['Sitting height-2.0'])
  vec.append(df['Weight-2.0'])
  vec.append(df['Ventricular rate-2.0'])
  vec.append(df['P duration-2.0'])
  vec.append(df['QRS duration-2.0'])
  # vec.append(df['PQ interval-2.0'])
  # vec.append(df['RR interval-2.0'])
  # vec.append(df['PP interval-2.0'])
  vec.append(df['Cardiac output-2.0'])
  vec.append(df['Cardiac index-2.0'])
  vec.append(df['Average heart rate-2.0'])
  vec.append(df['Body surface area-2.0'])
  vec.append(df['Duration of walks-2.0'])
  vec.append(df['Duration of moderate activity-2.0'])
  vec.append(df['Duration of vigorous activity-2.0'])
  # vec.append(df['Time spent watching television (TV)-2.0'])
  # vec.append(df['Time spent using computer-2.0'])
  # vec.append(df['Time spent driving-2.0'])
  vec.append(df['Heart rate during PWA-2.0'])
  vec.append(df['Systolic brachial blood pressure during PWA-2.0'])
  vec.append(df['Diastolic brachial blood pressure during PWA-2.0'])
  vec.append(df['Peripheral pulse pressure during PWA-2.0'])
  vec.append(df['Central systolic blood pressure during PWA-2.0'])
  vec.append(df['Central pulse pressure during PWA-2.0'])
  vec.append(df['Number of beats in waveform average for PWA-2.0'])
  vec.append(df['Central augmentation pressure during PWA-2.0'])
  vec.append(df['Augmentation index for PWA-2.0'])
  vec.append(df['Cardiac output during PWA-2.0'])
  vec.append(df['End systolic pressure during PWA-2.0'])
  vec.append(df['End systolic pressure index during PWA-2.0'])
  vec.append(df['Total peripheral resistance during PWA-2.0'])
  vec.append(df['Stroke volume during PWA-2.0'])
  # vec.append(df['Mean arterial pressure during PWA-2.0'])
  vec.append(df['Cardiac index during PWA-2.0'])
  vec.append(df['Sleep duration-2.0'])
  vec.append(df['Exposure to tobacco smoke at home-2.0'])
  vec.append(df['Exposure to tobacco smoke outside home-2.0'])
  # vec.append(df['Pack years of smoking-2.0'])
  # vec.append(df['Pack years adult smoking as proportion of life span exposed to smoking-2.0'])
  vec.append(df['LVEDV (mL)'])
  vec.append(df['LVESV (mL)'])
  vec.append(df['LVSV (mL)'])
  vec.append(df['LVEF (%)'])
  vec.append(df['LVCO (L/min)'])
  vec.append(df['LVM (g)'])
  vec.append(df['RVEDV (mL)'])
  vec.append(df['RVESV (mL)'])
  vec.append(df['RVSV (mL)'])
  vec.append(df['RVEF (%)'])
  
  # vec.append(df['Worrier / anxious feelings-2.0'].apply(clean_categorical))
  vec.append(df['Shortness of breath walking on level ground-2.0'].apply(clean_categorical))
  vec.append(df['Sex-0.0'].apply(clean_categorical))
  vec.append(df['Diabetes diagnosis'].apply(clean_categorical))
  vec.append(df['Heart attack diagnosed by doctor'].apply(clean_categorical))
  vec.append(df['Angina diagnosed by doctor'].apply(clean_categorical))
  vec.append(df['Stroke diagnosed by doctor'].apply(clean_categorical))
  vec.append(df['High blood pressure diagnosed by doctor'].apply(clean_categorical))
  vec.append(df['Cholesterol lowering medication regularly taken'].apply(clean_categorical))
  vec.append(df['Blood pressure medication regularly taken'].apply(clean_categorical))
  vec.append(df['Insulin medication regularly taken'].apply(clean_categorical))
  vec.append(df['Hormone replacement therapy medication regularly taken'].apply(clean_categorical))
  vec.append(df['Oral contraceptive pill or minipill medication regularly taken'].apply(clean_categorical))
  # vec.append(df['Pace-maker-2.0'].apply(clean_categorical))
  # vec.append(df['Ever had diabetes (Type I or Type II)-0.0'].apply(clean_categorical))
  vec.append(df['Long-standing illness, disability or infirmity-2.0'].apply(clean_categorical))
  # vec.append(df['Tense / \'highly strung\'-2.0'].apply(clean_categorical))
  vec.append(df['Ever smoked-2.0'].apply(clean_categorical))

  vec.append(df['Sleeplessness / insomnia-2.0'].apply(clean_categorical))
  # vec.append(df['Frequency of heavy DIY in last 4 weeks-2.0'].apply(clean_categorical))
  vec.append(df['Alcohol intake frequency.-2.0'].apply(clean_categorical))
  # vec.append(df['Processed meat intake-2.0'].apply(clean_categorical))
  # vec.append(df['Beef intake-2.0'].apply(clean_categorical))
  # vec.append(df['Pork intake-2.0'].apply(clean_categorical))
  # vec.append(df['Lamb/mutton intake-2.0'].apply(clean_categorical))
  vec.append(df['Overall health rating-2.0'].apply(clean_categorical))
  # vec.append(df['Alcohol usually taken with meals-2.0'].apply(clean_categorical))
  vec.append(df['Alcohol drinker status-2.0'].apply(clean_categorical))
  # vec.append(df['Frequency of drinking alcohol-0.0'].apply(clean_categorical))
  # vec.append(df['Frequency of consuming six or more units of alcohol-0.0'].apply(clean_categorical))
  # vec.append(df['Amount of alcohol drunk on a typical drinking day-0.0'].apply(clean_categorical))
  vec.append(df['Falls in the last year-2.0'].apply(clean_categorical))
  # vec.append(df['Weight change compared with 1 year ago-2.0'].apply(clean_categorical))
  vec.append(df['Number of days/week walked 10+ minutes-2.0'].apply(clean_categorical))
  vec.append(df['Number of days/week of moderate physical activity 10+ minutes-2.0'].apply(clean_categorical))
  vec.append(df['Number of days/week of vigorous physical activity 10+ minutes-2.0'].apply(clean_categorical))
  vec.append(df['Usual walking pace-2.0'].apply(clean_categorical))
  # vec.append(df['Frequency of stair climbing in last 4 weeks-2.0'].apply(clean_categorical))
  # vec.append(df['Frequency of walking for pleasure in last 4 weeks-2.0'].apply(clean_categorical))
  # vec.append(df['Duration walking for pleasure-2.0'].apply(clean_categorical))
  # vec.append(df['Frequency of strenuous sports in last 4 weeks-2.0'].apply(clean_categorical))
  vec.append(df['Duration of strenuous sports-2.0'].apply(clean_categorical))
  # vec.append(df['Duration of light DIY-2.0'].apply(clean_categorical))
  # vec.append(df['Duration of heavy DIY-2.0'].apply(clean_categorical))
  # vec.append(df['Frequency of other exercises in last 4 weeks-2.0'].apply(clean_categorical))
  # vec.append(df['Duration of other exercises-2.0'].apply(clean_categorical))
  vec.append(df['Current tobacco smoking-2.0'].apply(clean_categorical))
  vec.append(df['Past tobacco smoking-2.0'].apply(clean_categorical))
  # vec.append(df['Smoking/smokers in household-2.0'].apply(clean_categorical))
  vec.append(df['Smoking status-2.0'].apply(clean_categorical))
  return vec

In [564]:
cardiac_features_encoding_df = cardiac_features_to_vector_no_onehot_df(data_df)
cardiac_features_encoding_df = pd.concat(cardiac_features_encoding_df,axis=1)
cardiac_features_encoding_df = cardiac_features_encoding_df.reset_index(drop=True)

In [565]:
print(len(cardiac_features_encoding_df))
print(len(cardiac_features_encoding_df.columns))

442554
77


In [566]:
# one_based_features = ['Sleeplessness / insomnia-2.0','Alcohol intake frequency.-2.0','Overall health rating-2.0','Frequency of consuming six or more units of alcohol-0.0','Falls in the last year-2.0','Usual walking pace-2.0','Past tobacco smoking-2.0']
one_based_features = ['Sleeplessness / insomnia-2.0','Alcohol intake frequency.-2.0','Overall health rating-2.0','Falls in the last year-2.0','Usual walking pace-2.0','Past tobacco smoking-2.0']
cardiac_features_encoding_df[one_based_features] = cardiac_features_encoding_df[one_based_features] - 1

In [567]:
if SAVE:
    cardiac_features_encoding_df.to_csv(join(BASE_PATH,'cardiac_feature_18545_vector_noOH.csv'),index=False)
else:
    cardiac_features_encoding_df = pd.read_csv(join(BASE_PATH,'cardiac_feature_18545_vector_noOH.csv'))
print(SAVE)

True


### Labels

In [568]:
field_name = 'Diagnoses - ICD10'
infarction_codes = set(['I210', 'I211', 'I212', 'I213', 'I214', 'I219', 'I252'])
CAD_codes = set(['I200', 'I201', 'I208', 'I209', 
                 'I220', 'I221', 'I228', 'I229',
                 'I210', 'I211', 'I212', 'I213', 'I214', 'I219',
                 'I240', 'I248', 'I249'
                 'I250', 'I251', 'I252', 'I253', 'I254', 'I255', 'I256', 'I258', 'I259'])
dilated_cardiomyopathy = ['I420']
heart_failure = ['I500', 'I501', 'I509']
atherosclerosis = ['I700', 'I7000', 'I7001', 'I701', 'I7010', 'I7011', 'I702', 'I7020', 'I7021', 'I708', 'I7080', 'I7081', 'I709', 'I7090', 'I7091']
arterial_embolism_thrombosis = ['I740', 'I741', 'I742', 'I743', 'I744', 'I745', 'I748', 'I749']

broad_CAD = []
for c in [CAD_codes,dilated_cardiomyopathy,heart_failure,atherosclerosis,arterial_embolism_thrombosis]:
    broad_CAD.extend(c)
    
# Cerebral infarction as indication of poor artery health?
data_df = data_df.reset_index(drop=True)
cardiac_features_encoding_df = cardiac_features_encoding_df.reset_index(drop=True)

for target_codes, name in [(infarction_codes,'Infarction'), (CAD_codes, 'CAD'), (broad_CAD, 'CAD_broad')]:
    superset = pd.Series([False for i in range(len(cardiac_features_encoding_df))])
    for i in range(223):  # 243
        field_id = f'{field_name}-0.{i}'
        superset = (superset | (data_df[field_id].isin(target_codes)))
    print(sum(superset))
    cardiac_features_encoding_df[name] = superset.astype(int)

cardiac_features_encoding_df.loc[cardiac_features_encoding_df['Heart attack diagnosed by doctor']!=cardiac_features_encoding_df['Infarction'],'Infarction']=1 # Some had heart attack diagnosed but not ICD code

def det_htn(row):
  sys_limit = 140
  dia_limit = 90

  if ((row['Systolic blood pressure-2.mean']>sys_limit) and (row['Diastolic blood pressure-2.mean']>dia_limit)) or (row['High blood pressure diagnosed by doctor']==1)or (row['Blood pressure medication regularly taken']==1):
    return 1
  else:
    return 0


cardiac_features_encoding_df['Hypertension'] = cardiac_features_encoding_df.apply(lambda row: det_htn(row),axis=1)

SAVE=True
if SAVE:
  cardiac_features_encoding_df.to_csv(join(BASE_PATH,'cardiac_feature_18545_vector_labeled_noOH.csv'),index=False)
else:
  cardiac_features_encoding_df = pd.read_csv(join(BASE_PATH,'cardiac_feature_18545_vector_labeled_noOH.csv'))

20666
48285
55119


In [569]:
print(len(cardiac_features_encoding_df))
print(len(cardiac_features_encoding_df.columns))

442554
81
