### This notebook consists of three sections:
### Step 1: Extract clinical phenotype data from dataset['participant'] 
### Step 2: Extract Olink proteomics from dataset['olink_instance_0']
### Step 3: Extract operation records from dataset['hesin_oper']

In [None]:
# Import packages
# dxpy allows python to interact with the platform storage
# Note: This notebook is using spark since the size of the dataset we're extracting
# (i.e. the number of fields) is too large for a single node instance.
import dxpy
import dxdata

import pandas as pd
import subprocess
import glob
import os
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext


# Spark initialization (Done only once; do not rerun this cell unless you select Kernel -> Restart kernel).
# Need to adjust this buffer otherwise will get an error in toPandas() call
conf = pyspark.SparkConf().set("spark.kryoserializer.buffer.max", "1024")

sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)
sqlContext = SQLContext(sc)

In [None]:
dxdata.__version__

In [None]:
# silence warning
import warnings
warnings.filterwarnings('ignore')

# Re-enable warnings after your code if you want to see warnings again in subsequent cells
# warnings.filterwarnings('default')

In [None]:
# Automatically discover dispensed database name and dataset id
dispensed_database = dxpy.find_one_data_object(
    classname='database', 
    name='app*', 
    folder='/', 
    name_mode='glob', 
    describe=True)
dispensed_database_name = dispensed_database['describe']['name']

dispensed_dataset = dxpy.find_one_data_object(
    typename='Dataset', 
    name='app*.dataset', 
    folder='/', 
    name_mode='glob')
dispensed_dataset_id = dispensed_dataset['id']

## Access dataset

In [None]:
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

### Dataset 'entities' are virtual tables linked to one another.

The main entity is 'participant' and corresponds to most pheno fields. Additional entities correspond to linked health care data.
Entities starting with 'hesin' are for hospital records; entities starting with 'gp' are for GP records, etc.

In [None]:
dataset.entities

### Step 1: Accessing the main 'participant' entity
The extraction code follows some examples

In [None]:
participant = dataset['participant']

In [None]:
# Returns all field objects for a given UKB showcase field id

def fields_for_id(field_id):
    from distutils.version import LooseVersion
    field_id = str(field_id)
    fields = participant.find_fields(name_regex=r'^p{}(_i\d+)?(_a\d+)?$'.format(field_id))
    return sorted(fields, key=lambda f: LooseVersion(f.name))

# Returns all field names for a given UKB showcase field id

def field_names_for_id(field_id):
    return [f.name for f in fields_for_id(field_id)]

Obtain field name for all instances and arrays for each field_id

### Risk factors
"We defined risk factors at the first assessment as follows: diabetes diagnosed by a doctor (field #2443), BMI (field #21001), current smoking (field #20116), hypertension, family history of heart disease, and high cholesterol. For hypertension we used an expanded definition including self-reported high blood pressure (either on blood pressure medication, data fields #6177, #6153; systolic blood pressure >140 mm Hg, fields #4080, #93; or diastolic blood pressure >90 mm Hg, data fields #4079, #94). For family history of heart disease, we considered history in any first- degree relative (father, mother, sibling; fields #20107, 20110, and 20111, respectively). For high cholesterol, we considered individuals with self- reported high cholesterol at assessment, as well as diagnoses in the HES/death records (ICD-9 272.0; ICD-10 E78.0). For the analyses of the number of elevated risk factors, we considered diagnosed dia- betes (yes/no), hypertension at assessment (yes/no), BMI >30 kg/m2, smoking at assessment (yes/no), high cholesterol (yes/no), and family history of heart disease (yes/no)."

 Am Coll Cardiol. 2018 Oct 16;72(16):1883-1893. 
https://pubmed.ncbi.nlm.nih.gov/30309464/

In [None]:
# risk factor + family history + plate id
field_ids = ['2443', '20116', '6177', '6153', '4080', '93', '4079', '94', 
             '20107', '20110', '20111',
             '30901']
# sum flattens list of lists
sum([field_names_for_id(field_id) for field_id in field_ids], []) 

#### Looking up fields by title keyword

If you remember part of the field title, use these:

In [None]:
# Returns all field objects for a given title keyword

def fields_by_title_keyword(keyword):
    from distutils.version import LooseVersion
    fields = list(participant.find_fields(lambda f: keyword.lower() in f.title.lower()))
    return sorted(fields, key=lambda f: LooseVersion(f.name))

# Returns all field names for a given title keyword

def field_names_by_title_keyword(keyword):
    return [f.name for f in fields_by_title_keyword(keyword)]

# Returns all field titles for a given title keyword

def field_titles_by_title_keyword(keyword):
    return [f.title for f in fields_by_title_keyword(keyword)]

# Furhter information: https://github.com/dnanexus/OpenBio/blob/master/UKB_notebooks/ukb-rap-pheno-basic.ipynb

### Grabbing fields into a Spark DataFrame

The `participant.retrieve_fields` function can be used to construct a Spark DataFrame of the given fields.

By default, this retrieves data as encoded by UK Biobank. For example, field p31 (participant sex) will be returned as an integer column with values of 0 and 1. To receive decoded values, supply the `coding_values='replace'` argument.

For more information, see [Tips for Retrieving Fields](https://dnanexus.gitbook.io/uk-biobank-rap/getting-started/working-with-ukb-data#tips-for-retrieving-fields) in the documentation.

## Extract clinical data 

In [None]:
field_names = ['eid', 
               'p31',  # sex
               'p21022',  # age at recruitment
               'p21001_i0',  # BMI
               'p54_i0',  # UK Biobank assessment centre
               'p53_i0',  # Date of attending assessment centre p53_i0
               'p40000_i0',  # Date of death
               'p191', # date lost to follow-up
               'p20003_i0_a0', # medication
               'p6150_i0',  # having had a heart attack diagnosed by a doctor 
               'p20002_i0_a0',  # non-cancer illnesses that self-reported as heart attack 
               'p20004_i0_a0',  # self-reported operation including PTCA, CABG, or triple heart bypass
               'p2443_i0',  # diabetes
               'p20116_i0',  # current smoking
               'p20160_i0', # ever smoked
               'p6177_i0',  # self-reported high blood pressure
               'p6153_i0',  # self-reported high blood pressure
               'p4080_i0_a0',  # systolic blood pressure 
               'p93_i0_a0',  # systolic blood pressure 
               'p4079_i0_a0',  # diastolic blood pressure
               'p94_i0_a0', # diastolic blood pressure
               'p6177_i0', # Medication for cholesterol, blood pressure or diabetes | Instance 0
               'p20107_i0', #illness of father
               'p20110_i1', # illness of mother
               'p20111_i0', # illness of sibling
               'p30901_i0', # olink plateID
              ] \
                + field_names_for_id('41270') \
                + field_names_for_id('41280')  # Corrected line

# 41270 = ICD10; 41280 = Date of first in-patient diagnosis

• Further informatiaon on Date of first in-patient diagnosis can be found at https://biobank.ndph.ox.ac.uk/crystal/field.cgi?id=41280:
The corresponding ICD-10 diagnosis codes can be found in data-field Field 41270 and the two fields can be linked using the array structure.

• CAD definition and risk factors:
J Am Coll Cardiol. 2018 Oct 16;72(16):1883-1893. 
https://pubmed.ncbi.nlm.nih.gov/30309464/


## Grabbing fields into a Spark DataFrame
The participant.retrieve_fields function can be used to construct a Spark DataFrame of the given fields.

By default, this retrieves data as encoded by UK Biobank. For example, field p31 (participant sex) will be returned as an integer column with values of 0 and 1. To receive decoded values, supply the coding_values='replace' argument.

For more information, see Tips for Retrieving Fields in the documentation.

In [None]:
# Grabbing fields into a Spark DataFrame
df = participant.retrieve_fields(names=field_names, engine=dxdata.connect())

In [None]:
# See the first five entries as a Pandas DataFrame:
df.limit(5).toPandas()

In [None]:
# if the above looks good, go ahead and convert the entire spark data frame to pandas data frame 
pdf = df.toPandas()

In [None]:
print(pdf.columns)

In [None]:
# Saving as TSV file
pdf.to_csv('clinical_data.tsv', sep='\t', index=False)

### Step 2: Extract Olink proteomics from dataset['olink_instance_0']

In [None]:
olink = dataset['olink_instance_0']
# olink.fields # to list all

In [None]:
temp_list = olink.fields
type(temp_list)

In [None]:
#def field_names_for_id(field_id):
#    return [f.name for f in fields_for_id(field_id)]
olink_all_field_names = [f.name for f in olink.fields]

In [None]:
#field_names = ['eid', 'col6a3'] # select like this if you're interested in only a few protein
olink_all_field_names[:5] # check

In [None]:
dfo = olink.retrieve_fields(names=olink_all_field_names, engine=dxdata.connect())

In [None]:
# to check
# dfo.head(5)

### In case of extracting particular proteins only:
olink_field_names = ['eid', 'col6a3']
dfo = olink.retrieve_fields(names=olink_field_names, engine=dxdata.connect())

In [None]:
# See the first five entries as a Pandas DataFrame:
# dfo.limit(5).toPandas()

In [None]:
dfo.count() # check rows (53016 individual's data)

In [None]:
pdfo = dfo.toPandas()

In [None]:
pdfo.shape # (53016, 2924)

In [None]:
# Alternative approach
# pdfo = dfo.toPandas() is very memory intensive. So instead, we can do this sequentially (if needed).
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window

# Total number of rows in the DataFrame
total_rows = dfo.count()

# Number of chunks
num_chunks = 10

# Calculate the number of rows per chunk. Adding 1 to ensure the last chunk includes all remaining rows
rows_per_chunk = (total_rows // num_chunks) + (total_rows % num_chunks > 0)

# Initialize an empty list to store each chunk's pandas DataFrame
chunks_list = []

# Create a column 'row_id' to help in filtering rows for each chunk
dfo_with_id = dfo.withColumn("row_id", row_number().over(Window.orderBy(monotonically_increasing_id())) - 1)

for i in range(num_chunks):
    # Calculate start index for the current chunk
    start_index = i * rows_per_chunk
    
    # End index is not needed as we limit the number of rows fetched
    chunk_df = dfo_with_id.filter(dfo_with_id.row_id >= start_index).limit(rows_per_chunk)
    
    # Convert the chunk to a pandas DataFrame and append to the list
    chunk_pd_df = chunk_df.drop("row_id").toPandas()
    chunks_list.append(chunk_pd_df)

# Concatenate all chunks to form the final pandas DataFrame
pdf = pd.concat(chunks_list, ignore_index=True)

# Checking the shape of the final DataFrame
print(pdf.shape)

In [None]:
pdfo.iloc[:5, :5] # check

In [None]:
pdfo.to_csv('olink_data.tsv', sep='\t', index=False)

### Step 3: Extract operation records from dataset['hesin_oper']

In [None]:
operation = dataset['hesin_oper']

In [None]:
# operation.fields[1:5] # check
operation.fields

In [None]:
operation_all_field_names = [f.name for f in operation.fields]
print(operation_all_field_names)

In [None]:
df_ope = operation.retrieve_fields(names=operation_all_field_names, engine=dxdata.connect())

In [None]:
df_ope.head(5)

In [None]:
df_ope.limit(5).toPandas().head() # check

In [None]:
# Alternatively, we do this sequentially
# from pyspark.sql.functions import monotonically_increasing_id, row_number
# from pyspark.sql.window import Window

# Total number of rows in the DataFrame
total_rows = df_ope.count()

# Number of chunks
num_chunks = 10

# Calculate the number of rows per chunk. Adding 1 to ensure the last chunk includes all remaining rows
rows_per_chunk = (total_rows // num_chunks) + (total_rows % num_chunks > 0)

# Initialize an empty list to store each chunk's pandas DataFrame
chunks_list = []

# Create a column 'row_id' to help in filtering rows for each chunk
df_ope_with_id = df_ope.withColumn("row_id", row_number().over(Window.orderBy(monotonically_increasing_id())) - 1)

for i in range(num_chunks):
    # Calculate start index for the current chunk
    start_index = i * rows_per_chunk
    
    # End index is not needed as we limit the number of rows fetched
    chunk_df = df_ope_with_id.filter(df_ope_with_id.row_id >= start_index).limit(rows_per_chunk)
    
    # Convert the chunk to a pandas DataFrame and append to the list
    chunk_pd_df = chunk_df.drop("row_id").toPandas()
    chunks_list.append(chunk_pd_df)

# Concatenate all chunks to form the final pandas DataFrame
pdf_ope = pd.concat(chunks_list, ignore_index=True)

# Checking the shape of the final DataFrame
print(pdf_ope.shape)

In [None]:
pdf_ope.to_csv('operation_data.tsv', sep = '\t', index = False)

In [None]:
# hesin is optional
# hesin = dataset['hesin']
# hesin_all_field_names = [f.name for f in hesin.fields]
# print(hesin_all_field_names)
# df_hesin = hesin.retrieve_fields(names=hesin_all_field_names, engine=dxdata.connect())
# df_hesin.limit(5).toPandas().head() # check

In [None]:
# retrive hesin_diag, which contains ICD10
hesin_diag = dataset['hesin_diag']
hesin_diag_all_field_names = [f.name for f in hesin_diag.fields]
print(hesin_diag_all_field_names)
df_hesin_diag = hesin_diag.retrieve_fields(names=hesin_diag_all_field_names, engine=dxdata.connect())
df_hesin_diag.limit(5).toPandas().head() # check

In [None]:
# again, toPandas() is too memory intensive
# pdf_hesin_diag = df_hesin_diag.dropna(subset = ['diag_icd10']).toPandas()
# Alternatively, we do this sequentially
# from pyspark.sql.functions import monotonically_increasing_id, row_number
# from pyspark.sql.window import Window

# Total number of rows in the DataFrame
total_rows = df_hesin_diag.count()

# Number of chunks
num_chunks = 10

# Calculate the number of rows per chunk. Adding 1 to ensure the last chunk includes all remaining rows
rows_per_chunk = (total_rows // num_chunks) + (total_rows % num_chunks > 0)

# Initialize an empty list to store each chunk's pandas DataFrame
chunks_list = []

# Create a column 'row_id' to help in filtering rows for each chunk
df_hesin_diag_with_id = df_hesin_diag.withColumn("row_id", row_number().over(Window.orderBy(monotonically_increasing_id())) - 1)

for i in range(num_chunks):
    # Calculate start index for the current chunk
    start_index = i * rows_per_chunk
    
    # End index is not needed as we limit the number of rows fetched
    chunk_df = df_hesin_diag_with_id.filter(df_hesin_diag_with_id.row_id >= start_index).limit(rows_per_chunk)
    
    # Convert the chunk to a pandas DataFrame and append to the list
    chunk_pd_df = chunk_df.drop("row_id").toPandas()
    chunks_list.append(chunk_pd_df)

# Concatenate all chunks to form the final pandas DataFrame
pdf_hesin_diag = pd.concat(chunks_list, ignore_index=True)

In [None]:
print(pdf_hesin_diag.shape)

In [None]:
pdf_hesin_diag.to_csv('operation_icd10_data.tsv', sep = '\t', index = False)

In [None]:
# merge df_ope and df_hesin_diag (this is too memory intensive)
# pdf_ope_diag = pd.merge(pdf_ope, pdf_hesin_diag, on='eid', how='inner')
# pdf_ope_diag.head() # check
# pdf_ope_diag.to_csv('operation_data_with_icd10.tsv', sep = '\t', index = False)

# retrive death record
hesin = dataset['hesin']
hesin_all_field_names = [f.name for f in hesin.fields]
print(hesin_all_field_names)
df_hesin = hesin.retrieve_fields(names=hesin_all_field_names, engine=dxdata.connect())
df_hesin.limit(5).toPandas().head() # check#### Step 4: Extract death record
4.1. <Entity "death">,
4.2 <Entity "death_cause">

In [None]:
death = dataset['death']

In [None]:
death.fields

In [None]:
death_all_field_names = [f.name for f in death.fields]
print(death_all_field_names)
df_death = death.retrieve_fields(names=death_all_field_names, engine=dxdata.connect())
df_death.head(5)

In [None]:
pdf_death = df_death.toPandas() # convert to pandas data frame

In [None]:
pdf_death.to_csv('death_data.tsv', sep= '\t', index= False)

In [None]:
# 4.2
death_cause = dataset['death_cause']

In [None]:
print(death_cause.fields)

In [None]:
death_cause_all_field_names = [f.name for f in death_cause.fields]
print(death_cause_all_field_names)
df_death_cause = death_cause.retrieve_fields(names=death_cause_all_field_names, engine=dxdata.connect())
df_death_cause.head(5)

In [None]:
pdf_death_cause = df_death_cause.toPandas() # convert to pandas data frame

In [None]:
pdf_death_cause.to_csv('death_cause_data.tsv', sep= '\t', index= False)

### upload the resultant files and the current notebook (after saving it)

In [None]:
%%bash
dx upload clinical_data.tsv --dest UKB:/data/03.incident_CAD/
dx upload olink_data.tsv --dest UKB:/data/03.incident_CAD/
dx upload operation_data.tsv --dest UKB:/data/03.incident_CAD/
dx upload operation_icd10_data.tsv --dest UKB:/data/03.incident_CAD/
dx upload death_data.tsv --dest UKB:/data/03.incident_CAD/
dx upload death_cause_data.tsv --dest UKB:/data/03.incident_CAD/
dx upload ukbrap_extract_clinical_olink_ope_data.ipynb --dest UKB:/data/03.incident_CAD/