# Example query with pyspark

In [None]:
import dxpy
import dxdata

import pandas as pd
import subprocess
import glob
import os
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

# Spark initialization (Done only once; do not rerun this cell unless you select Kernel -> Restart kernel).
# Need to adjust this buffer otherwise will get an error in toPandas() call
conf = pyspark.SparkConf().set("spark.kryoserializer.buffer.max", "1024")

sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)
sqlContext = SQLContext(sc)

In [None]:
dxdata.__version__

In [None]:
# silence warning
import warnings
warnings.filterwarnings('ignore')

# Re-enable warnings after your code if you want to see warnings again in subsequent cells
# warnings.filterwarnings('default')

In [None]:
# Automatically discover dispensed database name and dataset id
dispensed_database = dxpy.find_one_data_object(
    classname='database', 
    name='app*', 
    folder='/', 
    name_mode='glob', 
    describe=True)
dispensed_database_name = dispensed_database['describe']['name']

dispensed_dataset = dxpy.find_one_data_object(
    typename='Dataset', 
    name='app*.dataset', 
    folder='/', 
    name_mode='glob')
dispensed_dataset_id = dispensed_dataset['id']

## Access dataset

In [None]:
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

In [None]:
dataset.entities

In [None]:
participant = dataset['participant']

In [None]:
# Returns all field objects for a given UKB showcase field id

def fields_for_id(field_id):
    from distutils.version import LooseVersion
    field_id = str(field_id)
    fields = participant.find_fields(name_regex=r'^p{}(_i\d+)?(_a\d+)?$'.format(field_id))
    return sorted(fields, key=lambda f: LooseVersion(f.name))

# Returns all field names for a given UKB showcase field id

def field_names_for_id(field_id):
    return [f.name for f in fields_for_id(field_id)]

Obtain field name for all instances and arrays for each field_id

### Clinical variables query
Ref: Am Coll Cardiol. 2018 Oct 16;72(16):1883-1893. 
https://pubmed.ncbi.nlm.nih.gov/30309464/

In [None]:
# example query
field_ids = ['2443', '20116']
# sum flattens list of lists
sum([field_names_for_id(field_id) for field_id in field_ids], []) 

#### Looking up fields by title keyword

If you remember part of the field title, use these:

In [None]:
# Returns all field objects for a given title keyword

def fields_by_title_keyword(keyword):
    from distutils.version import LooseVersion
    fields = list(participant.find_fields(lambda f: keyword.lower() in f.title.lower()))
    return sorted(fields, key=lambda f: LooseVersion(f.name))

# Returns all field names for a given title keyword

def field_names_by_title_keyword(keyword):
    return [f.name for f in fields_by_title_keyword(keyword)]

# Returns all field titles for a given title keyword

def field_titles_by_title_keyword(keyword):
    return [f.title for f in fields_by_title_keyword(keyword)]

# Furhter information: https://github.com/dnanexus/OpenBio/blob/master/UKB_notebooks/ukb-rap-pheno-basic.ipynb

### Grabbing fields into a Spark DataFrame

## Extract clinical data 

In [None]:
# example query
field_names = ['eid', 
               'p31',  # sex
               'p21022',  # age at recruitment
               'p21001_i0',  # BMI
               'p54_i0',  # UK Biobank assessment centre
               'p53_i0',  # Date of attending assessment centre p53_i0
              ] \
                + field_names_for_id('41270') \
                + field_names_for_id('41280')  # Corrected line

# 41270 = ICD10; 41280 = Date of first in-patient diagnosis

• Further informatiaon on Date of first in-patient diagnosis can be found at https://biobank.ndph.ox.ac.uk/crystal/field.cgi?id=41280:
The corresponding ICD-10 diagnosis codes can be found in data-field Field 41270 and the two fields can be linked using the array structure.

• CAD definition and risk factors:
J Am Coll Cardiol. 2018 Oct 16;72(16):1883-1893. 
https://pubmed.ncbi.nlm.nih.gov/30309464/


## Grabbing fields into a Spark DataFrame

In [None]:
# Grabbing fields into a Spark DataFrame
df = participant.retrieve_fields(names=field_names, engine=dxdata.connect())

In [None]:
# See the first five entries as a Pandas DataFrame:
df.limit(5).toPandas()

In [None]:
# if the above looks good, go ahead and convert the entire spark data frame to pandas data frame 
pdf = df.toPandas()

In [None]:
print(pdf.columns)

In [None]:
# Saving as TSV file
pdf.to_csv('clinical_data.tsv', sep='\t', index=False)

### Step 3: Extract operation records from dataset['hesin_oper']

In [None]:
operation = dataset['hesin_oper']

In [None]:
# operation.fields[1:5] # check
operation.fields

In [None]:
operation_all_field_names = [f.name for f in operation.fields]
print(operation_all_field_names)

In [None]:
df_ope = operation.retrieve_fields(names=operation_all_field_names, engine=dxdata.connect())

In [None]:
df_ope.head(5)

In [None]:
df_ope.limit(5).toPandas().head() # check

In [None]:
pdf_ope.to_csv('operation_data.tsv', sep = '\t', index = False)

In [None]:
# retrive hesin_diag, which contains ICD10
hesin_diag = dataset['hesin_diag']
hesin_diag_all_field_names = [f.name for f in hesin_diag.fields]
print(hesin_diag_all_field_names)
df_hesin_diag = hesin_diag.retrieve_fields(names=hesin_diag_all_field_names, engine=dxdata.connect())
df_hesin_diag.limit(5).toPandas().head() # check

In [None]:
print(pdf_hesin_diag.shape)

In [None]:
pdf_hesin_diag.to_csv('operation_icd10_data.tsv', sep = '\t', index = False)

# retrive death record
hesin = dataset['hesin']
hesin_all_field_names = [f.name for f in hesin.fields]
print(hesin_all_field_names)
df_hesin = hesin.retrieve_fields(names=hesin_all_field_names, engine=dxdata.connect())
df_hesin.limit(5).toPandas().head() # check#### Step 4: Extract death record
4.1. <Entity "death">,
4.2 <Entity "death_cause">

In [None]:
death = dataset['death']

In [None]:
death.fields

In [None]:
death_all_field_names = [f.name for f in death.fields]
print(death_all_field_names)
df_death = death.retrieve_fields(names=death_all_field_names, engine=dxdata.connect())
df_death.head(5)

In [None]:
pdf_death = df_death.toPandas() # convert to pandas data frame

In [None]:
pdf_death.to_csv('death_data.tsv', sep= '\t', index= False)

In [None]:
print(death_cause.fields)

In [None]:
death_cause_all_field_names = [f.name for f in death_cause.fields]
print(death_cause_all_field_names)
df_death_cause = death_cause.retrieve_fields(names=death_cause_all_field_names, engine=dxdata.connect())
df_death_cause.head(5)

In [None]:
pdf_death_cause = df_death_cause.toPandas() # convert to pandas data frame

In [None]:
pdf_death_cause.to_csv('death_cause_data.tsv', sep= '\t', index= False)

### upload the resultant files and the current notebook (after saving it)

In [None]:
%%bash
dx upload clinical_data.tsv --dest UKB:/data/03.incident_CAD/
dx upload operation_data.tsv --dest UKB:/data/03.incident_CAD/
dx upload operation_icd10_data.tsv --dest UKB:/data/03.incident_CAD/
dx upload death_data.tsv --dest UKB:/data/03.incident_CAD/
dx upload death_cause_data.tsv --dest UKB:/data/03.incident_CAD/

References:

https://github.com/dnanexus/OpenBio/blob/master/UKB_notebooks/ukb-rap-pheno-basic.ipynb

https://github.com/dnanexus/UKB_RAP/blob/main/proteomics/0_extract_phenotype_protein_data.ipynb