In [None]:
import os
import pandas as pd
from datetime import datetime
start = datetime.now()

bucket = os.getenv("WORKSPACE_BUCKET")
dataset = os.getenv("WORKSPACE_CDR")

**Author**: Stephan Cordogan

This document perpares a cohort of individuals for GWAS.  It includes information on the case/control status, as well as age and sex to be used as covariates.  Meniscus tears are the example disease in this notebook, and can be substituted.

This document uses some code from the document 02_Hail_part1_Prepare Phenotype, with Authors: Francis Ratsimbazafy, Jennifer Zhang and Contributors: Christopher Lord, Nicole Deflaux, Kelsey Mayo, Lee Lichtenstein, CH Albach.  

# Create your cohort of WGS individuals with sex and date of birth

In [None]:
dataset_mt_sql = """
    SELECT
        person.person_id,
        person.birth_datetime as date_of_birth,
        p_sex_at_birth_concept.concept_name as dragen_sex_ploidy  
    FROM
        `""" + os.environ["WORKSPACE_CDR"] + """.person` person 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_sex_at_birth_concept 
            ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id
    WHERE
        person.PERSON_ID IN (
            SELECT
                distinct person_id  
            FROM
                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
            WHERE
                cb_search_person.person_id IN (
                    SELECT
                        person_id 
                    FROM
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p 
                    WHERE
                        has_whole_genome_variant = 1 
                ) 
            )"""

dataset_mt_df = pd.read_gbq(
    dataset_mt_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

dataset_mt_df.head(5)

# Below are 3 methods to define cases and controls within your cohort, each increasingly restrictive.  

To execute one of the methods, ensure that the other two methods are hashtagged out and the chosen method is not.  In my experience, the more restrictive methods yield better results.  The most restrictive method requires a minimum of two separate visits, which is prescedented by Tcheandjieu et. al (linked below).  They also required a minimum of two disdinct ICD codes, which we did not find improved performance.  Closer examination of the dataset builder tool showed that many individuals with an ICD code had condition_start_datetime and condition_end_datetime on the same day.

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9419655/

The first method defines cases and controls identically to the dataset builder tool, using concept IDs, which are broad, overarching IDs specific to a disease. This will include all of the patients with moderately associated ICD codes present. In this instance, we use the concept ID for meniscus tears.  The code below is taken directly from the dataset builder tool, and simply used to define cases and controls- covariates need to be specified in the above cell.  

In [None]:
dataset_hasmt_person_sql = """
    SELECT
        person.person_id,
        person.gender_concept_id,
        p_gender_concept.concept_name as gender,
        person.birth_datetime as date_of_birth,
        person.race_concept_id,
        p_race_concept.concept_name as race,
        person.ethnicity_concept_id,
        p_ethnicity_concept.concept_name as ethnicity,
        person.sex_at_birth_concept_id,
        p_sex_at_birth_concept.concept_name as sex_at_birth 
    FROM
        `""" + os.environ["WORKSPACE_CDR"] + """.person` person 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_gender_concept 
            ON person.gender_concept_id = p_gender_concept.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_race_concept 
            ON person.race_concept_id = p_race_concept.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_ethnicity_concept 
            ON person.ethnicity_concept_id = p_ethnicity_concept.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_sex_at_birth_concept 
            ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id  
    WHERE
        person.PERSON_ID IN (SELECT
            distinct person_id  
        FROM
            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
        WHERE
            cb_search_person.person_id IN (SELECT
                person_id 
            FROM
                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p 
            WHERE
                has_whole_genome_variant = 1 ) 
            AND cb_search_person.person_id IN (SELECT
                criteria.person_id 
            FROM
                (SELECT
                    DISTINCT person_id, entry_date, concept_id 
                FROM
                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                WHERE
                    (concept_id IN(SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                    JOIN
                        (SELECT
                            CAST(cr.id as string) AS id       
                        FROM
                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr       
                        WHERE
                            concept_id IN (4035415)       
                            AND full_text LIKE '%_rank1]%'      ) a 
                            ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                            OR c.path LIKE CONCAT('%.', a.id) 
                            OR c.path LIKE CONCAT(a.id, '.%') 
                            OR c.path = a.id) 
                    WHERE
                        is_standard = 1 
                        AND is_selectable = 1) 
                    AND is_standard = 1 )) criteria ) )"""

dataset_hasmt_person_df = pd.read_gbq(
    dataset_hasmt_person_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

# Ensure 'person_id' is the index in dataset_hasmt_person_df
dataset_hasmt_person_ids = set(dataset_hasmt_person_df['person_id'])

# Create a new column 'has_mt' in dataset_mt_df based on the presence in dataset_hasmt_person_ids

dataset_mt_df['has_mt'] = dataset_mt_df['person_id'].isin(dataset_hasmt_person_ids).astype(int)

# The result is mt_final_cohort with has_mt set to 1 if person_id is in dataset_hasmt_person_df, otherwise is 0

mt_final_cohort = dataset_mt_df

The second method defines cases and controls by their ICD and SNOMED codes, systems used by doctors to code medical data.  The presence of an associated ICD code designates an individual as a case. ICD codes need to be specified in the immediately below cell.  If there is only a single code for ICD9, 10, or SNOMED, enter it twice

In [None]:
# Total Meniscus tears
condition_codes_mt_icd9 = tuple(['836.0','836.1'])
condition_codes_mt_icd10 = tuple(['S83.28', 'S83.281', 'S83.282', 
'S83.281A', 'S83.282A', 'S83.27', 'S83.289', 'S83.289A', 'S83.271',
                                 'S83.241A', 'S83.242A', 'S83.221A', 'S83.222A', 'S83.289'])
condition_codes_mt_SNOMED = tuple(['239720000', '302933001','302932006'])

# #LATERAL

# condition_codes_mt_icd9 = tuple(['836.1', '836.1'])
# condition_codes_mt_icd10 = tuple(['S83.28', 'S83.281', 'S83.282', 
#  'S83.281A', 'S83.282A', 'S83.27', 'S83.289', 'S83.289A', 'S83.271'])
# condition_codes_mt_SNOMED = tuple(['302933001','302933001'])

# #MEDIAL

# condition_codes_mt_icd9 = tuple(['836.0', '836.0'])
# condition_codes_mt_icd10 = tuple(['S83.241A', 'S83.242A', 'S83.221A', 'S83.222A', 'S83.289'])
# condition_codes_mt_SNOMED = tuple(['302932006','302932006'])

# #GENERAL

# condition_codes_mt_SNOMED = tuple(['239720000', '239720000'])

In [None]:
# # This code retrieves concept ID for conditions matching those codes, defining the query

# query = f"""                                
# SELECT 
#     c.concept_id
# FROM `{dataset}.concept` c
# JOIN `{dataset}.condition_occurrence` co ON c.concept_id = co.condition_source_concept_id
# WHERE (vocabulary_id='ICD9CM' AND concept_code IN {condition_codes_mt_icd9})
#     OR (vocabulary_id='ICD10CM' AND concept_code IN {condition_codes_mt_icd10}) 
#     OR (vocabulary_id='SNOMED' AND concept_code IN {condition_codes_mt_SNOMED})

# GROUP BY c.concept_name, c.concept_code,c.concept_id
# """

# # This code executes the query

# condition_concepts_mt_df  = pd.read_gbq(query, dialect = "standard")

# # This code retrieves unique person ID from person table for those who have condition codes above and adds an indicator variable has_ocdc for them

# query = f"""                                
# SELECT person.person_id, 
#    -- Add an indicator variable.
#     1 AS has_mt
# FROM `{dataset}.person` person
# WHERE
#     person_id IN (SELECT person_id
#                   FROM `{dataset}.condition_occurrence`
#                   WHERE condition_source_concept_id IN {tuple(condition_concepts_mt_df['concept_id'])})
# """
# mt_cohort = pd.read_gbq(query, dialect="standard")

# mt_final_cohort = (dataset_mt_df.merge(mt_cohort, on='person_id', how='left')
#               .fillna(value={'has_mt': 0})
#              )
# mt_final_cohort['has_mt'].value_counts()

The third method defines cases and controls by their ICD and SNOMED codes, with a minimum of two instances of a code for an individual to be classified as a case. This is the most restrictive method

In [None]:
# # This code retrieves concept ID for conditions matching those codes, defining the query 
# # Individuals included must have 2+ instances of a code

# query = f"""
# SELECT 
#     co.person_id,
#     c.concept_id,
#     COUNT(DISTINCT co.condition_occurrence_id) AS occurrence_count
# FROM `{dataset}.concept` c
# JOIN `{dataset}.condition_occurrence` co ON c.concept_id = co.condition_source_concept_id
# WHERE (vocabulary_id='ICD9CM' AND concept_code IN {condition_codes_mt_icd9})
#     OR (vocabulary_id='ICD10CM' AND concept_code IN {condition_codes_mt_icd10}) 
#     OR (vocabulary_id='SNOMED' AND concept_code IN {condition_codes_mt_SNOMED})
# GROUP BY co.person_id, c.concept_id
# HAVING COUNT(DISTINCT co.condition_occurrence_id) > 1
# """

# # This code executes the query

# condition_concepts_mt_df = pd.read_gbq(query, dialect="standard")

# # This code retrieves unique person ID from person table for those who have condition codes above and adds an indicator variable has_ocdc for them

# person_ids = tuple(condition_concepts_mt_df['person_id'].unique())

# query = f"""
# SELECT person.person_id, 
#    -- Add an indicator variable.
#     1 AS has_mt
# FROM `{dataset}.person` person
# WHERE person_id IN {person_ids}
# """

# mt_cohort = pd.read_gbq(query, dialect="standard")

# mt_final_cohort = (dataset_mt_df.merge(mt_cohort, on='person_id', how='left')
#               .fillna(value={'has_mt': 0})
#              )
# mt_final_cohort['has_mt'].value_counts()

In [None]:
mt_final_cohort.dtypes

This code converts age into a continuous variable

In [None]:
current_date = pd.Timestamp('now', tz='UTC')
mt_final_cohort['age'] = current_date-mt_final_cohort.date_of_birth
mt_final_cohort['age_yrs'] = mt_final_cohort.age/pd.Timedelta('365.25 days')
mt_final_cohort.head(5)

In [None]:
demographics = pd.get_dummies(mt_final_cohort.set_index(['person_id'])).reset_index()
demographics['has_mt'] = demographics['has_mt'].astype(int)
demographics.head()

The code below specifies the porportion of the total controls to be included in the GWAS.  This should be ~4x larger than your cases.  There are no consequences to a much larger control cohort other than the increased computational costs. The code below includes 2.5% of overall controls in the GWAS.

In [None]:
df_has_mt_1 = demographics[demographics['has_mt'] == 1]
df_has_mt_0 = demographics[demographics['has_mt'] == 0].sample(frac=0.025, random_state=1)
demographics_reduced = pd.concat([df_has_mt_1, df_has_mt_0]).reset_index(drop=True)


In [None]:
females = demographics_reduced['dragen_sex_ploidy_Female'].sum()
males = demographics_reduced['dragen_sex_ploidy_Male'].sum()
total = len(demographics_reduced)
print(females, males, total)

In [None]:
# Number of rows in df_reduced
num_rows = len(demographics_reduced)
print(f"Number of rows in df_reduced: {num_rows}")

The first line below removes individuals not classified as females or males

In [None]:
filtered_data = demographics_reduced[
    (demographics_reduced['dragen_sex_ploidy_Female'] != demographics_reduced['dragen_sex_ploidy_Male'])
]

# Select and rename the columns
phenotypes = (filtered_data[["person_id", "has_mt", "dragen_sex_ploidy_Female", "dragen_sex_ploidy_Male", "age_yrs"]]
              .rename(columns={'dragen_sex_ploidy_Female': 'is_female', 'dragen_sex_ploidy_Male': 'is_male'})
             )
phenotypes['is_male'] = phenotypes['is_male'].astype(int)

num_rows = len(phenotypes)
print(f"Number of rows in phenotypes: {num_rows}")

In [None]:
phenotypes["person_id"] = phenotypes["person_id"].astype(str)
    
phenotypes.to_csv('genomics_mt_phenotypes.tsv', index=False, sep='\t')

# save phenotypes to the bucket
!gsutil cp 'genomics_mt_phenotypes.tsv' {bucket}/data/

In [None]:
phenotypes['has_mt'].value_counts()

In [None]:
!gsutil -m ls {bucket}/data/