### eMERGE MDD definition
Algorithm

1. Has any instances (ever) of any diagnosis code for depression, and
2. Fails to qualify by the 2/30/180 rule for diagnosis of depression with psychosis, and
3. Qualifies by the 2/30/180 rule for diagnosis of major depression

In [None]:
%reload_ext google.cloud.bigquery
from datetime import date
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import dateutil
from IPython.display import display, HTML
import os

dataset = os.getenv("WORKSPACE_CDR")
CDR_split=dataset.split(".")
CDR_version=CDR_split[1]
prefix = CDR_split[0]

In [None]:
query="""SELECT DISTINCT person.person_id
    ,gender_concept_id,race_concept_id,year_of_birth
    FROM 
    `"""+prefix+"""."""+CDR_version+""".person` person 
    WHERE
        person.PERSON_ID IN (SELECT
            distinct person_id  
        FROM
            `"""+prefix+"""."""+CDR_version+""".cb_search_person` cb_search_person  
        WHERE
            cb_search_person.person_id IN (SELECT
                person_id 
            FROM
                `"""+prefix+"""."""+CDR_version+""".cb_search_person` p 
            WHERE
                has_whole_genome_variant = 1 ) )"""

demo_patients=pd.read_gbq(query, dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

In [None]:
type1_icd9 = ",".join(["'"+code+"'" for code in ['296.34', '298']])
type1_icd10 = ",".join(["'"+code+"'" for code in ['F32.3', 'F33.3']])

In [None]:
type2_icd9 = ",".join(["'"+code+"'" for code in ["296.2", "296.21", "296.22", "296.23", "296.25", "296.26", 
                                                 "296.3", "296.31", "296.32", "296.33", "296.35", "296.36" ]])
type2_icd10 = ",".join(["'"+code+"'" for code in ["F32.0", "F32.1", "F32.2", "F32.4", "F32.5", "F32.9", 
                                                  "F33.0", "F33.1", "F33.2", "F33.40", "F33.41", "F33.42", 
                                                  "F33.9"]])

In [None]:
type3_icd9 = ",".join(["'"+code+"'" for code in ["311", "300.4", "309.1"]])
type3_icd10 = ",".join(["'"+code+"'" for code in ["F32.89", "F33.8", "F43.21"]])

In [None]:
exlude_from_control_icd9 = ",".join(["'"+code+"'" for code in ["296.24", "296.82", "301.12", "301.13", "309"]])
exlude_from_control_icd10 = ",".join(["'"+code+"'" for code in ["F06.32", "F06.31", "F32.81", "F34.0", "F34.1", 
                                                                "F34.81", "F34.89", "F34.9", "F39", "F43.23", 
                                                                "F53"]])

In [None]:
all_codes_icd9 = ",".join(["'"+code+"'" for code in ['296.34', '298', "296.2", "296.21", "296.22", "296.23", 
                                                     "296.25", "296.26", "296.3", "296.31", "296.32", 
                                                     "296.33", "296.35", "296.36", "311", "300.4", "309.1"]])
all_codes_icd10 = ",".join(["'"+code+"'" for code in ['F32.3', 'F33.3', "F32.0", "F32.1", "F32.2", "F32.4", 
                                                      "F32.5", "F32.9", "F33.0", "F33.1", "F33.2", "F33.40", 
                                                      "F33.41", "F33.42","F33.9", "F32.89", "F33.8", "F43.21"]])

In [None]:
query = ("""
SELECT distinct * 
FROM 
    (SELECT DISTINCT person_id, condition_source_concept_id, condition_source_value, condition_start_date 
        FROM `"""+prefix+"""."""+ str(CDR_version) +""".condition_occurrence`) AS cond 
     INNER JOIN 
        (SELECT DISTINCT concept_id, concept_name, concept_code, vocabulary_id 
            FROM `"""+prefix+"""."""+str(CDR_version)+""".concept` 
            where (concept_code in ("""+all_codes_icd9+""") 
            and vocabulary_id ='ICD9CM') or (concept_code in ("""+all_codes_icd10+""")
            and vocabulary_id ='ICD10CM')) as concept 
            on concept.concept_id = cond.condition_source_concept_id
""")
all_dep_codes = pd.read_gbq(query, dialect="standard")

In [None]:
def _aggregate(x):
        d = {}
        distances =x.condition_start_date.diff()
        d['at_least_30_days_and_no_more_than_180_days'] = np.any((np.abs(distances.dt.days)>=30) & 
                                                                 (np.abs(distances.dt.days)<=180))
        d['Count'] = len(x)
        d['condition_start_date'] = x['condition_start_date'].min()
        return pd.Series(d, index=['at_least_30_days_and_no_more_than_180_days', 'Count', 'condition_start_date'])

# find and then exclude all patients that qualitfy for depression with psychosis by 2/30/180 rule (type 1)
type1_inds = all_dep_codes.loc[all_dep_codes['concept_code'].isin(['296.34', '298',
                                                                  'F32.3', 'F33.3'])]
type1_inds_nodups = type1_inds.drop_duplicates(subset=['person_id', 'condition_start_date', 'concept_code'])
type1_inds_nodups.sort_values(by='condition_start_date', inplace=True)
type1_inds_nodups = type1_inds_nodups.groupby(['person_id'], as_index=False).apply(_aggregate)
type1_final = type1_inds_nodups[(type1_inds_nodups.Count>=2)&
                                (type1_inds_nodups.at_least_30_days_and_no_more_than_180_days)]

dep_cases_notype1 = all_dep_codes.loc[~all_dep_codes['person_id'].isin(type1_final['person_id'])]

In [None]:
# qualifies for MDD by 2/30/180 rule (type 2)
dep_cases_notype1 = dep_cases_notype1.loc[all_dep_codes['concept_code'].isin(["296.2", "296.21", "296.22", "296.23", 
                                                                   "296.25", "296.26","296.3", "296.31", 
                                                                   "296.32", "296.33", "296.35", "296.36",
                                                                   "F32.0", "F32.1", "F32.2", "F32.4", 
                                                                   "F32.5", "F32.9", "F33.0", "F33.1", 
                                                                   "F33.2", "F33.40", "F33.41", "F33.42", 
                                                                   "F33.9"])]
type2_nodups = dep_cases_notype1.drop_duplicates(subset=['person_id', 'condition_start_date', 'concept_code'])
type2_nodups.sort_values(by='condition_start_date', inplace=True)
type2_nodups = type2_nodups.groupby(['person_id'], as_index=False).apply(_aggregate)
type2_final = type2_nodups[(type2_nodups.Count>=2)&
                                (type2_nodups.at_least_30_days_and_no_more_than_180_days)]


In [None]:
#only want the cases for the genotyped individuals 
cases = type2_final[type2_final["person_id"].isin(demo_patients['person_id'])]
print(cases.shape)
cases.to_csv("MDD_cases_emerge_algorithm.csv")

In [None]:
## Get Controls

all_codes_icd9_for_controls = ",".join(["'"+code+"'" for code in ['296.34', '298', "296.2", "296.21", "296.22", "296.23", 
                                                     "296.25", "296.26", "296.3", "296.31", "296.32", 
                                                     "296.33", "296.35", "296.36", "311", "300.4", "309.1",
                                                                 "296.24", "296.82", "301.12", "301.13", "309"]])
all_codes_icd10_for_controls = ",".join(["'"+code+"'" for code in ['F32.3', 'F33.3', "F32.0", "F32.1", "F32.2", "F32.4", 
                                                      "F32.5", "F32.9", "F33.0", "F33.1", "F33.2", "F33.40", 
                                                      "F33.41", "F33.42","F33.9", "F32.89", "F33.8", "F43.21",
                                                                  "F06.32", "F06.31", "F32.81", "F34.0", "F34.1", 
                                                                "F34.81", "F34.89", "F34.9", "F39", "F43.23", 
                                                                "F53"]])

In [None]:
query = ("""
SELECT distinct person_id  
FROM 
    (SELECT DISTINCT person_id, condition_source_concept_id, condition_source_value 
        FROM `"""+prefix+"""."""+ str(CDR_version) +""".condition_occurrence`) AS cond 
     INNER JOIN 
        (SELECT DISTINCT concept_id, concept_name, concept_code, vocabulary_id 
            FROM `"""+prefix+"""."""+str(CDR_version)+""".concept` 
            where (concept_code in ("""+all_codes_icd9_for_controls+""") 
            and vocabulary_id ='ICD9CM') or (concept_code in ("""+all_codes_icd10_for_controls+""")
            and vocabulary_id ='ICD10CM')) as concept 
            on concept.concept_id = cond.condition_source_concept_id
""")
control_dep_codes = pd.read_gbq(query, dialect="standard")

In [None]:
#only want the controls for the genotyped individuals 
controls = demo_patients[~demo_patients["person_id"].isin(control_dep_codes['person_id'])]
controls.shape
controls.to_csv("./Controls_emerge_definition.csv")

In [None]:
controls['Case'] = 0 
cases['Case'] = 1
cases = cases[['person_id', 'Case']]
controls = controls[['person_id', 'Case']]
cases_and_controls = pd.concat([cases, controls])

In [None]:
cases_and_controls.to_csv("MDDcases_control_emerge_definition.csv")