# SID Genetics Study Step 1: Phenotype Preparation 

## Objective
The purpose of this notebook is to assign type 2 diabetes mellitus status, find end of follow-up (EoF) dates for possible participants, and find starting eligible statin users and non-users.

# Pulling in Data

**Objective**: The purpose of this section is to load packages and pull in data from the All of Us Research Project (AoURP). AoURP dataset code (R and SQL) is generated using the AoURP's cohort builder.

In [None]:
# Load packages and citations

# install.packages('allofus')
# install.packages('tidyverse')
# install.packages('bigrquery')

library(allofus)
library(tidyverse)
library(bigrquery)

citation('allofus')
citation('tidyverse')
citation('bigrquery')

In [None]:
# Bring in all AoU participants

# This query represents dataset "T2D Diagnosis Redo" for domain "person" and was generated for All of Us Controlled Tier Dataset v7
dataset_84009189_person_sql <- paste("
    SELECT
        person.person_id,
        p_gender_concept.concept_name as gender,
        person.birth_datetime as date_of_birth,
        p_race_concept.concept_name as race,
        p_ethnicity_concept.concept_name as ethnicity,
        p_sex_at_birth_concept.concept_name as sex_at_birth 
    FROM
        `person` person 
    LEFT JOIN
        `concept` p_gender_concept 
            ON person.gender_concept_id = p_gender_concept.concept_id 
    LEFT JOIN
        `concept` p_race_concept 
            ON person.race_concept_id = p_race_concept.concept_id 
    LEFT JOIN
        `concept` p_ethnicity_concept 
            ON person.ethnicity_concept_id = p_ethnicity_concept.concept_id 
    LEFT JOIN
        `concept` p_sex_at_birth_concept 
            ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
person_84009189_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "person_84009189",
  "person_84009189_*.csv")
message(str_glue('The data will be written to {person_84009189_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_84009189_person_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  person_84009189_path,
  destination_format = "CSV")

# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {person_84009189_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(gender = col_character(), race = col_character(), ethnicity = col_character(), sex_at_birth = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_84009189_person_df <- read_bq_export_from_workspace_bucket(person_84009189_path)

dim(dataset_84009189_person_df)

# head(dataset_84009189_person_df, 5)

In [None]:
# Bring in glucose, fasting glucose, and HbA1c measurements
library(tidyverse)
library(bigrquery)

# This query represents dataset "T2D Diagnosis Redo" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v7
dataset_84009189_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        measurement.measurement_datetime,
        measurement.value_as_number,
        m_unit.concept_name as unit_concept_name 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (3000483, 3003309, 3004410, 3004501, 3005673, 3007263, 3037110)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_84009189_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_84009189",
  "measurement_84009189_*.csv")
message(str_glue('The data will be written to {measurement_84009189_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_84009189_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_84009189_path,
  destination_format = "CSV")

# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_84009189_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), unit_concept_name = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_84009189_measurement_df <- read_bq_export_from_workspace_bucket(measurement_84009189_path)

dim(dataset_84009189_measurement_df)

# head(dataset_84009189_measurement_df, 5)

In [None]:
# Type 1 diabetes medications
library(tidyverse)
library(bigrquery)

# This query represents dataset "T1D Medication" for domain "drug" and was generated for All of Us Controlled Tier Dataset v7
dataset_82559853_drug_sql <- paste("
    SELECT
        d_exposure.person_id,
        d_exposure.drug_concept_id,
        d_standard_concept.concept_name as standard_concept_name,
        d_exposure.drug_exposure_start_datetime,
        d_exposure.drug_exposure_end_datetime 
    FROM
        ( SELECT
            * 
        FROM
            `drug_exposure` d_exposure 
        WHERE
            (
                drug_concept_id IN (SELECT
                    DISTINCT ca.descendant_id 
                FROM
                    `cb_criteria_ancestor` ca 
                JOIN
                    (SELECT
                        DISTINCT c.concept_id       
                    FROM
                        `cb_criteria` c       
                    JOIN
                        (SELECT
                            CAST(cr.id as string) AS id             
                        FROM
                            `cb_criteria` cr             
                        WHERE
                            concept_id IN (1502905, 1513876, 1516976, 1517998, 1531601, 1544838, 1550023, 1567198, 1586346, 1596977, 19013951, 35602717, 46221581)             
                            AND full_text LIKE '%_rank1]%'       ) a 
                            ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                            OR c.path LIKE CONCAT('%.', a.id) 
                            OR c.path LIKE CONCAT(a.id, '.%') 
                            OR c.path = a.id) 
                    WHERE
                        is_standard = 1 
                        AND is_selectable = 1) b 
                        ON (ca.ancestor_id = b.concept_id)))) d_exposure 
        LEFT JOIN
            `concept` d_standard_concept 
                ON d_exposure.drug_concept_id = d_standard_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
drug_82559853_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "drug_82559853",
  "drug_82559853_*.csv")
message(str_glue('The data will be written to {drug_82559853_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_82559853_drug_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  drug_82559853_path,
  destination_format = "CSV")


# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {drug_82559853_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_82559853_drug_df <- read_bq_export_from_workspace_bucket(drug_82559853_path)

dim(dataset_82559853_drug_df)

# head(dataset_82559853_drug_df, 5)

In [None]:
# Type 1 diabetes ICD 9 and 10 codes
library(tidyverse)
library(bigrquery)

# This query represents dataset "T1D ICD" for domain "condition" and was generated for All of Us Controlled Tier Dataset v7
dataset_15893752_condition_sql <- paste("
    SELECT
        c_occurrence.person_id,
        c_occurrence.condition_start_datetime,
        c_occurrence.condition_end_datetime,
        c_source_concept.concept_name as source_concept_name,
        c_source_concept.concept_code as source_concept_code 
    FROM
        ( SELECT
            * 
        FROM
            `condition_occurrence` c_occurrence 
        WHERE
            (
                condition_source_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (35206878, 35206879, 44819501, 44819502, 44819504, 44820682, 44820683, 44820684, 44821787, 44822934, 44822935, 44822936, 44824071, 44825264, 44829881, 44831046, 44832190, 44832191, 44832192, 44833368, 44834549, 44836918, 45533018, 45537960, 45542736, 45547622, 45547623, 45547624, 45552379, 45552381, 45552382, 45552383, 45557110, 45566729, 45576438, 45581350, 45600636, 45600637, 45600638, 45600639, 45600640, 45605398)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 0 
                    AND is_selectable = 1)
            )) c_occurrence 
    LEFT JOIN
        `concept` c_source_concept 
            ON c_occurrence.condition_source_concept_id = c_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
condition_15893752_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "condition_15893752",
  "condition_15893752_*.csv")
message(str_glue('The data will be written to {condition_15893752_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_15893752_condition_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  condition_15893752_path,
  destination_format = "CSV")


# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {condition_15893752_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(source_concept_name = col_character(), source_concept_code = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_15893752_condition_df <- read_bq_export_from_workspace_bucket(condition_15893752_path)

dim(dataset_15893752_condition_df)

# head(dataset_15893752_condition_df, 5)

In [None]:
# Type 2 diabetes medications
library(tidyverse)
library(bigrquery)

# This query represents dataset "T2D Medication" for domain "drug" and was generated for All of Us Controlled Tier Dataset v7
dataset_63474191_drug_sql <- paste("
    SELECT
        d_exposure.person_id,
        d_exposure.drug_concept_id,
        d_standard_concept.concept_name as standard_concept_name,
        d_exposure.drug_exposure_start_datetime,
        d_exposure.drug_exposure_end_datetime 
    FROM
        ( SELECT
            * 
        FROM
            `drug_exposure` d_exposure 
        WHERE
            (
                drug_concept_id IN (SELECT
                    DISTINCT ca.descendant_id 
                FROM
                    `cb_criteria_ancestor` ca 
                JOIN
                    (SELECT
                        DISTINCT c.concept_id       
                    FROM
                        `cb_criteria` c       
                    JOIN
                        (SELECT
                            CAST(cr.id as string) AS id             
                        FROM
                            `cb_criteria` cr             
                        WHERE
                            concept_id IN (1502809, 1502826, 1503297, 1510202, 1515249, 1516766, 1518148, 1525215, 1529331, 1547504, 1559684, 1560171, 1580747, 1583722, 1594973, 1597756, 40166035, 40170911, 40239216, 43013884, 43526465, 44506754, 44785829, 44816332, 45774435, 45774751, 793143)             
                            AND full_text LIKE '%_rank1]%'       ) a 
                            ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                            OR c.path LIKE CONCAT('%.', a.id) 
                            OR c.path LIKE CONCAT(a.id, '.%') 
                            OR c.path = a.id) 
                    WHERE
                        is_standard = 1 
                        AND is_selectable = 1) b 
                        ON (ca.ancestor_id = b.concept_id)))) d_exposure 
        LEFT JOIN
            `concept` d_standard_concept 
                ON d_exposure.drug_concept_id = d_standard_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
drug_63474191_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "drug_63474191",
  "drug_63474191_*.csv")
message(str_glue('The data will be written to {drug_63474191_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_63474191_drug_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  drug_63474191_path,
  destination_format = "CSV")


# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {drug_63474191_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_63474191_drug_df <- read_bq_export_from_workspace_bucket(drug_63474191_path)

dim(dataset_63474191_drug_df)

# head(dataset_63474191_drug_df, 5)

In [None]:
# Type 2 diabetes ICD 9 and 10 codes
library(tidyverse)
library(bigrquery)

# This query represents dataset "T2D ICD" for domain "condition" and was generated for All of Us Controlled Tier Dataset v7
dataset_95475017_condition_sql <- paste("
    SELECT
        c_occurrence.person_id,
        c_occurrence.condition_start_datetime,
        c_occurrence.condition_end_datetime,
        c_source_concept.concept_name as source_concept_name,
        c_source_concept.concept_code as source_concept_code 
    FROM
        ( SELECT
            * 
        FROM
            `condition_occurrence` c_occurrence 
        WHERE
            (
                condition_source_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (35206881, 35206882, 44819500, 44824073, 44826460, 44826461, 44827616, 44827617, 44828795, 44829879, 44829882, 44831045, 44831047, 44832193, 44832194, 44833366, 44833367, 44836914, 44836915, 44836916, 45533019, 45533021, 45533023, 45542738, 45547626, 45547627, 45561949, 45566731, 45581352, 45581353, 45581355, 45586139, 45586140, 45591027, 45591031, 45595798, 45595799, 45600642, 45605401, 45605403, 45605405)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 0 
                    AND is_selectable = 1)
            )) c_occurrence 
    LEFT JOIN
        `concept` c_source_concept 
            ON c_occurrence.condition_source_concept_id = c_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
condition_95475017_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "condition_95475017",
  "condition_95475017_*.csv")
message(str_glue('The data will be written to {condition_95475017_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_95475017_condition_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  condition_95475017_path,
  destination_format = "CSV")


# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {condition_95475017_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(source_concept_name = col_character(), source_concept_code = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_95475017_condition_df <- read_bq_export_from_workspace_bucket(condition_95475017_path)

dim(dataset_95475017_condition_df)

# head(dataset_95475017_condition_df, 5)

In [None]:
# Rename and check AoU generated data sets
all_person <- dataset_84009189_person_df
length(unique(all_person$person_id))
dim(all_person)
# head(all_person)

all_diabetes_meas <- dataset_84009189_measurement_df
length(unique(all_diabetes_meas$person_id))
dim(all_diabetes_meas)
# head(all_diabetes_meas)

t1d_drug <- dataset_82559853_drug_df
length(unique(t1d_drug$person_id))
dim(t1d_drug)
# head(t1d_drug)

t1d_icd <- dataset_15893752_condition_df
length(unique(t1d_icd$person_id))
dim(t1d_icd)
# head(t1d_icd)

t2d_drug <- dataset_63474191_drug_df
length(unique(t2d_drug$person_id))
dim(t2d_drug)
# head(t2d_drug)

t2d_icd <- dataset_95475017_condition_df
length(unique(t2d_icd$person_id))
dim(t2d_icd)
# head(t2d_icd)

In [None]:
# All of Us Death dataframe
library(tidyverse)
library(bigrquery)

# This query represents dataset "EOF Data Frame" for domain "death" and was generated for All of Us Controlled Tier Dataset v7
dataset_95436229_death_sql <- paste("
    SELECT
        death.person_id, 
        death.death_date, 
        death.death_datetime, 
        death.death_type_concept_id, 
        death.cause_concept_id, 
        death.cause_source_value, 
        death.cause_source_concept_id 
    FROM
        `death` death", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
death_95436229_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "death_95436229",
  "death_95436229_*.csv")
message(str_glue('The data will be written to {death_95436229_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_95436229_death_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  death_95436229_path,
  destination_format = "CSV")

# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {person_97344176_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(cause_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_95436229_death_df <- read_bq_export_from_workspace_bucket(death_95436229_path)

dim(dataset_95436229_death_df)

# head(dataset_95436229_death_df, 5)

In [None]:
# Statin data frame
library(tidyverse)
library(bigrquery)

# This query represents dataset "Statins" for domain "drug" and was generated for All of Us Controlled Tier Dataset v7
dataset_97634019_drug_sql <- paste("
    SELECT
        d_exposure.person_id,
        d_standard_concept.concept_name as standard_concept_name,
        d_exposure.drug_exposure_start_datetime,
        d_exposure.drug_exposure_end_datetime,
        d_type.concept_name as drug_type_concept_name,
        d_source_concept.concept_name as source_concept_name 
    FROM
        ( SELECT
            * 
        FROM
            `drug_exposure` d_exposure 
        WHERE
            (
                drug_concept_id IN (SELECT
                    DISTINCT ca.descendant_id 
                FROM
                    `cb_criteria_ancestor` ca 
                JOIN
                    (SELECT
                        DISTINCT c.concept_id       
                    FROM
                        `cb_criteria` c       
                    JOIN
                        (SELECT
                            CAST(cr.id as string) AS id             
                        FROM
                            `cb_criteria` cr             
                        WHERE
                            concept_id IN (1510813, 1539403, 1545958, 1549686, 1551860, 1592085, 1592180, 40165636)             
                            AND full_text LIKE '%_rank1]%'       ) a 
                            ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                            OR c.path LIKE CONCAT('%.', a.id) 
                            OR c.path LIKE CONCAT(a.id, '.%') 
                            OR c.path = a.id) 
                    WHERE
                        is_standard = 1 
                        AND is_selectable = 1) b 
                        ON (ca.ancestor_id = b.concept_id)))) d_exposure 
        LEFT JOIN
            `concept` d_standard_concept 
                ON d_exposure.drug_concept_id = d_standard_concept.concept_id 
        LEFT JOIN
            `concept` d_type 
                ON d_exposure.drug_type_concept_id = d_type.concept_id 
        LEFT JOIN
            `concept` d_source_concept 
                ON d_exposure.drug_source_concept_id = d_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
drug_97634019_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "drug_97634019",
  "drug_97634019_*.csv")
message(str_glue('The data will be written to {drug_97634019_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_97634019_drug_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  drug_97634019_path,
  destination_format = "CSV")


# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {drug_97634019_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), drug_type_concept_name = col_character(), source_concept_name = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_97634019_drug_df <- read_bq_export_from_workspace_bucket(drug_97634019_path)

dim(dataset_97634019_drug_df)

# head(dataset_97634019_drug_df, 5)

# Implementing T2D Diagnosis Algorithm

**Objective**: The purpose of this section is to determine the type 2 diabetes mellitus (T2D) status of participants in AoU, using Northwestern University's Type 2 Diabetes Mellitus diagnosis algorithm: https://phekb.org/phenotype/type-2-diabetes-mellitus.

In [None]:
# Create DF where particpants have at least one T2D diagnosis code
t2d_icd_status <- t2d_icd %>%
                group_by(person_id) %>%
                arrange(person_id, condition_start_datetime) %>%
                mutate(t2d_icd = 1) %>%
                slice_head() %>%
                rename(start_date = condition_start_datetime) %>%
                mutate(reason = "t2d icd",
                      value = NA) %>%
                select(person_id, start_date, reason, value, t2d_icd) %>%
                distinct(.keep_all = TRUE)

length(unique(t2d_icd_status$person_id))
dim(t2d_icd_status)
# head(t2d_icd_status)

In [None]:
# Create DF where particpants have at least 2 T2D diagnoses on different dates
t2d_icd2_status <- t2d_icd %>% 
                group_by(person_id) %>% 
                filter(n() >= 2) %>% 
                filter(n_distinct(condition_start_datetime) >= 2) %>%
                arrange(person_id, condition_start_datetime) %>%
                mutate(t2d_icd_2x = 1) %>% 
                slice_head() %>%
                rename(start_date = condition_start_datetime) %>%
                mutate(reason = "t2d icd x2",
                      value = NA) %>%
                select(person_id, start_date, reason, value, t2d_icd_2x) %>%
                distinct(.keep_all = TRUE)

length(unique(t2d_icd2_status$person_id))
dim(t2d_icd2_status)
# head(t2d_icd2_status)

In [None]:
# Create DF where participants have at least one abnormal lab value
t2d_lab_status <- all_diabetes_meas %>% 
                group_by(person_id) %>% 
                mutate(lab_status = ifelse((measurement_concept_id == 3000483 & value_as_number > 200) |
                                          (measurement_concept_id == 3004501 & value_as_number > 200) |
                                          (measurement_concept_id == 3037110 & value_as_number >= 125) |
                                          (measurement_concept_id == 3004410 & value_as_number >= 6.5) |
                                          (measurement_concept_id == 3007263 & value_as_number >= 6.5) |
                                          (measurement_concept_id == 3003309 & value_as_number >= 6.5) |
                                          (measurement_concept_id == 3005673 & value_as_number >= 6.5), 1, 0)) %>%
                arrange(person_id, measurement_datetime) %>%
                filter(lab_status == 1) %>%
                slice_head() %>%
                rename(start_date = measurement_datetime,
                      value = value_as_number,
                      reason = measurement_concept_id) %>%
                mutate(reason = as.character(reason)) %>%
                select(person_id, start_date, reason, value, lab_status) %>%
                distinct(.keep_all = TRUE)

length(unique(t2d_lab_status$person_id))
dim(t2d_lab_status)
# head(t2d_lab_status)

In [None]:
# Create DF where participants have taken at least one T1D medication
t1d_drug_status <- t1d_drug %>% 
                group_by(person_id) %>% 
                arrange(person_id, drug_exposure_start_datetime) %>%
                mutate(t1d_drug = 1) %>% 
                slice_head() %>%
                rename(start_date = drug_exposure_start_datetime) %>%
                mutate(reason = "t2d rx",
                      value = NA) %>%
                select(person_id, start_date, reason, value, t1d_drug) %>%
                distinct(.keep_all = TRUE)

length(unique(t1d_drug_status$person_id))
dim(t1d_drug_status)
# head(t1d_drug_status)

In [None]:
# Create DF where participants have taken at least one T2D medication
t2d_drug_status <- t2d_drug %>% 
                group_by(person_id) %>% 
                arrange(person_id, drug_exposure_start_datetime) %>%
                mutate(t2d_drug = 1) %>% 
                slice_head() %>%
                rename(start_date = drug_exposure_start_datetime) %>%
                mutate(reason = "t2d rx",
                      value = NA) %>%
                select(person_id, start_date, reason, value, t2d_drug) %>%
                distinct(.keep_all = TRUE)

length(unique(t2d_drug_status$person_id))
dim(t2d_drug_status)
# head(t2d_drug_status)

In [None]:
# Create DF where particpants have at least one T1D diagnosis
t1d_icd_status <- t1d_icd %>% 
                group_by(person_id) %>% 
                arrange(person_id, condition_start_datetime) %>%
                mutate(t1d_icd = 1) %>% 
                slice_head() %>%
                rename(start_date = condition_start_datetime) %>%
                mutate(reason = "t1d icd",
                      value = NA) %>%
                select(person_id, start_date, reason, value, t1d_icd) %>%
                distinct(.keep_all = TRUE)

length(unique(t1d_icd_status$person_id))
dim(t1d_icd_status)
# head(t1d_icd_status)

In [None]:
# Rename start date and join T1D and T2D drug data frames
rename_drug_status <- t1d_drug_status %>% 
                        rename(start_date_t1d = start_date) %>%
                        right_join(t2d_drug_status)

length(unique(rename_drug_status$person_id))
dim(rename_drug_status)
head(rename_drug_status)

# Create DF where participants have taken 2+ T1D or T2D drugs with T2D exposure occuring ≥1 day before T1D
t2d_before_t1d_drug <- rename_drug_status %>%
                            filter(!is.na(start_date_t1d) & 
                                   as.numeric(difftime(as.Date(start_date_t1d), as.Date(start_date),
                                                      units = "days")) >= 1) %>%
                            mutate(t2d_before_t1d_drug = 1) %>%
                            mutate(reason = "t2d rx before t1d rx",
                                    value = NA) %>%
                            select(person_id, start_date, reason, value, t2d_before_t1d_drug)

length(unique(t2d_before_t1d_drug$person_id))
dim(t2d_before_t1d_drug)
# head(t2d_before_t1d_drug)

In [None]:
# Join all parts of the diagnosis algorithm into one data frame
t2d_determination_df <- full_join(t2d_drug_status, t1d_drug_status) %>%
                        full_join(t2d_icd_status) %>%
                        full_join(t2d_icd2_status) %>%
                        full_join(t2d_lab_status) %>%
                        full_join(t1d_icd_status) %>%
                        full_join(t2d_before_t1d_drug) %>% 
                        arrange(person_id, start_date) %>%
                        mutate(across(c(t2d_drug,t1d_drug,t2d_icd,t2d_icd_2x,lab_status,t1d_icd,t2d_before_t1d_drug), function(x) ifelse(is.na(x), 0, 1))) %>%
                        distinct(.keep_all = TRUE)

length(unique(t2d_determination_df$person_id))
dim(t2d_determination_df)
# head(t2d_determination_df, 10)

In [None]:
# Check counts for all parts of the diagnosis algorithm
print("t2d_drug")
table(t2d_determination_df$t2d_drug, useNA = "always")

print("t1d_drug")
table(t2d_determination_df$t1d_drug, useNA = "always")

print("t2d_icd")
table(t2d_determination_df$t2d_icd, useNA = "always")

print("t2d_icd_2x")
table(t2d_determination_df$t2d_icd_2x, useNA = "always")

print("lab_status")
table(t2d_determination_df$lab_status, useNA = "always")

print("t1d_icd")
table(t2d_determination_df$t1d_icd, useNA = "always")

print("t2d_before_t1d_drug")
table(t2d_determination_df$t2d_before_t1d_drug, useNA = "always")

In [None]:
# Exclude T1D cases and label each case in order
t2d_cases_df <- t2d_determination_df %>% group_by(person_id) %>% 
                filter(t1d_icd == 0) %>%
                mutate(case = case_when(any(t2d_icd == 1) & any(t2d_drug == 1) & any(t1d_drug == 0) ~ 1,
                                        any(t2d_icd == 1) & any(t2d_before_t1d_drug == 1) ~ 2,
                                        any(t2d_icd_2x == 1) & any(t1d_drug == 1) ~ 3,
                                        (any(t2d_drug == 1) | any(t2d_icd == 1)) & any(lab_status == 1) ~ 4))
                       
table(t2d_cases_df$case, useNA = "always")
length(unique(t2d_cases_df$person_id))
dim(t2d_cases_df)
# head(t2d_cases_df, 10)

In [None]:
# Get the first date of potential T2D diagnosis
t2d_cases_df_counts <- t2d_cases_df %>% select(person_id, start_date, reason, value, case) %>%
                        group_by(person_id) %>%
                        arrange(person_id, start_date) %>%
                        slice_head()

table(t2d_cases_df_counts$case, useNA = "always")
length(unique(t2d_cases_df_counts$person_id))
dim(t2d_cases_df_counts)
# head(t2d_cases_df_counts)

In [None]:
# Remove non-cases (NA)
t2d_cases_df_final <- t2d_cases_df_counts %>% filter(!is.na(case))

table(t2d_cases_df_final$case, useNA = "always")
length(unique(t2d_cases_df_final$person_id))
dim(t2d_cases_df_final)
# head(t2d_cases_df_final)

# Determining End of Follow Up

**Objective**: The purpose of this notebook is to determine either the event date (date of T2D diagnosis) or right-censor date (date of last fasting glucose, random glucose, or HbA1c measurement or date of death) for participants in AoU.

In [None]:
# Get all diabetes-related measures and arrange them by date to get the last one
all_diabetes_meas_eof <- all_diabetes_meas %>% select(person_id, measurement_datetime) %>% 
                            group_by(person_id) %>%
                            arrange(person_id, measurement_datetime) %>%
                            slice_tail()

length(unique(all_diabetes_meas_eof$person_id))
dim(all_diabetes_meas_eof)
# head(all_diabetes_meas_eof)

In [None]:
# Get the date of death for deceased patients (if multiple, get the latest)
death_eof <- dataset_95436229_death_df %>% select(person_id, death_datetime)%>% 
                            group_by(person_id) %>%
                            arrange(person_id, death_datetime) %>%
                            slice_tail()

length(unique(death_eof$person_id))
dim(death_eof)
# head(death_eof)

In [None]:
# Get the date of diabetes diagnosis for participants with T2D (from previous section)
t2d_cases_df_eof <- t2d_cases_df_final %>% select(person_id, start_date)

length(unique(t2d_cases_df_eof$person_id))
dim(t2d_cases_df_eof)
# head(t2d_cases_df_eof)

In [None]:
# Combine potention EoF dates into one data frame
# Choose T2D diagnosis if available
# Choose death date if after measurement date and both are available
# Label events and censored participants
# Calculate EoF age

eof_df <- full_join(all_person, all_diabetes_meas_eof) %>%
            full_join(death_eof) %>%
            full_join(t2d_cases_df_eof) %>%
            filter(!(is.na(measurement_datetime) & is.na(death_datetime) & is.na(start_date))) %>%
            mutate(eof_datetime = ifelse(!is.na(start_date), start_date, 
                                         ifelse(is.na(death_datetime), measurement_datetime, 
                                               ifelse(is.na(measurement_datetime), death_datetime,
                                                     ifelse(as.Date(death_datetime) >= as.Date(measurement_datetime),
                                                          death_datetime, measurement_datetime))))) %>%
            mutate(t2d_status = ifelse(!is.na(start_date), "Event", "Censor"),
                  eof_age = round(as.numeric(difftime(as.Date(eof_datetime), as.Date(date_of_birth), 
                                                      units = "days"))/365.2425, digits = 1)) %>%
            select(person_id, date_of_birth, eof_datetime, eof_age, t2d_status)
            

length(unique(eof_df$person_id))
dim(eof_df)
# head(eof_df)

In [None]:
# Check EoF age for censored vs. event participants by generating summary statistics and a box plot
boxplot(eof_age ~ t2d_status, data = eof_df, col = c("coral", "lightblue"))
eof_df %>% group_by(t2d_status) %>% summarize(
                                        Count = n(),
                                        Min = min(eof_age, na.rm = TRUE),
                                        "1st Qu." = quantile(eof_age, na.rm = TRUE)[["25%"]],
                                        Median = median(eof_age, na.rm = TRUE),
                                        Mean = mean(eof_age, na.rm = TRUE), 
                                        SD = sd(eof_age, na.rm = TRUE),
                                        "3rd Qu." = quantile(eof_age, na.rm = TRUE)[["75%"]],
                                        Max = max(eof_age, na.rm = TRUE),
                                        "NA's" = sum(is.na(eof_age)))
                                            

# Determining Eligible Statin Users

**Objective**: The purpose of this section is to find statin users in AoU meeting eligibility criteria for this study. Eligibility criteria is as follows:
- At least one statin prescription
- At least 30 days of follow-up after starting statins (to give enough time for NOD to possibly develop)
- At least 6 months (180 days) of follow-up before starting statins (to verify that the statin initiation date is the first time receiving statins)

The first prescribed statin type and dosage for each eligible statin user was also extracted. Since this study is designed on an intention-to-treat basis, the first prescription was assumed to be carried throughout the rest of the study period. 

In [None]:
# Find starting number of eligible statin users and filter out any Rx that occur after EoF
statin_rx <- inner_join(eof_df, dataset_97634019_drug_df) %>%
                filter(as.Date(drug_exposure_start_datetime) < as.Date(eof_datetime))

length(unique(statin_rx$person_id))
dim(statin_rx)
# head(statin_rx)

In [None]:
# Helper function to extract statin dose from standard concept name
library(stringr)
extract_dose <- function(x) {
    split_drug <- ifelse(grepl("/", x), strsplit(x, "/"), x) # checks if drug concept name has a '/', 
    # which means it's a combo drug and splits it into a list if so
    statin_part <- NULL # initializes a variable to hold the statin containing part of the drug name
    for (i in 1:length(split_drug[[1]])) { # loops through the list of drug parts
        if (grepl("statin", split_drug[[1]][i])) { # checks if drug part contains the string 'statin'
            if (!is.null(statin_part)) stop("Whoops; detected more than one statin") # checks if the previous loop 
            # also found a statin and throws an error if so, meant to protect against multiple statins in one combo
            statin_part <- split_drug[[1]][which(grepl("statin", split_drug[[1]]))] # assigns the part of the combo
            # drug with a statin to the 'statin_part' variable
        }
    }
    if (length(statin_part) != 1) return(NA) # throws an error if the length of 
    # statin_part is not one
    
    if (is.null(statin_part)) return(NA) # returns NA if there is no dose

    dose <- str_extract_all(statin_part, "[[:digit:]\\.]+ MG") # extracts the dosage from the statin part
    dose <- unlist(dose) # unlists the dose so it can be a character vector instead
    stopifnot(is.character(dose)) # makes sure the last step worked
    if (length(dose) == 0) return(NA) # checks again for no dose and returns NA
    
    return(dose) # returns the dose
}

# extract_dose(statin_rx[[33, "standard_concept_name"]])
# class(statin_rx[[33, "standard_concept_name"]])

In [None]:
# Calculate statin initiation age and find starting statin type + dose
statin_init <- statin_rx %>% group_by(person_id) %>% 
                arrange(person_id, drug_exposure_start_datetime) %>%
                slice_head() %>%
                mutate(statin_init_date = drug_exposure_start_datetime,
                      statin_init_age = round(as.numeric(difftime(as.Date(statin_init_date), as.Date(date_of_birth), 
                                                      units = "days"))/365.2425, digits = 1),
                      statin_type_start = str_extract(standard_concept_name, pattern = "[:alpha:]+statin"),
                      statin_dose_start = extract_dose(standard_concept_name)) %>%
                select(person_id, statin_init_date, statin_init_age, statin_type_start, statin_dose_start)

length(unique(statin_init$person_id))
dim(statin_init)
# head(statin_init)

In [None]:
# Find last statin prescription, start and end date, and type + dose
statin_end <- statin_rx %>% group_by(person_id) %>% 
                arrange(person_id, drug_exposure_end_datetime) %>%
                slice_tail() %>%
                mutate(statin_end_date = ifelse(as.Date(drug_exposure_start_datetime) == as.Date(drug_exposure_end_datetime) |
                                              is.na(drug_exposure_end_datetime), 
                                              as.character(as.Date(drug_exposure_start_datetime) + 1),
                                              as.character(as.Date(drug_exposure_end_datetime))),
                      statin_end_age = round(as.numeric(difftime(as.Date(statin_end_date), as.Date(date_of_birth), 
                                                      units = "days"))/365.2425, digits = 1)) %>%
                mutate(statin_type_end = str_extract(standard_concept_name, pattern = "[:alpha:]+statin"),
                      statin_dose_end = extract_dose(standard_concept_name)) %>%
                select(person_id, statin_end_date, statin_end_age, statin_type_end, statin_dose_end) %>% 
                full_join(statin_init)

length(unique(statin_end$person_id))
dim(statin_end)
# head(statin_end)

In [None]:
# Add statin init age, date, type, and dose to statin rx df, then:
    # Assign Rx IDs
    # Add 1 day to Rx which only have start dates
    # Calculate start + end ages
    # Calculate days supply
statin_rx_df <- full_join(statin_rx, statin_end) %>% 
                    group_by(person_id) %>%
                    arrange(person_id, drug_exposure_start_datetime) %>%
                    mutate(new_rx_end = ifelse(as.Date(drug_exposure_start_datetime) == as.Date(drug_exposure_end_datetime) |
                                              is.na(drug_exposure_end_datetime), 
                                              as.character(as.Date(drug_exposure_start_datetime) + 1),
                                              as.character(as.Date(drug_exposure_end_datetime)))) %>%
                    mutate(rx_start_age = round(as.numeric(difftime(as.Date(drug_exposure_start_datetime), as.Date(date_of_birth), 
                                                      units = "days"))/365.2425, digits = 1),
                           rx_end_age = round(as.numeric(difftime(as.Date(new_rx_end), as.Date(date_of_birth), 
                                                      units = "days"))/365.2425, digits = 1),
                           rx_days_supply = round(as.numeric(difftime(as.Date(new_rx_end), as.Date(drug_exposure_start_datetime), 
                                                      units = "days")), digits = 1),
                          rx_id = paste0("RX", 1:length(person_id))) %>% 
                    select(person_id, date_of_birth, eof_datetime, eof_age, t2d_status, statin_init_date, 
                           statin_init_age, statin_type_start, statin_dose_start, rx_id, 
                           drug_exposure_start_datetime, rx_start_age, new_rx_end, rx_end_age, statin_end_date, 
                           statin_end_age, statin_type_end, statin_dose_end, rx_days_supply) %>%
                    rename(rx_start_date = drug_exposure_start_datetime,
                          rx_end_date = new_rx_end)
                    

length(unique(statin_rx_df$person_id))
dim(statin_rx_df)
# head(statin_rx_df)

In [None]:
# Helper function to calculate total time while considering overlaps - Code provided by lab member
calculate_total_time <- function(data) {
    
    # Sort data by start date
    data <-  data %>% arrange(rx_start_date)
    
    # Initialize variables
    sum <- 0
    prev_start <- NULL
    prev_end <- NULL
    
    for (i in 1:nrow(data)) {
        current_start <- data$rx_start_date[i]
        current_end <- data$new_rx_end_date[i]
        
        if (current_start <= data$new_rx_end_date[i]) {
            if (is.null(prev_start)) {
            prev_start <- current_start
            prev_end <- current_end
            } else if (current_start > prev_end) {
            sum <- sum + as.numeric(difftime(prev_end, prev_start, units = "days"))
            prev_start <- current_start
            prev_end <- current_end
            } else if (current_end > prev_end) {
            prev_end <- current_end
        }
       }
    }
    
    if(!is.null(prev_start)) {
        sum <- sum + as.numeric(difftime(prev_end, prev_start, units = "days"))
    }
    
    return(sum)
}

In [None]:
# Generate observation period table for eligible statin user cohort
library(allofus)

con <- aou_connect()

# create observation_period table for everyone
su_observation_period_tbl <- aou_observation_period(cohort = statin_rx_df, collect = TRUE)
su_observation_period_tbl$person_id <- as.integer(su_observation_period_tbl$person_id)

dim(su_observation_period_tbl)
# head(su_observation_period_tbl)

In [None]:
# Generate a data frame with statin users meeting minimum conditions (enough follow-up and enough pre-follow-up)
statin_rx_df_c <- full_join(statin_rx_df, su_observation_period_tbl) %>% group_by(person_id) %>%
                    mutate(follow_up_time_1 = as.numeric(difftime(as.Date(eof_datetime), 
                                                                 as.Date(statin_init_date), 
                                                                 units = 'days')),
                            enough_followup = ifelse(follow_up_time_1 >= 30, 1, 0), 
                            pre_followup = as.numeric(difftime(as.Date(statin_init_date), 
                                                               observation_period_start_date, units = "days")),
                            enough_pre_followup = ifelse(pre_followup > 180, 1, 0)) 

statin_rx_df_itt <- statin_rx_df_c %>%
                      filter(enough_followup == 1 & enough_pre_followup == 1)

length(unique(statin_rx_df_itt$person_id))
dim(statin_rx_df_itt)
# head(statin_rx_df_itt)

In [None]:
# Remove prescription data to get data frame of distinct statin users
statin_rx_df_itt_distinct <- statin_rx_df_itt %>% select(-c('rx_id', 'rx_start_date', 'rx_start_age',
                                                            'rx_end_date', 'rx_end_age', 'rx_days_supply',
                                                            'observation_period_start_date', 
                                                            'observation_period_end_date', 'follow_up_time_1',
                                                            'pre_followup','enough_pre_followup')) %>%
                                                    distinct(.keep_all = TRUE)

length(unique(statin_rx_df_itt_distinct$person_id))
dim(statin_rx_df_itt_distinct)
# head(statin_rx_df_itt_distinct)

In [None]:
# Save eligible statin user df into workspace bucket
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

write.csv(statin_rx_df_itt_distinct, "statin_rx_df_v2_distinct.csv")
system(paste0("gsutil cp ./", "statin_rx_df_v2_distinct.csv", " ", my_bucket, "/sid_pheno_files/"), intern=T)

write.csv(statin_rx_df_itt, "statin_rx_df_v2_rx.csv")
system(paste0("gsutil cp ./", "statin_rx_df_v2_rx.csv", " ", my_bucket, "/sid_pheno_files/"), intern=T)

# Determining Eligible Non-users

**Objective**: The purpose of this section is to find participants in AoU who have either never taken statins or have started statins with less than 30 days of follow-up afterward. 

In [None]:
# Find eligible non-users from EoF df
non_users_eof_df <- left_join(eof_df, statin_rx_df_itt_distinct) %>% 
                        filter(is.na(statin_init_age) | enough_followup == 0) %>%
                        select(person_id, date_of_birth, eof_datetime, eof_age, t2d_status)
                        

length(unique(non_users_eof_df$person_id))
dim(non_users_eof_df)
# head(non_users_eof_df)

In [None]:
# Count event status in non-users vs. statin users
nu_table <- table(non_users_eof_df$t2d_status, useNA = "always")
su_table <- table(statin_rx_df_itt_distinct$t2d_status, useNA = "always")

print("Non-users")
nu_table
prop.table(nu_table)

print("Statin Users")
su_table
prop.table(su_table)

In [None]:
# Save eligible non-user df into workspace bucket
write.csv(non_users_eof_df, "non_users_eof_df_v2.csv")
system(paste0("gsutil cp ./", "non_users_eof_df_v2.csv", " ", my_bucket, "/sid_pheno_files/"), intern=T)