In [None]:
### Script that applies the following qc metrics to AoU labs
### ****Uses similar workflow as https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-020-00820-8
###
### - Filter for INPATIENT/OUTPATIENT labs (removes EMERGENCY and NULL)
### - Remove NAs, NULLs, infinite, non-numeric, <0 when inappropriate
### - Remove duplicates
### - Remove outliers (>4 SDs)
###
### Later
### - Calculate min, median, max
### - Inverse normal quantile transformation (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8643141/)

In [None]:
# Load "config.R" for utility functions. 
#Will also triggger loading of 
    
    # user_config.JSON (including key for project_config)
    # project_config.JSON
    # preprocessing_visualizations.R
    # preprocessing_functions.R

user <- "Jan" 
source("config.r")



#If certain packages not installed yet via requirements.txt, install them here via
# install.packages("package_name")

# Setup

## Extract pt demographics

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "All participants demographics" for domain "person" and was generated for All of Us Controlled Tier Dataset v8
dataset_74259454_person_sql <- paste("
    SELECT
        person.person_id,
        person.gender_concept_id,
        p_gender_concept.concept_name as gender,
        person.birth_datetime as date_of_birth,
        person.race_concept_id,
        p_race_concept.concept_name as race,
        person.ethnicity_concept_id,
        p_ethnicity_concept.concept_name as ethnicity,
        person.sex_at_birth_concept_id,
        p_sex_at_birth_concept.concept_name as sex_at_birth 
    FROM
        `person` person 
    LEFT JOIN
        `concept` p_gender_concept 
            ON person.gender_concept_id = p_gender_concept.concept_id 
    LEFT JOIN
        `concept` p_race_concept 
            ON person.race_concept_id = p_race_concept.concept_id 
    LEFT JOIN
        `concept` p_ethnicity_concept 
            ON person.ethnicity_concept_id = p_ethnicity_concept.concept_id 
    LEFT JOIN
        `concept` p_sex_at_birth_concept 
            ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
person_74259454_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "person_74259454",
  "person_74259454_*.csv")
message(str_glue('The data will be written to {person_74259454_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_74259454_person_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  person_74259454_path,
  destination_format = "CSV")



## Read in pt demographics

In [None]:
library(tidyverse)
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {person_74259454_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(gender = col_character(), race = col_character(), ethnicity = col_character(), sex_at_birth = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}

#person_74259454_path <- "gs://fc-secure-14ec18bd-6438-4be3-b1ec-fa12149aa148/bq_exports/davidz1@researchallofus.org/20231021/person_74259454/person_74259454_*.csv"

dataset_74259454_person_df <- read_bq_export_from_workspace_bucket(person_74259454_path)

dim(dataset_74259454_person_df)

head(dataset_74259454_person_df, 5)

In [None]:
race_ethnicity <- dataset_74259454_person_df %>%
    select(person_id, race, ethnicity)
write.table(race_ethnicity, paste0(data_path, "/dataframes/AllofUs_v8_race_ethnicity.txt"), sep="\t", quote=F, row.names=F)


In [None]:
DOBs <- dataset_74259454_person_df %>%
    select(person_id, date_of_birth)

dim(DOBs)
head(DOBs, 5)

# Creatinine

## Extract pt creatinine

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Creatinine" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_64357831_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    40775801
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (
                                                SELECT
                                                    DISTINCT c.concept_id 
                                                FROM
                                                    `cb_criteria` c 
                                                JOIN
                                                    (
                                                        select
                                                            cast(cr.id as string) as id 
                                                        FROM
                                                            `cb_criteria` cr 
                                                        WHERE
                                                            concept_id IN (40775801) 
                                                            AND full_text LIKE '%_rank1]%'
                                                    ) a 
                                                        ON (
                                                            c.path LIKE CONCAT('%.',
                                                        a.id,
                                                        '.%') 
                                                        OR c.path LIKE CONCAT('%.',
                                                        a.id) 
                                                        OR c.path LIKE CONCAT(a.id,
                                                        '.%') 
                                                        OR c.path = a.id) 
                                                    WHERE
                                                        is_standard = 1 
                                                        AND is_selectable = 1
                                                    ) 
                                                    AND is_standard = 1 
                                            )
                                        ) criteria 
                                    ) ))
                        ) measurement 
                    LEFT JOIN
                        `concept` m_standard_concept 
                            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
                    LEFT JOIN
                        `concept` m_type 
                            ON measurement.measurement_type_concept_id = m_type.concept_id 
                    LEFT JOIN
                        `concept` m_operator 
                            ON measurement.operator_concept_id = m_operator.concept_id 
                    LEFT JOIN
                        `concept` m_value 
                            ON measurement.value_as_concept_id = m_value.concept_id 
                    LEFT JOIN
                        `concept` m_unit 
                            ON measurement.unit_concept_id = m_unit.concept_id 
                    LEFT JOIn
                        `visit_occurrence` v 
                            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
                    LEFT JOIN
                        `concept` m_visit 
                            ON v.visit_concept_id = m_visit.concept_id 
                    LEFT JOIN
                        `concept` m_source_concept 
                            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_64357831_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_64357831",
  "measurement_64357831_*.csv")
message(str_glue('The data will be written to {measurement_64357831_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_64357831_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_64357831_path,
  destination_format = "CSV")



In [None]:
library(tidyverse)
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_64357831_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_64357831_path <- "gs://fc-secure-14ec18bd-6438-4be3-b1ec-fa12149aa148/bq_exports/davidz1@researchallofus.org/20231021/measurement_64357831/measurement_64357831_*.csv"
dataset_64357831_measurement_df <- read_bq_export_from_workspace_bucket(measurement_64357831_path)

dim(dataset_64357831_measurement_df)

head(dataset_64357831_measurement_df, 5)

## Process pt creatinine

In [None]:
# Creatinine
labs <- dataset_64357831_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name == "milligram per deciliter") %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Creatinine_wout_outliers_long_102123.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# eGFR

## Extract pt eGFR

In [None]:
# library(tidyverse)
# library(bigrquery)

# # This query represents dataset "Pts with eGFR" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
# dataset_05954982_measurement_sql <- paste("
#     SELECT
#         measurement.person_id,
#         measurement.measurement_concept_id,
#         m_standard_concept.concept_name as standard_concept_name,
#         m_standard_concept.concept_code as standard_concept_code,
#         m_standard_concept.vocabulary_id as standard_vocabulary,
#         measurement.measurement_datetime,
#         measurement.measurement_type_concept_id,
#         m_type.concept_name as measurement_type_concept_name,
#         measurement.operator_concept_id,
#         m_operator.concept_name as operator_concept_name,
#         measurement.value_as_number,
#         measurement.value_as_concept_id,
#         m_value.concept_name as value_as_concept_name,
#         measurement.unit_concept_id,
#         m_unit.concept_name as unit_concept_name,
#         measurement.range_low,
#         measurement.range_high,
#         measurement.visit_occurrence_id,
#         m_visit.concept_name as visit_occurrence_concept_name,
#         measurement.measurement_source_value,
#         measurement.measurement_source_concept_id,
#         m_source_concept.concept_name as source_concept_name,
#         m_source_concept.concept_code as source_concept_code,
#         m_source_concept.vocabulary_id as source_vocabulary,
#         measurement.unit_source_value,
#         measurement.value_source_value 
#     FROM
#         ( SELECT
#             * 
#         FROM
#             `measurement` measurement 
#         WHERE
#             (
#                 measurement_concept_id IN  (
#                     SELECT
#                         DISTINCT c.concept_id 
#                     FROM
#                         `cb_criteria` c 
#                     JOIN
#                         (
#                             select
#                                 cast(cr.id as string) as id 
#                             FROM
#                                 `cb_criteria` cr 
#                             WHERE
#                                 concept_id IN (
#                                     1029770
#                                 ) 
#                                 AND full_text LIKE '%_rank1]%'
#                         ) a 
#                             ON (
#                                 c.path LIKE CONCAT('%.',
#                             a.id,
#                             '.%') 
#                             OR c.path LIKE CONCAT('%.',
#                             a.id) 
#                             OR c.path LIKE CONCAT(a.id,
#                             '.%') 
#                             OR c.path = a.id) 
#                         WHERE
#                             is_standard = 1 
#                             AND is_selectable = 1
#                         )
#                 )  
#                 AND (
#                     measurement.PERSON_ID IN (
#                         SELECT
#                             distinct person_id  
#                         FROM
#                             `cb_search_person` cb_search_person  
#                         WHERE
#                             cb_search_person.person_id IN (
#                                 SELECT
#                                     criteria.person_id 
#                                 FROM
#                                     (SELECT
#                                         DISTINCT person_id,
#                                         entry_date,
#                                         concept_id 
#                                     FROM
#                                         `cb_search_all_events` 
#                                     WHERE
#                                         (
#                                             concept_id IN (
#                                                 SELECT
#                                                     DISTINCT c.concept_id 
#                                                 FROM
#                                                     `cb_criteria` c 
#                                                 JOIN
#                                                     (
#                                                         select
#                                                             cast(cr.id as string) as id 
#                                                         FROM
#                                                             `cb_criteria` cr 
#                                                         WHERE
#                                                             concept_id IN (1029770) 
#                                                             AND full_text LIKE '%_rank1]%'
#                                                     ) a 
#                                                         ON (
#                                                             c.path LIKE CONCAT('%.',
#                                                         a.id,
#                                                         '.%') 
#                                                         OR c.path LIKE CONCAT('%.',
#                                                         a.id) 
#                                                         OR c.path LIKE CONCAT(a.id,
#                                                         '.%') 
#                                                         OR c.path = a.id) 
#                                                     WHERE
#                                                         is_standard = 1 
#                                                         AND is_selectable = 1
#                                                     ) 
#                                                     AND is_standard = 1 
#                                             )
#                                         ) criteria 
#                                     ) ))
#                         ) measurement 
#                     LEFT JOIN
#                         `concept` m_standard_concept 
#                             ON measurement.measurement_concept_id = m_standard_concept.concept_id 
#                     LEFT JOIN
#                         `concept` m_type 
#                             ON measurement.measurement_type_concept_id = m_type.concept_id 
#                     LEFT JOIN
#                         `concept` m_operator 
#                             ON measurement.operator_concept_id = m_operator.concept_id 
#                     LEFT JOIN
#                         `concept` m_value 
#                             ON measurement.value_as_concept_id = m_value.concept_id 
#                     LEFT JOIN
#                         `concept` m_unit 
#                             ON measurement.unit_concept_id = m_unit.concept_id 
#                     LEFT JOIn
#                         `visit_occurrence` v 
#                             ON measurement.visit_occurrence_id = v.visit_occurrence_id 
#                     LEFT JOIN
#                         `concept` m_visit 
#                             ON v.visit_concept_id = m_visit.concept_id 
#                     LEFT JOIN
#                         `concept` m_source_concept 
#                             ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# # Formulate a Cloud Storage destination path for the data exported from BigQuery.
# # NOTE: By default data exported multiple times on the same day will overwrite older copies.
# #       But data exported on a different days will write to a new location so that historical
# #       copies can be kept as the dataset definition is changed.
# measurement_05954982_path <- file.path(
#   Sys.getenv("WORKSPACE_BUCKET"),
#   "bq_exports",
#   Sys.getenv("OWNER_EMAIL"),
#   strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
#   "measurement_05954982",
#   "measurement_05954982_*.csv")
# message(str_glue('The data will be written to {measurement_05954982_path}. Use this path when reading ',
#                  'the data into your notebooks in the future.'))

# # Perform the query and export the dataset to Cloud Storage as CSV files.
# # NOTE: You only need to run `bq_table_save` once. After that, you can
# #       just read data from the CSVs in Cloud Storage.
# bq_table_save(
#   bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_05954982_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
#   measurement_05954982_path,
#   destination_format = "CSV")



In [None]:
# # Read the data directly from Cloud Storage into memory.
# # NOTE: Alternatively you can `gsutil -m cp {measurement_05954982_path}` to copy these files
# #       to the Jupyter disk.
# read_bq_export_from_workspace_bucket <- function(export_path) {
#   col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
#   bind_rows(
#     map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
#         function(csv) {
#           message(str_glue('Loading {csv}.'))
#           chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
#           if (is.null(col_types)) {
#             col_types <- spec(chunk)
#           }
#           chunk
#         }))
# }
# #measurement_05954982_path <- "gs://fc-secure-14ec18bd-6438-4be3-b1ec-fa12149aa148/bq_exports/davidz1@researchallofus.org/20231021/measurement_05954982/measurement_05954982_*.csv"
# dataset_05954982_measurement_df <- read_bq_export_from_workspace_bucket(measurement_05954982_path)

# dim(dataset_05954982_measurement_df)

# head(dataset_05954982_measurement_df, 5)

## Process pt eGFR

In [None]:
# # eGFR
# labs <- dataset_05954982_measurement_df
# colnames(labs)
# length(unique(labs$person_id))

# #as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
# as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)

# #new_labs <- labs %>%
# #    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
# #          !grepl("Urgent", visit_occurrence_concept_name))

# #as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
# new_labs <- labs %>%
#     filter(unit_concept_name == "milliliter per minute per 1.73 square meter" |
#            unit_concept_name == "No matching concept" |
#            unit_concept_name == "milliliter per minute")
# length(unique(new_labs$person_id))
# min(new_labs$value_as_number, na.rm=T)
# max(new_labs$value_as_number, na.rm=T)


In [None]:
# keep_outliers = FALSE

# # Do all the filtering specified at the top of the file
# labs_formatted <- labs %>%
#     filter(!grepl("Emergency", visit_occurrence_concept_name) & 
#            !grepl("Intensive", visit_occurrence_concept_name) &
#            !grepl("Urgent", visit_occurrence_concept_name)) %>%
#     filter(unit_concept_name == "milliliter per minute per 1.73 square meter" |
#            unit_concept_name == "No matching concept" |
#            unit_concept_name == "milliliter per minute") %>%
#     select(person_id, measurement_datetime, value_as_number) %>%
#     mutate_at(c('person_id'), as.character) %>%
#     mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
#     na.omit() %>%
#     distinct() %>%
#     group_by(person_id) %>%
#     filter(value_as_number < (10 * median(value_as_number))) %>%
#     ungroup() %>%
#     filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
#     filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

# dim(labs_formatted)
# head(labs_formatted, 5)


In [None]:
# min(labs_formatted$value_as_number)
# median(labs_formatted$value_as_number)
# max(labs_formatted$value_as_number)


In [None]:
# write.table(labs_formatted, 
#     file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-eGFR_wout_outliers_long_102123.txt"), 
#     row.names=FALSE, quote=FALSE, sep="\t")


# Glucose

## Extract pt glucose

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Glucose" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_83179845_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    37065054
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (
                                                SELECT
                                                    DISTINCT c.concept_id 
                                                FROM
                                                    `cb_criteria` c 
                                                JOIN
                                                    (
                                                        select
                                                            cast(cr.id as string) as id 
                                                        FROM
                                                            `cb_criteria` cr 
                                                        WHERE
                                                            concept_id IN (37065054) 
                                                            AND full_text LIKE '%_rank1]%'
                                                    ) a 
                                                        ON (
                                                            c.path LIKE CONCAT('%.',
                                                        a.id,
                                                        '.%') 
                                                        OR c.path LIKE CONCAT('%.',
                                                        a.id) 
                                                        OR c.path LIKE CONCAT(a.id,
                                                        '.%') 
                                                        OR c.path = a.id) 
                                                    WHERE
                                                        is_standard = 1 
                                                        AND is_selectable = 1
                                                    ) 
                                                    AND is_standard = 1 
                                            )
                                        ) criteria 
                                    ) ))
                        ) measurement 
                    LEFT JOIN
                        `concept` m_standard_concept 
                            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
                    LEFT JOIN
                        `concept` m_type 
                            ON measurement.measurement_type_concept_id = m_type.concept_id 
                    LEFT JOIN
                        `concept` m_operator 
                            ON measurement.operator_concept_id = m_operator.concept_id 
                    LEFT JOIN
                        `concept` m_value 
                            ON measurement.value_as_concept_id = m_value.concept_id 
                    LEFT JOIN
                        `concept` m_unit 
                            ON measurement.unit_concept_id = m_unit.concept_id 
                    LEFT JOIn
                        `visit_occurrence` v 
                            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
                    LEFT JOIN
                        `concept` m_visit 
                            ON v.visit_concept_id = m_visit.concept_id 
                    LEFT JOIN
                        `concept` m_source_concept 
                            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_83179845_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_83179845",
  "measurement_83179845_*.csv")
message(str_glue('The data will be written to {measurement_83179845_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_83179845_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_83179845_path,
  destination_format = "CSV")



In [None]:
library(tidyverse)
library(bigrquery)

# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_83179845_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_83179845_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20231029/measurement_83179845/measurement_83179845_*.csv"
dataset_83179845_measurement_df <- read_bq_export_from_workspace_bucket(measurement_83179845_path)

dim(dataset_83179845_measurement_df)

head(dataset_83179845_measurement_df, 5)

## Process pt glucose

In [None]:
# Glucose
labs <- dataset_83179845_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)

max(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name == "milligram per deciliter") %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Glucose_wout_outliers_long_102923.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")

# A1C

## Extract pt A1C

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with A1c" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_74098089_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    3004410, 3005673
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (3005673, 3004410) 
                                            AND is_standard = 1 
                                        )) criteria ) 
                            )
                        )
                ) measurement 
            LEFT JOIN
                `concept` m_standard_concept 
                    ON measurement.measurement_concept_id = m_standard_concept.concept_id 
            LEFT JOIN
                `concept` m_type 
                    ON measurement.measurement_type_concept_id = m_type.concept_id 
            LEFT JOIN
                `concept` m_operator 
                    ON measurement.operator_concept_id = m_operator.concept_id 
            LEFT JOIN
                `concept` m_value 
                    ON measurement.value_as_concept_id = m_value.concept_id 
            LEFT JOIN
                `concept` m_unit 
                    ON measurement.unit_concept_id = m_unit.concept_id 
            LEFT JOIn
                `visit_occurrence` v 
                    ON measurement.visit_occurrence_id = v.visit_occurrence_id 
            LEFT JOIN
                `concept` m_visit 
                    ON v.visit_concept_id = m_visit.concept_id 
            LEFT JOIN
                `concept` m_source_concept 
                    ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_74098089_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_74098089",
  "measurement_74098089_*.csv")
message(str_glue('The data will be written to {measurement_74098089_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_74098089_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_74098089_path,
  destination_format = "CSV")



In [None]:
library(tidyverse)
library(bigrquery)

# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_74098089_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}

#measurement_74098089_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20231030/measurement_74098089/measurement_74098089_*.csv"
dataset_74098089_measurement_df <- read_bq_export_from_workspace_bucket(measurement_74098089_path)

dim(dataset_74098089_measurement_df)

head(dataset_74098089_measurement_df, 5)

## Process pt A1C

In [None]:
# A1C
labs <- dataset_74098089_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == 'percent'], na.rm=T)

#max(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "gram per deciliter" &
           unit_concept_name != "Seconds per CO" &
           unit_concept_name != "per gram of hemoglobin" &
           unit_concept_name != "second") %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-HbA1c_wout_outliers_long_102923.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# ALT

## Extract pt ALT

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with ALT" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_35250829_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    3006923
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (3006923) 
                                            AND is_standard = 1 
                                        )) criteria ) 
                            )
                        )
                ) measurement 
            LEFT JOIN
                `concept` m_standard_concept 
                    ON measurement.measurement_concept_id = m_standard_concept.concept_id 
            LEFT JOIN
                `concept` m_type 
                    ON measurement.measurement_type_concept_id = m_type.concept_id 
            LEFT JOIN
                `concept` m_operator 
                    ON measurement.operator_concept_id = m_operator.concept_id 
            LEFT JOIN
                `concept` m_value 
                    ON measurement.value_as_concept_id = m_value.concept_id 
            LEFT JOIN
                `concept` m_unit 
                    ON measurement.unit_concept_id = m_unit.concept_id 
            LEFT JOIn
                `visit_occurrence` v 
                    ON measurement.visit_occurrence_id = v.visit_occurrence_id 
            LEFT JOIN
                `concept` m_visit 
                    ON v.visit_concept_id = m_visit.concept_id 
            LEFT JOIN
                `concept` m_source_concept 
                    ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_35250829_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_35250829",
  "measurement_35250829_*.csv")
message(str_glue('The data will be written to {measurement_35250829_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_35250829_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_35250829_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_35250829_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_35250829_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240129/measurement_35250829/measurement_35250829_*.csv"
dataset_35250829_measurement_df <- read_bq_export_from_workspace_bucket(measurement_35250829_path)

dim(dataset_35250829_measurement_df)

head(dataset_35250829_measurement_df, 5)

## Process pt ALT

In [None]:
# ALT
labs <- dataset_35250829_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
min(labs$value_as_number[labs$unit_concept_name == 'u/L'], na.rm=T)

#max(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "milliliter" &
           unit_concept_name != "milligram per deciliter" &
           unit_concept_name != "unified atomic mass unit" &
           unit_concept_name != "mIU/mL") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)

In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-ALT_wout_outliers_long_012924.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# AST

## Extract pt AST

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with AST" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_63436989_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    3013721
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (3013721) 
                                            AND is_standard = 1 
                                        )) criteria ) 
                            )
                        )
                ) measurement 
            LEFT JOIN
                `concept` m_standard_concept 
                    ON measurement.measurement_concept_id = m_standard_concept.concept_id 
            LEFT JOIN
                `concept` m_type 
                    ON measurement.measurement_type_concept_id = m_type.concept_id 
            LEFT JOIN
                `concept` m_operator 
                    ON measurement.operator_concept_id = m_operator.concept_id 
            LEFT JOIN
                `concept` m_value 
                    ON measurement.value_as_concept_id = m_value.concept_id 
            LEFT JOIN
                `concept` m_unit 
                    ON measurement.unit_concept_id = m_unit.concept_id 
            LEFT JOIn
                `visit_occurrence` v 
                    ON measurement.visit_occurrence_id = v.visit_occurrence_id 
            LEFT JOIN
                `concept` m_visit 
                    ON v.visit_concept_id = m_visit.concept_id 
            LEFT JOIN
                `concept` m_source_concept 
                    ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_63436989_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_63436989",
  "measurement_63436989_*.csv")
message(str_glue('The data will be written to {measurement_63436989_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_63436989_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_63436989_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_63436989_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_63436989_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240129/measurement_63436989/measurement_63436989_*.csv"
dataset_63436989_measurement_df <- read_bq_export_from_workspace_bucket(measurement_63436989_path)

dim(dataset_63436989_measurement_df)

head(dataset_63436989_measurement_df, 5)

## Process pt AST

In [None]:
# AST
labs <- dataset_63436989_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
max(labs$value_as_number[labs$unit_concept_name == 'unit per liter'], na.rm=T)

#max(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "micro-international unit per milliliter" &
           unit_concept_name != "hour" &
           unit_concept_name != "Henry" &
           unit_concept_name != "unified atomic mass unit" &
           unit_concept_name != "milligram per liter" &
           unit_concept_name != "mIU/mL") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)

In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-AST_wout_outliers_long_012924.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# GGT

## Extract pt GGT

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with GGT" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_00353129_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    3026910
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (3026910) 
                                            AND is_standard = 1 
                                        )) criteria ) 
                            )
                        )
                ) measurement 
            LEFT JOIN
                `concept` m_standard_concept 
                    ON measurement.measurement_concept_id = m_standard_concept.concept_id 
            LEFT JOIN
                `concept` m_type 
                    ON measurement.measurement_type_concept_id = m_type.concept_id 
            LEFT JOIN
                `concept` m_operator 
                    ON measurement.operator_concept_id = m_operator.concept_id 
            LEFT JOIN
                `concept` m_value 
                    ON measurement.value_as_concept_id = m_value.concept_id 
            LEFT JOIN
                `concept` m_unit 
                    ON measurement.unit_concept_id = m_unit.concept_id 
            LEFT JOIn
                `visit_occurrence` v 
                    ON measurement.visit_occurrence_id = v.visit_occurrence_id 
            LEFT JOIN
                `concept` m_visit 
                    ON v.visit_concept_id = m_visit.concept_id 
            LEFT JOIN
                `concept` m_source_concept 
                    ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_00353129_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_00353129",
  "measurement_00353129_*.csv")
message(str_glue('The data will be written to {measurement_00353129_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_00353129_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_00353129_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_00353129_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_00353129_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240129/measurement_00353129/measurement_00353129_*.csv"
dataset_00353129_measurement_df <- read_bq_export_from_workspace_bucket(measurement_00353129_path)

dim(dataset_00353129_measurement_df)

head(dataset_00353129_measurement_df, 5)

## Process pt GGT

In [None]:
# GGT
labs <- dataset_00353129_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == 'u/L'], na.rm=T)

#max(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "milligram per deciliter" &
           unit_concept_name != "mIU/mL" &
           unit_concept_name != "microliter" &
           unit_concept_name != "u/L") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)

In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-GGT_wout_outliers_long_012924.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Alk-Phos

## Extract pt Alk-Phos

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Alk-Phos" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_92884239_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    3035995
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (3035995) 
                                            AND is_standard = 1 
                                        )) criteria ) 
                            )
                        )
                ) measurement 
            LEFT JOIN
                `concept` m_standard_concept 
                    ON measurement.measurement_concept_id = m_standard_concept.concept_id 
            LEFT JOIN
                `concept` m_type 
                    ON measurement.measurement_type_concept_id = m_type.concept_id 
            LEFT JOIN
                `concept` m_operator 
                    ON measurement.operator_concept_id = m_operator.concept_id 
            LEFT JOIN
                `concept` m_value 
                    ON measurement.value_as_concept_id = m_value.concept_id 
            LEFT JOIN
                `concept` m_unit 
                    ON measurement.unit_concept_id = m_unit.concept_id 
            LEFT JOIn
                `visit_occurrence` v 
                    ON measurement.visit_occurrence_id = v.visit_occurrence_id 
            LEFT JOIN
                `concept` m_visit 
                    ON v.visit_concept_id = m_visit.concept_id 
            LEFT JOIN
                `concept` m_source_concept 
                    ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_92884239_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_92884239",
  "measurement_92884239_*.csv")
message(str_glue('The data will be written to {measurement_92884239_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_92884239_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_92884239_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_92884239_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_92884239_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240129/measurement_92884239/measurement_92884239_*.csv"
dataset_92884239_measurement_df <- read_bq_export_from_workspace_bucket(measurement_92884239_path)

dim(dataset_92884239_measurement_df)

head(dataset_92884239_measurement_df, 5)

## Process pt Alk-Phos

In [None]:
# Alk-Phos
labs <- dataset_92884239_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == 'microgram per liter'], na.rm=T)

#max(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "microgram per liter" &
           unit_concept_name != "gram per liter" &
           unit_concept_name != "no value" &
           unit_concept_name != "avidity index" &
           unit_concept_name != "milligram per deciliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Alk-Phos_wout_outliers_long_012924.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# SHBG

## Extract pt SHBG

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with SHBG" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_13171762_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    3004248
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (3004248) 
                                            AND is_standard = 1 
                                        )) criteria ) 
                            )
                        )
                ) measurement 
            LEFT JOIN
                `concept` m_standard_concept 
                    ON measurement.measurement_concept_id = m_standard_concept.concept_id 
            LEFT JOIN
                `concept` m_type 
                    ON measurement.measurement_type_concept_id = m_type.concept_id 
            LEFT JOIN
                `concept` m_operator 
                    ON measurement.operator_concept_id = m_operator.concept_id 
            LEFT JOIN
                `concept` m_value 
                    ON measurement.value_as_concept_id = m_value.concept_id 
            LEFT JOIN
                `concept` m_unit 
                    ON measurement.unit_concept_id = m_unit.concept_id 
            LEFT JOIn
                `visit_occurrence` v 
                    ON measurement.visit_occurrence_id = v.visit_occurrence_id 
            LEFT JOIN
                `concept` m_visit 
                    ON v.visit_concept_id = m_visit.concept_id 
            LEFT JOIN
                `concept` m_source_concept 
                    ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_13171762_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_13171762",
  "measurement_13171762_*.csv")
message(str_glue('The data will be written to {measurement_13171762_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_13171762_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_13171762_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_13171762_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_13171762_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240129/measurement_13171762/measurement_13171762_*.csv"
dataset_13171762_measurement_df <- read_bq_export_from_workspace_bucket(measurement_13171762_path)

dim(dataset_13171762_measurement_df)

head(dataset_13171762_measurement_df, 5)

## Process pt SHBG

In [None]:
# SHBG
labs <- dataset_13171762_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == 'nanomole per liter'], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-SHBG_wout_outliers_long_012924.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Triglycerides

## Extract pt triglycerides

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Triglycerides" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_06590776_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    3022192
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (3022192) 
                                            AND is_standard = 1 
                                        )) criteria ) 
                            )
                        )
                ) measurement 
            LEFT JOIN
                `concept` m_standard_concept 
                    ON measurement.measurement_concept_id = m_standard_concept.concept_id 
            LEFT JOIN
                `concept` m_type 
                    ON measurement.measurement_type_concept_id = m_type.concept_id 
            LEFT JOIN
                `concept` m_operator 
                    ON measurement.operator_concept_id = m_operator.concept_id 
            LEFT JOIN
                `concept` m_value 
                    ON measurement.value_as_concept_id = m_value.concept_id 
            LEFT JOIN
                `concept` m_unit 
                    ON measurement.unit_concept_id = m_unit.concept_id 
            LEFT JOIn
                `visit_occurrence` v 
                    ON measurement.visit_occurrence_id = v.visit_occurrence_id 
            LEFT JOIN
                `concept` m_visit 
                    ON v.visit_concept_id = m_visit.concept_id 
            LEFT JOIN
                `concept` m_source_concept 
                    ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_06590776_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_06590776",
  "measurement_06590776_*.csv")
message(str_glue('The data will be written to {measurement_06590776_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_06590776_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_06590776_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_06590776_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_06590776_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240129/measurement_06590776/measurement_06590776_*.csv"
dataset_06590776_measurement_df <- read_bq_export_from_workspace_bucket(measurement_06590776_path)

dim(dataset_06590776_measurement_df)

head(dataset_06590776_measurement_df, 5)

## Process pt Triglycerides

In [None]:
# Triglycerides
labs <- dataset_06590776_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == 'no value'], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "gram per deciliter" &
           unit_concept_name != "unit") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Triglycerides_wout_outliers_long_012924.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Chol-HDL

## Extract pt Chol-HDL

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Chol-HDL" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_96734629_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    3007070
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (3007070) 
                                            AND is_standard = 1 
                                        )) criteria ) 
                            )
                        )
                ) measurement 
            LEFT JOIN
                `concept` m_standard_concept 
                    ON measurement.measurement_concept_id = m_standard_concept.concept_id 
            LEFT JOIN
                `concept` m_type 
                    ON measurement.measurement_type_concept_id = m_type.concept_id 
            LEFT JOIN
                `concept` m_operator 
                    ON measurement.operator_concept_id = m_operator.concept_id 
            LEFT JOIN
                `concept` m_value 
                    ON measurement.value_as_concept_id = m_value.concept_id 
            LEFT JOIN
                `concept` m_unit 
                    ON measurement.unit_concept_id = m_unit.concept_id 
            LEFT JOIn
                `visit_occurrence` v 
                    ON measurement.visit_occurrence_id = v.visit_occurrence_id 
            LEFT JOIN
                `concept` m_visit 
                    ON v.visit_concept_id = m_visit.concept_id 
            LEFT JOIN
                `concept` m_source_concept 
                    ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_96734629_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_96734629",
  "measurement_96734629_*.csv")
message(str_glue('The data will be written to {measurement_96734629_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_96734629_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_96734629_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_96734629_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_96734629_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240129/measurement_96734629/measurement_96734629_*.csv"
dataset_96734629_measurement_df <- read_bq_export_from_workspace_bucket(measurement_96734629_path)

dim(dataset_96734629_measurement_df)

head(dataset_96734629_measurement_df, 5)

## Process pt Chol-HDL

In [None]:
# Chol-HDL
labs <- dataset_96734629_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == 'no value'], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "gram per deciliter" &
           unit_concept_name != "milligram per liter" &
           unit_concept_name != "milliliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Chol-HDL_wout_outliers_long_012924.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Chol-LDL

## Extract pt Chol-LDL

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Chol-LDL" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_60162566_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    3009966, 3028288, 3028437, 3053341
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (3028288, 3028437, 3009966, 3053341) 
                                            AND is_standard = 1 
                                        )) criteria ) 
                            )
                        )
                ) measurement 
            LEFT JOIN
                `concept` m_standard_concept 
                    ON measurement.measurement_concept_id = m_standard_concept.concept_id 
            LEFT JOIN
                `concept` m_type 
                    ON measurement.measurement_type_concept_id = m_type.concept_id 
            LEFT JOIN
                `concept` m_operator 
                    ON measurement.operator_concept_id = m_operator.concept_id 
            LEFT JOIN
                `concept` m_value 
                    ON measurement.value_as_concept_id = m_value.concept_id 
            LEFT JOIN
                `concept` m_unit 
                    ON measurement.unit_concept_id = m_unit.concept_id 
            LEFT JOIn
                `visit_occurrence` v 
                    ON measurement.visit_occurrence_id = v.visit_occurrence_id 
            LEFT JOIN
                `concept` m_visit 
                    ON v.visit_concept_id = m_visit.concept_id 
            LEFT JOIN
                `concept` m_source_concept 
                    ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_60162566_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_60162566",
  "measurement_60162566_*.csv")
message(str_glue('The data will be written to {measurement_60162566_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_60162566_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_60162566_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_60162566_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_60162566_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240129/measurement_60162566/measurement_60162566_*.csv"
dataset_60162566_measurement_df <- read_bq_export_from_workspace_bucket(measurement_60162566_path)

dim(dataset_60162566_measurement_df)

head(dataset_60162566_measurement_df, 5)

## Process pt Chol-LDL

In [None]:
# Chol-LDL
labs <- dataset_60162566_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == 'nanomole per liter'], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "nanomole per liter" &
           unit_concept_name != "nanometer" &
           unit_concept_name != "gram per deciliter" &
           unit_concept_name != "international unit per liter" &
           unit_concept_name != "microgram per liter" &
           unit_concept_name != "milligram per 24 hours" &
           unit_concept_name != "unit per liter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Chol-LDL_wout_outliers_long_012924.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Chol-Total

## Extract pt Chol-Total

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Chol-Total" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_04523275_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    3027114
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (3027114) 
                                            AND is_standard = 1 
                                        )) criteria ) 
                            )
                        )
                ) measurement 
            LEFT JOIN
                `concept` m_standard_concept 
                    ON measurement.measurement_concept_id = m_standard_concept.concept_id 
            LEFT JOIN
                `concept` m_type 
                    ON measurement.measurement_type_concept_id = m_type.concept_id 
            LEFT JOIN
                `concept` m_operator 
                    ON measurement.operator_concept_id = m_operator.concept_id 
            LEFT JOIN
                `concept` m_value 
                    ON measurement.value_as_concept_id = m_value.concept_id 
            LEFT JOIN
                `concept` m_unit 
                    ON measurement.unit_concept_id = m_unit.concept_id 
            LEFT JOIn
                `visit_occurrence` v 
                    ON measurement.visit_occurrence_id = v.visit_occurrence_id 
            LEFT JOIN
                `concept` m_visit 
                    ON v.visit_concept_id = m_visit.concept_id 
            LEFT JOIN
                `concept` m_source_concept 
                    ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_04523275_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_04523275",
  "measurement_04523275_*.csv")
message(str_glue('The data will be written to {measurement_04523275_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_04523275_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_04523275_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_04523275_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_04523275_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240129/measurement_04523275/measurement_04523275_*.csv"
dataset_04523275_measurement_df <- read_bq_export_from_workspace_bucket(measurement_04523275_path)

dim(dataset_04523275_measurement_df)

head(dataset_04523275_measurement_df, 5)

## Process pt Chol-Total

In [None]:
# Chol-Total
labs <- dataset_04523275_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
max(labs$value_as_number[labs$unit_concept_name == 'milligram per deciliter'], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "millimole per liter" &
           unit_concept_name != "milligram per 24 hours" &
           unit_concept_name != "unit") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Chol-Total_wout_outliers_long_012924.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# BMI

## Extract pt BMI

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with BMI" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_42695967_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    3038553
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        )
                )  
                AND (
                    measurement.PERSON_ID IN (
                        SELECT
                            distinct person_id  
                        FROM
                            `cb_search_person` cb_search_person  
                        WHERE
                            cb_search_person.person_id IN (
                                SELECT
                                    criteria.person_id 
                                FROM
                                    (SELECT
                                        DISTINCT person_id,
                                        entry_date,
                                        concept_id 
                                    FROM
                                        `cb_search_all_events` 
                                    WHERE
                                        (
                                            concept_id IN (3038553) 
                                            AND is_standard = 1 
                                        )) criteria ) 
                            )
                        )
                ) measurement 
            LEFT JOIN
                `concept` m_standard_concept 
                    ON measurement.measurement_concept_id = m_standard_concept.concept_id 
            LEFT JOIN
                `concept` m_type 
                    ON measurement.measurement_type_concept_id = m_type.concept_id 
            LEFT JOIN
                `concept` m_operator 
                    ON measurement.operator_concept_id = m_operator.concept_id 
            LEFT JOIN
                `concept` m_value 
                    ON measurement.value_as_concept_id = m_value.concept_id 
            LEFT JOIN
                `concept` m_unit 
                    ON measurement.unit_concept_id = m_unit.concept_id 
            LEFT JOIn
                `visit_occurrence` v 
                    ON measurement.visit_occurrence_id = v.visit_occurrence_id 
            LEFT JOIN
                `concept` m_visit 
                    ON v.visit_concept_id = m_visit.concept_id 
            LEFT JOIN
                `concept` m_source_concept 
                    ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_42695967_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_42695967",
  "measurement_42695967_*.csv")
message(str_glue('The data will be written to {measurement_42695967_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_42695967_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_42695967_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_42695967_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_42695967_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240207/measurement_42695967/measurement_42695967_*.csv"
dataset_42695967_measurement_df <- read_bq_export_from_workspace_bucket(measurement_42695967_path)

dim(dataset_42695967_measurement_df)

head(dataset_42695967_measurement_df, 5)


## Process pt BMI

In [None]:
# BMI
labs <- dataset_42695967_measurement_df
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == "square meter"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
new_labs <- labs %>%
    filter(unit_concept_name == "square meter")
length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "link for Gunter's chain (US)" &
           unit_concept_name != "ratio") %>%
    filter(value_as_number > 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/AllofUs_v8_vitals-BMI_wout_outliers_long_020724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Basophils

## Extract basophil counts/percentages

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Basophils" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_42532320_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37042222, 37070839)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37070839, 37042222)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_42532320_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_42532320",
  "measurement_42532320_*.csv")
message(str_glue('The data will be written to {measurement_42532320_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_42532320_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_42532320_path,
  destination_format = "CSV")



In [None]:
library(tidyverse)
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_42532320_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_42532320_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240526/measurement_42532320/measurement_42532320_*.csv"
dataset_42532320_measurement_df <- read_bq_export_from_workspace_bucket(measurement_42532320_path)

dim(dataset_42532320_measurement_df)

head(dataset_42532320_measurement_df, 5)

## Process basophil counts (thousand/uL)

In [None]:
# Basophil counts
labs <- dataset_42532320_measurement_df %>%
    filter(grepl("volume", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "per microliter"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "billion per liter" &
           unit_concept_name != "per microliter" &
           unit_concept_name != "no value" &
           unit_concept_name != "percent" &
           unit_concept_name != "cells per microliter" &
           unit_concept_name != "cells/uL" &
           unit_concept_name != "nL" &
           unit_concept_name != "uL" &
           unit_concept_name != "thousand" &
           unit_concept_name != "cubic millimeter" &
           unit_concept_name != "the number ten" &
           unit_concept_name != "kilounit per liter" &
           unit_concept_name != "number ten") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Basophil-counts_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process basophil percentages (%)

In [None]:
# Basophil percentages
labs <- dataset_42532320_measurement_df %>%
    filter(grepl("leukocytes", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "thousand per microliter" &
           unit_concept_name != "thousand per cubic millimeter" &
           unit_concept_name != "cells per microliter" &
           unit_concept_name != "billion per liter" &
           unit_concept_name != "gram per liter" &
           unit_concept_name != "per microliter" &
           unit_concept_name != "cells") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Basophil-percentages_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Eosinophils

## Extract eosinophil counts/percentages

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Eosinophils" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_12778330_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37054426, 37060474)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37060474, 37054426)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_12778330_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_12778330",
  "measurement_12778330_*.csv")
message(str_glue('The data will be written to {measurement_12778330_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_12778330_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_12778330_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_12778330_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_12778330_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240527/measurement_12778330/measurement_12778330_*.csv"
dataset_12778330_measurement_df <- read_bq_export_from_workspace_bucket(measurement_12778330_path)

dim(dataset_12778330_measurement_df)

head(dataset_12778330_measurement_df, 5)

## Process eosinophil counts (thousand/uL)

In [None]:
# Eosinophil counts
labs <- dataset_12778330_measurement_df %>%
    filter(grepl("volume", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
#quantile(labs$value_as_number[labs$unit_concept_name == "per microliter"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "percent" &
           unit_concept_name != "billion per liter" &
           unit_concept_name != "per microliter" &
           unit_concept_name != "nL" &
           unit_concept_name != "cells per microliter" &
           unit_concept_name != "no value" &
           unit_concept_name != "cells/uL" &
           unit_concept_name != "thousand" &
           unit_concept_name != "cubic millimeter" &
           unit_concept_name != "per cubic millimeter" &
           unit_concept_name != "gram per liter" &
           unit_concept_name != "the number ten" &
           unit_concept_name != "kilounit per liter" &
           unit_concept_name != "number ten" &
           unit_concept_name != "microliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Eosinophil-counts_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process eosinophil percentages (%)

In [None]:
# Eosinophil percentages
labs <- dataset_12778330_measurement_df %>%
    filter(grepl("leukocytes", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
#quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "thousand per microliter" &
           unit_concept_name != "cells per microliter" &
           unit_concept_name != "billion per liter" &
           unit_concept_name != "thousand per cubic millimeter" &
           unit_concept_name != "cells" &
           unit_concept_name != "per microliter" &
           unit_concept_name != "unit per liter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Eosinophil-percentages_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Reticulocytes

## Extract reticulocyte immature fraction/counts/percentages

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Reticulocytes" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_38270090_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37034184, 37060058, 37071361)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37071361, 37034184, 37060058)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_38270090_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_38270090",
  "measurement_38270090_*.csv")
message(str_glue('The data will be written to {measurement_38270090_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_38270090_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_38270090_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_38270090_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_38270090_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240527/measurement_38270090/measurement_38270090_*.csv"
dataset_38270090_measurement_df <- read_bq_export_from_workspace_bucket(measurement_38270090_path)

dim(dataset_38270090_measurement_df)

head(dataset_38270090_measurement_df, 5)

## Process immature reticulocyte fraction (%)

In [None]:
# Immature reticulocytes fraction counts
labs <- dataset_38270090_measurement_df %>%
    filter(grepl("Immature", standard_concept_name) | standard_concept_name == "Reticulocyte production index")
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "picogram" &
           unit_concept_name != "microliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Reticulocyte-immature-fraction_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process reticulocyte counts (thousand/uL)

In [None]:
# Reticulocyte counts
labs <- dataset_38270090_measurement_df %>%
    filter(grepl("volume", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
#quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "million per cubic millimeter" &
           unit_concept_name != "percent" &
           unit_concept_name != "million per microliter" &
           unit_concept_name != "no value" &
           unit_concept_name != "per microliter" &
           unit_concept_name != "nL" &
           unit_concept_name != "Meter per microliter" &
           unit_concept_name != "billion per liter" &
           unit_concept_name != "cells per microliter" &
           unit_concept_name != "million per liter" &
           unit_concept_name != "microunit per liter" &
           unit_concept_name != "cells/uL" &
           unit_concept_name != "Meter per cubic millimeter" &
           unit_concept_name != "million per milliliter" &
           unit_concept_name != "the number ten") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Reticulocyte-counts_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process reticulocyte percentages (%)

In [None]:
# Reticulocyte percentages
labs <- dataset_38270090_measurement_df %>%
    filter(grepl("erythrocytes", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "thousand per cubic millimeter" &
           unit_concept_name != "gram per liter" &
           unit_concept_name != "picogram" &
           unit_concept_name != "microunit per liter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Reticulocyte-percentages_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Lymphocytes

## Extract lymphocyte counts/percentages

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Lymphocytes" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_64593798_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37031129, 37060904)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37060904, 37031129)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_64593798_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_64593798",
  "measurement_64593798_*.csv")
message(str_glue('The data will be written to {measurement_64593798_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_64593798_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_64593798_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_64593798_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_64593798_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240527/measurement_64593798/measurement_64593798_*.csv"
dataset_64593798_measurement_df <- read_bq_export_from_workspace_bucket(measurement_64593798_path)

dim(dataset_64593798_measurement_df)

head(dataset_64593798_measurement_df, 5)

## Process lymphoctye counts (thousand/uL)

In [None]:
# Lymphocyte counts
labs <- dataset_64593798_measurement_df %>%
    filter(grepl("volume", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "No matching concept"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "percent" &
           unit_concept_name != "billion per liter" &
           unit_concept_name != "per microliter" &
           unit_concept_name != "no value" &
           unit_concept_name != "nL" &
           unit_concept_name != "cells per microliter" &
           unit_concept_name != "/mcL" &
           unit_concept_name != "cells/uL" &
           unit_concept_name != "thousand" &
           unit_concept_name != "cubic millimeter" &
           unit_concept_name != "cells per cubic millimeter" &
           unit_concept_name != "uL" &
           unit_concept_name != "gram per liter" &
           unit_concept_name != "microliter" &
           unit_concept_name != "kelvin" &
           unit_concept_name != "the number ten" &
           unit_concept_name != "kilounit per liter" &
           unit_concept_name != "millimeter" &
           unit_concept_name != "unit per liter" &
           unit_concept_name != "number ten") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Lymphocyte-counts_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process lymphocyte percentages (%)

In [None]:
# Lymphocyte percentages
labs <- dataset_64593798_measurement_df %>%
    filter(grepl("leukocytes", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "thousand per microliter" &
           unit_concept_name != "thousand per cubic millimeter" &
           unit_concept_name != "cells per microliter" &
           unit_concept_name != "billion per liter" &
           unit_concept_name != "cells" &
           unit_concept_name != "cubic millimeter" &
           unit_concept_name != "per microliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Lymphocyte-percentages_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Monocytes

## Extract monocyte counts/percentages

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Monocytes" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_95770305_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37040193, 37060555)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37060555, 37040193)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_95770305_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_95770305",
  "measurement_95770305_*.csv")
message(str_glue('The data will be written to {measurement_95770305_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_95770305_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_95770305_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_95770305_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_95770305_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240527/measurement_95770305/measurement_95770305_*.csv"
dataset_95770305_measurement_df <- read_bq_export_from_workspace_bucket(measurement_95770305_path)

dim(dataset_95770305_measurement_df)

head(dataset_95770305_measurement_df, 5)

## Process monocyte counts (thousand/uL)

In [None]:
# Monocyte counts
labs <- dataset_95770305_measurement_df %>%
    filter(grepl("volume", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "No matching concept"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "percent" &
           unit_concept_name != "per microliter" &
           unit_concept_name != "billion per liter" &
           unit_concept_name != "nL" &
           unit_concept_name != "no value" &
           unit_concept_name != "cells per microliter" &
           unit_concept_name != "cells/uL" &
           unit_concept_name != "thousand" &
           unit_concept_name != "cubic millimeter" &
           unit_concept_name != "gram per liter" &
           unit_concept_name != "the number ten" &
           unit_concept_name != "kilounit per liter" &
           unit_concept_name != "number ten") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Monocyte-counts_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process monocyte percentages (%)

In [None]:
# Monocyte percentages
labs <- dataset_95770305_measurement_df %>%
    filter(grepl("leukocytes", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "thousand per microliter" &
           unit_concept_name != "cells per microliter" &
           unit_concept_name != "thousand per cubic millimeter" &
           unit_concept_name != "billion per liter" &
           unit_concept_name != "kelvin" &
           unit_concept_name != "cells" &
           unit_concept_name != "femtoliter" &
           unit_concept_name != "milliliter" &
           unit_concept_name != "unit per liter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Monocyte-percentages_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Neutrophils

## Extract neutrophil counts/percentages

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Neutrophils" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_99967281_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37028517, 37045722)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37028517, 37045722)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_99967281_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_99967281",
  "measurement_99967281_*.csv")
message(str_glue('The data will be written to {measurement_99967281_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_99967281_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_99967281_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_99967281_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_99967281_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240527/measurement_99967281/measurement_99967281_*.csv"
dataset_99967281_measurement_df <- read_bq_export_from_workspace_bucket(measurement_99967281_path)

dim(dataset_99967281_measurement_df)

head(dataset_99967281_measurement_df, 5)

## Process neutrophil counts (thousand/uL)

In [None]:
# Neutrophil counts
labs <- dataset_99967281_measurement_df %>%
    filter(grepl("volume", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "No matching concept"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "percent" &
           unit_concept_name != "per microliter" &
           unit_concept_name != "billion per liter" &
           unit_concept_name != "nL" &
           unit_concept_name != "cells per microliter" &
           unit_concept_name != "no value" &
           unit_concept_name != "/mm3" &
           unit_concept_name != "thousand" &
           unit_concept_name != "cubic millimeter" &
           unit_concept_name != "gram per liter" &
           unit_concept_name != "cells/uL" &
           unit_concept_name != "microliter" &
           unit_concept_name != "Percentage unit" &
           unit_concept_name != "the number ten" &
           unit_concept_name != "kilounit per liter" &
           unit_concept_name != "/mcL" &
           unit_concept_name != "number ten" &
           unit_concept_name != "kelvin") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Neutrophil-counts_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process neutrophil percentages (%)

In [None]:
# Neutrophil percentages
labs <- dataset_99967281_measurement_df %>%
    filter(grepl("leukocytes", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "thousand per microliter" &
           unit_concept_name != "cells" &
           unit_concept_name != "thousand per cubic millimeter" &
           unit_concept_name != "billion per liter" &
           unit_concept_name != "nL" &
           unit_concept_name != "femtoliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Neutrophil-percentages_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Erythrocytes

## Extract nucleated erythrocyte counts/percentages, erythrocyte counts/distribution width

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Erythrocytes" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_15073300_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37026965, 37040514, 37041261, 37059055)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37041261, 37026965, 37059055, 37040514)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_15073300_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_15073300",
  "measurement_15073300_*.csv")
message(str_glue('The data will be written to {measurement_15073300_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_15073300_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_15073300_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_15073300_path}` to copy these files
#       to the Jupyter disk.

measurement_15073300_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_15073300",
  "measurement_15073300_*.csv")
message(str_glue('The data will be written to {measurement_15073300_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))



read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_15073300_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240527/measurement_15073300/measurement_15073300_*.csv"
dataset_15073300_measurement_df <- read_bq_export_from_workspace_bucket(measurement_15073300_path)

dim(dataset_15073300_measurement_df)

head(dataset_15073300_measurement_df, 5)

## Process nucleated erythroctyes counts (thousand/uL)

In [None]:
# Nucleated erythrocyte counts
labs <- dataset_15073300_measurement_df %>%
    filter(grepl("Nucleated", standard_concept_name) & grepl("volume", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
#quantile(labs$value_as_number[labs$unit_concept_name == "thousand per microliter"], na.rm=T, probs = seq(0, 1, 0.05))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "percent" &
           unit_concept_name != "billion per liter" &
           unit_concept_name != "per 100 white blood cells" &
           unit_concept_name != "nL" &
           unit_concept_name != "per hundred" &
           unit_concept_name != "thousand" &
           unit_concept_name != "cells per microliter" &
           unit_concept_name != "per microliter" &
           unit_concept_name != "no value" &
           unit_concept_name != "counts" &
           unit_concept_name != "trillion per liter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Nucleated-Erythrocyte-counts_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process nucleated erythrocyte percentages (%)

In [None]:
# Nucleated erythrocyte percentage
labs <- dataset_15073300_measurement_df %>%
    filter(grepl("Nucleated", standard_concept_name) & grepl("leukocytes", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
#quantile(labs$value_as_number[labs$unit_concept_name == "thousand per microliter"], na.rm=T, probs = seq(0, 1, 0.05))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "thousand per microliter" &
           unit_concept_name != "counts" &
           unit_concept_name != "thousand per cubic millimeter" &
           unit_concept_name != "milliliter per deciliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Nucleated-Erythrocyte-percentages_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process erythrocyte counts (million/uL)

In [None]:
# Erythrocyte counts
labs <- dataset_15073300_measurement_df %>%
    filter(!grepl("Nucleated", standard_concept_name) & grepl("volume", standard_concept_name) &
           !grepl("distribution", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

#as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "thousand per cubic millimeter"], na.rm=T, probs = seq(0, 1, 0.25))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "nL" &
           unit_concept_name != "per microliter" &
           unit_concept_name != "microlitre/ml" &
           unit_concept_name != "microunit per liter" &
           unit_concept_name != "no value" &
           unit_concept_name != "per high power field" &
           unit_concept_name != "microliter" &
           unit_concept_name != "cells per microliter" &
           unit_concept_name != "million" &
           unit_concept_name != "trillion cells per liter" &
           unit_concept_name != "liter per minute" &
           unit_concept_name != "the number ten" &
           unit_concept_name != "unit per 10 cells" &
           unit_concept_name != "cubic millimeter" &
           unit_concept_name != "percent" &
           unit_concept_name != "number ten") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Erythrocyte-counts_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process erythrocyte distribution width (%)

In [None]:
# Erythrocyte distribution width
labs <- dataset_15073300_measurement_df %>%
    filter(grepl("distribution", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "No matching concept"], na.rm=T, probs = seq(0, 1, 0.25))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "femtoliter" &
           unit_concept_name != "unit" &
           unit_concept_name != "no value" &
           unit_concept_name != "thousand per microliter" &
           unit_concept_name != "microgram per liter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number) 
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Erythrocyte-distribution-width_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Hemoglobin

## Extract hemoglobin measurements

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Hemoglobin" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_06003545_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37045413, 37068065, 37070108, 37072252)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37068065, 37070108, 37045413, 37072252)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_06003545_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_06003545",
  "measurement_06003545_*.csv")
message(str_glue('The data will be written to {measurement_06003545_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_06003545_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_06003545_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_06003545_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_06003545_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240527/measurement_06003545/measurement_06003545_*.csv"
dataset_06003545_measurement_df <- read_bq_export_from_workspace_bucket(measurement_06003545_path)

dim(dataset_06003545_measurement_df)

head(dataset_06003545_measurement_df, 5)

## Process hematocrit (%)

In [None]:
# Hematocrit
labs <- dataset_06003545_measurement_df %>%
    filter(grepl("Hematocrit", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
#quantile(labs$value_as_number[labs$unit_concept_name == "thousand per microliter"], na.rm=T, probs = seq(0, 1, 0.05))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "milliliter per deciliter" &
           unit_concept_name != "no value" &
           unit_concept_name != "gram per deciliter" &
           unit_concept_name != "milligram per gram of creatinine" &
           unit_concept_name != "second" &
           unit_concept_name != "thousand per microliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Hematocrit_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Hemoglobin concentration (g/dL)

In [None]:
# Hemaglobin concentration
labs <- dataset_06003545_measurement_df %>%
    filter(grepl("Hemoglobin", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T, probs = seq(0, 1, 0.25))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "million per microliter" &
           unit_concept_name != "thousand per microliter" &
           unit_concept_name != "deciliter" &
           unit_concept_name != "gram per milliliter" &
           unit_concept_name != "liter" &
           unit_concept_name != "milligram" &
           unit_concept_name != "Unit of presentation" &
           unit_concept_name != "percent" &
           unit_concept_name != "femtoliter" &
           unit_concept_name != "international unit per liter" &
           unit_concept_name != "microliter" &
           unit_concept_name != "Percent" &
           unit_concept_name != "picogram") %>%
    mutate(value_as_number = ifelse(unit_concept_name == "gram per liter", value_as_number / 10, value_as_number)) %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Hemaglobin-concentration_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Mean corpuscular hemoglobin (picogram)

In [None]:
# MCH
labs <- dataset_06003545_measurement_df %>%
    filter(grepl("MCH", standard_concept_name) & !grepl("MCHC", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "No matching concept"], na.rm=T, probs = seq(0, 1, 0.25))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "per gram" &
           unit_concept_name != "no value" &
           unit_concept_name != "microgram" &
           unit_concept_name != "gram per deciliter" &
           unit_concept_name != "percent" &
           unit_concept_name != "mcg" &
           unit_concept_name != "pg/cell" &
           unit_concept_name != "Percent" &
           unit_concept_name != "femtoliter" &
           unit_concept_name != "mL/kg") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Mean-Corpuscular-Hemoglobin_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Mean corpuscular hemoglobin concentration (g/dL)

In [None]:
# MCHC
labs <- dataset_06003545_measurement_df %>%
    filter(grepl("MCHC", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "No matching concept"], na.rm=T, probs = seq(0, 1, 0.25))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "Percentage unit" &
           unit_concept_name != "picogram" &
           unit_concept_name != "no value" &
           unit_concept_name != "Percent" &
           unit_concept_name != "femtoliter" &
           unit_concept_name != "gram per liter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Mean-Corpuscular-Hemoglobin-concentration_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Cell volumes

## Extract cell volume measurements

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Cell Volumes" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_02226542_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37065843, 37071701)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37071701, 37065843)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_02226542_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_02226542",
  "measurement_02226542_*.csv")
message(str_glue('The data will be written to {measurement_02226542_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_02226542_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_02226542_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_02226542_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_02226542_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_02226542/measurement_02226542_*.csv"
dataset_02226542_measurement_df <- read_bq_export_from_workspace_bucket(measurement_02226542_path)

dim(dataset_02226542_measurement_df)

head(dataset_02226542_measurement_df, 5)

## Process mean corpuscular volume (femtoliters)

In [None]:
# MCV
labs <- dataset_02226542_measurement_df %>%
    filter(grepl("MCV", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T, probs = seq(0, 1, 0.25))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "cubic millimeter" &
           unit_concept_name != "percent" &
           unit_concept_name != "millimole per liter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Mean-Corpuscular-Volume_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process mean platelet volume (femtoliter)

In [None]:
# Mean platelet volume
labs <- dataset_02226542_measurement_df %>%
    filter(grepl("Platelet", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T, probs = seq(0, 1, 0.25))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "percent" &
           unit_concept_name != "thousand per microliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Mean-Platelet-Volume_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Platelets

## Extract platelet count/crit/distribution width

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Platelets" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_78924662_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37037425, 4097621, 44787151)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (44787151, 4097621, 37037425)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_78924662_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_78924662",
  "measurement_78924662_*.csv")
message(str_glue('The data will be written to {measurement_78924662_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_78924662_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_78924662_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_78924662_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_78924662_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_78924662/measurement_78924662_*.csv"
dataset_78924662_measurement_df <- read_bq_export_from_workspace_bucket(measurement_78924662_path)

dim(dataset_78924662_measurement_df)

head(dataset_78924662_measurement_df, 5)

## Process platelet counts (thousand/uL)

In [None]:
# Platelet count
labs <- dataset_78924662_measurement_df %>%
    filter((grepl("volume", standard_concept_name) | grepl("adequacy", standard_concept_name)) &
           !grepl("distribution", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T, probs = seq(0, 1, 0.25))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name == "thousand per microliter" |
           unit_concept_name == "No matching concept" |
           unit_concept_name == "thousand per cubic millimeter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Platelet-counts_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process platelet crit (%)

In [None]:
# Platelet crit
labs <- dataset_78924662_measurement_df %>%
    filter(grepl("hematocrit", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T, probs = seq(0, 1, 0.25))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs$value_as_number, na.rm=T)

## Process platelet distribution width (%)

In [None]:
# Platelet distribution width
labs <- dataset_78924662_measurement_df %>%
    filter(grepl("distribution", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T, probs = seq(0, 1, 0.25))

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "thousand per microliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs$value_as_number, na.rm=T)

# Leukocytes

## Extract leukocyte counts

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Leukocytes" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_86273426_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37043992)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37043992)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_86273426_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_86273426",
  "measurement_86273426_*.csv")
message(str_glue('The data will be written to {measurement_86273426_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_86273426_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_86273426_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_86273426_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_86273426_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_86273426/measurement_86273426_*.csv"
dataset_86273426_measurement_df <- read_bq_export_from_workspace_bucket(measurement_86273426_path)

dim(dataset_86273426_measurement_df)

head(dataset_86273426_measurement_df, 5)

## Process leukocyte counts (thousand/uL)

In [None]:
# Leukocyte counts
labs <- dataset_86273426_measurement_df #%>%
    #filter(grepl("volume", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "No matching concept"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name == "thousand per microliter" |
           unit_concept_name == "No matching concept" |
           unit_concept_name == "thousand per cubic millimeter" |
           unit_concept_name == "Kelvin per cubic millimeter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Leukocyte-counts_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Albumin

## Extract albumin

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Albumin" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_61770793_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37048896)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37048896)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_61770793_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_61770793",
  "measurement_61770793_*.csv")
message(str_glue('The data will be written to {measurement_61770793_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_61770793_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_61770793_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_61770793_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
} 
#measurement_61770793_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_61770793/measurement_61770793_*.csv"
dataset_61770793_measurement_df <- read_bq_export_from_workspace_bucket(measurement_61770793_path)

dim(dataset_61770793_measurement_df)

head(dataset_61770793_measurement_df, 5)

## Process albumin (g/dL)

In [None]:
# Albumin 
labs <- dataset_61770793_measurement_df #%>%
    #filter(grepl("volume", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "milligram per liter" &
           unit_concept_name != "microgram per milligram" &
           unit_concept_name != "microgram per milliliter" &
           unit_concept_name != "percent" &
           unit_concept_name != "milligram per deciliter" &
           unit_concept_name != "Percent" &
           unit_concept_name != "unit per liter" &
           unit_concept_name != "cells/uL") %>%
    mutate(value_as_number = ifelse(unit_concept_name == "gram per liter", value_as_number / 10, value_as_number)) %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Albumin_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Apolipoproteins

## Extract APOL-A, APOL-B

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Apolipoproteins" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_03352512_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (40772580, 40779220)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (40772580, 40779220)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_03352512_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_03352512",
  "measurement_03352512_*.csv")
message(str_glue('The data will be written to {measurement_03352512_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_03352512_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_03352512_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_03352512_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_03352512_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_03352512/measurement_03352512_*.csv"
dataset_03352512_measurement_df <- read_bq_export_from_workspace_bucket(measurement_03352512_path)

dim(dataset_03352512_measurement_df)

head(dataset_03352512_measurement_df, 5)

## Process APOL-1A (mg/mL)

In [None]:
# APOA1
labs <- dataset_03352512_measurement_df %>%
    filter(standard_concept_name == "Apolipoprotein A-I [Mass/volume] in Serum or Plasma")
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == 'milligram per milliliter'], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "milligram per milliliter") %>%
    filter(value_as_number >= 0) %>%
    mutate(value_as_number = value_as_number/100) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-APOL-A1_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process APOL-B (mg/mL)

In [None]:
# APOB
labs <- dataset_03352512_measurement_df %>%
    filter(standard_concept_name == "Apolipoprotein B [Mass/volume] in Serum or Plasma")
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == 'milligram per milliliter'], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "milligram per milliliter") %>%
    filter(value_as_number >= 0) %>%
    mutate(value_as_number = value_as_number/100) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number < (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-APOL-B_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# CRP

## Extract CRP

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with CRP" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_74918429_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37025552)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37025552)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_74918429_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_74918429",
  "measurement_74918429_*.csv")
message(str_glue('The data will be written to {measurement_74918429_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_74918429_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_74918429_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_74918429_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_74918429_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_74918429/measurement_74918429_*.csv"
dataset_74918429_measurement_df <- read_bq_export_from_workspace_bucket(measurement_74918429_path)

dim(dataset_74918429_measurement_df)

head(dataset_74918429_measurement_df, 5)

## Process CRP (mg/L)

In [None]:
# CRP 
labs <- dataset_74918429_measurement_df #%>%
    #filter(grepl("volume", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "milligram per liter"], na.rm=T)
quantile(labs$value_as_number[labs$unit_concept_name == "No matching concept"], na.rm=T)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "Unit of presentation" &
           unit_concept_name != "titer" &
           unit_concept_name != "standardized quality unit" &
           unit_concept_name != "nanogram per milliliter" &
           unit_concept_name != "percent") %>%
    mutate(value_as_number = ifelse(unit_concept_name == "milligram per deciliter", value_as_number * 10, value_as_number)) %>%
    mutate(value_as_number = ifelse(unit_concept_name == "mg/dL", value_as_number * 10, value_as_number)) %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-CRP_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Calcium/Phosphate

## Extract calcium/phosphate

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Ca-Phos" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_67813221_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37035164, 37062662)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37062662, 37035164)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_67813221_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_67813221",
  "measurement_67813221_*.csv")
message(str_glue('The data will be written to {measurement_67813221_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_67813221_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_67813221_path,
  destination_format = "CSV")



In [None]:
measurement_67813221_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_67813221",
  "measurement_67813221_*.csv")
message(str_glue('The data will be written to {measurement_67813221_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_67813221_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_67813221_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_67813221/measurement_67813221_*.csv"
dataset_67813221_measurement_df <- read_bq_export_from_workspace_bucket(measurement_67813221_path)

dim(dataset_67813221_measurement_df)

head(dataset_67813221_measurement_df, 5)

## Process calcium (mg/dL)

In [None]:
# Calcium 
labs <- dataset_67813221_measurement_df %>%
    filter(grepl("Calcium", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "milligram per milliliter" &
           unit_concept_name != "milligram per deciliter calculated" &
           unit_concept_name != "gram per deciliter" &
           unit_concept_name != "milligram per 24 hours") %>%
    mutate(value_as_number = ifelse(unit_concept_name == "millimole per liter", value_as_number * (1/0.2495), value_as_number)) %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Calcium_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process phosphate (mg/dL)

In [None]:
# Phosphate
labs <- dataset_67813221_measurement_df %>%
    filter(grepl("Phosphate", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "milligram per gram") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Phosphate_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Cystatin C

## Extract cystatin c

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Cystatin C" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_37766621_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37043215)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37043215)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_37766621_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_37766621",
  "measurement_37766621_*.csv")
message(str_glue('The data will be written to {measurement_37766621_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_37766621_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_37766621_path,
  destination_format = "CSV")



In [None]:
measurement_37766621_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_37766621",
  "measurement_37766621_*.csv")
message(str_glue('The data will be written to {measurement_37766621_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))



# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_37766621_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_37766621_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_37766621/measurement_37766621_*.csv"
dataset_37766621_measurement_df <- read_bq_export_from_workspace_bucket(measurement_37766621_path)

dim(dataset_37766621_measurement_df)

head(dataset_37766621_measurement_df, 5)

## Process cystatin c (mg/L)

In [None]:
# Cystatin C
labs <- dataset_37766621_measurement_df #%>%
    #filter(grepl("Phosphate", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Cystatin-C_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Bilirubins

## Extract bilirubin total/direct/indirect

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Bilirubins" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_87077757_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (3007359, 3024128, 3027597)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN (3027597, 3007359, 3024128) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_87077757_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_87077757",
  "measurement_87077757_*.csv")
message(str_glue('The data will be written to {measurement_87077757_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_87077757_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_87077757_path,
  destination_format = "CSV")



In [None]:
measurement_87077757_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_87077757",
  "measurement_87077757_*.csv")
message(str_glue('The data will be written to {measurement_87077757_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))


# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_87077757_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_87077757_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_87077757/measurement_87077757_*.csv"
dataset_87077757_measurement_df <- read_bq_export_from_workspace_bucket(measurement_87077757_path)

dim(dataset_87077757_measurement_df)

head(dataset_87077757_measurement_df, 5)

## Process bilirubin total (mg/dL)

In [None]:
# Bilirubin Total
labs <- dataset_87077757_measurement_df %>%
    filter(standard_concept_name == "Bilirubin.total [Mass/volume] in Serum or Plasma")
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == 'microgram per deciliter'], na.rm=T)

#max(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "microgram per deciliter" &
           unit_concept_name != "gram per deciliter" &
           unit_concept_name != "unit per liter" &
           unit_concept_name != "mg/mL") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Bilirubin-Total_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process bilirubin direct (mg/dL)

In [None]:
# Bilirubin Direct
labs <- dataset_87077757_measurement_df %>%
    filter(standard_concept_name == "Bilirubin.direct [Mass/volume] in Serum or Plasma")
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == 'microgram per deciliter'], na.rm=T)

#max(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "microgram per deciliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Bilirubin-Direct_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process bilirubin indirect (mg/dL)

In [None]:
# Bilirubin Indirect
labs <- dataset_87077757_measurement_df %>%
    filter(standard_concept_name == "Bilirubin.indirect [Mass/volume] in Serum or Plasma")
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
median(labs$value_as_number[labs$unit_concept_name == 'microgram per deciliter'], na.rm=T)

#max(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "nanogram per milliliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Bilirubin-Indirect_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# IGF-1

## Extract IGF-1

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with IGF-1" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_10238626_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37027959)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37027959)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_10238626_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_10238626",
  "measurement_10238626_*.csv")
message(str_glue('The data will be written to {measurement_10238626_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_10238626_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_10238626_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_10238626_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_10238626_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_10238626/measurement_10238626_*.csv"
dataset_10238626_measurement_df <- read_bq_export_from_workspace_bucket(measurement_10238626_path)

dim(dataset_10238626_measurement_df)

head(dataset_10238626_measurement_df, 5)

## Process IGF-1 (ng/mL)

In [None]:
# IGF-1
labs <- dataset_10238626_measurement_df #%>%
    #filter(standard_concept_name == "Bilirubin.indirect [Mass/volume] in Serum or Plasma")
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == 'no value'], na.rm=T)

#max(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "No matching concept") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-IGF-1_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Lipoprotein A

## Extract Lipoprotein A

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Lipoprotein A" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_20410432_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37060496)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37060496)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_20410432_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_20410432",
  "measurement_20410432_*.csv")
message(str_glue('The data will be written to {measurement_20410432_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_20410432_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_20410432_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_20410432_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_20410432_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_20410432/measurement_20410432_*.csv"
dataset_20410432_measurement_df <- read_bq_export_from_workspace_bucket(measurement_20410432_path)

dim(dataset_20410432_measurement_df)

head(dataset_20410432_measurement_df, 5)

## Process Lipoprotein A (mg/dL)

In [None]:
# Lipoprotein A
labs <- dataset_20410432_measurement_df #%>%
    #filter(standard_concept_name == "Bilirubin.indirect [Mass/volume] in Serum or Plasma")
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == 'no value'], na.rm=T)

#max(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "milligram per deciliter")
#length(unique(new_labs$person_id))



In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "nanomole per liter" &
           unit_concept_name != "nmol/L") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Lipoprotein-A_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Testosterone

## Extract testosterone

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Testosterone" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_29984346_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37049505)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37049505)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_29984346_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_29984346",
  "measurement_29984346_*.csv")
message(str_glue('The data will be written to {measurement_29984346_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_29984346_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_29984346_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_29984346_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_29984346_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_29984346/measurement_29984346_*.csv"
dataset_29984346_measurement_df <- read_bq_export_from_workspace_bucket(measurement_29984346_path)

dim(dataset_29984346_measurement_df)

head(dataset_29984346_measurement_df, 5)

## Process testosterone (ng/dL)

In [None]:
# Testosterone
labs <- dataset_29984346_measurement_df #%>%
    #filter(grepl("Phosphate", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "picogram per milliliter"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "nanomole per liter" &
           unit_concept_name != "percent" &
           unit_concept_name != "microgram per deciliter" &
           unit_concept_name != "picogram per milliliter") %>%
    filter(value_as_number >= 0) %>%
    mutate(value_as_number = ifelse(unit_concept_name == "nanogram per milliliter", value_as_number * 100, value_as_number)) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Testosterone_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Total Protein

## Extract total protein

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Total Protein" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_62703067_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (4152983)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN (4152983) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_62703067_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_62703067",
  "measurement_62703067_*.csv")
message(str_glue('The data will be written to {measurement_62703067_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_62703067_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_62703067_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_62703067_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_62703067_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_62703067/measurement_62703067_*.csv"
dataset_62703067_measurement_df <- read_bq_export_from_workspace_bucket(measurement_62703067_path)

dim(dataset_62703067_measurement_df)

head(dataset_62703067_measurement_df, 5)

## Process total protein (g/dL)

In [None]:
# Total Protein
labs <- dataset_62703067_measurement_df #%>%
    #filter(grepl("Phosphate", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "g/dL"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "nanomole per liter" &
           unit_concept_name != "percent" &
           unit_concept_name != "microgram per deciliter" &
           unit_concept_name != "picogram per milliliter") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Total-Protein_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Urate/Urea

## Extract urate/urea

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with Urate_Urea" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_32274867_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37027482, 37029793)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (3034204, 37029793)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_32274867_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_32274867",
  "measurement_32274867_*.csv")
message(str_glue('The data will be written to {measurement_32274867_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_32274867_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_32274867_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_32274867_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_32274867_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_32274867/measurement_32274867_*.csv"
dataset_32274867_measurement_df <- read_bq_export_from_workspace_bucket(measurement_32274867_path)

dim(dataset_32274867_measurement_df)

head(dataset_32274867_measurement_df, 5)

## Process urate (mg/dL)

In [None]:
# Urate
labs <- dataset_32274867_measurement_df %>%
    filter(grepl("Urate", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "microgram per deciliter" &
           unit_concept_name != "percent") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Urate_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


## Process urea (mg/dL)

In [None]:
# Urea
labs <- dataset_32274867_measurement_df %>%
    filter(grepl("Urea", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "milligram per deciliter"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Urea_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")


# Vitamin D

## Extract vitamin D

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "Pts with VitD" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_63812257_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (37036354)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (37036354)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_63812257_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_63812257",
  "measurement_63812257_*.csv")
message(str_glue('The data will be written to {measurement_63812257_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_63812257_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_63812257_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_63812257_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
#measurement_63812257_path <- "gs://fc-secure-b96fb036-3379-4be0-8834-e7e486f2b76e/bq_exports/davidz1@researchallofus.org/20240528/measurement_63812257/measurement_63812257_*.csv"
dataset_63812257_measurement_df <- read_bq_export_from_workspace_bucket(measurement_63812257_path)

dim(dataset_63812257_measurement_df)

head(dataset_63812257_measurement_df, 5)

## Process vitamin D (ng/mL)

In [None]:
# Vitamin D
labs <- dataset_63812257_measurement_df #%>%
    #filter(grepl("Urate", standard_concept_name))
colnames(labs)
length(unique(labs$person_id))

as.data.frame(table(labs$standard_concept_name)) %>% arrange(-Freq)

#as.data.frame(table(labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
as.data.frame(table(labs$unit_concept_name)) %>% arrange(-Freq)
quantile(labs$value_as_number[labs$unit_concept_name == "no value"], na.rm=T)

#median(labs$value_as_number, na.rm=T)

#new_labs <- labs %>%
#    filter(!grepl("Emergency", visit_occurrence_concept_name) & !grepl("Intensive", visit_occurrence_concept_name) &
#          !grepl("Urgent", visit_occurrence_concept_name))

#as.data.frame(table(new_labs$visit_occurrence_concept_name)) %>% arrange(-Freq)
#new_labs <- labs %>%
#    filter(unit_concept_name == "square meter")
#length(unique(new_labs$person_id))


In [None]:
keep_outliers = FALSE

# Do all the filtering specified at the top of the file
labs_formatted <- labs %>%
    filter(!grepl("Emergency", visit_occurrence_concept_name) & 
           !grepl("Intensive", visit_occurrence_concept_name) &
           !grepl("Urgent", visit_occurrence_concept_name)) %>%
    filter(unit_concept_name != "pg/mL" &
           unit_concept_name != "mg/mL") %>%
    filter(value_as_number >= 0) %>%
    select(person_id, measurement_datetime, value_as_number) %>%
    mutate_at(c('person_id'), as.character) %>%
    mutate(across(where(is.character), ~na_if(., "NULL"))) %>%
    na.omit() %>%
    distinct() %>%
    group_by(person_id) %>%
    filter(value_as_number <= (10 * median(value_as_number))) %>%
    ungroup() %>%
    filter(keep_outliers | value_as_number >= mean(value_as_number) - (4 * sd(value_as_number))) %>%
    filter(keep_outliers | value_as_number <= mean(value_as_number) + (4 * sd(value_as_number))) 

dim(labs_formatted)
head(labs_formatted, 5)


In [None]:
min(labs_formatted$value_as_number)
median(labs_formatted$value_as_number)
mean(labs_formatted$value_as_number)
max(labs_formatted$value_as_number)
sd(labs_formatted$value_as_number)

quantile(labs_formatted$value_as_number)


In [None]:
write.table(labs_formatted, 
    file=paste0(data_path, "/dataframes/blood/AllofUs_v8_labs-Vitamin-D_wout_outliers_long_052724.txt"), 
    row.names=FALSE, quote=FALSE, sep="\t")
