# Helpers

In [None]:
library(tidyverse)  # Data wrangling packages.


In [None]:
# Load "config.R" for utility functions. 
#Will also triggger loading of 
    
    # user_config.JSON (including key for project_config)
    # project_config.JSON
    # preprocessing_visualizations.R
    # preprocessing_functions.R

user <- "Jan" 
source("config.r")



#If certain packages not installed yet via requirements.txt, install them here via
# install.packages("package_name")

In [None]:
# This snippet assumes that you run setup first

# This code lists objects in your Google Bucket

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# List objects in the bucket
system(paste0("gsutil ls -r ", my_bucket), intern=T)


# Person data

In [None]:
# This query represents dataset "All ICD Codes" for domain "person" and was generated for All of Us Registered Tier Dataset v7
person_86377274_path <- "gs://fc-secure-cde9a0f0-7d5a-4045-98bb-fb1d394a535b/bq_exports/janclusmann@researchallofus.org/20250709/person_86377274/person_86377274_*.csv"

# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {person_86377274_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(gender = col_character(), race = col_character(), ethnicity = col_character(), sex_at_birth = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}

dataset_86377274_person_df <- read_bq_export_from_workspace_bucket(person_86377274_path)

dim(dataset_86377274_person_df)

head(dataset_86377274_person_df, 5)

In [None]:
### Columns to keep: None
### Can get any necessary demographic information later using controlled tier access

# EHR Data Old Scripts

## EHR Data: Observations

In [None]:
# This query represents dataset "All ICD Codes" for domain "observation" and was generated for All of Us Registered Tier Dataset v7
observation_86377274_path <- "gs://fc-secure-cde9a0f0-7d5a-4045-98bb-fb1d394a535b/bq_exports/janclusmann@researchallofus.org/20250709/observation_86377274/observation_86377274_*.csv"

# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {observation_86377274_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), observation_type_concept_name = col_character(), value_as_string = col_character(), value_as_concept_name = col_character(), qualifier_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), observation_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), qualifier_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}

dataset_86377274_observation_df <- read_bq_export_from_workspace_bucket(observation_86377274_path)

dim(dataset_86377274_observation_df)

head(dataset_86377274_observation_df, 5)

In [None]:
#unique(dataset_86377274_observation_df$source_concept_code)
#unique(dataset_86377274_observation_df$qualifier_concept_name)
#colnames(dataset_86377274_observation_df)

### Columns to keep:
dataset_86377274_observation_df_filtered <- dataset_86377274_observation_df[c("person_id", "observation_datetime", "observation_type_concept_name", "visit_occurrence_id", "visit_occurrence_concept_name", "source_concept_name", "source_concept_code", "source_vocabulary")]
dataset_86377274_observation_df_filtered <- dataset_86377274_observation_df_filtered %>% 
    rename(all_of(c(condition_start_datetime = "observation_datetime", condition_type_concept_name = "observation_type_concept_name"))) %>%
    add_column(condition_end_datetime = as.character(NA), .after = "condition_start_datetime") %>%
    add_column(condition_type = "observation")
dataset_86377274_observation_df_filtered

## EHR Data: Procedures

In [None]:
# This query represents dataset "All ICD Codes" for domain "procedure" and was generated for All of Us Registered Tier Dataset v7
procedure_86377274_path <- "gs://fc-secure-cde9a0f0-7d5a-4045-98bb-fb1d394a535b/bq_exports/janclusmann@researchallofus.org/20250709/procedure_86377274/procedure_86377274_*.csv"

# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {procedure_86377274_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), procedure_type_concept_name = col_character(), modifier_concept_name = col_character(), visit_occurrence_concept_name = col_character(), procedure_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), modifier_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}

dataset_86377274_procedure_df <- read_bq_export_from_workspace_bucket(procedure_86377274_path)

dim(dataset_86377274_procedure_df)

head(dataset_86377274_procedure_df, 5)

In [None]:
#unique(dataset_86377274_procedure_df$source_concept_code)

### Columns to keep:
dataset_86377274_procedure_df_filtered <- dataset_86377274_procedure_df[c("person_id", "procedure_datetime", "procedure_type_concept_name", "visit_occurrence_id", "visit_occurrence_concept_name", "source_concept_name", "source_concept_code", "source_vocabulary")]
dataset_86377274_procedure_df_filtered <- dataset_86377274_procedure_df_filtered %>% 
    rename(all_of(c(condition_start_datetime = "procedure_datetime", condition_type_concept_name = "procedure_type_concept_name"))) %>%
    add_column(condition_end_datetime = as.character(NA), .after = "condition_start_datetime") %>%
    add_column(condition_type = "procedure")
dataset_86377274_procedure_df_filtered

## EHR Data: Measurements

In [None]:
# This query represents dataset "All ICD Codes" for domain "measurement" and was generated for All of Us Registered Tier Dataset v7
measurement_86377274_path <- "gs://fc-secure-cde9a0f0-7d5a-4045-98bb-fb1d394a535b/bq_exports/janclusmann@researchallofus.org/20250709/measurement_86377274/measurement_86377274_*.csv"

# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_86377274_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}

dataset_86377274_measurement_df <- read_bq_export_from_workspace_bucket(measurement_86377274_path)

dim(dataset_86377274_measurement_df)

head(dataset_86377274_measurement_df, 5)

In [None]:
#unique(dataset_86377274_measurement_df$source_concept_code)
#colnames(dataset_86377274_measurement_df)

### Columns to keep:
dataset_86377274_measurement_df_filtered <- dataset_86377274_measurement_df[c("person_id", "measurement_datetime", "measurement_type_concept_name", "visit_occurrence_id", "visit_occurrence_concept_name", "source_concept_name", "source_concept_code", "source_vocabulary")]
dataset_86377274_measurement_df_filtered <- dataset_86377274_measurement_df_filtered %>% 
    rename(all_of(c(condition_start_datetime = "measurement_datetime", condition_type_concept_name = "measurement_type_concept_name"))) %>%
    add_column(condition_end_datetime = as.character(NA), .after = "condition_start_datetime") %>%
    add_column(condition_type = "measurement")
dataset_86377274_measurement_df_filtered

## EHR Data: Conditions

In [None]:
# This query represents dataset "All ICD Codes" for domain "condition" and was generated for All of Us Registered Tier Dataset v7
condition_86377274_path <- "gs://fc-secure-cde9a0f0-7d5a-4045-98bb-fb1d394a535b/bq_exports/janclusmann@researchallofus.org/20250709/condition_86377274/condition_86377274_*.csv"

# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {condition_86377274_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  #col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), condition_type_concept_name = col_character(), stop_reason = col_character(), visit_occurrence_concept_name = col_character(), condition_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), condition_status_source_value = col_character(), condition_status_concept_name = col_character())
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), condition_type_concept_name = col_character(), stop_reason = col_character(), visit_occurrence_concept_name = col_character(), condition_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), condition_status_source_value = col_character(), condition_status_concept_name = col_character())
    bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          #chunk_filtered <- chunk[c("person_id", "condition_start_datetime", "condition_end_datetime", "condition_type_concept_name", "visit_occurrence_id", "visit_occurrence_concept_name", "source_concept_name", "source_concept_code", "source_vocabulary")]
        chunk_filtered <- chunk[c("person_id", "condition_start_datetime", "source_concept_code", "source_vocabulary")]  
        chunk_filtered <- chunk_filtered %>% 
            add_column(condition_type = "condition")
          chunk_filtered
        }))
}

dataset_86377274_condition_df <- read_bq_export_from_workspace_bucket(condition_86377274_path)

dim(dataset_86377274_condition_df)

head(dataset_86377274_condition_df, 5)

# EHR Data New

## Observations

In [None]:
# This query represents dataset "All ICD Codes" for domain "observation" and was generated for All of Us Registered Tier Dataset v8
observation_86377274_path <- "gs://fc-secure-cde9a0f0-7d5a-4045-98bb-fb1d394a535b/bq_exports/janclusmann@researchallofus.org/20250709/observation_86377274/observation_86377274_*.csv"

# Read the data directly from Cloud Storage into memory.
read_bq_export_from_workspace_bucket <- function(export_path) {
  # Only specify col_types for columns we actually need
  col_types <- cols(
    person_id = col_double(),
    observation_datetime = col_character(),
    source_concept_name = col_character(), 
    source_concept_code = col_character(), 
    source_vocabulary = col_character(),
    .default = col_skip()  # Skip all other columns during reading
  )
  
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          
          # Rename datetime column and add condition_type
          chunk <- chunk %>% 
            rename(condition_start_datetime = observation_datetime) %>%
            add_column(condition_type = "observation")
          
          return(chunk)
        }))
}

dataset_86377274_observation_df <- read_bq_export_from_workspace_bucket(observation_86377274_path)

dim(dataset_86377274_observation_df)
head(dataset_86377274_observation_df, 5)

## Procedures

In [None]:
# This query represents dataset "All ICD Codes" for domain "procedure" and was generated for All of Us Registered Tier Dataset v8
procedure_86377274_path <- "gs://fc-secure-cde9a0f0-7d5a-4045-98bb-fb1d394a535b/bq_exports/janclusmann@researchallofus.org/20250709/procedure_86377274/procedure_86377274_*.csv"

# Read the data directly from Cloud Storage into memory.
read_bq_export_from_workspace_bucket <- function(export_path) {
  # Only specify col_types for columns we actually need
  col_types <- cols(
    person_id = col_double(),
    procedure_datetime = col_character(),
    source_concept_name = col_character(), 
    source_concept_code = col_character(), 
    source_vocabulary = col_character(),
    .default = col_skip()  # Skip all other columns during reading
  )
  
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          
          # Rename datetime column and add condition_type
          chunk <- chunk %>% 
            rename(condition_start_datetime = procedure_datetime) %>%
            add_column(condition_type = "procedure")
          
          return(chunk)
        }))
}

dataset_86377274_procedure_df <- read_bq_export_from_workspace_bucket(procedure_86377274_path)

dim(dataset_86377274_procedure_df)
head(dataset_86377274_procedure_df, 5)

## Measurements

In [None]:
# This query represents dataset "All ICD Codes" for domain "measurement" and was generated for All of Us Registered Tier Dataset v8
measurement_86377274_path <- "gs://fc-secure-cde9a0f0-7d5a-4045-98bb-fb1d394a535b/bq_exports/janclusmann@researchallofus.org/20250709/measurement_86377274/measurement_86377274_*.csv"

# Read the data directly from Cloud Storage into memory.
read_bq_export_from_workspace_bucket <- function(export_path) {
  # Only specify col_types for columns we actually need
  col_types <- cols(
    person_id = col_double(),
    measurement_datetime = col_character(),
    source_concept_name = col_character(), 
    source_concept_code = col_character(), 
    source_vocabulary = col_character(),
    .default = col_skip()  # Skip all other columns during reading
  )
  
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          
          # Rename datetime column and add condition_type
          chunk <- chunk %>% 
            rename(condition_start_datetime = measurement_datetime) %>%
            add_column(condition_type = "measurement")
          
          return(chunk)
        }))
}

dataset_86377274_measurement_df <- read_bq_export_from_workspace_bucket(measurement_86377274_path)

dim(dataset_86377274_measurement_df)
head(dataset_86377274_measurement_df, 5)

## Merging Observations, procedures, measurements

In [None]:
# Set up output file path
prelim_output_file <- paste0(data_path, "/dataframes/AllofUs_v8_obs_proc_meas_codes.txt")
final_output_file <- paste0(data_path, "/dataframes/df_diagnosis.txt")
condition_output_file <- paste0(data_path, "/dataframes/AllofUs_v8_condition_codes.txt")

# Start with existing datasets (observation, procedure, measurement)
message("Starting with existing datasets...")
print(paste("Observation rows:", nrow(dataset_86377274_observation_df)))
print(paste("Procedure rows:", nrow(dataset_86377274_procedure_df)))
print(paste("Measurement rows:", nrow(dataset_86377274_measurement_df)))

# Combine non-condition datasets first
all_codes <- rbind(dataset_86377274_observation_df, 
                   dataset_86377274_procedure_df, 
                   dataset_86377274_measurement_df)

message(paste("Combined non-condition data:", nrow(all_codes), "rows"))

# Write initial data to file (conditions are memory-heavy so this is a back up in case the kernel crashes)
write_delim(all_codes, prelim_output_file, delim = "\t")
message("Written initial datasets to file")

# Copy to bucket
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
system(paste0("gsutil cp ./", prelim_output_file, " ", my_bucket, "/data/"), intern=T)
message("File saved to bucket!")

# Clean up to free memory
rm(dataset_86377274_observation_df, dataset_86377274_procedure_df, dataset_86377274_measurement_df)
gc()

## Conditions

In [None]:
# Final output file
condition_output_file <- paste0(data_path, "/dataframes/AllofUs_v8_condition_codes.txt")

# Get list of all files
condition_86377274_path <- "gs://fc-secure-cde9a0f0-7d5a-4045-98bb-fb1d394a535b/bq_exports/janclusmann@researchallofus.org/20250709/condition_86377274/condition_86377274_*.csv"
all_files <- system2('gsutil', args = c('ls', condition_86377274_path), stdout = TRUE, stderr = TRUE)

# Initialize the output file with headers
first_batch <- TRUE
total_rows <- 0

# Process files in small batches and append to final file
batch_size <- 5
num_batches <- ceiling(length(all_files) / batch_size)

for (i in 1:num_batches) {
  start_idx <- (i - 1) * batch_size + 1
  end_idx <- min(i * batch_size, length(all_files))
  
  message(str_glue('Processing batch {i}/{num_batches} (files {start_idx} to {end_idx})'))
  
  file_batch <- all_files[start_idx:end_idx]
  
  # Process current batch - only specify col_types for columns we actually need
  col_types <- cols(
    person_id = col_double(),
    condition_start_datetime = col_character(),
    source_concept_name = col_character(), 
    source_concept_code = col_character(), 
    source_vocabulary = col_character(),
    .default = col_skip()  # Skip all other columns during reading
  )
  
  batch_data <- bind_rows(
    map(file_batch, function(csv) {
      message(str_glue('  Loading {basename(csv)}'))
      chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
      
      # Add condition_type column
      chunk <- chunk %>% 
        add_column(condition_type = "condition")
      
      # No need to filter columns since we only read what we need
      # and no need to create chunk_filtered
      
      return(chunk)
    })
  )
  
  # Append to final file (write headers only for first batch)
  if (first_batch) {
    write_delim(batch_data, condition_output_file, delim = "\t")
    first_batch <- FALSE
  } else {
    write_delim(batch_data, condition_output_file, delim = "\t", append = TRUE, col_names = FALSE)
  }
  
  total_rows <- total_rows + nrow(batch_data)
  message(str_glue('  Batch {i} complete. Added {nrow(batch_data)} rows. Total so far: {total_rows}'))
  
  # Clean up immediately
  rm(batch_data)
  gc()
  
  # Small delay
  Sys.sleep(1)
}

message(str_glue('Processing for condition data complete! Final file: {condition_output_file} with {total_rows} rows'))

# Copy to bucket
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
system(paste0("gsutil cp ./", prelim_output_file, " ", my_bucket, "/data/"), intern=T)
system(paste0("gsutil cp ./", condition_output_file, " ", my_bucket, "/data/"), intern=T)
system(paste0("gsutil cp ./", final_output_file, " ", my_bucket, "/data/"), intern=T)

message("File saved to bucket!")

In [None]:
# Set up output file path
prelim_output_file <- paste0(data_path, "/dataframes/AllofUs_v8_obs_proc_meas_codes.txt")
final_output_file <- paste0(data_path, "/dataframes/df_diagnosis.txt")
condition_output_file <- paste0(data_path, "/dataframes/AllofUs_v8_condition_codes.txt")

# Re-load opm data and condition data
message("Loading observation/procedure/measurement data...")
opm_data <- read_delim(prelim_output_file, delim = "\t", show_col_types = FALSE)
message(paste("OPM data loaded:", nrow(opm_data), "rows"))

message("Loading condition data...")
condition_data <- read_delim(condition_output_file, delim = "\t", show_col_types = FALSE)
message(paste("Condition data loaded:", nrow(condition_data), "rows"))

# Combine the datasets
message("Combining datasets...")
all_codes <- rbind(opm_data, condition_data)

# Clean up intermediate datasets to free memory
rm(opm_data, condition_data)
gc()

# Final summary
total_rows <- nrow(all_codes)
message(str_glue('Processing complete!'))
message(str_glue('Final file: {final_output_file}'))
message(str_glue('Total rows: {total_rows}'))

# Save the combined dataset
message("Saving combined dataset...")
write_delim(all_codes, final_output_file, delim = "\t")
message("Combined dataset saved locally")

# Copy to bucket
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
system(paste0("gsutil cp ", final_output_file, " ", my_bucket, "/data/"), intern=T)
message("File saved to bucket!")

# Clean up
rm(all_codes)
gc()

# Optional: Check the final file structure
message("Final file structure:")
final_check <- read_delim(final_output_file, delim = "\t", n_max = 5, show_col_types = FALSE)
print(head(final_check))
print(paste("Final file dimensions:", paste(dim(final_check), collapse = " x ")))

## Additional info

In [None]:
all_codes <- read_delim(final_output_file, delim = "\t", show_col_types = FALSE)


colnames(all_codes)
print(min(all_codes$condition_start_datetime))
print(max(all_codes$condition_start_datetime))

In [None]:
Sys.getenv('WORKSPACE_BUCKET')
