In [None]:
library(tidyverse)
library(bigrquery)
library(writexl)
library(readxl)
library(ggplot2)
library(scales)  # For comma formatting
library(data.table)
library(dplyr)
library(ggplot2)
library(knitr)
library(readr)
library(stringr)

In [None]:
# Function to summarize a column
summarize_column <- function(df, column_name) {
  df %>%
    count(.data[[column_name]]) %>%
    mutate(proportion = n / sum(n)) %>%
    arrange(desc(n)) %>%
    rename(
      Count.Var1 = .data[[column_name]],
      Count.Freq = n,
      Proportion.Freq = proportion
    ) %>%
    mutate(Proportion.Var1 = Count.Var1)
}

# Function to merge categories for race_ethnicity column
merge_categories <- function(df) {
  df %>%
    mutate(race_ethnicity = case_when(
      ethnicity == 'Hispanic or Latino' ~ 'Latinx',
      race %in% c('I prefer not to answer', 'None of these', 'PMI: Skip', 'None Indicated') ~ 'No answer',
      race == 'Black or African American' ~ 'Black / African\nAmerican',
      race == 'More than one population' ~ 'More than one',
      race == 'Middle Eastern or North African' ~ 'Middle Eastern',
      race == 'Native Hawaiian or Other Pacific Islander' ~ 'Pacific Islander',
      TRUE ~ race
    ))
}

# Main function to process and summarize data
process_and_summarize <- function(df) {
  # Summarize race
  race_summary <- summarize_column(df, "race")
  
  # Summarize ethnicity
  ethnicity_summary <- summarize_column(df, "ethnicity")
  
  # Merge categories and summarize race_ethnicity
  df_merged <- merge_categories(df)
  race_ethnicity_summary <- summarize_column(df_merged, "race_ethnicity")
  
  # Return a list of all summaries
  list(
    race = race_summary,
    ethnicity = ethnicity_summary,
    race_ethnicity = race_ethnicity_summary
  )
}

# Function to read and process data
read_and_process_data <- function(path) {
  # Read the data
  df <- read_bq_export_from_workspace_bucket(path)
  
  # Process and summarize
  summaries <- process_and_summarize(df)
  
  # Return both the dataframe and summaries
  list(
    data = df,
    summaries = summaries
  )
}

In [None]:
# Summary for 
person_03039562_path <- "gs://fc-secure-7ce90512-cbaf-4591-985d-a48ed28a7fda/bq_exports/janclusmann@researchallofus.org/20240730/person_03039562/person_03039562_*.csv"


# Read and process the main dataset
result <- read_and_process_data(person_03039562_path)
result$data <- result$data %>% rename(eid=person_id)

# Print summaries
print("Race Summary:")
print(result$summaries$race)

print("Ethnicity Summary:")
print(result$summaries$ethnicity)

print("Race-Ethnicity Summary:")
print(result$summaries$race_ethnicity)

write_xlsx(result$summaries$race_ethnicity, "HCC/Ethnicity_counts_all.xlsx")

In [None]:
# statistics for positive cases only
df_y <- read_csv("HCC/y_outer_basic_all.csv")

filtered_df <- result$data %>% inner_join(df_y %>% filter(status == 1), by = "eid")
filtered_summaries <- process_and_summarize(filtered_df)
filtered_summaries$race_ethnicity

write_xlsx(filtered_summaries$race_ethnicity, "HCC/Ethnicity_counts_hcc.xlsx")