# Helpers and functions

In [None]:
# Load "config.R" for utility functions. 
#Will also triggger loading of 
    
    # user_config.JSON (including key for project_config)
    # project_config.JSON
    # preprocessing_visualizations.R
    # preprocessing_functions.R

user <- "Jan" 
source("config.r")



#If certain packages not installed yet via requirements.txt, install them here via
# install.packages("package_name")

In [None]:
# Function to summarize a column
summarize_column <- function(df, column_name) {
  df %>%
    count(.data[[column_name]]) %>%
    mutate(proportion = n / sum(n)) %>%
    arrange(desc(n)) %>%
    rename(
      Count.Var1 = .data[[column_name]],
      Count.Freq = n,
      Proportion.Freq = proportion
    ) %>%
    mutate(Proportion.Var1 = Count.Var1)
}

# Function to merge categories for race_ethnicity column
merge_categories <- function(df) {
  df %>%
    mutate(race_ethnicity = case_when(
      ethnicity == 'Hispanic or Latino' ~ 'Latinx',
      race %in% c('I prefer not to answer', 'None of these', 'PMI: Skip', 'None Indicated') ~ 'No answer',
      race == 'Black or African American' ~ 'Black / African\nAmerican',
      race == 'More than one population' ~ 'More than one',
      race == 'Middle Eastern or North African' ~ 'Middle Eastern',
      race == 'Native Hawaiian or Other Pacific Islander' ~ 'Pacific Islander',
      TRUE ~ race
    ))
}

# Main function to process and summarize data
process_and_summarize <- function(df) {
  # Summarize race
  race_summary <- summarize_column(df, "race")
  
  # Summarize ethnicity
  ethnicity_summary <- summarize_column(df, "ethnicity")
  
  # Merge categories and summarize race_ethnicity
  df_merged <- merge_categories(df)
  race_ethnicity_summary <- summarize_column(df_merged, "race_ethnicity")
  
  # Return a list of all summaries
  list(
    race = race_summary,
    ethnicity = ethnicity_summary,
    race_ethnicity = race_ethnicity_summary
  )
}

# Function to read and process data
read_and_process_data <- function(data_path, gs_path) {
  # Try to load df_covariates from file
  df_covariates <- tryCatch({
    message("Trying to load df_covariates from file...")
    data.table::fread(file.path(data_path, "/dataframes/df_covariates_1.2.txt"), sep = "\t")
  }, error = function(e) {
    message("Failed to load local file: ", e$message)
    return(NULL)
  })
  
  # If loading failed, use Google Cloud Storage
  if (is.null(df_covariates)) {
    message("Loading from Google Cloud Storage")
    df <- read_bq_export_from_workspace_bucket(gs_path)
  } else {
    message("Successfully loaded df_covariates from local file")
    df <- df_covariates
  }
  
  # Process and summarize
  summaries <- process_and_summarize(df)
  
  # Return both the dataframe and summaries
  list(
    data = df,
    summaries = summaries
  )
}



read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(gender = col_character(), race = col_character(), ethnicity = col_character(), sex_at_birth = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}

In [None]:
df <- data.table::fread(file.path(data_path, "/dataframes/df_covariates_1.2.txt"), sep = "\t")
head(df)

# Load and process data

In [None]:
# Summary for 
person_03039562_path <- "gs://fc-secure-7ce90512-cbaf-4591-985d-a48ed28a7fda/bq_exports/janclusmann@researchallofus.org/20240730/person_03039562/person_03039562_*.csv"


# Read and process the main dataset
result <- read_and_process_data(data_path, person_03039562_path)
result$data <- result$data %>% rename(eid=person_id)

# Print summaries
print("Race Summary:")
print(result$summaries$race)

print("Ethnicity Summary:")
print(result$summaries$ethnicity)

print("Race-Ethnicity Summary:")
print(result$summaries$race_ethnicity)

write_xlsx(result$summaries$race_ethnicity, paste0(data_path, "/dataframes/Ethnicity_counts_all.xlsx"))

# Sort and rename

In [None]:
# Create a summary column with ethnicity and race put together for the tables
df_ethnicity <- result$data

df_ethnicity <- merge_categories(df_ethnicity)

df_ethnicity <- df_ethnicity %>% select("eid", "race_ethnicity") %>%
    mutate(
        ethnicity_ukb_aligned = fct_recode( #Add a column with ethnicity details in the same format as UKB for combined table with TP FN comparison
            race_ethnicity,
            "Black" = "Black / African\nAmerican",
            "Other/Unknown" = "No answer",
            "Other/Unknown" = "Pacific Islander",
            "Other/Unknown" = "Middle Eastern",
            "Other/Unknown" = "More than one",
            "Caucasian" = "White"
        ),
        ethnicity_ukb_aligned = fct_relevel(ethnicity_ukb_aligned, sort)  # Sort levels alphabetically
    )


write_csv(df_ethnicity, paste0(data_path, "/dataframes/df_ethnicity.csv"))






# Positive Cases (DOI)

In [None]:
# statistics for positive cases only
df_y <- read_csv(paste0(data_path, "/dataframes/y_outer_basic_all.csv")

filtered_df <- result$data %>% inner_join(df_y %>% filter(status == 1), by = "eid")
filtered_summaries <- process_and_summarize(filtered_df)
filtered_summaries$race_ethnicity

write_xlsx(filtered_summaries$race_ethnicity, paste0(data_path, "/dataframes/Ethnicity_counts_hcc.xlsx"))

# Binary split white/non-white

In [None]:
create_binary_ethnicity_dataframe <- function(df) {
  df %>%
    mutate(race_binary = case_when(
      race == "White" ~ "White",
      is.na(race) ~ NA_character_,
      TRUE ~ "Non-White"
    )) %>%
    select(eid, race_binary)
}

# Usage example:
# Assuming 'result' is the output from your read_and_process_data function
# and contains the original dataframe in result$data

# Create the binary ethnicity dataframe
df_ethnicity_boolean <- create_binary_ethnicity_dataframe(result$data)

# Print the first few rows
print(head(df_ethnicity_boolean))

# Get a summary of the binary ethnicity
ethnicity_summary <- df_ethnicity_boolean %>%
  count(race_binary) %>%
  mutate(percentage = n / sum(n) * 100)

print(ethnicity_summary)

write_csv(df_ethnicity_boolean, paste0(data_path, "/dataframes/df_ethnicity_boolean.csv"))