# Helpers

In [None]:
# Load "config.R" for utility functions. 
#Will also triggger loading of 
    
    # user_config.JSON (including key for project_config)
    # project_config.JSON
    # preprocessing_visualizations.R
    # preprocessing_functions.R

user <- "Jan" 
source("config.r")



#If certain packages not installed yet via requirements.txt, install them here via
# install.packages("package_name")

# Load raw diagnosis data and reduce its size
As not one person but one visit (including blood drawn) is treated as entity, we will mask/reveal the diagnosis dependent on the year of blood withdrawal of a patient. So we will merge blood + diagnosis

## Load head

In [None]:
# Expect head of df_diagnosis
df5 <- fread(paste0(data_path, "/dataframes/df_diagnosis.txt"), nrows=5)
head(df5)


## Load all

In [None]:
#Load (huge) df_diagnosis
df_diagnosis <- fread(paste0(data_path, "/dataframes/df_diagnosis.txt"))

head(df_diagnosis)


## Reduce huge diagnosis dataset

In [None]:
# =============================================================================
# Filtering 12 gb diagnosis dataframe to "relevant columns only" and "first report of ICD code per patient only", 
#reducing it to 1GB
# That reduced df can then be used for processing, e.g. selecting relevant ICD codes stored in ICD Dict/Excel file above, 
#without heavy CPU load
# =============================================================================

filter_earliest_diagnoses <- function(df) {
  cat("Filtering for earliest diagnosis per person and ICD code...\n")
  cat("Input data dimensions:", nrow(df), "rows,", ncol(df), "columns\n")
  
  # Convert to data.table if not already
  if (!is.data.table(df)) {
    df <- as.data.table(df)
  }
  
  # Convert condition_start_datetime to proper datetime if it's character
  if (is.character(df$condition_start_datetime)) {
    df[, condition_start_datetime := as.POSIXct(condition_start_datetime)]
  }
  
  # Check for required columns
  required_cols <- c("person_id", "source_concept_code", "condition_start_datetime")
  missing_cols <- required_cols[!required_cols %in% names(df)]
  
  if (length(missing_cols) > 0) {
    stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
  }
  
  # Remove rows with missing key information
  initial_rows <- nrow(df)
  df <- df[!is.na(person_id) & !is.na(source_concept_code) & !is.na(condition_start_datetime)]
  rows_removed <- initial_rows - nrow(df)
  
  if (rows_removed > 0) {
    cat("Removed", rows_removed, "rows with missing person_id, source_concept_code, or condition_start_datetime\n")
  }
  
  # Filter for earliest occurrence per person and ICD code
  # Using data.table syntax for efficiency
  df_earliest <- df[order(condition_start_datetime), 
                    .SD[1], 
                    by = .(person_id, source_concept_code)]
  
  # Drop specified columns
  columns_to_drop <- c("condition_type", "source_concept_name", "source_vocabulary")
  existing_cols_to_drop <- columns_to_drop[columns_to_drop %in% names(df_earliest)]
  
  if (length(existing_cols_to_drop) > 0) {
    df_earliest <- df_earliest[, !..existing_cols_to_drop]
    cat("Dropped columns:", paste(existing_cols_to_drop, collapse = ", "), "\n")
  } else {
    cat("Note: None of the specified columns to drop were found in the dataset\n")
  }
  
  cat("Output data dimensions:", nrow(df_earliest), "rows,", ncol(df_earliest), "columns\n")
  cat("Reduction:", round((1 - nrow(df_earliest)/nrow(df)) * 100, 1), "% of rows removed\n")
  
  return(df_earliest)
}


df_earliest_dt <- filter_earliest_diagnoses(df_diagnosis)

head(df_earliest_dt)
dim(df_earliest_dt)

write_csv(df_earliest_dt, paste0(data_path, "/dataframes/df_diagnosis_reduced.csv"))

# Dataloader: 

Load data that has previously been reduced to maintain only the first instance of a particular ICD for a particular patient,
In the section below, we then further subset the dataframe to maintain only the relevant ICD codes from ICD_groups and ICD_Singles (the ones you want to further explore).

Two Options
-A: Subset prospectively (for every instance, just the diagnosis that were made before or during time of assessment will be kept, the rest will be ommitted (default for all prospective modelling)

-B: Subset all entries, for time-agnostic analysisi

## Load metadata

In [None]:
# Load metadata (file with columns/ICD codes to investigate)
icd_dict_path <- paste0(data_path, "/Features_AOU.xlsx")

ICD_Groups <- read_excel(icd_dict_path, sheet = "ICD_Groups") %>% select(-c("n", "Summe"))
ICD_Singles <- read_excel(icd_dict_path, sheet = "ICD_Singles") %>% select(-c("n", "Summe"))

#Patients_at_risk <- read_excel(icd_dict_path, sheet = "Patients at risk") #relevant for PAR subsetting only

## Combine metadata to vector

In [None]:
# Combine all ICD codes from both sources
icd_codes_groups <- unique(ICD_Groups$ICD10)
icd_codes_singles <- unique(ICD_Singles$ICD10)
all_icd_codes <- unique(c(icd_codes_groups, icd_codes_singles))

# Remove any NA values
all_icd_codes <- all_icd_codes[!is.na(all_icd_codes)]

cat("ICD codes from Groups:", length(icd_codes_groups), "\n")
cat("ICD codes from Singles:", length(icd_codes_singles), "\n")
cat("Total unique ICD codes:", length(all_icd_codes), "\n")

# Print the vector of ICD codes
cat("\nAll ICD codes of interest:\n")
print(all_icd_codes)

## Load df_diagnosis (reduced to first instance per diag per patient)

In [None]:
df_diagnosis <- read_csv(paste0(data_path, "/dataframes/df_diagnosis_reduced.csv"))
df_diagnosis$source_concept_code <- gsub("\\.", "", df_diagnosis$source_concept_code) #removes the dot in the ICD codes
head(df_diagnosis)
dim(df_diagnosis)

## Load Blood data

In [None]:


# Load blood dates
df_blood_dates <- read_csv(paste0(data_path, "/dataframes/df_blood_raw.csv")) %>% select("person_id", "year", "person_id_year")

df_blood_dates

# Merge diagnosis_data (prognostic)
As not one person but one visit (including blood drawn) is treated as entity, we will mask/reveal the diagnosis dependent on the year of blood withdrawal of a patient. So we will merge blood + diagnosis

## Filter for relevant ICD codes only

In [None]:
# =============================================================================
# STEP 1: FILTER DIAGNOSIS DATA FOR RELEVANT ICD CODES ONLY
# =============================================================================

cat("\nOriginal diagnosis data dimensions:", nrow(df_diagnosis), "rows\n")

# Create pattern for filtering (codes that START WITH any of our ICD codes)
pattern <- paste0("^(", paste(all_icd_codes, collapse = "|"), ")")

# Filter diagnosis data
df_diagnosis_filtered <- df_diagnosis %>%
  filter(grepl(pattern, source_concept_code, perl = TRUE))

cat("Filtered diagnosis data dimensions:", nrow(df_diagnosis_filtered), "rows\n")
cat("Reduction:", round((1 - nrow(df_diagnosis_filtered)/nrow(df_diagnosis)) * 100, 1), "% removed\n")

# Show what ICD codes we actually have in the filtered data
filtered_codes <- unique(df_diagnosis_filtered$source_concept_code)
cat("Unique ICD codes found in filtered data:", length(filtered_codes), "\n")

## Merge diagnosis data with blood data

In [None]:
# =============================================================================
# STEP 2: SIMPLE MERGE WITH BLOOD DATA
# =============================================================================

# First, rename year column in blood data to avoid confusion
df_blood_dates <- df_blood_dates %>%
  rename(blood_year = year)

cat("\nMerging blood and diagnosis data...\n")
cat("Blood data dimensions:", nrow(df_blood_dates), "rows\n")
cat("Filtered diagnosis dimensions:", nrow(df_diagnosis_filtered), "rows\n")

# Simple merge by person_id (keep all blood measurements)
df_diag_blood <- merge(df_blood_dates, df_diagnosis_filtered, by = "person_id", all.x = TRUE, allow.cartesian=TRUE)

cat("Merged data dimensions:", nrow(df_diag_blood), "rows\n")



## Calculate difftime (year of diag and year of blood measurement)

In [None]:
# =============================================================================
# STEP 3: CALCULATE TIME DIFFERENCES AND FILTER
# =============================================================================

# Add diagnosis year column
df_diag_blood <- df_diag_blood %>%
  mutate(
    diagnosis_year = year(condition_start_datetime),
    difftime = blood_year - diagnosis_year
  )

cat("\nCalculating time differences...\n")
cat("Rows before time filtering:", nrow(df_diag_blood), "\n")

# Define timeframe (e.g., 5 years). Default is the project_config configuration for the current entitiy that is being expored
# Check out project_configs-> cca-> timeframe
timeframe <- timeframe

# Filter: keep only diagnoses that occurred within timeframe years before blood measurement
# difftime >= -(timeframe) means: diagnosis_year >= (blood_year - timeframe)
# This keeps diagnoses from (blood_year - timeframe) to blood_year
df_merged <- df_diag_blood %>%
  filter(difftime >= -(timeframe))

cat("Rows after time filtering (within", timeframe, "years):", nrow(df_merged), "\n")

# =============================================================================
# STEP 5: DATA QUALITY CHECKS
# =============================================================================

cat("\n=== DATA QUALITY CHECKS ===\n")

# Check for any missing values in key columns
cat("Missing person_id:", sum(is.na(df_merged$person_id)), "\n")
cat("Missing person_id_year:", sum(is.na(df_merged$person_id_year)), "\n")
cat("Missing source_concept_code:", sum(is.na(df_merged$source_concept_code)), "\n")
cat("Missing blood_year:", sum(is.na(df_merged$blood_year)), "\n")
cat("Missing diagnosis_year:", sum(is.na(df_merged$diagnosis_year)), "\n")

# Show time difference distribution
cat("\nTime difference distribution (blood_year - diagnosis_year):\n")
print(summary(df_merged$difftime))

# Show how many unique person_id_year combinations we have
cat("Unique person_id_year combinations:", n_distinct(df_merged$person_id_year), "\n")
cat("Unique persons:", n_distinct(df_merged$person_id), "\n")
cat("Unique ICD codes in final data:", n_distinct(df_merged$source_concept_code), "\n")

# =============================================================================
# STEP 6: PREVIEW RESULTS
# =============================================================================

cat("\n=== PREVIEW OF MERGED DATA ===\n")
print(head(df_merged, 10))

cat("\nColumn names in final merged data:\n")
print(colnames(df_merged))

cat("\nData is ready for ICD Groups and ICD Singles processing!\n")

## Merge on individual ICD Codes

### ICD Groups (Multiple codes together for one final feature, n to 1)

In [None]:
# Merge df_diagnosis onto ICD_Groups metadata, inspect single ICD codes (sum_groups_before) 
# and then merge the codes as defined by the groups (sum_groups_after). 

pat_icds_labelled <- left_join(x= ICD_Groups, y= df_merged, by = c("ICD10" = "source_concept_code")) %>% distinct()


df_icd_groups_before <- pat_icds_labelled %>% group_by(Diagnosis, person_id) %>% summarise(occurence = n()) %>% spread(Diagnosis, occurence) 

sum_groups_before <- as.data.frame(colSums(df_icd_groups_before, na.rm=TRUE))
sum_groups_before$Diagnosis <- rownames(sum_groups_before) #rownames as column
sum_groups_before <- sum_groups_before %>% 
  filter(Diagnosis != "person_id") %>%
  rename("Occurrence" = `colSums(df_icd_groups_before, na.rm = TRUE)`) %>%
    mutate(
        Occurrence = ifelse(
          as.numeric(Occurrence) < 20,
          "<20",
          as.character(Occurrence)
        )
      )


sum_groups_before <- sum_groups_before[, c("Diagnosis", "Occurrence")]


# #groupby + summarises sortiert Dataset nach Diagnosegruppen und eid,  spread macht aus langem schmalen Dataset ein breites mit logical/boolean values
df_icd_groups <- pat_icds_labelled %>% group_by(Group, person_id) %>% summarise(occurence = n()) %>% spread(Group, occurence) 
df_icd_groups[df_icd_groups > 1 & df_icd_groups < 100]<- 1 
df_icd_groups[is.na(df_icd_groups)] <- 0

sum_groups_after <- as.data.frame(colSums(df_icd_groups))
sum_groups_after$Diagnosis <- rownames(sum_groups_after) #rownames as column
sum_groups_after <- sum_groups_after[, c("Diagnosis", "colSums(df_icd_groups)")] %>%
  filter(Diagnosis != "person_id") %>%
  rename("Occurrence" = "colSums(df_icd_groups)") %>%
    mutate(
        Occurrence = ifelse(
          as.numeric(Occurrence) < 20,
          "<20",
          as.character(Occurrence)
        )
      )


sum_groups_before 
sum_groups_after

write_xlsx(sum_groups_before, file.path(suppl_path, "/Sum_Group_Diagnosis_Distinct_AOU.xlsx"))
write_xlsx(sum_groups_after, file.path(suppl_path, "/Sum_Group_Diagnosis_Summarized_AOU.xlsx"))

### ICD Singles

In [None]:
pat_icds_labelled <- left_join(x= ICD_Singles, df_merged, by = c("ICD10" = "source_concept_code")) %>%  distinct() 

# #groupby + summarises sortiert Dataset nach Diagnosegruppen und eid,  spread macht aus langem schmalen Dataset ein breites mit logical/boolean values
df_icd_singles <- pat_icds_labelled %>% group_by(Diagnosis, person_id) %>% summarise(occurence = n()) %>% spread(Diagnosis, occurence) 
df_icd_singles[df_icd_singles > 1 & df_icd_singles < 10000]<- 1 
df_icd_singles[is.na(df_icd_singles)] <- 0
sum_singles <- as.data.frame(colSums(df_icd_singles))


sum_singles$Diagnosis <- rownames(sum_singles) #rownames as column
sum_singles <- sum_singles[, c("Diagnosis", "colSums(df_icd_singles)")] %>%
  filter(Diagnosis != "person_id") %>%
  rename("Occurrence" = "colSums(df_icd_singles)") %>%
    mutate(
        Occurrence = ifelse(
          as.numeric(Occurrence) < 20,
          "<20",
          as.character(Occurrence)
        )
      )

sum_singles

write_xlsx(sum_singles, file.path(suppl_path, "/Sum_Single_Diagnosis_AOU.xlsx"))

### Merge Singles and Groups

In [None]:
# Inspect raw df_diagnosis dataframes

#df_icd_singles
#df_icd_groups
df_icd <- full_join(df_icd_singles, df_icd_groups, by = "person_id") %>%
  mutate(across(everything(), ~replace_na(.x, 0)))
df_icd

## Export Prospective df_icd

In [None]:
write_csv(df_icd, paste0(data_path, "/dataframes/df_diagnosis_filtered.csv"))

# Export df_diagnosis (time-agnostic)

