# Import libraries and data

## Libraries

In [None]:
# Load "config.R" for utility functions. 
#Will also triggger loading of 
    
    # user_config.JSON (including key for project_config)
    # project_config.JSON
    # preprocessing_visualizations.R
    # preprocessing_functions.R

user <- "Jan" 
source("config.r")



#If certain packages not installed yet via requirements.txt, install them here via
# install.packages("package_name")

In [None]:

# albumin <- data.table::fread("qc_labs_long/AllofUs_v7_labs-Albumin_wout_outliers_long_052724.txt", sep="\t")
# head(albumin)

In [None]:
#Test Call
aggregated_data_list[["Alk"]]

## Metadata load

In [None]:
metadata <- read_xlsx(file.path(data_path, "columnmapper_aou.xlsx"))
colnames(metadata)

# # Extract expected lab names from metadata
bloodvalues <- unique(metadata$column_name_aou[metadata$source_df == "df_blood"])
bloodvalues

## Data load

In [None]:
#Pre-loading check what can be loaded

# Set folder path
folder_path <- paste0(data_path, "/dataframes/blood/")
files <- list.files(path = folder_path, pattern = "\\.txt$", full.names = TRUE)

# Loop through files and print column names
for (file in files) {
  cat("📄 Datei:", basename(file), "\n")
  dt <- fread(file, sep = "\t", nrows = 5)  # Nur ersten 5 Zeilen einlesen (schneller)
  print(colnames(dt))
  cat("\n--------------------------\n\n")
}


In [None]:
summary_list <- list()
aggregated_data_list <- list()

for (file in files) {
  file_base <- basename(file)

# Step 1: strong match — exact name after 'labs-' and before '_'
  matched_lab <- bloodvalues[str_detect(file_base, paste0("labs-", fixed(bloodvalues), "_"))]

  # Step 2: fallback to general substring match if nothing matched above
  if (length(matched_lab) == 0) {
    matched_lab <- bloodvalues[str_detect(file_base, fixed(bloodvalues))]
  }

  # Final check
  if (length(matched_lab) == 0) {
    warning(paste("Skipping", file, "- no match with bloodvalue names."))
    next
  }

  if (length(matched_lab) > 1) {
    warning(paste("Multiple matches in", file, ":", paste(matched_lab, collapse = ", ")))
    next
  }

  lab_name <- matched_lab

  # Read data
  dt <- fread(file, sep = "\t")

  if (!"measurement_datetime" %in% colnames(dt)) {
    warning(paste("Skipping", file, "- 'measurement_datetime' not found."))
    next
  }

  # Parse date and drop invalid
  dt[, measurement_datetime := as.POSIXct(measurement_datetime, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")]
  dt <- dt[!is.na(measurement_datetime)]

  # Time grouping
  dt[, year := year(measurement_datetime)]
  dt[, person_id_year := paste0(person_id, "_", year)]

  # Aggregate
  agg_dt <- dt[, .(mean_value = mean(value_as_number, na.rm = TRUE)), by = .(person_id_year, person_id, year)]
  aggregated_data_list[[lab_name]] <- agg_dt

  # Summary
  summary_list[[lab_name]] <- data.table(
    lab = lab_name,
    total_entries = nrow(dt),
    missing_values = sum(is.na(dt$value_as_number)),
    unique_persons = uniqueN(dt$person_id)
  )
}

# Compile summary
summary_table <- rbindlist(summary_list, use.names = TRUE)
print(summary_table)

In [None]:
aggregated_data_list[["Bilirubin-Total"]]

# Merging

## Functions

In [None]:
na_summary <- function(df) {
  df <- as.data.table(df)  # ensure data.table structure

  lab_cols <- setdiff(names(df), c("person_id", "year", "person_id_year"))
  total <- nrow(df)
  na_counts <- sapply(df[, ..lab_cols], function(x) sum(is.na(x)))
  na_fraction <- round(na_counts / total, 3)
  result <- data.table(variable = lab_cols, NAs = na_counts, NA_fraction = na_fraction)
  result[order(-NA_fraction)]
}                    
                      
create_timeframe_matrix <- function(df, end_year, start_year = NULL) {
    #' Create a lab matrix for a given timeframe, keeping the most recent year per person
    #'
    #' Filters a long-format lab matrix (`df`) by year and selects the most recent lab values
    #' per person within the specified year range. If only `end_year` is given, selects all years <= `end_year`.
    #' If both `start_year` and `end_year` are given, selects only within that interval.
    #'
    #' @param df A data.table or data.frame with at least columns: person_id, year, person_id_year, and lab variables
    #' @param end_year Integer. The last year to include in the filter (inclusive)
    #' @param start_year Integer or NULL. The first year to include in the filter (inclusive). Default is NULL.
    #'
  df <- as.data.table(df)  # ensure consistent behavior

  if (!is.null(start_year)) {
    df_filtered <- df[year >= start_year & year <= end_year]
  } else {
    df_filtered <- df[year <= end_year]
  }

  # Order by person and year descending to prioritize most recent
  df_filtered <- df_filtered[order(person_id, -year)]

  # Keep only the latest entry per person
  latest_per_person <- df_filtered[, .SD[1], by = person_id]
   cat("✅ Timeframe created with", nrow(latest_per_person), "patients\n")
    print(head(latest_per_person[, .(person_id, year)]))
    print(na_summary(latest_per_person)) 
  return(latest_per_person)
}                    

## Merge blood values into yearwise df_blood and save
Only necessary once in your workspace, afterwards you can load df_blood from the data folder

In [None]:
df_blood <- NULL

for (name in names(aggregated_data_list)) {
  df <- copy(aggregated_data_list[[name]])
  

  setnames(df, "mean_value", name)

  df[, person_id := as.character(person_id)]
  df[, year := as.integer(year)]
  df[, person_id_year := paste0(person_id, "_", year)]

  df <- df[, .(person_id, year, person_id_year, get(name))]
  setnames(df, old = "V4", new = name)

  if (is.null(df_blood)) {
    df_blood <- df
  } else {
    df_blood <- merge(df_blood, df, by = c("person_id", "year", "person_id_year"), all = TRUE)
  }
}

#Provide overview of df_blood and export
head(df_blood)
dim(df_blood)
colnames(df_blood)
write_csv(df_blood, paste0(data_path, "/dataframes/df_blood_raw.csv"))

In [None]:
colnames(df_blood)

# Imputation

## Load covariates for imputation

In [None]:
df_covariates <- data.table::fread(paste0(data_path, "/dataframes/df_covariates_1.7.txt"), sep="\t") %>% select("person_id", "SEX", "AGE", "BMI")

head(df_covariates)

## Load blood for imputation

In [None]:
# Load labs from storage
df_blood <- read_csv(paste0(data_path, "/dataframes/df_blood_raw.csv"))
df_blood <- merge(df_blood, df_covariates, by="person_id")
head(df_blood)
dim(df_blood)


# Check the fraction of NAs per column and export that as supplementary file
na_blood_summary <- na_summary(df_blood)
na_blood_summary
write_xlsx(na_blood_summary, paste0(suppl_path, "/NA_Blood_Summary.xlsx"))



## Optional: Remove rows with too many NAs

In [None]:
# df_nan <- read_xlsx("HCC/df_nan.xlsx") #DF extracted before with > 25 NAs per row on relevant data
# dim(df_nan)
# head(df_nan)

# df_blood <- semi_join(df_blood, df_nan, by = "person_id")

# dim(df_blood)
# head(df_blood)

# length(unique(df_blood$person_id))

## Remove unnecessary columns to increase imputation speed

This is optional, and mostly dependent on the columns you are actually looking to integrate into your model

In [None]:
colnames(df_blood)

In [None]:
mandatory_columns = c("ALT", "AST", "Platelet-counts", "Glucose", "Mean-Corpuscular-Volume", "Alk-Phos", "Albumin", "Hematocrit", "Leukocyte-counts", "Calcium", "GGT", "Triglycerides")

metadata_columns = c("person_id", "year", "person_id_year", "SEX", "AGE", "BMI")

columns_to_keep <- union(mandatory_columns, metadata_columns)

# Subset the dataframe
df_blood <- df_blood[, columns_to_keep]

head(df_blood)

na_summary(df_blood)

## Timeframe collections

In [None]:
df_blood_pre2010 <- create_timeframe_matrix(df_blood, 2010)
df_blood2010_2015 <-create_timeframe_matrix(df_blood, end_year= 2015, start_year=2010)
df_blood2015_2020 <-create_timeframe_matrix(df_blood, end_year= 2020, start_year=2015)
df_blood2020_2025 <-create_timeframe_matrix(df_blood, end_year= 2025, start_year=2020)

write.csv(df_blood_pre2010,  file = paste0(data_path, "/dataframes/df_blood_pre2010.csv"), row.names = FALSE)
write.csv(df_blood2010_2015, file = paste0(data_path, "/dataframes/df_blood2010_2015.csv"), row.names = FALSE)
write.csv(df_blood2015_2020, file = paste0(data_path, "/dataframes/df_blood2015_2020.csv"), row.names = FALSE)
write.csv(df_blood2020_2025, file = paste0(data_path, "/dataframes/df_blood2020_2025.csv"), row.names = FALSE)







In [None]:
df_blood2015_2020 <-create_timeframe_matrix(df_blood, end_year= 2020, start_year=2015)

In [None]:
df_blood_pre2010 <- read_csv("data/df_blood_pre2010.csv")


na_summary(df_blood_pre2010)

In [None]:
df_blood2010_2015_i <- read_csv("data/df_blood2010_2015_i.csv")


na_summary(df_blood2010_2015_i)

New heading

## Load dfs to be imputed

In [None]:
df_blood_pre2010   <- fread(paste0(data_path, "/dataframes/df_blood_pre2010.csv"))
df_blood2010_2015 <- fread(paste0(data_path, "/dataframes/df_blood2010_2015.csv"))
df_blood2015_2020 <- fread(paste0(data_path, "/dataframes/df_blood2015_2020.csv"))
df_blood2020_2025 <- fread(paste0(data_path, "/dataframes/df_blood2020_2025.csv"))


In [None]:
#debug
# Check for constant columns
constant_check <- sapply(df_blood_pre2010, function(x) var(x, na.rm = TRUE) == 0)
print(names(constant_check)[constant_check])

# Check correlation matrix
numeric_cols <- sapply(df_blood_pre2010, is.numeric)
cor_matrix <- cor(df_blood_pre2010[, numeric_cols, with = FALSE], use = "complete.obs")

## Start MICE Imputation

In [None]:
# OG Function, takes long


#' Run MICE imputation with optional test mode and compute time estimate
#'
#' Filters variables based on missingness, imputes using MICE while retaining
#' metadata and mandatory columns. Optionally runs a test mode with timing.
#'
#' @param df The full input data.frame or data.table
#' @param max_na_frac Maximum tolerated NA fraction (default = 0.5)
#' @param mandatory_columns Columns to always retain regardless of NA content
#' @param impute_only_colums: If NULL, all columns with NA will be imputed, otherwise defines vector of all columns used as imputation targets
#' @param metadata_columns Columns to retain but exclude from imputation
#' @param m Number of imputations (default = 5)
#' @param maxit Max number of iterations (default = 5)
#' @param method_type MICE method for imputation (default = "pmm")
#' @param seed Random seed for reproducibility
#' @param test_run Logical. If TRUE, runs on 500-row sample and estimates full run time
#'
#' @return Imputed data.frame (or sample result if test_run = TRUE)
run_mice_imputation <- function(df,
                                max_na_frac = 0.5,
                                mandatory_columns = c(),
                                impute_only_columns = NULL,
                                metadata_columns = c("person_id", "year", "person_id_year"),
                                m = 5,
                                maxit = 5,
                                method_type = "pmm",
                                seed = 42,
                                test_run = FALSE) {

  library(data.table)
  library(mice)

  df <- as.data.table(df)
  all_cols <- names(df)

    candidate_cols <- setdiff(all_cols, metadata_columns)
  na_counts <- sapply(df[, ..candidate_cols], function(x) sum(is.na(x)))
  na_frac <- na_counts / nrow(df)
  keep_cols <- names(na_frac[na_frac <= max_na_frac])
  # Combine all relevant columns, but only include those that exist in df
    final_cols <- unique(c(
  metadata_columns,
  if (!is.null(impute_only_columns)) impute_only_columns,
  keep_cols
    ))

# Ensure all are valid columns in the input
final_cols <- intersect(final_cols, all_cols)
                      
  cat("✅ Final imputation input will include", length(final_cols), "columns:\n")
print(final_cols)                    
                      
  # Prepare data
  mice_input <- as.data.frame(df[, ..final_cols])

  # Build method vector
  method_vec <- rep("", length(names(mice_input)))
  names(method_vec) <- names(mice_input)

# If impute_only_columns is NULL, default to all imputable columns (excluding metadata)
    if (is.null(impute_only_columns)) {
      imputable_cols <- setdiff(names(mice_input), metadata_columns)
    } else {
      imputable_cols <- intersect(impute_only_columns, names(mice_input))
    }

  method_vec[imputable_cols] <- method_type

  # Sanitize column names and store previous name in name_map to revert later
  original_names <- names(mice_input)
  safe_names <- make.names(original_names, unique = TRUE)
  name_map <- setNames(original_names, safe_names)
  setnames(mice_input, safe_names)
  names(method_vec) <- safe_names

  # Internal helper: restore names based on original name mapping
  restore_names <- function(df, name_map) {
    current_names <- names(df)
    matched <- intersect(current_names, names(name_map))
    setnames(df, old = matched, new = name_map[matched])
    return(df)
  }

  if (test_run) {
    test_n <- min(500, nrow(mice_input))
    cat("🧪 Running test MICE on", test_n, "rows to estimate compute time...\n")

    start_time <- Sys.time()
    test_result <- mice(mice_input[1:test_n, ], m = 1, maxit = 2, method = method_vec, seed = seed)
    end_time <- Sys.time()

    elapsed <- as.numeric(difftime(end_time, start_time, units = "secs"))
    estimated_full <- (elapsed / test_n) * nrow(mice_input) * (maxit / 2) * (m / 1)

    cat(sprintf("⏱️ Test run took %.2f seconds.\n", elapsed))
    cat(sprintf("🧮 Estimated full run time: %.1f seconds (~%.1f minutes)\n", estimated_full, estimated_full / 60))

    df_test_imputed <- as.data.table(complete(test_result, action = 1))
    df_test_imputed <- restore_names(df_test_imputed, name_map)
    return(df_test_imputed)
  }

  cat("🚀 Running full MICE on", nrow(mice_input), "rows and", length(imputable_cols), "variables...\n")
  mice_model <- mice(mice_input, m = m, maxit = maxit, method = method_vec, seed = seed)

  df_imputed <- as.data.table(complete(mice_model, action = 1))
  df_imputed <- restore_names(df_imputed, name_map)

  return(df_imputed)
}


# MICE needs column names with dots rather than dashes. This helper function transforms column names back to original format               
restore_names <- function(df, reference_names) {
  current_names <- names(df)
  # Map: remove make.names() effects
  cleaned_ref <- make.names(reference_names, unique = TRUE)
  restore_map <- setNames(reference_names, cleaned_ref)

  # Replace if found
  matched <- intersect(current_names, names(restore_map))
  setnames(df, old = matched, new = restore_map[matched])
  return(df)
}                      
                      

In [None]:
#Claude function debug

run_mice_imputation <- function(df,
                                      max_na_frac = 0.5,
                                      mandatory_columns = c(),
                                      impute_only_columns = NULL,
                                      metadata_columns = c("person_id", "year", "person_id_year"),
                                      m = 5,
                                      maxit = 5,
                                      seed = 42,
                                      test_run = FALSE,
                                      optimize_cart = TRUE) {
  
  library(data.table)
  library(mice)
  
  df <- as.data.table(df)
  all_cols <- names(df)
  
  # Filter columns based on missingness
  candidate_cols <- setdiff(all_cols, metadata_columns)
  na_counts <- sapply(df[, ..candidate_cols], function(x) sum(is.na(x)))
  na_frac <- na_counts / nrow(df)
  keep_cols <- names(na_frac[na_frac <= max_na_frac])
  
  final_cols <- unique(c(
    metadata_columns,
    if (!is.null(impute_only_columns)) impute_only_columns,
    keep_cols
  ))
  final_cols <- intersect(final_cols, all_cols)
  
  cat("✅ Final imputation input will include", length(final_cols), "columns\n")
  
  # Prepare data
  mice_input <- as.data.frame(df[, ..final_cols])
  
  # Smart method selection
  method_vec <- rep("", length(names(mice_input)))
  names(method_vec) <- names(mice_input)
  
  if (is.null(impute_only_columns)) {
    imputable_cols <- setdiff(names(mice_input), metadata_columns)
  } else {
    imputable_cols <- intersect(impute_only_columns, names(mice_input))
  }
  
  # Hybrid method assignment
  for (col in imputable_cols) {
    na_count <- sum(is.na(mice_input[[col]]))
    na_prop <- na_count / nrow(mice_input)
    
    if (na_prop > 0.3) {
      # High missingness: use CART for better quality
      method_vec[col] <- "cart"
    } else if (na_prop > 0.1) {
      # Medium missingness: use random forest (faster than CART)
      method_vec[col] <- "rf"
    } else {
      # Low missingness: use PMM (fastest)
      method_vec[col] <- "pmm"
    }
  }
  
  # Optimize CART settings
  if (optimize_cart) {
    # Reduce CART complexity for speed
    options(mice.cart.cp = 0.01)  # Increase complexity parameter
    options(mice.cart.minbucket = 5)  # Increase minimum bucket size
    options(mice.cart.maxdepth = 5)  # Limit tree depth
  }
  
  # Sanitize column names
  original_names <- names(mice_input)
  safe_names <- make.names(original_names, unique = TRUE)
  name_map <- setNames(original_names, safe_names)
  setnames(mice_input, safe_names)
  names(method_vec) <- safe_names
  
  restore_names <- function(df, name_map) {
    current_names <- names(df)
    matched <- intersect(current_names, names(name_map))
    setnames(df, old = matched, new = name_map[matched])
    return(df)
  }
  
  if (test_run) {
    test_n <- min(1000, nrow(mice_input))
    cat("🧪 Running test MICE on", test_n, "rows to estimate compute time...\n")
    
    start_time <- Sys.time()
    test_result <- mice(mice_input[1:test_n, ], 
                       m = 1, 
                       maxit = 2, 
                       method = method_vec, 
                       seed = seed,
                       printFlag = FALSE)
    end_time <- Sys.time()
    
    elapsed <- as.numeric(difftime(end_time, start_time, units = "secs"))
    estimated_full <- (elapsed / test_n) * nrow(mice_input) * (maxit / 2) * (m / 1)
    
    cat(sprintf("⏱️ Test run took %.2f seconds.\n", elapsed))
    cat(sprintf("🧮 Estimated full run time: %.1f seconds (~%.1f minutes)\n", 
                estimated_full, estimated_full / 60))
    cat("🔍 Method selection:\n")
    print(table(method_vec[method_vec != ""]))
    
    df_test_imputed <- as.data.table(complete(test_result, action = 1))
    df_test_imputed <- restore_names(df_test_imputed, name_map)
    return(df_test_imputed)
  }
  
  cat("🚀 Running full MICE on", nrow(mice_input), "rows and", length(imputable_cols), "variables...\n")
  cat("🔍 Using methods:\n")
  print(table(method_vec[method_vec != ""]))
  
  mice_model <- mice(mice_input, 
                    m = m, 
                    maxit = maxit, 
                    method = method_vec, 
                    seed = seed,
                    printFlag = FALSE)
  
  df_imputed <- as.data.table(complete(mice_model, action = 1))
  df_imputed <- restore_names(df_imputed, name_map)
  
  return(df_imputed)
}

In [None]:
# Fix for MICE singularity errors
run_mice_imputation <- function(df,
                                      max_na_frac = 0.5,
                                      mandatory_columns = c(),
                                      impute_only_columns = NULL,
                                      metadata_columns = c("person_id", "year", "person_id_year"),
                                      m = 5,
                                      maxit = 5,
                                      method_type = "cart",
                                      seed = 42,
                                      test_run = FALSE) {
  
  library(data.table)
  library(mice)
  
  df <- as.data.table(df)
  all_cols <- names(df)
  
  # Step 1: Identify column categories correctly
  # Mandatory columns: MUST be included regardless of missingness (your biomarkers)
  # Metadata columns: Used as predictors but never imputed (complete columns like person_id, SEX, AGE, BMI)
  # Optional columns: Can be removed if too much missingness
  
  optional_cols <- setdiff(all_cols, c(metadata_columns, mandatory_columns))
  
  # Check for constant columns ONLY in optional columns (never remove mandatory!)
  if (length(optional_cols) > 0) {
    constant_cols <- sapply(df[, ..optional_cols], function(x) {
      if (is.numeric(x)) {
        var(x, na.rm = TRUE) == 0 || is.na(var(x, na.rm = TRUE))
      } else {
        length(unique(na.omit(x))) <= 1
      }
    })
    
    if (any(constant_cols)) {
      remove_constant <- names(constant_cols)[constant_cols]
      cat("⚠️ Removing", length(remove_constant), "constant optional columns:", remove_constant, "\n")
      optional_cols <- setdiff(optional_cols, remove_constant)
    }
  }
  
  # Check for highly correlated columns ONLY among optional columns
  # Never remove mandatory columns due to correlation!
  if (length(optional_cols) > 1) {
    numeric_optional <- optional_cols[sapply(df[, ..optional_cols], is.numeric)]
    if (length(numeric_optional) > 1) {
      cor_matrix <- cor(df[, ..numeric_optional], use = "pairwise.complete.obs")
      cor_matrix[is.na(cor_matrix)] <- 0
      diag(cor_matrix) <- 0
      
      high_cor_pairs <- which(abs(cor_matrix) > 0.99, arr.ind = TRUE)
      if (nrow(high_cor_pairs) > 0) {
        remove_cols <- unique(colnames(cor_matrix)[high_cor_pairs[, 2]])
        cat("⚠️ Removing", length(remove_cols), "highly correlated optional columns:", remove_cols, "\n")
        optional_cols <- setdiff(optional_cols, remove_cols)
      }
    }
  }
  
  # Filter optional columns by missingness
  if (length(optional_cols) > 0) {
    na_counts <- sapply(df[, ..optional_cols], function(x) sum(is.na(x)))
    na_frac <- na_counts / nrow(df)
    keep_optional <- names(na_frac[na_frac <= max_na_frac])
  } else {
    keep_optional <- character(0)
  }
  
  # Combine final columns: mandatory + metadata + filtered optional
  final_cols <- unique(c(
    mandatory_columns,  # Always included
    metadata_columns,   # Always included
    if (!is.null(impute_only_columns)) impute_only_columns else keep_optional
  ))
  final_cols <- intersect(final_cols, all_cols)
  
  cat("📋 Column breakdown:\n")
  cat("  - Mandatory (biomarkers):", length(intersect(mandatory_columns, final_cols)), "columns\n")
  cat("  - Metadata (predictors):", length(intersect(metadata_columns, final_cols)), "columns\n") 
  cat("  - Optional (filtered):", length(intersect(keep_optional, final_cols)), "columns\n")
  
  cat("✅ Final imputation input will include", length(final_cols), "columns\n")
  
  # Prepare data with additional cleaning
  mice_input <- as.data.frame(df[, ..final_cols])
  
  # Step 2: Additional data cleaning for numerical stability
  for (col in names(mice_input)) {
    if (is.numeric(mice_input[[col]])) {
      # Remove extreme outliers that can cause numerical issues
      Q1 <- quantile(mice_input[[col]], 0.25, na.rm = TRUE)
      Q3 <- quantile(mice_input[[col]], 0.75, na.rm = TRUE)
      IQR <- Q3 - Q1
      lower_bound <- Q1 - 3 * IQR  # Using 3*IQR instead of 1.5 for less aggressive filtering
      upper_bound <- Q3 + 3 * IQR
      
      outliers <- which(mice_input[[col]] < lower_bound | mice_input[[col]] > upper_bound)
      if (length(outliers) > 0 && length(outliers) < nrow(mice_input) * 0.05) {  # Only if < 5% outliers
        mice_input[[col]][outliers] <- NA
        cat("ℹ️ Set", length(outliers), "extreme outliers to NA in column", col, "\n")
      }
    }
  }
  
  # Build method vector with correct logic
  method_vec <- rep("", length(names(mice_input)))
  names(method_vec) <- names(mice_input)
  
  # Determine imputation targets: mandatory columns + any specified impute_only_columns
  if (!is.null(impute_only_columns)) {
    imputable_cols <- intersect(impute_only_columns, names(mice_input))
  } else {
    # Default: impute mandatory columns (biomarkers) but NOT metadata columns
    imputable_cols <- intersect(mandatory_columns, names(mice_input))
  }
  
  # Add any optional columns that need imputation (those with missingness)
  optional_in_final <- setdiff(names(mice_input), c(mandatory_columns, metadata_columns))
  for (col in optional_in_final) {
    if (sum(is.na(mice_input[[col]])) > 0) {
      imputable_cols <- c(imputable_cols, col)
    }
  }
  
  imputable_cols <- unique(imputable_cols)
  
  cat("🎯 Will impute", length(imputable_cols), "columns:", paste(imputable_cols, collapse = ", "), "\n")
  cat("📌 Metadata columns (predictors only):", paste(intersect(metadata_columns, names(mice_input)), collapse = ", "), "\n")
  
  # Use more robust methods to avoid singularity
  for (col in imputable_cols) {
    if (method_type == "cart") {
      method_vec[col] <- "cart"
    } else if (is.numeric(mice_input[[col]])) {
      method_vec[col] <- "norm"  # Normal model instead of PMM for numerical stability
    } else {
      method_vec[col] <- "logreg"
    }
  }
  
  # Sanitize column names
  original_names <- names(mice_input)
  safe_names <- make.names(original_names, unique = TRUE)
  name_map <- setNames(original_names, safe_names)
  setnames(mice_input, safe_names)
  names(method_vec) <- safe_names
  
  restore_names <- function(df, name_map) {
    current_names <- names(df)
    matched <- intersect(current_names, names(name_map))
    setnames(df, old = matched, new = name_map[matched])
    return(df)
  }
  
  # Step 3: Try imputation with error handling
  tryCatch({
    if (test_run) {
      test_n <- min(1000, nrow(mice_input))
      cat("🧪 Running test MICE on", test_n, "rows to estimate compute time...\n")
      
      start_time <- Sys.time()
      test_result <- mice(mice_input[1:test_n, ], 
                         m = 1, 
                         maxit = 2, 
                         method = method_vec, 
                         seed = seed,
                         printFlag = FALSE)
      end_time <- Sys.time()
      
      elapsed <- as.numeric(difftime(end_time, start_time, units = "secs"))
      estimated_full <- (elapsed / test_n) * nrow(mice_input) * (maxit / 2) * (m / 1)
      
      cat(sprintf("⏱️ Test run took %.2f seconds.\n", elapsed))
      cat(sprintf("🧮 Estimated full run time: %.1f seconds (~%.1f minutes)\n", 
                  estimated_full, estimated_full / 60))
      
      df_test_imputed <- as.data.table(complete(test_result, action = 1))
      df_test_imputed <- restore_names(df_test_imputed, name_map)
      return(df_test_imputed)
    }
    
    cat("🚀 Running full MICE on", nrow(mice_input), "rows and", length(imputable_cols), "variables...\n")
    mice_model <- mice(mice_input, 
                      m = m, 
                      maxit = maxit, 
                      method = method_vec, 
                      seed = seed,
                      printFlag = FALSE)
    
    df_imputed <- as.data.table(complete(mice_model, action = 1))
    df_imputed <- restore_names(df_imputed, name_map)
    
    return(df_imputed)
    
  }, error = function(e) {
    if (grepl("singular", e$message)) {
      cat("❌ Singularity error encountered. Trying fallback approach...\n")
      return(run_mice_fallback(mice_input, method_vec, m, maxit, seed, name_map, metadata_columns))
    } else {
      stop(e)
    }
  })
}

# Fallback function for singularity issues
run_mice_fallback <- function(mice_input, method_vec, m, maxit, seed, name_map, metadata_columns) {
  
  cat("🔄 Attempting fallback strategy...\n")
  
  # Strategy 1: Use simpler methods
  imputable_cols <- names(method_vec)[method_vec != ""]
  simple_methods <- rep("", length(names(mice_input)))
  names(simple_methods) <- names(mice_input)
  
  for (col in imputable_cols) {
    if (is.numeric(mice_input[[col]])) {
      simple_methods[col] <- "mean"  # Simple mean imputation
    } else {
      simple_methods[col] <- "mode"  # Simple mode imputation
    }
  }
  
  tryCatch({
    mice_model <- mice(mice_input, 
                      m = m, 
                      maxit = maxit, 
                      method = simple_methods, 
                      seed = seed,
                      printFlag = FALSE)
    
    df_imputed <- as.data.table(complete(mice_model, action = 1))
    df_imputed <- restore_names(df_imputed, name_map)
    
    cat("✅ Fallback imputation completed successfully\n")
    return(df_imputed)
    
  }, error = function(e2) {
    cat("❌ Fallback also failed. Trying manual imputation...\n")
    return(run_manual_imputation(mice_input, name_map, metadata_columns))
  })
}

# Manual imputation as last resort
run_manual_imputation <- function(mice_input, name_map, metadata_columns) {
  
  df_manual <- mice_input
  
  for (col in names(df_manual)) {
    if (col %in% metadata_columns) next
    
    if (sum(is.na(df_manual[[col]])) > 0) {
      if (is.numeric(df_manual[[col]])) {
        # Use median for numeric columns
        median_val <- median(df_manual[[col]], na.rm = TRUE)
        df_manual[[col]][is.na(df_manual[[col]])] <- median_val
      } else {
        # Use mode for categorical columns
        mode_val <- names(sort(table(df_manual[[col]]), decreasing = TRUE))[1]
        df_manual[[col]][is.na(df_manual[[col]])] <- mode_val
      }
    }
  }
  
  df_manual <- as.data.table(df_manual)
  df_manual <- restore_names(df_manual, name_map)
  
  cat("✅ Manual imputation completed\n")
  return(df_manual)
}

# Diagnostic function to identify problematic columns (respecting mandatory/metadata distinction)
diagnose_singularity <- function(df, 
                                mandatory_columns = c("ALT", "AST", "Platelet-counts", "Glucose", "Mean-Corpuscular-Volume", "Alk-Phos", "Albumin", "Hematocrit", "Leukocyte-counts", "Calcium", "GGT", "Triglycerides"),
                                metadata_columns = c("person_id", "year", "person_id_year", "SEX", "AGE", "BMI")) {
  
  optional_cols <- setdiff(names(df), c(mandatory_columns, metadata_columns))
  
  cat("🔍 Diagnosing potential singularity issues...\n")
  cat("📊 Column breakdown:\n")
  cat("  - Mandatory (biomarkers):", length(intersect(mandatory_columns, names(df))), "columns\n")
  cat("  - Metadata (predictors):", length(intersect(metadata_columns, names(df))), "columns\n")
  cat("  - Optional:", length(optional_cols), "columns\n\n")
  
  # Check 1: Constant columns (only in optional - never remove mandatory!)
  if (length(optional_cols) > 0) {
    constant_cols <- sapply(df[, ..optional_cols], function(x) {
      if (is.numeric(x)) {
        var(x, na.rm = TRUE) == 0 || is.na(var(x, na.rm = TRUE))
      } else {
        length(unique(na.omit(x))) <= 1
      }
    })
    
    if (any(constant_cols)) {
      cat("⚠️ Constant optional columns (can be removed):\n")
      print(names(constant_cols)[constant_cols])
      cat("\n")
    }
  }
  
  # Check 2: High correlations among all numeric columns (including mandatory)
  all_numeric <- names(df)[sapply(df, is.numeric)]
  if (length(all_numeric) > 1) {
    cor_matrix <- cor(df[, ..all_numeric], use = "pairwise.complete.obs")
    cor_matrix[is.na(cor_matrix)] <- 0
    diag(cor_matrix) <- 0
    
    high_cor_pairs <- which(abs(cor_matrix) > 0.99, arr.ind = TRUE)
    if (nrow(high_cor_pairs) > 0) {
      cat("⚠️ Highly correlated pairs (r > 0.99):\n")
      for (i in 1:nrow(high_cor_pairs)) {
        row_idx <- high_cor_pairs[i, 1]
        col_idx <- high_cor_pairs[i, 2]
        col1 <- rownames(cor_matrix)[row_idx]
        col2 <- colnames(cor_matrix)[col_idx]
        is_mandatory1 <- col1 %in% mandatory_columns
        is_mandatory2 <- col2 %in% mandatory_columns
        
        cat(sprintf("%s%s - %s%s: r = %.4f\n", 
                   col1, if(is_mandatory1) " (MANDATORY)" else "",
                   col2, if(is_mandatory2) " (MANDATORY)" else "",
                   cor_matrix[row_idx, col_idx]))
      }
      cat("\n")
    }
  }
  
  # Check 3: Missing data patterns in mandatory columns
  mandatory_present <- intersect(mandatory_columns, names(df))
  if (length(mandatory_present) > 0) {
    na_summary_mandatory <- sapply(df[, ..mandatory_present], function(x) sum(is.na(x)) / length(x))
    cat("📋 Missing data in mandatory columns:\n")
    for (i in 1:length(na_summary_mandatory)) {
      cat(sprintf("  %s: %.1f%% missing\n", names(na_summary_mandatory)[i], na_summary_mandatory[i] * 100))
    }
    cat("\n")
  }
  
  # Check 4: Missing data in optional columns
  if (length(optional_cols) > 0) {
    optional_present <- intersect(optional_cols, names(df))
    na_summary_optional <- sapply(df[, ..optional_present], function(x) sum(is.na(x)) / length(x))
    high_missing_optional <- na_summary_optional[na_summary_optional > 0.8]
    
    if (length(high_missing_optional) > 0) {
      cat("⚠️ Optional columns with >80% missing data (candidates for removal):\n")
      print(high_missing_optional)
      cat("\n")
    }
  }
  
  cat("✅ Diagnosis complete\n")
}

In [None]:
c("ALT", "AST", "Platelet-counts", "Glucose", "Mean-Corpuscular-Volume", "Alk-Phos", "Albumin", "Hematocrit", "Leukocyte-counts", "Calcium" )

### Testrun

In [None]:
df_blood_pre2010_test <- run_mice_imputation(df_blood_pre2010,
                                   mandatory_columns = mandatory_columns,
                                   m = 5,
                                   maxit = 5,
                                   test_run = TRUE)


# df_blood_pre2010
# df_blood2010_2015
# df_blood2015_2020 
head(df_blood_pre2010_test)

In [None]:
df_blood2010_2015_test <- run_mice_imputation(df_blood2010_2015,
                                   mandatory_columns = mandatory_columns,
                                   m = 3,
                                   maxit = 3,
                                   test_run = TRUE)

head(df_blood2010_2015_test)


### Run all

In [None]:
df_blood_pre2010_i <- run_mice_imputation(df_blood_pre2010,
                                   mandatory_columns = mandatory_columns,
                                   m = 3,
                                   maxit = 3,
                                   test_run = FALSE)
head(df_blood_pre2010_i)
dim(df_blood_pre2010_i)

write_csv(df_blood_pre2010_i, paste0(data_path, "/dataframes/df_blood_pre2010_i.csv"))

In [None]:
df_blood2010_2015_i <- run_mice_imputation(df_blood2010_2015,
                                   mandatory_columns = mandatory_columns,
                                   m = 3,
                                   maxit = 3,
                                   test_run = FALSE)

head(df_blood2010_2015_i)
dim(df_blood2010_2015_i)
write_csv(df_blood2010_2015_i, paste0(data_path, "/dataframes/df_blood2010_2015_i.csv"))
rm(df_blood2010_2015_i)

In [None]:
df_blood2015_2020_i <- run_mice_imputation(df_blood2015_2020,
                                   mandatory_columns = mandatory_columns,
                                   m = 3,
                                   maxit = 3,
                                   test_run = FALSE)
head(df_blood2015_2020_i)
dim(df_blood2015_2020_i)
write_csv(df_blood2015_2020_i, paste0(data_path, "/dataframes/df_blood2015_2020i.csv"))

In [None]:
df_blood2020_2025_i <- run_mice_imputation(df_blood2020_2025,
                                   mandatory_columns = mandatory_columns,
                                   m = 3,
                                   maxit = 3,
                                   test_run = FALSE)
head(df_blood2015_2020_i)
dim(df_blood2015_2020_i)
write_csv(df_blood2015_2020_i, paste0(data_path, "/dataframes/df_blood2020_2025i.csv"))

# Optional: Imputation single add-on columns
MICE imputation here is compute-intensive. If you want to add a certain column you forgot to impute before, here you can add those to the existing columns

In [None]:
colnames(df_blood)

In [None]:
# Load the pre-existing, already imputed dataframes
df_blood_pre2010  <- read_csv("data/df_blood_pre2010.csv")
df_blood2010_2015 <- read_csv("data/df_blood2010_2015.csv")
df_blood2015_2020 <- read_csv("data/df_blood2015_2020.csv")


# Define new mandatory columns
mandatory_columns = c("Bilirubin-Total" )
metadata_columns = c("person_id", "year", "person_id_year", "SEX", "AGE", "BMI")
columns_to_keep <- union(mandatory_columns, metadata_columns)

# Subset the dataframe
df_blood_add <- df_blood[, columns_to_keep]

# Create timeframe-matched versions of the new column(s)
df_blood_pre2010_add <- create_timeframe_matrix(df_blood_add, 2010)
df_blood2010_2015_add <-create_timeframe_matrix(df_blood, end_year= 2015, start_year=2010)
df_blood2015_2020_add <-create_timeframe_matrix(df_blood, end_year= 2020, start_year=2015)

# Reduce each *_add to only the columns needed for merging
df_blood_pre2010_add    <- df_blood_pre2010_add[, c("person_id_year", mandatory_columns), with = FALSE]
df_blood2010_2015_add   <- df_blood2010_2015_add[, c("person_id_year", mandatory_columns), with = FALSE]
df_blood2015_2020_add   <- df_blood2015_2020_add[, c("person_id_year", mandatory_columns), with = FALSE]



# Merge in the additional column for each timeframe
df_blood_pre2010  <- merge(df_blood_pre2010, df_blood_pre2010_add,  by = "person_id_year", all.x = TRUE)
df_blood2010_2015 <- merge(df_blood2010_2015, df_blood2010_2015_add, by = "person_id_year", all.x = TRUE)
df_blood2015_2020 <- merge(df_blood2015_2020, df_blood2015_2020_add, by = "person_id_year", all.x = TRUE)



In [None]:
# Test MICE on each
cat("🧪 Testing MICE on pre2010...\n")
run_mice_imputation(df_blood_pre2010,
               mandatory_columns = mandatory_columns,
                                   m = 5,
                                   maxit = 5,
                                    impute_only_columns="Bilirubin-Total",
                                   test_run = TRUE,
                                    method_type="cart")

cat("🧪 Testing MICE on 2010–2015...\n")
run_mice_imputation(df_blood2010_2015,
                    mandatory_columns = mandatory_columns,
                                   m = 5,
                                   maxit = 5,
                     impute_only_columns="Bilirubin-Total",
                                   test_run = TRUE,
                                    method_type="cart")

cat("🧪 Testing MICE on 2015–2020...\n")
run_mice_imputation(df_blood2015_2020,
                    mandatory_columns = mandatory_columns,
                                   m = 5,
                                   maxit = 5,
                     impute_only_columns="Bilirubin-Total",
                                   test_run = TRUE,
                                    method_type="cart")

cat("🧪 Testing MICE on 2020–2025...\n")
run_mice_imputation(df_blood2020_2025,
                    mandatory_columns = mandatory_columns,
                                   m = 5,
                                   maxit = 5,
                     impute_only_columns="Bilirubin-Total",
                                   test_run = TRUE,
                                    method_type="cart")

In [None]:
# Final full imputations
df_blood_pre2010_i <- run_mice_imputation(df_blood_pre2010,
                                          mandatory_columns = mandatory_columns,
                                   m = 5,
                                   maxit = 5,
                                   test_run = FALSE,
                                    method_type="cart")

df_blood2010_2015_i <- run_mice_imputation(df_blood2010_2015,
                                           mandatory_columns = mandatory_columns,
                                   m = 5,
                                   maxit = 5,
                                   test_run = FALSE,
                                    method_type="cart")

df_blood2015_2020_i <- run_mice_imputation(df_blood2015_2020,
                                           mandatory_columns = mandatory_columns,
                                   m = 5,
                                   maxit = 5,
                                   test_run = FALSE,
                                    method_type="cart")

# Display and summarize
cat("\n📊 Overview df_blood_pre2010:\n")
print(head(df_blood_pre2010_i))
print(dim(df_blood_pre2010_i))
print(na_summary(df_blood_pre2010_i))

cat("\n📊 Overview df_blood2010_2015:\n")
print(head(df_blood2010_2015_i))
print(dim(df_blood2010_2015_i))
print(na_summary(df_blood2010_2015_i))

cat("\n📊 Overview df_blood2015_2020:\n")
print(head(df_blood2015_2020_i))
print(dim(df_blood2015_2020_i))
print(na_summary(df_blood2015_2020_i))




In [None]:
# Export back to CSV
write_csv(df_blood_pre2010_i,  "data/df_blood_pre2010.csv")
write_csv(df_blood2010_2015_i, "data/df_blood2010_2015.csv")
write_csv(df_blood2015_2020_i, "data/df_blood2015_2020.csv")

In [None]:
df_blood_pre2010_i