## 1. Setup

In [None]:
install.packages("Rcpp")
library(Rcpp)

install.packages("miceRanger")
install.packages("dplyr")
install.packages("data.table")
install.packages("caret")

library(miceRanger)
library(dplyr)
library(data.table)
library(caret)

seed = 123
set.seed(seed)
                   

In [None]:
#def functions
has_multiple_unique_values <- function(column) {
  non_na_values <- na.omit(column)
  length(unique(non_na_values)) > 1
}

## 2. Load Data

In [None]:
system("dx download 'UKBRISK/to_event_Touchscreen_v3.tsv'")
system("dx download 'UKBRISK/Metabolites_v1_participant.tsv'")
system("dx download 'UKBRISK/Polygenic_risk_score_v1_participant.tsv'")
system("dx download 'UKBRISK/Past_medical_history_v2_participant.tsv'")
system("dx download 'UKBRISK/Touchscreen_v1_participant.tsv'")
system("dx download 'UKBRISK/Clinical_predictors_preprocessed_v2.tsv'")

system("dx download 'Risk score dataframes/NHSHC_preprocessed_v1.tsv'")
system("dx download 'Risk score dataframes/QRISK3_preprocessed_v1.tsv'")
system("dx download 'Risk score dataframes/PREVENT_preprocessed_v1.tsv'")
system("dx download 'Risk score dataframes/SCORE2_preprocessed_v1.tsv'")

endpoints <- read.delim("to_event_Touchscreen_v3.tsv", sep = "\t")
endpoint_names <- c("AD", "CVD", "DM", "LD", "RD", 
                    "AF", "HF", "CAD", "VT", "ISS", 
                    "AAA", "PAD", "AS", "COPD", "LC", 
                    "MEL", "CRC", "PC", "BC", "PD", 
                    "OP", "CAT", "POAG", "HT")
cat(paste0("endpoint information read \n"))

predictors_list <- list(
    metabolomics = read.delim("Metabolites_v1_participant.tsv", sep = "\t"),
    prs = read.delim("Polygenic_risk_score_v1_participant.tsv", sep = "\t"),
    pmh = read.delim("Past_medical_history_v2_participant.tsv", sep = "\t"),
    ts = read.delim("Touchscreen_v1_participant.tsv", sep = "\t"),
    clinicalrisk = read.delim("Clinical_predictors_preprocessed_v2.tsv", sep = "\t"),
    nhc = read.delim("NHSHC_preprocessed_v1.tsv", sep = "\t"),
    qrisk = read.delim("QRISK3_preprocessed_v1.tsv", sep = "\t"),
    prevent = read.delim("PREVENT_preprocessed_v1.tsv", sep = "\t"),
    score = read.delim("SCORE2_preprocessed_v1.tsv", sep = "\t")
)

for (name in names(predictors_list)) {
  file <- predictors_list[[name]]
  cat(paste0(name," file read, dimenions: rows:", dim(file)[1], ", cols:", dim(file)[2], "\n"))
}

## 3. Reformatting

In [None]:
#convert logical columns to character
for (name in names(predictors_list)) {
  file <- predictors_list[[name]]
  logical_cols <- sapply(file, is.logical)
  file[logical_cols] <- lapply(file[logical_cols], as.character)
  predictors_list[[name]] <- file
}

In [None]:
#rename cols
for (name in names(predictors_list)) {
  file <- predictors_list[[name]]
  colnames(file) <- c('eid',paste0(name, "_", colnames(file[-1])))
  predictors_list[[name]] <- file
}


In [None]:
#Insert False for individuals without a record of a condition in pmh (there were some NAs)
predictors_list[["pmh"]][, which(names(predictors_list[["pmh"]]) != "eid")] <- 
  lapply(predictors_list[["pmh"]][, which(names(predictors_list[["pmh"]]) != "eid")], function(x) replace(x, is.na(x), FALSE))


## 5. Exclusion based on missingness

In [None]:
#exclude individuals who didn't have assay (e.g. no metabolomics)
for (name in names(predictors_list)) {
  file <- predictors_list[[name]]
  na_percentage_rows <- apply(file[,which(names(file) != "eid")], 1, function(x) sum(is.na(x)) / length(x))
  file <- file[na_percentage_rows < 1, ]
  cat(paste0(name, ": retained n = ", table(na_percentage_rows == 1)[1]," of ", length(na_percentage_rows), " individuals due to criteria: missingness < 100% within each individual", "\n"))
  predictors_list[[name]] <- file
}

#Filter for common EIDs across all datasets
eids_list <- lapply(predictors_list, function(x) x$eid)
common_eids <- Reduce(intersect, eids_list)
for (name in names(predictors_list)) {
  dataset <- predictors_list[[name]]
  predictors_list[[name]] <- dataset[dataset$eid %in% common_eids, ]
}

In [None]:
#apply variable-wise missingness criteria
for (name in names(predictors_list)) {
  file <- predictors_list[[name]]
  na_percentage_cols <- apply(file, 2, function(x) sum(is.na(x)) / length(x))
  file <- file[ ,na_percentage_cols <= 0.2]
  predictors_list[[name]] <- file
  cat(paste0(name, "retained n = ", table(na_percentage_cols > 0.2)[1], " of ", length(na_percentage_cols), " variables due to criteria: missingness <= 20% within each variable", "\n"))
}

#apply individual-wise missingness criteria
for (name in names(predictors_list)) {
  file <- predictors_list[[name]]
  na_percentage_rows_2 <- apply(file[,which(names(file) != "eid")], 1, function(x) sum(is.na(x)) / length(x))
  file <- file[na_percentage_rows_2 <= 0.2, ]
  predictors_list[[name]] <- file
  cat(paste0(name, "retained n = ", table(na_percentage_rows_2 > 0.2)[1], " of ", length(na_percentage_rows_2), " individuals due to criteria: missingness <= 20% within each individual", "\n"))
}

In [None]:
#Filter for common EIDs across all datasets
eids_list <- lapply(predictors_list, function(x) x$eid)
common_eids <- Reduce(intersect, eids_list)
for (name in names(predictors_list)) {
  dataset <- predictors_list[[name]]
  predictors_list[[name]] <- dataset[dataset$eid %in% common_eids, ]
}

## 5. Test-Train splitting & Imputation 

In [None]:
set.seed(42)

eidsindex <- sample(seq_len(length(common_eids)), size = length(common_eids) * 0.5)
eidstrain <- common_eids[eidsindex]
eidstest <- setdiff(common_eids, eidstrain)

cat("Train set size:", length(eidstrain), "\n")
cat("Test set size:", length(eidstest), "\n")

In [None]:
# check endpoint distribution in train and test sets
for (endpoint in endpoint_names) {
    
    stat <- paste0(endpoint, "_status")
    
    counttrain <- table(endpoints[endpoints$eid %in% eidstrain, stat], useNA = "ifany")
    counttest <- table(endpoints[endpoints$eid %in% eidstest, stat], useNA = "ifany")
    
    cat(paste0("\nEndpoint: ", endpoint, "\n"))
    print(counttrain)
    print(counttest)
}

In [None]:
# Remove columns that have only one unique value in the training set (i.e., no variability, not useful for prediction)
for (name in names(predictors_list)) {
    file <- predictors_list[[name]]
    
    before <- ncol(file)

    file <- file[, sapply(file[file$eid %in% eidstrain, ], has_multiple_unique_values)]
    predictors_list[[name]] <- file
    
    after <- ncol(file)
    excluded <- before - after
    cat(paste0("For ", name, ": retained n=", after, 
               " columns, excluded n=", excluded, " columns\n"))
}


In [None]:
# Imputation using miceRanger
for (name in c("nhc", "qrisk", "prevent", "score", "clinicalrisk", "ts", "metabolomics")) { 
    file <- predictors_list[[name]]
    
    all_vars <- setdiff(names(file), "eid")
    v <- setNames(lapply(all_vars, function(x) setdiff(all_vars, x)), all_vars)
    
    #impute predictors
    miceObj <- miceRanger(
        file[file$eid %in% eidstrain, ], 
        m = 1, 
        returnModels = TRUE,
        maxiter = 3,
        max.depth = 8,
        num.trees = 8,
        verbose = TRUE,
        vars = v,
        save.memory = FALSE)
    
    imputed <- impute(file, miceObj, verbose = FALSE)
    predictors_list[[name]] <- imputed$imputedData$Dataset_1
    
    cat(paste0("Imputation done for ", name, "\n"))
}


## 6. Generate final DF and save

In [None]:
#merge predictor dataframes on eid
merged_df <- Reduce(function(x, y) merge(x, y, by = "eid", all = TRUE), predictors_list)

#merge with outcomes
status_cols <- grep(paste0("_status$"), names(endpoints), value = TRUE)
followup_cols <- grep(paste0("_followup$"), names(endpoints), value = TRUE)
at_base_cols <- grep(paste0("_at_base$"), names(endpoints), value = TRUE)

status_cols <- status_cols[status_cols %in% paste0(endpoint_names, "_status")]
followup_cols <- followup_cols[followup_cols %in% paste0(endpoint_names, "_followup")]
at_base_cols <- at_base_cols[at_base_cols %in% paste0(endpoint_names, "_at_base")]
                    
outcome_cols <- c(status_cols, followup_cols, at_base_cols, "eid")

outcome_df <- endpoints[, outcome_cols]
merged_df <- merge(merged_df, outcome_df, by = "eid", all.x = TRUE)

#add testtrain col
merged_df$testtrain <- ifelse(merged_df$eid %in% eidstrain, "train", "test")

In [None]:
#save final dataframe containing imputed predictors and outcomes as well as test train information

filename <- "imputed_data_22112024.tsv"
write.table(merged_df, filename, sep = "\t", row.names = FALSE, quote = FALSE)

upload_cmd <- paste0("dx upload ", filename, " --path UKBRISK_Imputed/final/", filename)
system(upload_cmd, intern = TRUE)
