## 1. Setup

In [None]:
install.packages("Rcpp")
install.packages("miceRanger")
install.packages("dplyr")
install.packages("data.table")
install.packages("caret")

library(Rcpp)
library(miceRanger)
library(dplyr)
library(data.table)
library(caret)

set.seed(42)
                   

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘Deriv’, ‘modelr’, ‘microbenchmark’, ‘bit’, ‘doBy’, ‘minqa’, ‘nloptr’, ‘bit64’, ‘carData’, ‘Formula’, ‘nnet’, ‘pbkrtest’, ‘lme4’, ‘vroom’, ‘tzdb’, ‘broom’, ‘car’, ‘rootSolve’, ‘lmom’, ‘cellranger’, ‘progress’, ‘forcats’, ‘hms’, ‘readr’, ‘ggrepel’, ‘ggsci’, ‘cowplot’, ‘ggsignif’, ‘polynom’, ‘rstatix’, ‘boot’, ‘expm’, ‘Exact’, ‘gld’, ‘readxl’, ‘haven’, ‘ranger’, ‘FNN’, ‘corrplot’, ‘ggpubr’, ‘DescTools’




In [None]:
#def functions
has_multiple_unique_values <- function(column) {
  non_na_values <- na.omit(column)
  length(unique(non_na_values)) > 1
}

## 2. Load Data

In [None]:
system("dx download 'UKBRISK/to_event_Touchscreen_v3.tsv'")
system("dx download 'UKBRISK/Metabolites_v1_participant.tsv'")
system("dx download 'UKBRISK/Polygenic_risk_score_v1_participant.tsv'")
system("dx download 'UKBRISK/Past_medical_history_v2_participant.tsv'")
system("dx download 'UKBRISK/Clinical_predictors_preprocessed_v2.tsv'")


endpoints <- read.delim("to_event_Touchscreen_v3.tsv", sep = "\t")
endpoint_names <- c("AD", "CVD", "BC")
cat(paste0("endpoint information read \n"))

predictors_list <- list(
    metabolomics = read.delim("Metabolites_v1_participant.tsv", sep = "\t"),
    prs = read.delim("Polygenic_risk_score_v1_participant.tsv", sep = "\t"),
    pmh = read.delim("Past_medical_history_v2_participant.tsv", sep = "\t"),
    clinicalrisk = read.delim("Clinical_predictors_preprocessed_v2.tsv", sep = "\t")
)

for (name in names(predictors_list)) {
  file <- predictors_list[[name]]
  cat(paste0(name," file read, dimenions: rows:", dim(file)[1], ", cols:", dim(file)[2], "\n"))
}

## 3. Reformatting

In [None]:
#convert logical columns to character
for (name in names(predictors_list)) {
  file <- predictors_list[[name]]
  logical_cols <- sapply(file, is.logical)
  file[logical_cols] <- lapply(file[logical_cols], as.character)
  predictors_list[[name]] <- file
}

In [None]:
#rename cols
for (name in names(predictors_list)) {
  file <- predictors_list[[name]]
  colnames(file) <- c('eid',paste0(name, "_", colnames(file[-1])))
  predictors_list[[name]] <- file
}


In [None]:
#NAs in baseline pmh = no record
predictors_list[["pmh"]][, which(names(predictors_list[["pmh"]]) != "eid")] <- 
  lapply(predictors_list[["pmh"]][, which(names(predictors_list[["pmh"]]) != "eid")], function(x) replace(x, is.na(x), FALSE))


## 5. Exclusion based on missingness

In [None]:
#exclude individuals who didn't have assay (e.g. no metabolomics)
for (name in names(predictors_list)) {
  file <- predictors_list[[name]]
  na_percentage_rows <- apply(file[,which(names(file) != "eid")], 1, function(x) sum(is.na(x)) / length(x))
  file <- file[na_percentage_rows < 1, ]
  cat(paste0(name, ": retained n = ", table(na_percentage_rows == 1)[1]," of ", length(na_percentage_rows), " individuals due to criteria: missingness < 100% within each individual", "\n"))
  predictors_list[[name]] <- file
}

#common eids only
eids_list <- lapply(predictors_list, function(x) x$eid)
common_eids <- Reduce(intersect, eids_list)
for (name in names(predictors_list)) {
  dataset <- predictors_list[[name]]
  predictors_list[[name]] <- dataset[dataset$eid %in% common_eids, ]
}

In [None]:
#apply variable-wise missingness criteria
for (name in names(predictors_list)) {
  file <- predictors_list[[name]]
  na_percentage_cols <- apply(file, 2, function(x) sum(is.na(x)) / length(x))
  file <- file[ ,na_percentage_cols <= 0.2]
  predictors_list[[name]] <- file
  cat(paste0(name, "retained n = ", table(na_percentage_cols > 0.2)[1], " of ", length(na_percentage_cols), " variables due to criteria: missingness <= 20% within each variable", "\n"))
}

#apply individual-wise missingness criteria
for (name in names(predictors_list)) {
  file <- predictors_list[[name]]
  na_percentage_rows_2 <- apply(file[,which(names(file) != "eid")], 1, function(x) sum(is.na(x)) / length(x))
  file <- file[na_percentage_rows_2 <= 0.2, ]
  predictors_list[[name]] <- file
  cat(paste0(name, "retained n = ", table(na_percentage_rows_2 > 0.2)[1], " of ", length(na_percentage_rows_2), " individuals due to criteria: missingness <= 20% within each individual", "\n"))
}

In [None]:
#common eids only
eids_list <- lapply(predictors_list, function(x) x$eid)
common_eids <- Reduce(intersect, eids_list)
for (name in names(predictors_list)) {
  dataset <- predictors_list[[name]]
  predictors_list[[name]] <- dataset[dataset$eid %in% common_eids, ]
}

## 5. Test-Train splitting, Imputation & Saving

In [None]:
set.seed(42)

n_splits <- 5

cv_splits <- sample(rep(1:n_splits, length.out = length(common_eids)))
split_info <- data.frame(eid = common_eids, cv_split = cv_splits)

In [None]:
#process each CV split
for (cv in seq_len(n_splits)) {
  
  predictors_list_copy <- predictors_list

  #identify/save test/train
  eidstest <- split_info$eid[split_info$cv_split == cv]
  eidstrain <- setdiff(common_eids, eidstest)
  split_filename <- paste0("cv_split_", cv, ".csv")
  write.csv(data.frame(eid = common_eids,
                       train_test_split = ifelse(common_eids %in% eidstrain, "train", "test"),
                       cv_split = split_info$cv_split),
            split_filename, row.names = FALSE)
  
  cat("Saved split info to:", split_filename, "\n")
  
  #filter for cols with multiple unique values in the train set
  for (name in names(predictors_list_copy)) {
    file <- predictors_list_copy[[name]]
    before <- ncol(file)
    file <- file[, sapply(file[file$eid %in% eidstrain, ], has_multiple_unique_values)]
    predictors_list_copy[[name]] <- file
    after <- ncol(file)
    excluded <- before - after
    cat(paste0("For ", name, ": retained n=", after, 
               " columns, excluded n=", excluded, " columns\n"))
  }
  
  #impute missing values in train and apply to test
  for (name in c("clinicalrisk", "metabolomics")) {
    file <- predictors_list_copy[[name]]
    
    all_vars <- setdiff(names(file), "eid")
    v <- setNames(lapply(all_vars, function(x) setdiff(all_vars, x)), all_vars)
    
    #train imp model
    miceObj <- miceRanger(
      file[file$eid %in% eidstrain, ], 
      m = 1, 
      returnModels = TRUE,
      maxiter = 3,
      max.depth = 8,
      num.trees = 8,
      verbose = TRUE,
      vars = v,
      save.memory = FALSE)
    
    #apply imp model
    imputed <- impute(file, miceObj, verbose = FALSE)
    predictors_list_copy[[name]] <- imputed$imputedData$Dataset_1
    
    cat(paste0("Imputation done for ", name, " in split ", cv, "\n"))
  }

  ##generate final df for each CV split
  merged_df <- Reduce(function(x, y) merge(x, y, by = "eid", all = TRUE), predictors_list_copy)
  merged_df <- merge(merged_df, split_info, by = "eid", all.x = TRUE)
  
  #merge with outcomes
  status_cols <- grep(paste0("_status$"), names(endpoints), value = TRUE)
  followup_cols <- grep(paste0("_followup$"), names(endpoints), value = TRUE)
  at_base_cols <- grep(paste0("_at_base$"), names(endpoints), value = TRUE)
  
  status_cols <- status_cols[status_cols %in% paste0(endpoint_names, "_status")]
  followup_cols <- followup_cols[followup_cols %in% paste0(endpoint_names, "_followup")]
  at_base_cols <- at_base_cols[at_base_cols %in% paste0(endpoint_names, "_at_base")]
                      
  outcome_cols <- c(status_cols, followup_cols, at_base_cols, "eid")
  
  outcome_df <- endpoints[, outcome_cols]
  merged_df <- merge(merged_df, outcome_df, by = "eid", all.x = TRUE)
  
  #add train/test and cv split columns
  merged_df <- merge(merged_df, split_info, by = "eid", all.x = TRUE)
  merged_df$testtrain <- ifelse(merged_df$eid %in% eidstrain, "train", "test")
  
  #save
  filename <- paste0("imputed_data_split_", cv, "_22112024.tsv")
  write.table(merged_df, filename, sep = "\t", row.names = FALSE, quote = FALSE)
  upload_cmd <- paste0("dx upload ", filename, " --path Benchmarking/Imputed/", filename)
  system(upload_cmd, intern = TRUE)
  
  cat("Final merged dataframe saved and uploaded for split", cv, "\n")
}


Saved split info to: cv_split_1.csv 
For metabolomics: retained n=169 columns, excluded n=0 columns
For prs: retained n=18 columns, excluded n=0 columns
For pmh: retained n=435 columns, excluded n=0 columns
For clinicalrisk: retained n=46 columns, excluded n=0 columns


One or more of the specified variables to impute contains no missing values. These will remain as a predictor, however they will not be imputed. 

Converting characters to factors.



## 6. Generate final DF and save

## 7. Check that everything worked

In [18]:
for (name in names(predictors_list)) {
      file <- predictors_list[[name]]
      na_count <- sum(is.na(file))
      print(paste0(name, ": ", na_count, " NA values"))
    }

[1] "metabolomics: 0 NA values"
[1] "prs: 0 NA values"
[1] "pmh: 0 NA values"
[1] "ts: 0 NA values"
[1] "clinicalrisk: 0 NA values"
[1] "nhc: 0 NA values"
[1] "qrisk: 0 NA values"
[1] "prevent: 0 NA values"
[1] "score: 0 NA values"


In [19]:
dl_cmd <- paste0("dx download 'UKBRISK_Imputed/final/imputed_data_02092024.tsv' --overwrite")
system(dl_cmd, intern = TRUE)
impfinal <- read.delim("imputed_data_02092024.tsv", sep = "\t")

In [21]:
table(is.na(impfinal))
colnames(impfinal)[colSums(is.na(impfinal)) > 0] #only outcome cols - perfect


    FALSE 
466115256 

In [20]:
head(impfinal)

Unnamed: 0_level_0,eid,metabolomics_Apolipoprotein.B,metabolomics_Apolipoprotein.A1,metabolomics_Alanine,metabolomics_Glutamine,metabolomics_Glycine,metabolomics_Histidine,metabolomics_Total.Concentration.of.Branched.Chain.Amino.Acids..Leucine...Isoleucine...Valine.,metabolomics_Isoleucine,metabolomics_Leucine,⋯,MEL_at_base,CRC_at_base,PC_at_base,BC_at_base,PD_at_base,OP_at_base,CAT_at_base,POAG_at_base,HT_at_base,testtrain
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>
1,1000044,0.76017,1.0988,0.25304,0.46639,0.17234,0.051149,0.30596,0.051907,0.085895,⋯,False,False,False,False,False,False,False,False,False,train
2,1000083,0.77423,1.9638,0.26335,0.56426,0.20824,0.073546,0.3682,0.047598,0.10619,⋯,False,False,False,False,False,False,False,False,False,train
3,1000121,0.82021,1.2126,0.2539,0.52869,0.089564,0.060435,0.31846,0.03521,0.078134,⋯,False,False,False,False,False,False,False,False,False,test
4,1000143,0.85863,1.4698,0.37789,0.59261,0.099752,0.070415,0.42862,0.061815,0.1187,⋯,False,False,False,False,False,False,False,False,False,test
5,1000150,0.5666,1.1732,0.31397,0.53431,0.13414,0.042231,0.37507,0.049463,0.12236,⋯,False,False,False,False,False,False,False,False,False,test
6,1000168,1.0344,1.5165,0.27074,0.56913,0.25387,0.053087,0.3551,0.046822,0.10043,⋯,False,False,False,False,False,False,False,False,False,train
