## 1. Setup

In [10]:
library(dplyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




## 2. Load Data

### 2.1 Load data into memory

In [3]:
n_splits <- 5

split_data_list <- list()

for (cv in seq_len(n_splits)) {
  filename <- paste0("imputed_data_split_", cv, "_22112024.tsv")
  download_cmd <- paste0("dx download Benchmarking/Imputed/", filename, " -o ", filename)
  system(download_cmd, intern = TRUE)
  df <- read.table(filename, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
  cat("Loaded data for split", cv, "with dimensions:", dim(df), "\n")
  split_data_list[[cv]] <- df
}

cat("All splits loaded into memory.\n")


Loaded data for split 1 with dimensions: 258966 677 
Loaded data for split 2 with dimensions: 258966 677 
Loaded data for split 3 with dimensions: 258966 677 
Loaded data for split 4 with dimensions: 258966 677 
Loaded data for split 5 with dimensions: 258966 677 
All splits loaded into memory.


### 2.2 Load mapping file for PANEL

In [4]:
system("dx download 'UKBRISK_Processed/Clinicalrisk_mapping_v2.tsv'")

mapping_clinicalrisk <- read.delim("Clinicalrisk_mapping_v2.tsv", sep = "\t")
mapping_clinicalrisk <- mapping_clinicalrisk[-1,]
mapping_clinicalrisk$Column.name <- paste0("clinicalrisk_",mapping_clinicalrisk$Column.name)
table(as.factor(mapping_clinicalrisk$Data.type))
mapping_clinicalrisk[mapping_clinicalrisk$Column.name == "clinicalrisk_Systolic.blood.pressure","Column.name"] = 'clinicalrisk_SBP_mean'
table(names(df)[grep("^clinicalrisk_", names(df))] %in% mapping_clinicalrisk$Column.name)


Continuous     Factor 
        32         13 


TRUE 
  45 

### 2.3 Adjusting data types and performing some sanity checks

In [13]:
processed_data_list <- list()

for (cv in seq_len(n_splits)) {
  df <- split_data_list[[cv]]
  
  #apply clinicalrisk transformations based on mapping_clinicalrisk
  for (i in 1:nrow(mapping_clinicalrisk)) {
    column_name <- mapping_clinicalrisk$Column.name[i]
    data_type <- mapping_clinicalrisk$Data.type[i]

    if (column_name %in% names(df)) {
      if (data_type == "Continuous") {
        df[[column_name]] <- as.numeric(df[[column_name]])
      } else {
        df[[column_name]] <- as.factor(df[[column_name]])
      }
    }
  }

  cat("Clinicalrisk transformations completed for split", cv, "\n")
  
  #check no factor cols with > 2 levels
  factor_cols <- names(df)[sapply(df, is.factor)]
  multi_level_factors <- factor_cols[sapply(df[factor_cols], function(col) nlevels(col) > 2)]
  
  if (length(multi_level_factors) > 0) {
    cat("Factor columns with more than 2 levels in split", cv, ":\n")
    print(multi_level_factors)
  } else {
    cat("No factor columns with more than 2 levels in split", cv, "\n")
  }
  
  #check no negative values in continous cols
  cols_to_check <- grep("^metabolomics_|ts_|clinicalrisk_", names(df), value = TRUE)
  cols_to_check <- cols_to_check[sapply(df[cols_to_check], is.numeric)]  # Keep only numeric columns

  if (length(cols_to_check) > 0) {
    neg_cols <- sapply(df[, cols_to_check, drop = FALSE], function(col) any(col < 0, na.rm = TRUE))
    
    if (any(neg_cols)) {
      cat("Split", cv, "contains negative values in columns: ", paste(names(df)[cols_to_check][neg_cols], collapse = ", "), "\n")
    } else {
      cat("No negative columns found in split", cv, "\n")
    }
  } else {
    cat("No columns matched for checking negatives in split", cv, "\n")
  }
  
  #store back
  processed_data_list[[cv]] <- df
}

cat("Data-type transformations for clinicalrisk, factor checks, and negative value checks completed for all splits.\n")


Clinicalrisk transformations completed for split 1 
No factor columns with more than 2 levels in split 1 
No negative columns found in split 1 
Clinicalrisk transformations completed for split 2 
No factor columns with more than 2 levels in split 2 
No negative columns found in split 2 
Clinicalrisk transformations completed for split 3 
No factor columns with more than 2 levels in split 3 
No negative columns found in split 3 
Clinicalrisk transformations completed for split 4 
No factor columns with more than 2 levels in split 4 
No negative columns found in split 4 
Clinicalrisk transformations completed for split 5 
No factor columns with more than 2 levels in split 5 
No negative columns found in split 5 
Data-type transformations for clinicalrisk, factor checks, and negative value checks completed for all splits.


## 3. Replace 0s with 1/10th of median (assume detection threshold)

In [14]:
transformed_data_list <- list()

for (cv in seq_len(n_splits)) {
  df <- processed_data_list[[cv]]  

  cols_to_transform <- names(df)[grepl("^(metabolomics_|clinicalrisk_)", names(df)) & 
                                   sapply(df, function(x) class(x) %in% c('numeric', 'integer'))]

  #replace 0  with median/10 
  df[cols_to_transform] <- lapply(df[cols_to_transform], function(x) {
    median_val <- median(x[x != 0], na.rm = TRUE) / 10
    x[x == 0] <- median_val
    return(x)
  })
  
  cat("Zero values replaced with 1/10th of median for continuous columns in split", cv, "\n")
  
  #store back
  transformed_data_list[[cv]] <- df
}

cat("Transformation of continuous columns completed for all splits.\n")


Zero values replaced with 1/10th of median for continuous columns in split 1 
Zero values replaced with 1/10th of median for continuous columns in split 2 
Zero values replaced with 1/10th of median for continuous columns in split 3 
Zero values replaced with 1/10th of median for continuous columns in split 4 
Zero values replaced with 1/10th of median for continuous columns in split 5 
Transformation of continuous columns completed for all splits.


## 4. Log-Scaling, removal of outlier blood measurements > 5 SD

In [15]:
for (cv in seq_len(n_splits)) {
  df <- transformed_data_list[[cv]]  # Assuming `transformed_data_list` contains data after zero replacement

  #log
  df <- df %>%
    mutate(across(all_of(cols_to_transform), log))

  cat("Log transformation completed for split", cv, "\n")
  
  #means and sd from train split
  means <- sapply(df[df$testtrain == 'train', cols_to_transform], mean, na.rm = TRUE)
  sds <- sapply(df[df$testtrain == 'train', cols_to_transform], sd, na.rm = TRUE)
  
  #center-scale
  center_scale <- function(x, mean, sd) {
    (x - mean) / sd
  }
  df[, cols_to_transform] <- mapply(center_scale, 
                                    df[, cols_to_transform], 
                                    means, 
                                    sds, 
                                    SIMPLIFY = FALSE)
  
  cat("Centering and scaling completed for split", cv, "\n")

  ##filter outliers from blood measurements (> 5 SD)
  bloods_cr <- mapping_clinicalrisk$Column.name[22:44]
  initial_row_count <- nrow(df)
  df <- df %>%
    filter(across(c(starts_with("metabolomics_"), any_of(bloods_cr)), ~ abs(.) <= 5 | is.na(.)))
  final_row_count <- nrow(df)
  rows_removed <- initial_row_count - final_row_count
  cat("Outlier filtering completed for split", cv, " - rows removed due to outliers:", rows_removed, "\n")

  #check for NaN or -Inf
  nan_exists <- suppressWarnings(sapply(df, function(col) any(is.nan(col))))
  inf_exists <- suppressWarnings(sapply(df, function(col) any(is.infinite(col) & col < 0)))
  
  if (any(nan_exists)) {
    cat("Split", cv, "contains NaN values in columns:", paste(names(df)[nan_exists], collapse = ", "), "\n")
  } else {
    cat("No NaN values found in split", cv, "\n")
  }
  
  if (any(inf_exists)) {
    cat("Split", cv, "contains -Inf values in columns:", paste(names(df)[inf_exists], collapse = ", "), "\n")
  } else {
    cat("No -Inf values found in split", cv, "\n")
  }
  
  #store back
  transformed_data_list[[cv]] <- df
}

cat("Log scaling, normalization, outlier filtering, and value checks completed for all splits.\n")


Log transformation completed for split 1 
Centering and scaling completed for split 1 


“[1m[22mUsing `across()` in `filter()` was deprecated in dplyr 1.0.8.
[36mℹ[39m Please use `if_any()` or `if_all()` instead.”


Outlier filtering completed for split 1  - rows removed due to outliers: 17799 
No NaN values found in split 1 
No -Inf values found in split 1 
Log transformation completed for split 2 
Centering and scaling completed for split 2 


“[1m[22mUsing `across()` in `filter()` was deprecated in dplyr 1.0.8.
[36mℹ[39m Please use `if_any()` or `if_all()` instead.”


Outlier filtering completed for split 2  - rows removed due to outliers: 17810 
No NaN values found in split 2 
No -Inf values found in split 2 
Log transformation completed for split 3 
Centering and scaling completed for split 3 


“[1m[22mUsing `across()` in `filter()` was deprecated in dplyr 1.0.8.
[36mℹ[39m Please use `if_any()` or `if_all()` instead.”


Outlier filtering completed for split 3  - rows removed due to outliers: 17762 
No NaN values found in split 3 
No -Inf values found in split 3 
Log transformation completed for split 4 
Centering and scaling completed for split 4 


“[1m[22mUsing `across()` in `filter()` was deprecated in dplyr 1.0.8.
[36mℹ[39m Please use `if_any()` or `if_all()` instead.”


Outlier filtering completed for split 4  - rows removed due to outliers: 17844 
No NaN values found in split 4 
No -Inf values found in split 4 
Log transformation completed for split 5 
Centering and scaling completed for split 5 


“[1m[22mUsing `across()` in `filter()` was deprecated in dplyr 1.0.8.
[36mℹ[39m Please use `if_any()` or `if_all()` instead.”


Outlier filtering completed for split 5  - rows removed due to outliers: 17799 
No NaN values found in split 5 
No -Inf values found in split 5 
Log scaling, normalization, outlier filtering, and value checks completed for all splits.


## 5. Save data

In [16]:
for (cv in seq_len(n_splits)) {
  df <- transformed_data_list[[cv]] 
  filename_save <- paste0("Processed_final_split_", cv, "_01122024.tsv")
  write.table(df, file = filename_save, sep = "\t", row.names = FALSE, quote = FALSE)
  cat("Data saved locally for split", cv, "with filename:", filename_save, "\n")
  
  upl_cmd <- paste0("dx upload ", filename_save, " --path Benchmarking/Processed/", filename_save)
  system(upl_cmd, intern = TRUE)
  
  cat("Data uploaded for split", cv, "to the cloud at Benchmarking/Processed/", filename_save, "\n")
}

cat("All splits saved and uploaded successfully.\n")


Data saved locally for split 1 with filename: Processed_final_split_1_01122024.tsv 
Data uploaded for split 1 to the cloud at Benchmarking/Processed/ Processed_final_split_1_01122024.tsv 
Data saved locally for split 2 with filename: Processed_final_split_2_01122024.tsv 
Data uploaded for split 2 to the cloud at Benchmarking/Processed/ Processed_final_split_2_01122024.tsv 
Data saved locally for split 3 with filename: Processed_final_split_3_01122024.tsv 
Data uploaded for split 3 to the cloud at Benchmarking/Processed/ Processed_final_split_3_01122024.tsv 
Data saved locally for split 4 with filename: Processed_final_split_4_01122024.tsv 
Data uploaded for split 4 to the cloud at Benchmarking/Processed/ Processed_final_split_4_01122024.tsv 
Data saved locally for split 5 with filename: Processed_final_split_5_01122024.tsv 
Data uploaded for split 5 to the cloud at Benchmarking/Processed/ Processed_final_split_5_01122024.tsv 
All splits saved and uploaded successfully.
