<a href="https://colab.research.google.com/github/squinton-gcu/Data-Science/blob/main/Processing_Module_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Processing Module via R
This module runs the processing and outlier detection steps of the pipeline.
Input is the four datasets.
Output is 4 processed csv files.
Graphs are generated to show the affects of normalization, and standardization.

In [42]:
# check and install all packages if needed
# if you do need to download anything, please restart the runtime
if("googledrive" %in% rownames(installed.packages()) == FALSE) {install.packages("googledrive")}
if("BiocManager" %in% rownames(installed.packages()) == FALSE) {
  install.packages("BiocManager")}
if("MAI" %in% rownames(installed.packages()) == FALSE) {BiocManager::install("MAI")}
# issue with rlang during instalation which requires it to be re-downloaded
remove.packages('rlang')
install.packages('rlang')


Removing package from ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [43]:
library('googledrive')
library('MAI')


In [44]:
if (file.exists("/usr/local/lib/python3.7/dist-packages/google/colab/_ipython.py")) { #may update python version  
                                       #occasionally
  install.packages("R.utils")
  library("R.utils")
  library("httr")
  my_check <- function() {return(TRUE)}
  reassignInPackage("is_interactive", pkgName = "httr", my_check) 
  options(rlang_interactive=TRUE)
}

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [45]:
drive_auth(use_oob = TRUE, cache = FALSE)

Please point your browser to the following url: 

https://accounts.google.com/o/oauth2/auth?client_id=603366585132-dpeg5tt0et3go5of2374d83ifevk5086.apps.googleusercontent.com&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&response_type=code



Enter authorization code: 4/1AdQt8qj-Efe4H1mECy5uL9Wq3qoKqa7YZswwSLKWD3Nq8a5hDOL8QswqI-w


In [46]:
x <- drive_get('~/Thesis/ALZ_plasma.csv')
drive_download(x, overwrite = TRUE)
y <- drive_get('~/Thesis/ALZ_CSF2.csv')
drive_download(y, overwrite = TRUE)
z <- drive_get('~/Thesis/trauma_human.csv')
drive_download(z, overwrite = TRUE)
v <- drive_get('~/Thesis/rat_stress.csv')
drive_download(v, overwrite = TRUE)





[32m✔[39m The input `path` resolved to exactly 1 file.

File downloaded:

[36m•[39m [36mALZ_plasma.csv[39m [90m<id: 1U7njj5PBdw9bSX0J_e-Ri88Vfz2kQf87>[39m

Saved locally as:

[36m•[39m [34mALZ_plasma.csv[39m





[32m✔[39m The input `path` resolved to exactly 1 file.

File downloaded:

[36m•[39m [36mALZ_CSF2.csv[39m [90m<id: 1cRmB2cJ2BYwipnetgcTMMpxanRAVPxmC>[39m

Saved locally as:

[36m•[39m [34mALZ_CSF2.csv[39m





[32m✔[39m The input `path` resolved to exactly 1 file.

File downloaded:

[36m•[39m [36mtrauma_human.csv[39m [90m<id: 1ida0XZX27tw3PgNFiU9b81Mgr3PRtIZx>[39m

Saved locally as:

[36m•[39m [34mtrauma_human.csv[39m





[32m✔[39m The input `path` resolved to exactly 1 file.

File downloaded:

[36m•[39m [36mrat_stress.csv[39m [90m<id: 1gjdVA1DG0k0CAl30Edo4i5HVwHQz7TVk>[39m

Saved locally as:

[36m•[39m [34mrat_stress.csv[39m



In [47]:
ALZ_plasma <- read.table("ALZ_plasma.csv", header=TRUE, row.names = 1, sep =',')
ALZ_CSF <- read.table("ALZ_CSF2.csv", header=TRUE, row.names=1, sep=',')
trauma_human <- read.table("trauma_human.csv", header=TRUE, row.names=1, sep=',')
trauma_rat <- read.table("rat_stress.csv", header=TRUE, row.names=1, sep=',')

In [48]:
processing_step_impute <- function(Input_File) {
  #remove all metabolites with more than 70% missing values
  Input_File2 <- Input_File[, which(colMeans(!is.na(Input_File)) > 0.7)]
  # take care of missing values
  imputed_File <- MAI(Input_File2, 
					MCAR_algorithm = c("Multi_nsKNN"),
					MNAR_algorithm = c("nsKNN"),
					)
  imputed_File <- as.data.frame(imputed_File)
  row.names(imputed_File) = row.names(Input_File2)
  colnames(imputed_File) = colnames(Input_File2)
  if (ncol(Input_File2) != ncol(imputed_File)) {
    imputed_File2 <- imputed_File[1:(length(imputed_File)-(ncol(imputed_File) - ncol(Input_File2)))]
  }
  else {
    imputed_File2 = imputed_File
  }
  colnames(imputed_File2) = colnames(Input_File2)

  return(imputed_File2)
}

processing_step_normalize <- function(imputedFile) {
  # normalize with log2 and change NAs to zeros
  normalized_File <- as.data.frame(log2(imputedFile))
  normalized_File[is.na(normalized_File)] <- 0
  return(normalized_File)
}

processing_step_scale <- function(normalizedFile) {
  # Scale using the z scale function from R
  scale_File <- scale(t(normalizedFile))
  scale_File <- t(scale_File)
  return(scale_File)
}


In [49]:
# will generate a few random initial exploratory graphs to ensure processing was successful
graph_normalizeVSscale <- function(normalized_table, scale_table, table_name) {
  png(paste(table_name, "_scale1.png"))
  scale1_hist <- hist(as.numeric(scale_table[1,]), main = paste("Histogram of " , table_name, "scale1"))
  dev.off()
  png(paste(table_name, "_scale12.png"))
  scale12_hist <- hist(as.numeric(scale_table[15,]), main = paste("Histogram of " , table_name, "scale2"))
  dev.off()

  png(paste(table_name, "_normalized1.png"))
  normalized1_hist <- hist(as.numeric(normalized_table[1,]), main = paste("Histogram of " , table_name, "normalized1"))
  dev.off()
  png(paste(table_name, "_normalized2.png"))
  normalized2_hist <- hist(as.numeric(normalized_table[15,]), main = paste("Histogram of " , table_name, "normalized2"))
  dev.off()
}

In [50]:
#outlier checker using IQR
outlier_checker <- function(normalized, name) {
  outlier_list = NULL
  options(warn=-1)
  for (x in 1:nrow(normalized)) {
    ALZ_Q3 <- quantile(normalized[x,], 0.75)
    ALZ_Q1<- quantile(normalized[x,], 0.21)
    ALZ_IQR <- IQR(normalized[x,])
    ALZ_outliers <-subset(normalized[x,], normalized[x,] > (as.numeric(ALZ_Q1 - 1.5*ALZ_IQR)) & normalized[x,] < (as.numeric(ALZ_Q3 + 1.5*ALZ_IQR)))
    ALZ_num_outliers <- length(ALZ_outliers[1,]) - ncol(normalized)
    outlier_list <- append(outlier_list, ALZ_num_outliers)
  }
  print(paste(name, " outlier list"))
  subset(outlier_list, outlier_list > 0)
}

In [51]:
ALZ_plasma_imputed <- processing_step_impute(ALZ_plasma)
ALZ_plasma_normalized <- processing_step_normalize(ALZ_plasma_imputed)
ALZ_plasma_scaled <- processing_step_scale(ALZ_plasma_normalized)

Estimating pattern of missingness

Imposing missingness

Generating features

Training

Predicting

Imputing



In [52]:
ALZ_csf_imputed <- processing_step_impute(ALZ_CSF)
ALZ_csf_normalized <- processing_step_normalize(ALZ_csf_imputed)
ALZ_csf_scaled <- processing_step_scale(ALZ_csf_normalized)

Estimating pattern of missingness

Imposing missingness

Generating features

Training

Predicting

Imputing



In [53]:
#no missing values to impute
trauma_human_imputed <- trauma_human
trauma_human_normalized <- processing_step_normalize(trauma_human_imputed)
trauma_human_scaled <- processing_step_scale(trauma_human_normalized)

In [54]:
#no missing values to impute
trauma_rat_imputed <- trauma_rat
trauma_rat_normalized <- processing_step_normalize(trauma_rat_imputed)
trauma_rat_scaled <- processing_step_scale(trauma_rat_normalized)

In [55]:
outlier_checker(ALZ_plasma_normalized, "ALZ_plasma")
outlier_checker(ALZ_csf_normalized, "ALZ_csf")
outlier_checker(trauma_human_normalized, "trauma_human")
outlier_checker(trauma_rat_normalized, "trauma_rat")

[1] "ALZ_plasma  outlier list"


[1] "ALZ_csf  outlier list"


[1] "trauma_human  outlier list"


[1] "trauma_rat  outlier list"


In [56]:
graph_normalizeVSscale(ALZ_plasma_normalized, ALZ_plasma_scaled, "ALZ_plasma")
graph_normalizeVSscale(ALZ_csf_normalized, ALZ_csf_scaled, "ALZ_csf")
graph_normalizeVSscale(trauma_human_normalized, trauma_human_scaled, "trauma_human")
graph_normalizeVSscale(trauma_rat_normalized, trauma_rat_scaled, "trauma_rat")

In [57]:
# save processed verisons of data frames
write.csv(ALZ_plasma_scaled, "ALZ_plasma_processed.csv", row.names=TRUE)
write.csv(ALZ_csf_scaled, "ALZ_csf_processed.csv", row.names=TRUE)
write.csv(trauma_human_scaled, "trauma_human_processed.csv", row.names=TRUE)
write.csv(trauma_rat_scaled, "trauma_rat_processed.csv", row.names=TRUE)


## References

Dekermanjian, J., Shaddox, E., N, D., y, Ghosh, D., & Kechris, K. (2022). MAI: Mechanism-Aware Imputation (1.2.0) [Computer software]. Bioconductor version: Release (3.15). https://doi.org/10.18129/B9.bioc.MAI

Duca, A. L. (2020, December 30). Data Preprocessing with scikit-learn—Missing Values. Medium. https://towardsdatascience.com/data-preprocessing-with-scikit-learn-missing-values-8dff2c266db

How to Remove Outliers in R | R-bloggers. (n.d.). Retrieved July 4, 2022, from https://www.r-bloggers.com/2021/09/how-to-remove-outliers-in-r-3/

Lee, J. Y., & Styczynski, M. P. (2018). NS-kNN: A modified k-nearest neighbors approach for imputing metabolomics data. Metabolomics, 14(12), 153. https://doi.org/10.1007/s11306-018-1451-8

Team, T. A. (n.d.). How, When, and Why Should You Normalize / Standardize / Rescale… – Towards AI. Retrieved July 3, 2022, from https://towardsai.net/p/data-science/how-when-and-why-should-you-normalize-standardize-rescale-your-data-3f083def38ff, https://towardsai.net/p/data-science/how-when-and-why-should-you-normalize-standardize-rescale-your-data-3f083def38ff

van den Berg, R. A., Hoefsloot, H. C., Westerhuis, J. A., Smilde, A. K., & van der Werf, M. J. (2006). Centering, scaling, and transformations: Improving the biological information content of metabolomics data. BMC Genomics, 7, 142. https://doi.org/10.1186/1471-2164-7-142

Wall, K. (2021, July 2). Access Google Drive Using Google Colab Running an R Kernel. Medium. https://towardsdatascience.com/access-google-drive-using-google-colab-running-an-r-kernel-3736db7835

