In [1]:
suppressPackageStartupMessages(require(tidyverse))
suppressPackageStartupMessages(require(data.table))
suppressPackageStartupMessages(require(glmnet))


In [2]:
snpnet_dir<-'/oak/stanford/groups/mrivas/software/snpnet'
mem2bufferSizeDivisionFactor<-4
cpu<-6
mem<-60000
niter<-100
genotype_dir<-'/scratch/users/ytanigaw/tmp/snpnet/geno/array_imp_combined'
data_dir_root<-'/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/public-resources/uk_biobank/biomarkers/snpnet/data_array_imp'
phenotype_file<-'biomarkers_covar.phe'
phenotype_name<-'Cystatin_C'
family<-'gaussian'
prevIter<-0


In [3]:
# load snpnet
devtools::load_all(snpnet_dir)


Loading snpnet


In [4]:
# please check if glmnet version is >= 2.0.20
print(packageVersion("glmnet"))


[1] ‘2.0.20’


In [5]:
# configue parameters
data_dir_root  <- data_dir_root
phenotype.name <- phenotype_name
phenotype.file <- file.path(data_dir_root, phenotype_file)
results.dir    <- file.path(data_dir_root, phenotype.name, 'results')
covariates     <- c()

print(phenotype.name)


[1] "Cystatin_C"


In [11]:
genotype.dir = genotype_dir

In [6]:
phenotype = phenotype.name
nlambda = 100
lambda.min.ratio = NULL
num.snps.batch = 1000
increase.size = num.snps.batch/2
standardize.variant = FALSE
use.glmnetPlus = (family == "gaussian")
stopping.lag = 2
early.stopping = TRUE
glmnet.thresh = 1E-7

configs = list(
    missing.rate = 0.1,
    MAF.thresh = 0.001,
    nCores = cpu,
    bufferSize = mem / mem2bufferSizeDivisionFactor,
    meta.dir = "meta",
    nlams.init = 10,
    nlams.delta = 5
)
verbose = T
KKT.verbose = T
buffer.verbose = T
validation = T
save = F # we don't want to overwrite the original output during the debug


In [7]:
if (prevIter >= niter) stop("prevIter is greater or equal to the total number of iterations.")
configs <- setup_configs_directories(configs, covariates, standardize.variant, early.stopping,
                       stopping.lag, save, results.dir)

start.time.tot <- Sys.time()
cat("Start snpnet:", as.character(start.time.tot), "\n")

Start snpnet: 2019-09-30 17:55:11 


In [8]:
### --- Process phenotypes --- ###
  cat("Preprocessing start:", as.character(Sys.time()), "\n")
  phe.master <- data.table::fread(phenotype.file, colClasses = c("FID" = "character", "IID" = "character"), select = c("FID", "IID", covariates, phenotype))
  cat_ids <- paste(phe.master$FID, phe.master$IID, sep = "_")
  # rownames(phe.master) <- phe.master$ID
  if (is.null(family)) {
    if (all(unique(phe.master[[phenotype]] %in% c(0, 1, 2, -9)))) {
      family <- "binomial"
    } else {
      family <- "gaussian"
    }
  }
  if (family == "binomial") phe.master[[phenotype]] <- phe.master[[phenotype]] - 1

Preprocessing start: 2019-09-30 17:55:29 


In [9]:
### --- Check whether to use glmnet or glmnetPlus --- ###
  use.glmnetPlus <- checkGlmnetPlus(use.glmnetPlus, family)
  if (use.glmnetPlus) {
    glmnet.settings <- glmnetPlus::glmnet.control()
    on.exit(do.call(glmnetPlus::glmnet.control, glmnet.settings))
    glmnetPlus::glmnet.control(fdev = 0, devmax = 1)
  } else {
    glmnet.settings <- glmnet::glmnet.control()
    on.exit(do.call(glmnet::glmnet.control, glmnet.settings))
    glmnet::glmnet.control(fdev = 0, devmax = 1)
  }

Loading required namespace: glmnetPlus


In [12]:
### --- Process genotypes --- ###
  chr.train <- BEDMatrixPlus(file.path(genotype.dir, "train.bed"))
  n.chr.train <- nrow(chr.train)
  ids.chr.train <- rownames(chr.train)

  if (validation) {
    chr.val <- BEDMatrixPlus(file.path(genotype.dir, "val.bed"))
    n.chr.val <- nrow(chr.val)
    ids.chr.val <- rownames(chr.val)
  }

Extracting number of samples and rownames from train.fam...
Extracting number of variants and colnames from train.bim...
Extracting number of samples and rownames from val.fam...
Extracting number of variants and colnames from val.bim...


In [None]:
### --- Prepare the feature matrix --- ###
  rowIdx.subset.train <- which(ids.chr.train %in% cat_ids[phe.master[[phenotype]] != -9])  # missing phenotypes are encoded with -9
  n.subset.train <- length(rowIdx.subset.train)
  stats <- computeStats(chr.train, rowIdx.subset.train, stat = c("pnas", "means", "sds"),
                        path = file.path(results.dir, configs[["meta.dir"]]), save = save, configs = configs, verbose = verbose, buffer.verbose = buffer.verbose)
  phe.train <- phe.master[match(ids.chr.train, cat_ids), ]
  if (length(covariates) > 0) {
    features.train <- phe.train[, covariates, with = F]
    features.train <- features.train[rowIdx.subset.train, ]
  } else {
    features.train <- NULL
  }
  if (validation) {
    rowIdx.subset.val <- which(ids.chr.val %in% cat_ids[phe.master[[phenotype]] != -9])  # missing phenotypes are encoded with -9
    n.subset.val <- length(rowIdx.subset.val)
    phe.val <- phe.master[match(ids.chr.val, cat_ids), ]
    if (length(covariates) > 0) {
      features.val <- phe.val[, covariates, with = F]
      features.val <- features.val[rowIdx.subset.val, ]
    } else {
      features.val <- NULL
    }
  }
