## Reset env - import/install package

In [1]:
# clear all elements from environment
rm(list=ls())      #will clear all objects includes hidden objects.
gc(reset = TRUE)   #free up memrory and report the memory usage.

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,595856,31.9,1271675,68,595856,31.9
Vcells,1066817,8.2,8388608,64,1066817,8.2


In [2]:
# sources from which this notebook was build:
#   R Packages Vignettes: 
#     https://bioconductor.org/packages/release/bioc/html/GEOquery.html
#     https://bioconductor.org/packages/release/bioc/html/minfi.html
#     https://bioconductor.org/packages/release/workflows/html/methylationArrayAnalysis.html
#   Others:
#     https://vbaliga.github.io/verify-that-r-packages-are-installed-and-loaded/
    


packages = c("BiocManager", "GEOquery",  "minfi",  "purrr",  "strex" )

package.check <- lapply(
    packages,
    FUN = function(x) {
        if (!require(x, character.only = TRUE)) {
            install.packages(x, dependencies = TRUE)
            library(x, character.only = TRUE)
        }
    }
)


BiocManager::install("IlluminaHumanMethylation450kmanifest")

Loading required package: BiocManager

"package 'BiocManager' was built under R version 4.2.0"
Bioconductor version '3.14' is out-of-date; the current release version '3.15'
  is available with R version '4.2'; see https://bioconductor.org/install

Loading required package: GEOquery

Loading required package: Biobase

Loading required package: BiocGenerics


Attaching package: 'BiocGenerics'


The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs


The following objects are masked from 'package:base':

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Welcome to Bioconductor

    Vignettes contain introductory materia

---

## Data to load or download

In [3]:
# CHARGE and Kabuki: GSE97362
# Sotos : GSE74432

GEO = 'GSE97362'
DOWNLOAD_GEO = F # F: just load, T: download and load

## Metadata - GEOquery

In [4]:
library(GEOquery)

metadata <- pData(getGEO(GEO,  GSEMatrix=TRUE, getGPL=FALSE, destdir='./data/')[[1]])

Found 1 file(s)

GSE97362_series_matrix.txt.gz

Using locally cached version: ./data//GSE97362_series_matrix.txt.gz



In [5]:
t(metadata[10,]) # we just visualize for one subject 

Unnamed: 0,GSM2562708
title,CHD7-10 whole blood CHD7 LOF genomic DNA
geo_accession,GSM2562708
status,Public on May 03 2017
submission_date,Apr 03 2017
last_update_date,Oct 22 2020
type,genomic
channel_count,1
source_name_ch1,whole blood genomic DNA
organism_ch1,Homo sapiens
characteristics_ch1,gender: female


In [6]:
library(purrr)
library(strex)



if (GEO == "GSE97362") {
    metadata <- metadata[, c("geo_accession", "description", "disease state:ch1", "gender:ch1", "age (years):ch1")]
    colnames(metadata) <- c("geo_accession", "description", "disease_state", "gender", "age")
    head(metadata)

    # - DISEASE ID
    metadata$disease_id <- as.character(map(strsplit(metadata$description, split = "-", 1), 1))
    metadata[metadata$disease_state == "Control",]$disease_id <- "CTRL"
    head(metadata)


    # - AGE CATEGORY
    age_round <-round(as.numeric(str_extract_numbers(metadata$age,  decimals = TRUE)))
    metadata$age <- age_round

        # check for NA's remove them
    bad_samples = metadata$geo_accession[is.na(age_round)]
    bad_samples
    metadata <- metadata[!metadata$geo_accession %in% bad_samples, ]
    #age_round[is.na(age_round)] <- mean(age_round, na.rm = TRUE)  # OR replace by mean

        # cut by levels
    metadata$age_cat <- cut(metadata$age, breaks = c(-1, 14, 24, 64, 150), labels = c('child','youth','adults','senior'))
    head(metadata)

        # just keep some rows
    metadata <- metadata[, c('geo_accession', 'disease_state', 'gender', 'disease_id', 'age_cat', 'age')]
    head(metadata)
    dim(metadata)
    
    cat("Metadata for GSE97362 ---> DONE")
    
    
} else if (GEO == "GSE74432") {
    metadata <- metadata[, c("geo_accession", "disease state:ch1", "gender:ch1", "characteristics_ch1.2")]
    colnames(metadata) <-  c("geo_accession", "disease_state", "gender","tissue")
    head(metadata)
    
    # - KEEP ONLY BLOOD TISSUE
    tissue_samples <-  metadata[metadata$tissue == 'tissue: whole blood', ]$geo_accession
    metadata <- metadata[tissue_samples, ]
    
    # - DISEASE ID
    metadata$disease_id <- as.character(rep(NA, nrow(metadata)))
    metadata[metadata$disease_state == "Control",]$disease_id <- "CTRL"
    metadata[metadata$disease_state == "Sotos",]$disease_id <- "NSD1"
    metadata[metadata$disease_state == "NSD1 variant",]$disease_id <- "NSD1_V"
    metadata <- metadata[!is.na(metadata$disease_id),   ]
    head(metadata)
    
    
    metadata <- metadata[,!colnames(metadata) %in% c('tissue')]
    cat("Metadata for GSE74432 ---> DONE")
}

Metadata for GSE97362 ---> DONE

In [7]:
# just to be sure we check for NAs

apply(metadata, 2, function(x) any(is.na(x)))

---

## Raw data - minfi

#### we load the data

In [8]:
library(GEOquery)
library(minfi)
library(IlluminaHumanMethylation450kmanifest)


if (DOWNLOAD_GEO) {
    # get RAW_tar file (to download the raw)
    getGEOSuppFiles(GEO)
 

    # unatar and unzip everything
    untar( tarfile = file.path(parse(text=paste(GEO,'/',GEO,'_RAW.tar',sep=''))), 
           exdir = file.path(parse(text=paste(GEO,'/idat',sep=''))))

    idatFiles <- list.files( file.path(parse(text=paste(GEO,'/idat',sep=''))) , pattern = ".gz$", full = TRUE)
    sapply(idatFiles, gunzip, overwrite = TRUE)
}

# Load the Methylation Array
RGset <- read.metharray.exp( file.path(parse(text=paste(GEO,'/idat',sep=''))) ) #R/G channel

"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with e

"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with e

"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with e

"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with e

"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with embedded nuls"
"truncating string with e

#### we reformat the colnames (samples ID)

In [9]:
# see here: https://kasperdanielhansen.github.io/genbioconductor/html/minfi.html

colnames(RGset) <- as.character(map(strsplit(colnames(RGset), split = "_", 1), 1)) # from GSM2562699_9376525031_R05C01t to GSM2562699
RGset
# pd <- pData(RGset) 
# pd # this phenotype data is not usefull here : col of size 0

class: RGChannelSet 
dim: 622399 235 
metadata(0):
assays(2): Green Red
rownames(622399): 10600313 10600322 ... 74810490 74810492
rowData names(0):
colnames(235): GSM2562699 GSM2562700 ... GSM2562934 GSM2562935
colData names(0):
Annotation
  array: IlluminaHumanMethylation450k
  annotation: ilmn12.hg19

#### we remove the bad samples 

In [10]:
RGset <- RGset[, colnames(RGset) %in% metadata$geo_accession]
RGset

class: RGChannelSet 
dim: 622399 234 
metadata(0):
assays(2): Green Red
rownames(622399): 10600313 10600322 ... 74810490 74810492
rowData names(0):
colnames(234): GSM2562699 GSM2562700 ... GSM2562934 GSM2562935
colData names(0):
Annotation
  array: IlluminaHumanMethylation450k
  annotation: ilmn12.hg19

## Save RGset & metadata

In [11]:
save(RGset,metadata,file=file.path( parse(text=paste('RAW_',GEO,'_RGset_metadata.RData',sep='')) ))  

---

In [12]:
sessionInfo()

R version 4.1.2 (2021-11-01)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 22000)

Matrix products: default

locale:
[1] LC_COLLATE=English_Belgium.1252  LC_CTYPE=English_Belgium.1252   
[3] LC_MONETARY=English_Belgium.1252 LC_NUMERIC=C                    
[5] LC_TIME=English_Belgium.1252    

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] IlluminaHumanMethylation450kmanifest_0.4.0
 [2] strex_1.4.2                               
 [3] stringr_1.4.0                             
 [4] purrr_0.3.4                               
 [5] minfi_1.40.0                              
 [6] bumphunter_1.36.0                         
 [7] locfit_1.5-9.4                            
 [8] iterators_1.0.14                          
 [9] foreach_1.5.2                             
[10] Biostrings_2.62.0                         
[11] XVector_0.34.0                         