diff --git a/DESCRIPTION b/DESCRIPTION index f6aa0c415..005b55868 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: xcms -Version: 4.1.6 +Version: 4.1.7 Title: LC-MS and GC-MS Data Analysis Description: Framework for processing and visualization of chromatographically separated and single-spectra mass spectral data. Imports from AIA/ANDI NetCDF, @@ -46,7 +46,7 @@ Imports: methods, Biobase, BiocGenerics, - ProtGenerics (>= 1.29.1), + ProtGenerics (>= 1.35.2), lattice, RColorBrewer, plyr, @@ -62,7 +62,8 @@ Imports: Spectra (>= 1.13.2), progress, multtest, - jsonlite + jsonlite, + MetaboCoreUtils (>= 1.11.2) Suggests: BiocStyle, caTools, @@ -76,7 +77,6 @@ Suggests: MALDIquant, pheatmap, MsBackendMgf, - MetaboCoreUtils, signal Enhances: Rgraphviz, @@ -127,6 +127,7 @@ Collate: 'init.R' 'loadXcmsData.R' 'matchpeaks.R' + 'method-filterFeatures.R' 'methods-Chromatogram.R' 'methods-IO.R' 'methods-MChromatograms.R' diff --git a/NAMESPACE b/NAMESPACE old mode 100755 new mode 100644 index 134ef3f4a..aa3563bf3 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,7 +4,7 @@ importFrom("utils", "capture.output", "data") import("methods") importMethodsFrom("ProtGenerics", "peaks", "chromatogram", "writeMSData", "polarity<-", "centroided", "isCentroided", "peaks<-", - "isolationWindowTargetMz", "quantify", "bin", "spectrapply") + "isolationWindowTargetMz", "quantify", "bin", "spectrapply", "filterFeatures") importClassesFrom("ProtGenerics", "Param") importFrom("BiocGenerics", "updateObject", "fileName", "subset", "dirname", "dirname<-") @@ -35,6 +35,9 @@ importFrom("S4Vectors", "split", "Rle", "DataFrame", "SimpleList", "List", importMethodsFrom("S4Vectors", "as.matrix", "mcols", "mcols<-", "extractROWS", "findMatches") importFrom("SummarizedExperiment", "SummarizedExperiment") +importFrom("SummarizedExperiment", "rowData") +importFrom("SummarizedExperiment", "rowData<-") +importFrom("SummarizedExperiment", "assay") importFrom("MsCoreUtils", "rbindFill", "closest", "i2index", "sumi", "between", "maxi", "breaks_ppm") @@ -594,3 +597,11 @@ importFrom("jsonlite", "serializeJSON", "write_json", "unserializeJSON", export("RDataParam") export("PlainTextParam") exportMethods("storeResults") + +## filtering features things +importFrom("MetaboCoreUtils", "rowRsd", "rowDratio", "rowPercentMissing", + "rowBlank") +export("RsdFilter") +export("DratioFilter") +export("PercentMissingFilter") +export("BlankFlag") diff --git a/R/DataClasses.R b/R/DataClasses.R index 5f465bccd..f24cf52b2 100644 --- a/R/DataClasses.R +++ b/R/DataClasses.R @@ -190,6 +190,7 @@ setClass("xcmsPeaks", contains = "matrix") .PROCSTEP.PEAK.FILLING <- "Missing peak filling" .PROCSTEP.CALIBRATION <- "Calibration" .PROCSTEP.FEATURE.GROUPING <- "Feature grouping" +.PROCSTEP.FEATURE.FILTERING <- "Feature filtering" .PROCSTEPS <- c( .PROCSTEP.UNKNOWN, .PROCSTEP.PEAK.DETECTION, @@ -198,7 +199,8 @@ setClass("xcmsPeaks", contains = "matrix") .PROCSTEP.RTIME.CORRECTION, .PROCSTEP.PEAK.FILLING, .PROCSTEP.CALIBRATION, - .PROCSTEP.FEATURE.GROUPING + .PROCSTEP.FEATURE.GROUPING, + .PROCSTEP.FEATURE.FILTERING ) ############################################################ diff --git a/R/method-filterFeatures.R b/R/method-filterFeatures.R new file mode 100644 index 000000000..8c708680b --- /dev/null +++ b/R/method-filterFeatures.R @@ -0,0 +1,622 @@ +#' @title Filtering of features based on conventional quality assessment +#' +#' @description +#' +#' When dealing with metabolomics results, it is often necessary to filter +#' features based on certain criteria. These criteria are typically derived +#' from statistical formulas applied to full rows of data, where each row +#' represents a feature and its abundance of signal in each samples. +#' The `filterFeatures` function filters features based on these conventional +#' quality assessment criteria. Multiple types of filtering are implemented and +#' can be defined by the `filter` argument. +#' +#' Supported `filter` arguments are: +#' +#' - [`RsdFilter`]: Calculates the relative standard deviation +#' (i.e. coefficient of variation) in abundance for each feature in QC +#' (Quality Control) samples and filters them in the input object according to +#' a provided threshold. +#' +#' - [`DratioFilter`]: Computes the D-ratio or *dispersion ratio*, defined as +#' the standard deviation in abundance for QC samples divided by the standard +#' deviation for biological test samples, for each feature and filters them +#' according to a provided threshold. +#' +#' - [`PercentMissingFilter`]: Determines the percentage of missing values for +#' each feature in the various sample groups and filters them according to a +#' provided threshold. +#' +#' - [`BlankFlag`]: Identifies features where the mean abundance in test samples +#' is lower than a specified multiple of the mean abundance of blank samples. +#' This can be used to flag features that result from contamination in the +#' solvent of the samples. A new column `possible_contaminants` is added to the +#' `featureDefinitions` (`XcmsExperiment` object) or `rowData` +#' (`SummarizedExperiment` object) reflecting this. +#' +#' For specific examples, see the help pages of the individual parameter classes +#' listed above. +#' +#' @param object `XcmsExperiment` or `SummarizedExperiment`. For an +#' `XcmsExperiment` object, the `featureValues(object)` will be evaluated, and +#' for `Summarizedesxperiment` the `assay(object, assay)`. The object will be +#' filtered. +#' +#' @param filter The parameter object selecting and configuring the type of +#' filtering. It can be one of the following classes: [`RsdFilter`], +#' [`DratioFilter`], [`PercentMissingFilter`] or [`BlankFlag`]. +#' +#' @param assay For filtering of `SummarizedExperiment` objects only. Indicates +#' which assay the filtering will be based on. Note that the features for the +#' entire object will be removed, but the computations are performed on a single +#' assay. Default is 1, which means the first assay of the `object` will +#' be evaluated. +#' +#' @param ... Optional parameters. For `object` being an `XcmsExperiment`: +#' parameters for the [featureValues()] call. +#' +#' @name filterFeatures +#' +#' @author Philippine Louail +#' +#' @importFrom MetaboCoreUtils rowRsd rowDratio rowPercentMissing rowBlank +#' +#' @importMethodsFrom ProtGenerics filterFeatures +#' +#' @references +#' +#' Broadhurst D, Goodacre R, Reinke SN, Kuligowski J, Wilson ID, Lewis MR, +#' Dunn WB. Guidelines and considerations for the use of system suitability +#' and quality control samples in mass spectrometry assays applied in +#' untargeted clinical metabolomic studies. Metabolomics. 2018;14(6):72. +#' doi: 10.1007/s11306-018-1367-3. Epub 2018 May 18. PMID: 29805336; +#' PMCID: PMC5960010. +#' +#' @examples +#' ## See the vignettes for more detailed examples +#' library(MsExperiment) +#' +#' ## Load a test data set with features defined. +#' test_xcms <- loadXcmsData() +#' ## Set up parameter to filter based on coefficient of variation. By setting +#' ## the filter such as below, features that have a coefficient of variation +#' ## superior to 0.3 in QC samples will be removed from the object `test_xcms` +#' ## when calling the `filterFeatures` function. +#' +#' rsd_filter <- RsdFilter(threshold = 0.3, +#' qcIndex = sampleData(test_xcms)$sample_type == "QC") +#' +#' filtered_data_rsd <- filterFeatures(object = test_xcms, filter = rsd_filter) +#' +#' ## Set up parameter to filter based on D-ratio. By setting the filter such +#' ## as below, features that have a D-ratio computed based on their abundance +#' ## between QC and study samples superior to 0.5 will be removed from the +#' ## object `test_xcms`. +#' +#' dratio_filter <- DratioFilter(threshold = 0.5, +#' qcIndex = sampleData(test_xcms)$sample_type == "QC", +#' studyIndex = sampleData(test_xcms)$sample_type == "study") +#' +#' filtered_data_dratio <- filterFeatures(object = test_xcms, +#' filter = dratio_filter) +#' +#' ## Set up parameter to filter based on the percent of missing data. +#' ## Parameter f should represent the sample group of samples, for which the +#' ## percentage of missing values will be evaluated. As the setting is defined +#' ## bellow, if a feature as less (or equal) to 30% missing values in one +#' ## sample group, it will be kept in the `test_xcms` object. +#' +#' missing_data_filter <- PercentMissingFilter(threshold = 30, +#' f = sampleData(test_xcms)$sample_type) +#' +#' filtered_data_missing <- filterFeatures(object = test_xcms, +#' filter = missing_data_filter) +#' +#' ## Set up parameter to flag possible contaminants based on blank samples' +#' ## abundance. By setting the filter such as below, features that have mean +#' ## abundance ratio between blank(here use study as an example) and QC +#' ## samples less than 2 will be marked as `TRUE` in an extra column named +#' ## `possible_contaminants` in the `featureDefinitions` table of the object +#' ## `test_xcms`. +#' +#' filter <- BlankFlag(threshold = 2, +#' qcIndex = sampleData(test_xcms)$sample_type == "QC", +#' blankIndex = sampleData(test_xcms)$sample_type == "study") +#' filtered_xmse <- filterFeatures(test_xcms, filter) +#' +#' @md +NULL + +#' @title Filter features based on their coefficient of variation +#' +#' @name RsdFilter +#' +#' @export +#' +#' @family Filter features in xcms +#' +#' @description +#' The `RsdFilter` class and methods enable users to filter features from an +#' `XcmsExperiment` or `SummarizedExperiment` object based on their relative +#' standard deviation (coefficient of variation) for a specified threshold. +#' +#' This `filter` is part of the possible dispatch of the generic function +#' `filterFeatures`. Features *above* (`>`) the user-input threshold will be +#' removed from the entire dataset. +#' +#' @param threshold `numeric` value representing the threshold. Features with a +#' coefficient of variation *strictly higher* (`>`) than this will be removed +#' from the entire dataset. +#' +#' @param qcIndex `integer` (or `logical`) vector corresponding to the indices +#' of QC samples. +#' +#' @param na.rm `logical` indicates whether missing values (`NA`) should be +#' removed prior to the calculations. +#' +#' @param mad `logical` indicates whether the *Median Absolute Deviation* (MAD) +#' should be used instead of the standard deviation. This is suggested for +#' non-gaussian distributed data. +#' +#' @inheritParams filterFeatures +#' +#' @return For `RsdFilter`: a `RsdFilter` class. `filterFeatures` return the +#' input object minus the features that did not met the user input threshold. +#' +#' @note +#' It is assumed that the abundance values are in natural scale. Abundances in +#' log scale should be first transformed to natural scale before calculating +#' the RSD. +#' +#' @author Philippine Louail +#' +#' @importFrom MetaboCoreUtils rowRsd +#' +NULL + +#' @noRd +setClass("RsdFilter", + slots = c(threshold = "numeric", + qcIndex = "integer", + na.rm = "logical", + mad = "logical"), + contains = "Param", + prototype = prototype( + threshold = numeric(), + qcIndex = integer(), + na.rm = logical(), + mad = logical()), + validity = function(object) { + msg <- NULL + if (length(object@threshold) != 1) + msg <- c("'threshold' has to be a numeric of length 1") + msg + }) + +#' @rdname RsdFilter +#' +#' @export +RsdFilter <- function(threshold = 0.3, qcIndex = integer(), + na.rm = TRUE, mad = FALSE) { + if (is.logical(qcIndex)) + qcIndex <- which(qcIndex) + if (is.numeric(qcIndex)) + qcIndex <- as.integer(qcIndex) + new("RsdFilter", threshold = threshold, qcIndex = qcIndex, na.rm = na.rm, + mad = mad) +} + +#' @rdname RsdFilter +setMethod("filterFeatures", + signature(object = "XcmsResult", + filter = "RsdFilter"), + function(object, filter, ...){ + .check_index_range(filter@qcIndex, length(object), name = "qcIndex") + vals <- featureValues(object, ...)[, filter@qcIndex] + vals <- rowRsd(x = vals, na.rm = filter@na.rm, mad = filter@mad) + fts_idx <- which(vals <= filter@threshold) + message(length(vals) - length(fts_idx), " features were removed") + ph <- XProcessHistory(param = filter, + date. = date(), + type. = .PROCSTEP.FEATURE.FILTERING, + fileIndex = seq_along(object)) + object <- addProcessHistory(object, ph) + filterFeatureDefinitions(object, features = fts_idx) + } +) + +#' @rdname RsdFilter +setMethod("filterFeatures", + signature(object = "SummarizedExperiment", + filter = "RsdFilter"), + function(object, filter, assay = 1){ + .check_index_range(filter@qcIndex, ncol(object), name = "qcIndex") + vals <- assay(object, assay)[, filter@qcIndex] + vals <- rowRsd(vals, na.rm = filter@na.rm, mad = filter@mad) + fts_idx <- which(vals <= filter@threshold) + message(length(vals) - length(fts_idx), " features were removed") + object[fts_idx] + } +) + +#' @title Filter features based on the dispersion ratio +#' +#' @name DratioFilter +#' +#' @export +#' +#' @family Filter features in xcms +#' +#' @description +#' The `DratioFilter` class and method enable users to filter features from an +#' `XcmsExperiment` or `SummarizedExperiment` object based on the D-ratio or +#' *dispersion ratio*. This is defined as the standard deviation for QC +#' samples divided by the standard deviation for biological test samples, for +#' each feature of the object (Broadhurst et al.). +#' +#' This `filter` is part of the possible dispatch of the generic function +#' `filterFeatures`. Features *above* (`>`) the user-input threshold will be +#' removed from the entire dataset. +#' +#' @param threshold `numeric` value representing the threshold. Features with a +#' D-ratio *strictly higher* (`>`) than this will be removed from the entire +#' dataset. +#' +#' @param qcIndex `integer` (or `logical`) vector corresponding to the indices +#' of QC samples. +#' +#' @param studyIndex `integer` (or `logical`) vector corresponding of the +#' indices of study samples. +#' +#' @param na.rm `logical` Indicates whether missing values (`NA`) should be +#' removed prior to the calculations. +#' +#' @param mad `logical` Indicates whether the *Median Absolute Deviation* +#' (MAD) should be used instead of the standard deviation. This is suggested +#' for non-gaussian distributed data. +#' +#' @inheritParams filterFeatures +#' +#' @return For `DratioFilter`: a `DratioFilter` class. `filterFeatures` return +#' the input object minus the features that did not met the user input threshold +#' +#' @author Philippine Louail +#' +#' @importFrom MetaboCoreUtils rowDratio +#' +#' @references +#' +#' Broadhurst D, Goodacre R, Reinke SN, Kuligowski J, Wilson ID, Lewis MR, +#' Dunn WB. Guidelines and considerations for the use of system suitability +#' and quality control samples in mass spectrometry assays applied in +#' untargeted clinical metabolomic studies. Metabolomics. 2018;14(6):72. +#' doi: 10.1007/s11306-018-1367-3. Epub 2018 May 18. PMID: 29805336; +#' PMCID: PMC5960010. +#' +NULL + +#' @noRd +setClass("DratioFilter", + slots = c(threshold = "numeric", + qcIndex = "integer", + studyIndex = "integer", + na.rm = "logical", + mad = "logical"), + contains = "Param", + prototype = prototype( + threshold = numeric(), + qcIndex = integer(), + studyIndex = integer(), + na.rm = logical(), + mad = logical()), + validity = function(object) { + msg <- NULL + if (length(object@threshold) != 1) + msg <- c("'threshold' has to be a numeric of length 1") + msg + }) + +#' @rdname DratioFilter +#' +#' @export +DratioFilter <- function(threshold = 0.5, + qcIndex = integer(), + studyIndex = integer(), + na.rm = TRUE, mad = FALSE) { + if (is.logical(qcIndex)) + qcIndex <- which(qcIndex) + if(is.numeric(qcIndex)) + qcIndex <- as.integer(qcIndex) + if (is.logical(studyIndex)) + studyIndex <- which(studyIndex) + if(is.numeric(studyIndex)) + studyIndex <- as.integer(studyIndex) + new("DratioFilter", threshold = threshold, qcIndex = qcIndex, + studyIndex = studyIndex, na.rm = na.rm, mad = mad) +} + +#' @rdname DratioFilter +setMethod("filterFeatures", + signature(object = "XcmsResult", + filter = "DratioFilter"), + function(object, filter, ...){ + .check_index_range(filter@studyIndex, length(object), + name = "studyIndex") + .check_index_range(filter@qcIndex, length(object), name = "qcIndex") + x <- featureValues(object, ...)[, filter@studyIndex] + y <- featureValues(object, ...)[, filter@qcIndex] + vals <- rowDratio(x = x, y = y, + na.rm = filter@na.rm, + mad = filter@mad) + fts_idx <- which(vals <= filter@threshold) + message(length(vals) - length(fts_idx), " features were removed") + ph <- XProcessHistory(param = filter, + date. = date(), + type. = .PROCSTEP.FEATURE.FILTERING, + fileIndex = seq_along(object)) + object <- addProcessHistory(object, ph) + filterFeatureDefinitions(object, features = fts_idx) + } +) + +#' @rdname DratioFilter +setMethod("filterFeatures", + signature(object = "SummarizedExperiment", + filter = "DratioFilter"), + function(object, filter, assay = 1){ + .check_index_range(filter@studyIndex, ncol(object), + name = "studyIndex") + .check_index_range(filter@qcIndex, ncol(object), name = "qcIndex") + x <- assay(object, assay)[, filter@studyIndex] + y <- assay(object, assay)[, filter@qcIndex] + vals <- rowDratio(x = x, y = y, + na.rm = filter@na.rm, + mad = filter@mad) + fts_idx <- which(vals <= filter@threshold) + message(length(vals) - length(fts_idx), " features were removed") + object[fts_idx] + } +) + +#' @title Filter features based on the percentage of missing data +#' +#' @name PercentMissingFilter +#' +#' @export +#' +#' @family Filter features in xcms +#' +#' @description +#' The `PercentMissingFilter` class and method enable users to filter features +#' from an `XcmsExperiment` or `SummarizedExperiment` object based on the +#' percentage (values from 1 to 100) of missing values for each features in +#' different sample groups and filters them according to a provided threshold. +#' +#' This `filter` is part of the possible dispatch of the generic function +#' `filterFeatures`. Features with a percentage of missing values *higher* (`>`) +#' than the user input threshold in all sample groups will be removed (i.e. +#' features for which the proportion of missing values is below (`<=`) the +#' threshold in at least one sample group will be retained). +#' +#' @param f `vector` of the same length as the `object`, specifying the sample +#' type for each sample in the dataset. The percentage of missing values per +#' feature will be computed within each of these sample groups. Parameter `f`, +#' if not already a `factor`, will be converted to one using the factor function. +#' Samples with an `NA` as their value in `f` will be excluded from calculation. +#' +#' @param threshold `numeric` percentage (between 0 and 100) of accepted missing +#' values for a feature in one sample group. +#' +#' @inheritParams filterFeatures +#' +#' @return For `PercentMissingFilter`: a `PercentMissingFilter` class. +#' `filterFeatures` return the input object minus the features that did not met +#' the user input threshold +#' +#' @author Philippine Louail +#' +#' @importFrom MetaboCoreUtils rowPercentMissing +#' +NULL + +#' @noRd +setClass("PercentMissingFilter", + slots = c(threshold = "numeric", + f = "factor"), + contains = "Param", + prototype = prototype( + threshold = numeric(), + f = factor()), + validity = function(object) { + msg <- NULL + if (length(object@threshold) != 1) + msg <- c("'threshold' has to be a numeric of length 1") + msg + }) + +#' @rdname PercentMissingFilter +#' +#' @export +PercentMissingFilter <- function(threshold = 30, f = factor()) { + if (!is.factor(f)) + f <- factor(f) + new("PercentMissingFilter", threshold = threshold, f = f) +} + +#' @rdname PercentMissingFilter +setMethod("filterFeatures", + signature(object = "XcmsResult", + filter = "PercentMissingFilter"), + function(object, filter, ...){ + if (length(filter@f) != length(object)) + stop("'f' must be same lenght as object") + fts_idx <- c() + for (i in levels(filter@f)){ + spl_idx <- which(filter@f == i) + vals <- rowPercentMissing(featureValues(object, ...)[, spl_idx]) + fts_idx <- c(fts_idx, which(vals <= filter@threshold)) + } + fts_idx <- order(unique(fts_idx)) + message(length(vals) - length(fts_idx), " features were removed") + ph <- XProcessHistory(param = filter, + date. = date(), + type. = .PROCSTEP.FEATURE.FILTERING, + fileIndex = seq_along(object)) + object <- addProcessHistory(object, ph) + filterFeatureDefinitions(object, features = fts_idx) + } +) + +#' @rdname PercentMissingFilter +setMethod("filterFeatures", + signature(object = "SummarizedExperiment", + filter = "PercentMissingFilter"), + function(object, filter, assay = 1){ + if (length(filter@f) != ncol(object)) + stop("'f' must be same lenght as object") + fts_idx <- c() + for (i in levels(filter@f)){ + spl_idx <- which(filter@f == i) + vals <- rowPercentMissing(assay(object, assay)[, spl_idx]) + fts_idx <- c(fts_idx, which(vals <= filter@threshold)) + } + fts_idx <- order(unique(fts_idx)) + message(length(vals) - length(fts_idx), " features were removed") + object[fts_idx] + } +) + +#' @title Flag features based on the intensity in blank samples +#' +#' @name BlankFlag +#' +#' @export +#' +#' @family Filter features in xcms +#' +#' @description +#' The `BlankFlag` class and method enable users to flag features of an +#' `XcmsExperiment` or `SummarizedExperiment` object based on the relationship +#' between the intensity of a feature in blanks compared to the intensity in the +#' samples. +#' +#' This class and method are part of the possible dispatch of the +#' generic function `filterFeatures`. Features *below* (`<`) the user-input +#' threshold will be flagged by calling the `filterFeatures` function. This +#' means that an extra column will be created in `featureDefinitions` or +#' `rowData` called `possible_contaminants` with a logical value for each +#' feature. +#' +#' @param threshold `numeric` indicates the minimum difference +#' required between the mean abundance of a feature in samples compared to the +#' mean abundance of the same feature in blanks for it to not be considered a +#' possible contaminant. For example, the default threshold of 2 signifies that +#' the mean abundance of the features in samples has to be at least twice the +#' mean abundance in blanks for it to not be flagged as a possible contaminant. +#' +#' @param blankIndex `integer` (or `logical`) vector corresponding to the +#' indices of blank samples. +#' +#' @param qcIndex `integer` (or `logical`) vector corresponding to the +#' indices of quality control (QC) samples. +#' +#' @param na.rm `logical` indicates whether missing values (`NA`) should be +#' removed prior to the calculations. +#' +#' @inheritParams filterFeatures +#' +#' @return For `BlankFlag`: a `BlankFlag` class. `filterFeatures` returns +#' the input object with an added column in the features metadata called +#' `possible_contaminants` with a logical value for each feature. This is added +#' to `featureDefinitions` for `XcmsExperiment` objects and `rowData` for +#' `SummarizedExperiment` objects. +#' +#' @author Philippine Louail +#' +#' @importFrom MetaboCoreUtils rowBlank +#' +NULL + +#' @noRd +setClass("BlankFlag", + slots = c(threshold = "numeric", blankIndex = "integer", + qcIndex = "integer", na.rm = "logical"), + contains = "Param", + prototype = prototype( + threshold = numeric(), + blankIndex = integer(), + qcIndex = integer(), + na.rm = logical()), + validity = function(object) { + msg <- NULL + if (length(object@threshold) != 1) + msg <- c("'threshold' has to be a numeric of length 1") + msg + }) + +#' @rdname BlankFlag +#' +#' @export +BlankFlag <- function(threshold = 2, blankIndex = integer(), + qcIndex = integer(), na.rm = TRUE) { + if (is.logical(blankIndex)) + blankIndex <- which(blankIndex) + if(is.numeric(blankIndex)) + blankIndex <- as.integer(blankIndex) + if (is.logical(qcIndex)) + qcIndex <- which(qcIndex) + if(is.numeric(qcIndex)) + qcIndex <- as.integer(qcIndex) + new("BlankFlag", threshold = threshold, blankIndex = blankIndex, + qcIndex = qcIndex, na.rm = na.rm) +} + +#' @rdname BlankFlag +setMethod("filterFeatures", + signature(object = "XcmsResult", + filter = "BlankFlag"), + function(object, filter, ...){ + .check_index_range(filter@blankIndex, length(object), name = "blankIndex") + .check_index_range(filter@qcIndex, length(object), name = "qcIndex") + x <- featureValues(object, ...)[, filter@qcIndex] + y <- featureValues(object, ...)[, filter@blankIndex] + vals <- rowBlank(x = x, y = y, + na.rm = filter@na.rm, + threshold = filter@threshold) + message(sum(vals, na.rm = TRUE) - nrow(featureValues(object)), + " features were flagged") + featureDefinitions(object)$possible_contaminants <- vals + ph <- XProcessHistory(param = filter, + date. = date(), + type. = .PROCSTEP.FEATURE.FILTERING, + fileIndex = seq_along(object)) + object <- addProcessHistory(object, ph) + validObject(object) + object + } +) + +#' @rdname BlankFlag +setMethod("filterFeatures", + signature(object = "SummarizedExperiment", + filter = "BlankFlag"), + function(object, filter, assay = 1){ + .check_index_range(filter@blankIndex, ncol(object), + name = "blankIndex") + .check_index_range(filter@qcIndex, ncol(object), name = "qcIndex") + x <- assay(object, assay)[, filter@qcIndex] + y <- assay(object, assay)[, filter@blankIndex] + vals <- rowBlank(x = x, y = y, + na.rm = filter@na.rm, + threshold = filter@threshold) + message(sum(vals, na.rm = TRUE) - length(object), + " features were flagged") + rowData(object)$possible_contaminants <- vals + object + } +) + +#' @noRd +.check_index_range <- function(x, l, name = "") { + if (length(x) == 0 || !all(x %in% seq_len(l))) + stop(name, " should be between 1 and ", l) +} diff --git a/inst/NEWS b/inst/NEWS index 00cf4c7d5..5fc15922b 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,11 @@ +Changes in version 4.1.7 +---------------------- + +- Implementation of `filterFeatures` function with `filter` parameters: + `RsdFilter`, `DratioFilter`, `PercentMissingFilter`, `BlankFlag`. They can be + used ot filter features from `XcmsResult` and `SummarizedExperiment` objects. +- Addition of a section in the main xcms vignette to describe how to use it. + Changes in version 4.1.6 ---------------------- diff --git a/man/BlankFlag.Rd b/man/BlankFlag.Rd new file mode 100644 index 000000000..578b47dc2 --- /dev/null +++ b/man/BlankFlag.Rd @@ -0,0 +1,84 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-filterFeatures.R +\name{BlankFlag} +\alias{BlankFlag} +\alias{filterFeatures,XcmsResult,BlankFlag-method} +\alias{filterFeatures,SummarizedExperiment,BlankFlag-method} +\title{Flag features based on the intensity in blank samples} +\usage{ +BlankFlag( + threshold = 2, + blankIndex = integer(), + qcIndex = integer(), + na.rm = TRUE +) + +\S4method{filterFeatures}{XcmsResult,BlankFlag}(object, filter, ...) + +\S4method{filterFeatures}{SummarizedExperiment,BlankFlag}(object, filter, assay = 1) +} +\arguments{ +\item{threshold}{`numeric` indicates the minimum difference +required between the mean abundance of a feature in samples compared to the +mean abundance of the same feature in blanks for it to not be considered a +possible contaminant. For example, the default threshold of 2 signifies that +the mean abundance of the features in samples has to be at least twice the +mean abundance in blanks for it to not be flagged as a possible contaminant.} + +\item{blankIndex}{`integer` (or `logical`) vector corresponding to the +indices of blank samples.} + +\item{qcIndex}{`integer` (or `logical`) vector corresponding to the +indices of quality control (QC) samples.} + +\item{na.rm}{`logical` indicates whether missing values (`NA`) should be +removed prior to the calculations.} + +\item{object}{\code{XcmsExperiment} or \code{SummarizedExperiment}. For an +\code{XcmsExperiment} object, the \code{featureValues(object)} will be evaluated, and +for \code{Summarizedesxperiment} the \code{assay(object, assay)}. The object will be +filtered.} + +\item{filter}{The parameter object selecting and configuring the type of +filtering. It can be one of the following classes: \code{\link{RsdFilter}}, +\code{\link{DratioFilter}}, \code{\link{PercentMissingFilter}} or \code{\link{BlankFlag}}.} + +\item{...}{Optional parameters. For \code{object} being an \code{XcmsExperiment}: +parameters for the \code{\link[=featureValues]{featureValues()}} call.} + +\item{assay}{For filtering of \code{SummarizedExperiment} objects only. Indicates +which assay the filtering will be based on. Note that the features for the +entire object will be removed, but the computations are performed on a single +assay. Default is 1, which means the first assay of the \code{object} will +be evaluated.} +} +\value{ +For `BlankFlag`: a `BlankFlag` class. `filterFeatures` returns +the input object with an added column in the features metadata called +`possible_contaminants` with a logical value for each feature. This is added +to `featureDefinitions` for `XcmsExperiment` objects and `rowData` for +`SummarizedExperiment` objects. +} +\description{ +The `BlankFlag` class and method enable users to flag features of an +`XcmsExperiment` or `SummarizedExperiment` object based on the relationship +between the intensity of a feature in blanks compared to the intensity in the +samples. + +This class and method are part of the possible dispatch of the +generic function `filterFeatures`. Features *below* (`<`) the user-input +threshold will be flagged by calling the `filterFeatures` function. This +means that an extra column will be created in `featureDefinitions` or +`rowData` called `possible_contaminants` with a logical value for each +feature. +} +\seealso{ +Other Filter features in xcms: +\code{\link{DratioFilter}}, +\code{\link{PercentMissingFilter}}, +\code{\link{RsdFilter}} +} +\author{ +Philippine Louail +} +\concept{Filter features in xcms} diff --git a/man/DratioFilter.Rd b/man/DratioFilter.Rd new file mode 100644 index 000000000..72fb90256 --- /dev/null +++ b/man/DratioFilter.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-filterFeatures.R +\name{DratioFilter} +\alias{DratioFilter} +\alias{filterFeatures,XcmsResult,DratioFilter-method} +\alias{filterFeatures,SummarizedExperiment,DratioFilter-method} +\title{Filter features based on the dispersion ratio} +\usage{ +DratioFilter( + threshold = 0.5, + qcIndex = integer(), + studyIndex = integer(), + na.rm = TRUE, + mad = FALSE +) + +\S4method{filterFeatures}{XcmsResult,DratioFilter}(object, filter, ...) + +\S4method{filterFeatures}{SummarizedExperiment,DratioFilter}(object, filter, assay = 1) +} +\arguments{ +\item{threshold}{`numeric` value representing the threshold. Features with a +D-ratio *strictly higher* (`>`) than this will be removed from the entire +dataset.} + +\item{qcIndex}{`integer` (or `logical`) vector corresponding to the indices +of QC samples.} + +\item{studyIndex}{`integer` (or `logical`) vector corresponding of the +indices of study samples.} + +\item{na.rm}{`logical` Indicates whether missing values (`NA`) should be +removed prior to the calculations.} + +\item{mad}{`logical` Indicates whether the *Median Absolute Deviation* +(MAD) should be used instead of the standard deviation. This is suggested +for non-gaussian distributed data.} + +\item{object}{\code{XcmsExperiment} or \code{SummarizedExperiment}. For an +\code{XcmsExperiment} object, the \code{featureValues(object)} will be evaluated, and +for \code{Summarizedesxperiment} the \code{assay(object, assay)}. The object will be +filtered.} + +\item{filter}{The parameter object selecting and configuring the type of +filtering. It can be one of the following classes: \code{\link{RsdFilter}}, +\code{\link{DratioFilter}}, \code{\link{PercentMissingFilter}} or \code{\link{BlankFlag}}.} + +\item{...}{Optional parameters. For \code{object} being an \code{XcmsExperiment}: +parameters for the \code{\link[=featureValues]{featureValues()}} call.} + +\item{assay}{For filtering of \code{SummarizedExperiment} objects only. Indicates +which assay the filtering will be based on. Note that the features for the +entire object will be removed, but the computations are performed on a single +assay. Default is 1, which means the first assay of the \code{object} will +be evaluated.} +} +\value{ +For `DratioFilter`: a `DratioFilter` class. `filterFeatures` return +the input object minus the features that did not met the user input threshold +} +\description{ +The `DratioFilter` class and method enable users to filter features from an +`XcmsExperiment` or `SummarizedExperiment` object based on the D-ratio or +*dispersion ratio*. This is defined as the standard deviation for QC +samples divided by the standard deviation for biological test samples, for +each feature of the object (Broadhurst et al.). + +This `filter` is part of the possible dispatch of the generic function +`filterFeatures`. Features *above* (`>`) the user-input threshold will be +removed from the entire dataset. +} +\references{ +Broadhurst D, Goodacre R, Reinke SN, Kuligowski J, Wilson ID, Lewis MR, +Dunn WB. Guidelines and considerations for the use of system suitability +and quality control samples in mass spectrometry assays applied in +untargeted clinical metabolomic studies. Metabolomics. 2018;14(6):72. +doi: 10.1007/s11306-018-1367-3. Epub 2018 May 18. PMID: 29805336; +PMCID: PMC5960010. +} +\seealso{ +Other Filter features in xcms: +\code{\link{BlankFlag}}, +\code{\link{PercentMissingFilter}}, +\code{\link{RsdFilter}} +} +\author{ +Philippine Louail +} +\concept{Filter features in xcms} diff --git a/man/PercentMissingFilter.Rd b/man/PercentMissingFilter.Rd new file mode 100644 index 000000000..dc550fe32 --- /dev/null +++ b/man/PercentMissingFilter.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-filterFeatures.R +\name{PercentMissingFilter} +\alias{PercentMissingFilter} +\alias{filterFeatures,XcmsResult,PercentMissingFilter-method} +\alias{filterFeatures,SummarizedExperiment,PercentMissingFilter-method} +\title{Filter features based on the percentage of missing data} +\usage{ +PercentMissingFilter(threshold = 30, f = factor()) + +\S4method{filterFeatures}{XcmsResult,PercentMissingFilter}(object, filter, ...) + +\S4method{filterFeatures}{SummarizedExperiment,PercentMissingFilter}(object, filter, assay = 1) +} +\arguments{ +\item{threshold}{`numeric` percentage (between 0 and 100) of accepted missing +values for a feature in one sample group.} + +\item{f}{`vector` of the same length as the `object`, specifying the sample +type for each sample in the dataset. The percentage of missing values per +feature will be computed within each of these sample groups. Parameter `f`, +if not already a `factor`, will be converted to one using the factor function. +Samples with an `NA` as their value in `f` will be excluded from calculation.} + +\item{object}{\code{XcmsExperiment} or \code{SummarizedExperiment}. For an +\code{XcmsExperiment} object, the \code{featureValues(object)} will be evaluated, and +for \code{Summarizedesxperiment} the \code{assay(object, assay)}. The object will be +filtered.} + +\item{filter}{The parameter object selecting and configuring the type of +filtering. It can be one of the following classes: \code{\link{RsdFilter}}, +\code{\link{DratioFilter}}, \code{\link{PercentMissingFilter}} or \code{\link{BlankFlag}}.} + +\item{...}{Optional parameters. For \code{object} being an \code{XcmsExperiment}: +parameters for the \code{\link[=featureValues]{featureValues()}} call.} + +\item{assay}{For filtering of \code{SummarizedExperiment} objects only. Indicates +which assay the filtering will be based on. Note that the features for the +entire object will be removed, but the computations are performed on a single +assay. Default is 1, which means the first assay of the \code{object} will +be evaluated.} +} +\value{ +For `PercentMissingFilter`: a `PercentMissingFilter` class. +`filterFeatures` return the input object minus the features that did not met +the user input threshold +} +\description{ +The `PercentMissingFilter` class and method enable users to filter features +from an `XcmsExperiment` or `SummarizedExperiment` object based on the +percentage (values from 1 to 100) of missing values for each features in +different sample groups and filters them according to a provided threshold. + +This `filter` is part of the possible dispatch of the generic function +`filterFeatures`. Features with a percentage of missing values *higher* (`>`) +than the user input threshold in all sample groups will be removed (i.e. +features for which the proportion of missing values is below (`<=`) the +threshold in at least one sample group will be retained). +} +\seealso{ +Other Filter features in xcms: +\code{\link{BlankFlag}}, +\code{\link{DratioFilter}}, +\code{\link{RsdFilter}} +} +\author{ +Philippine Louail +} +\concept{Filter features in xcms} diff --git a/man/RsdFilter.Rd b/man/RsdFilter.Rd new file mode 100644 index 000000000..24cc3625e --- /dev/null +++ b/man/RsdFilter.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-filterFeatures.R +\name{RsdFilter} +\alias{RsdFilter} +\alias{filterFeatures,XcmsResult,RsdFilter-method} +\alias{filterFeatures,SummarizedExperiment,RsdFilter-method} +\title{Filter features based on their coefficient of variation} +\usage{ +RsdFilter(threshold = 0.3, qcIndex = integer(), na.rm = TRUE, mad = FALSE) + +\S4method{filterFeatures}{XcmsResult,RsdFilter}(object, filter, ...) + +\S4method{filterFeatures}{SummarizedExperiment,RsdFilter}(object, filter, assay = 1) +} +\arguments{ +\item{threshold}{`numeric` value representing the threshold. Features with a +coefficient of variation *strictly higher* (`>`) than this will be removed +from the entire dataset.} + +\item{qcIndex}{`integer` (or `logical`) vector corresponding to the indices +of QC samples.} + +\item{na.rm}{`logical` indicates whether missing values (`NA`) should be +removed prior to the calculations.} + +\item{mad}{`logical` indicates whether the *Median Absolute Deviation* (MAD) +should be used instead of the standard deviation. This is suggested for +non-gaussian distributed data.} + +\item{object}{\code{XcmsExperiment} or \code{SummarizedExperiment}. For an +\code{XcmsExperiment} object, the \code{featureValues(object)} will be evaluated, and +for \code{Summarizedesxperiment} the \code{assay(object, assay)}. The object will be +filtered.} + +\item{filter}{The parameter object selecting and configuring the type of +filtering. It can be one of the following classes: \code{\link{RsdFilter}}, +\code{\link{DratioFilter}}, \code{\link{PercentMissingFilter}} or \code{\link{BlankFlag}}.} + +\item{...}{Optional parameters. For \code{object} being an \code{XcmsExperiment}: +parameters for the \code{\link[=featureValues]{featureValues()}} call.} + +\item{assay}{For filtering of \code{SummarizedExperiment} objects only. Indicates +which assay the filtering will be based on. Note that the features for the +entire object will be removed, but the computations are performed on a single +assay. Default is 1, which means the first assay of the \code{object} will +be evaluated.} +} +\value{ +For `RsdFilter`: a `RsdFilter` class. `filterFeatures` return the +input object minus the features that did not met the user input threshold. +} +\description{ +The `RsdFilter` class and methods enable users to filter features from an +`XcmsExperiment` or `SummarizedExperiment` object based on their relative +standard deviation (coefficient of variation) for a specified threshold. + +This `filter` is part of the possible dispatch of the generic function +`filterFeatures`. Features *above* (`>`) the user-input threshold will be +removed from the entire dataset. +} +\note{ +It is assumed that the abundance values are in natural scale. Abundances in +log scale should be first transformed to natural scale before calculating +the RSD. +} +\seealso{ +Other Filter features in xcms: +\code{\link{BlankFlag}}, +\code{\link{DratioFilter}}, +\code{\link{PercentMissingFilter}} +} +\author{ +Philippine Louail +} +\concept{Filter features in xcms} diff --git a/man/filterFeatures.Rd b/man/filterFeatures.Rd new file mode 100644 index 000000000..d55c057e4 --- /dev/null +++ b/man/filterFeatures.Rd @@ -0,0 +1,121 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-filterFeatures.R +\name{filterFeatures} +\alias{filterFeatures} +\title{Filtering of features based on conventional quality assessment} +\arguments{ +\item{object}{\code{XcmsExperiment} or \code{SummarizedExperiment}. For an +\code{XcmsExperiment} object, the \code{featureValues(object)} will be evaluated, and +for \code{Summarizedesxperiment} the \code{assay(object, assay)}. The object will be +filtered.} + +\item{filter}{The parameter object selecting and configuring the type of +filtering. It can be one of the following classes: \code{\link{RsdFilter}}, +\code{\link{DratioFilter}}, \code{\link{PercentMissingFilter}} or \code{\link{BlankFlag}}.} + +\item{assay}{For filtering of \code{SummarizedExperiment} objects only. Indicates +which assay the filtering will be based on. Note that the features for the +entire object will be removed, but the computations are performed on a single +assay. Default is 1, which means the first assay of the \code{object} will +be evaluated.} + +\item{...}{Optional parameters. For \code{object} being an \code{XcmsExperiment}: +parameters for the \code{\link[=featureValues]{featureValues()}} call.} +} +\description{ +When dealing with metabolomics results, it is often necessary to filter +features based on certain criteria. These criteria are typically derived +from statistical formulas applied to full rows of data, where each row +represents a feature and its abundance of signal in each samples. +The \code{filterFeatures} function filters features based on these conventional +quality assessment criteria. Multiple types of filtering are implemented and +can be defined by the \code{filter} argument. + +Supported \code{filter} arguments are: +\itemize{ +\item \code{\link{RsdFilter}}: Calculates the relative standard deviation +(i.e. coefficient of variation) in abundance for each feature in QC +(Quality Control) samples and filters them in the input object according to +a provided threshold. +\item \code{\link{DratioFilter}}: Computes the D-ratio or \emph{dispersion ratio}, defined as +the standard deviation in abundance for QC samples divided by the standard +deviation for biological test samples, for each feature and filters them +according to a provided threshold. +\item \code{\link{PercentMissingFilter}}: Determines the percentage of missing values for +each feature in the various sample groups and filters them according to a +provided threshold. +\item \code{\link{BlankFlag}}: Identifies features where the mean abundance in test samples +is lower than a specified multiple of the mean abundance of blank samples. +This can be used to flag features that result from contamination in the +solvent of the samples. A new column \code{possible_contaminants} is added to the +\code{featureDefinitions} (\code{XcmsExperiment} object) or \code{rowData} +(\code{SummarizedExperiment} object) reflecting this. +} + +For specific examples, see the help pages of the individual parameter classes +listed above. +} +\examples{ +## See the vignettes for more detailed examples +library(MsExperiment) + +## Load a test data set with features defined. +test_xcms <- loadXcmsData() +## Set up parameter to filter based on coefficient of variation. By setting +## the filter such as below, features that have a coefficient of variation +## superior to 0.3 in QC samples will be removed from the object `test_xcms` +## when calling the `filterFeatures` function. + +rsd_filter <- RsdFilter(threshold = 0.3, + qcIndex = sampleData(test_xcms)$sample_type == "QC") + +filtered_data_rsd <- filterFeatures(object = test_xcms, filter = rsd_filter) + +## Set up parameter to filter based on D-ratio. By setting the filter such +## as below, features that have a D-ratio computed based on their abundance +## between QC and study samples superior to 0.5 will be removed from the +## object `test_xcms`. + +dratio_filter <- DratioFilter(threshold = 0.5, + qcIndex = sampleData(test_xcms)$sample_type == "QC", + studyIndex = sampleData(test_xcms)$sample_type == "study") + +filtered_data_dratio <- filterFeatures(object = test_xcms, + filter = dratio_filter) + +## Set up parameter to filter based on the percent of missing data. +## Parameter f should represent the sample group of samples, for which the +## percentage of missing values will be evaluated. As the setting is defined +## bellow, if a feature as less (or equal) to 30\% missing values in one +## sample group, it will be kept in the `test_xcms` object. + +missing_data_filter <- PercentMissingFilter(threshold = 30, + f = sampleData(test_xcms)$sample_type) + +filtered_data_missing <- filterFeatures(object = test_xcms, + filter = missing_data_filter) + +## Set up parameter to flag possible contaminants based on blank samples' +## abundance. By setting the filter such as below, features that have mean +## abundance ratio between blank(here use study as an example) and QC +## samples less than 2 will be marked as `TRUE` in an extra column named +## `possible_contaminants` in the `featureDefinitions` table of the object +## `test_xcms`. + +filter <- BlankFlag(threshold = 2, + qcIndex = sampleData(test_xcms)$sample_type == "QC", + blankIndex = sampleData(test_xcms)$sample_type == "study") +filtered_xmse <- filterFeatures(test_xcms, filter) + +} +\references{ +Broadhurst D, Goodacre R, Reinke SN, Kuligowski J, Wilson ID, Lewis MR, +Dunn WB. Guidelines and considerations for the use of system suitability +and quality control samples in mass spectrometry assays applied in +untargeted clinical metabolomic studies. Metabolomics. 2018;14(6):72. +doi: 10.1007/s11306-018-1367-3. Epub 2018 May 18. PMID: 29805336; +PMCID: PMC5960010. +} +\author{ +Philippine Louail +} diff --git a/tests/testthat/test_function-filtering.R b/tests/testthat/test_function-filtering.R new file mode 100644 index 000000000..b7e0220b0 --- /dev/null +++ b/tests/testthat/test_function-filtering.R @@ -0,0 +1,119 @@ +xmse_full <- loadXcmsData("xmse") + +test_that("RsdFilter", { + # Test default parameters + filter <- RsdFilter() + expect_is(filter, "RsdFilter") + expect_equal(filter@threshold, 0.3) + expect_equal(filter@qcIndex, integer()) + expect_equal(filter@na.rm, TRUE) + expect_equal(filter@mad, FALSE) + + #test with XcmsExperiment object + filter <- RsdFilter(qcIndex = sampleData(xmse_full)$sample_type == "QC") + filtered_xmse <- filterFeatures(xmse_full, filter) + expect_lte(nrow(featureDefinitions(filtered_xmse)), + nrow(featureDefinitions(xmse_full))) + + #test error message when index too small of too large + filter_1 <- RsdFilter(qcIndex = integer()) + filter_2 <- RsdFilter(qcIndex = c(sampleData(xmse_full)$sample_type == "QC", + TRUE)) + filter_3 <- RsdFilter(qcIndex = c(2, 15)) + expect_error(filterFeatures(xmse_full, filter_1)) + expect_error(filterFeatures(xmse_full, filter_2)) + expect_error(filterFeatures(xmse_full, filter_3)) + + #test in SummarizedExperiment object + res <- quantify(xmse_full) + filter <- RsdFilter(qcIndex = res$sample_type == "QC") + filtered_res <- filterFeatures(res, filter) + expect_lte(nrow(filtered_res), nrow(res)) + + #test same amount of features in in both object type + expect_equal(nrow(featureDefinitions(filtered_xmse)), + nrow(filtered_res)) +}) + +test_that("DratioFilter", { + # Test default parameters + filter <- DratioFilter() + expect_is(filter, "DratioFilter") + expect_equal(filter@threshold, 0.5) + expect_equal(filter@qcIndex, integer()) + expect_equal(filter@studyIndex, integer()) + expect_equal(filter@na.rm, TRUE) + expect_equal(filter@mad, FALSE) + + #test with XcmsExperiment object + filter <- DratioFilter( + qcIndex = sampleData(xmse_full)$sample_type == "QC", + studyIndex = sampleData(xmse_full)$sample_type == "study") + filtered_xmse <- filterFeatures(xmse_full, filter) + expect_lte(nrow(featureDefinitions(filtered_xmse)), + nrow(featureDefinitions(xmse_full))) + + #test in SummarizedExperiment object + res <- quantify(xmse_full) + filter <- DratioFilter(qcIndex = res$sample_type == "QC", + studyIndex = res$sample_type == "study") + filtered_res <- filterFeatures(res, filter) + expect_lte(nrow(filtered_res), nrow(res)) + + #test same amount of features in in both object type + expect_equal(nrow(featureDefinitions(filtered_xmse)), + nrow(filtered_res)) +}) + +test_that("PercentMissingFilter", { + # Test default parameters + filter <- PercentMissingFilter() + expect_is(filter, "PercentMissingFilter") + expect_equal(filter@threshold, 30) + expect_equal(filter@f, factor()) + + #test with XcmsExperiment object + filter <- PercentMissingFilter(f = sampleData(xmse_full)$sample_type) + filtered_xmse <- filterFeatures(xmse_full, filter) + expect_lte(nrow(featureDefinitions(filtered_xmse)), + nrow(featureDefinitions(xmse_full))) + + #test in SummarizedExperiment object + res <- quantify(xmse_full) + filter <- PercentMissingFilter(f = res$sample_type) + filtered_res <- filterFeatures(res, filter) + expect_lte(nrow(filtered_res), nrow(res)) + + #test same amount of features in in both object type + expect_equal(nrow(featureDefinitions(filtered_xmse)), + nrow(filtered_res)) +}) + +test_that("BlankFlag", { + # Test default parameters + filter <- BlankFlag() + expect_is(filter, "BlankFlag") + expect_equal(filter@threshold, 2) + expect_equal(filter@na.rm, TRUE) + expect_equal(filter@qcIndex, integer()) + expect_equal(filter@blankIndex, integer()) + + #test with XcmsExperiment object - using study sample as blanks + filter <- BlankFlag( + qcIndex = sampleData(xmse_full)$sample_type == "QC", + blankIndex = sampleData(xmse_full)$sample_type == "study") + filtered_xmse <- filterFeatures(xmse_full, filter) + expect_true("possible_contaminants" %in% + colnames(featureDefinitions(filtered_xmse))) + + #test in SummarizedExperiment object - using study sample as blanks + res <- quantify(xmse_full) + filter <- BlankFlag(qcIndex = res$sample_type == "QC", + blankIndex = res$sample_type == "study") + filtered_res <- filterFeatures(res, filter) + expect_true("possible_contaminants" %in% colnames(rowData(filtered_res))) + + expect_equal(sum(featureDefinitions(filtered_xmse)$possible_contaminants, + na.rm = TRUE), + sum(rowData(filtered_res)$possible_contaminants, na.rm = TRUE)) +}) diff --git a/vignettes/xcms.Rmd b/vignettes/xcms.Rmd index e0c2bcbb7..29e76a95b 100644 --- a/vignettes/xcms.Rmd +++ b/vignettes/xcms.Rmd @@ -1153,6 +1153,121 @@ properties of the mice analyzed (sex, age, litter mates etc). # Further data processing and analysis +## Quality-based filtering of features + +When dealing with metabolomics results, it is often necessary to filter +features based on certain criteria. These criteria are typically derived from +statistical formulas applied to full rows of data, where each row represents a +feature. The `filterFeatures` function provides a robust solution for filtering +features based on these conventional quality assessment criteria. It supports +multiple types of filtering, allowing users to tailor the filtering process to +their specific needs, all controlled by the `filter` argument. This function +and its implementations are applicable to both `XcmsExperiment` results objects +and `SummarizedExperiment` objects. + +We will demonstrate how to use the `filterFeatures` function to perform quality +assessment and filtering on both the `faahko` and `res` variables defined above. +The `filter` argument can accommodate various types of input, each determining the +specific type of quality assessment and filtering to be performed. + +The `RsdFilter` enable users to filter features based on their relative +standard deviation (coefficient of variation) for a specified `threshold`. It +is recommended to base the computation on quality control (QC) samples, +as demonstrated below: + +```{r} +# Set up parameters for RsdFilter +rsd_filter <- RsdFilter(threshold = 0.3, + qcIndex = sampleData(faahko)$sample_type == "QC") + +# Apply the filter to faakho object +filtered_faahko <- filterFeatures(object = faahko, filter = rsd_filter) + +# Now apply the same strategy to the res object +rsd_filter <- RsdFilter(threshold = 0.3, qcIndex = res$sample_type == "QC") +filtered_res <- filterFeatures(object = res, filter = rsd_filter, assay = "raw") +``` + +All features with an RSD (CV) strictly larger than 0.3 in QC samples were thus +removed from the data set. + +The `DratioFilter` can be used to filter features based on the D-ratio or +*dispersion ratio*, which compares the standard deviation in QC samples to that +in study samples. + +```{r} +# Set up parameters for DratioFilter +dratio_filter <- DratioFilter( + threshold = 0.5, + qcIndex = sampleData(filtered_faahko)$sample_type == "QC", + studyIndex = sampleData(filtered_faahko)$sample_type == "study") + +# Apply the filter to faahko object +filtered_faakho <- filterFeatures(object = filtered_faahko, + filter = dratio_filter) + +# Now same but for the res object +dratio_filter <- DratioFilter( + threshold = 0.5, + qcIndex = filtered_res$sample_type == "QC", + studyIndex = filtered_res$sample_type == "study") + +filtered_res <- filterFeatures(object = filtered_res, + filter = dratio_filter) +``` + +All features with an D-ratio strictly larger than 0.5 were thus removed from +the data set. + +The `PercentMissingFilter` allows to filter features based on the percentage of +missing values for each feature. This function takes as an input the parameter +`f` which is supposed to be a vector of length equal to the length of the object +(i.e. number of samples) with the sample type for each. The function then +computes the percentage of missing values per sample groups and filters +features based on this. Features with a percent of missing values larger than +the threshold in all sample groups will be removed. Another option is to base +this quality assessment and filtering only on QC samples. + +Both examples are shown below: + +```{r} +# To set up parameter `f` to filter only based on QC samples +f <- sampleData(filtered_faakho)$sample_type +f[f != "QC"] <- NA + +# To set up parameter `f` to filter per sample type excluding QC samples +f <- sampleData(filtered_faakho)$sample_type +f[f == "QC"] <- NA + +missing_filter <- PercentMissingFilter(threshold = 30, + f = f) + +# Apply the filter to faakho object +filtered_faakho <- filterFeatures(object = filtered_faakho, + filter = missing_filter) + +# Apply the filter to res object +missing_filter <- PercentMissingFilter(threshold = 30, + f = f) +filtered_res <- filterFeatures(object = filtered_res, + filter = missing_filter) +``` + +Here, no feature was removed, meaning that all the features had less than 30% +of `NA` values in at least one of the sample type. + +Although not directly relevant to this experiment, the `BlankFlag` filter can be +used to flag features based on the intensity relationship between blank and QC +samples. More information can be found in the documentation of the filter: + +```{r} +# Retrieve documentation for the main function and the specific filter. +?filterFeatures +?BlankFlag +``` + +## Normalization + Normalizing features' signal intensities is required, but at present not (yet) supported in `xcms` (some methods might be added in near future). It is advised to use the `SummarizedExperiment` returned by the `quantify` method for any