Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: single ref level string splitting in tabulate viz #116

Merged
merged 12 commits into from May 1, 2023
168 changes: 115 additions & 53 deletions q2_composition/_ancombc.py
Expand Up @@ -9,10 +9,11 @@
import tempfile
import pandas as pd
import os
import json
import formulaic

import qiime2
from qiime2.metadata import NumericMetadataColumn
from qiime2.metadata import NumericMetadataColumn, CategoricalMetadataColumn

from ._format import DataLoafPackageDirFmt

Expand Down Expand Up @@ -72,11 +73,37 @@ def _leaf_collector(term):
return _leaf_collector(term[1]) + _leaf_collector(term[2])


def _get_ref_level_defaults_from_formula_terms(metadata, term,
reference_levels):
term_alpha_value = (metadata.get_column(term)
.to_dataframe()
.sort_values(term)[term][0])
ref_level_pair = term + '::' + str(term_alpha_value)
reference_levels.append(ref_level_pair)

return reference_levels


Comment on lines +76 to +86
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since reference_levels is a list, and lists are mutable, the value reference_levels has after appending on line 81 will also be its value outside of this function. Lists are basically pass-by-reference, not pass-by-value, so mutating the state of a list inside of a function like this and then returning the list is redundant, but it might be worth keeping for clarity.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense, thanks @Oddant1 - I think I'll leave it as-is for clarity's sake, unless @ebolyen thinks it's unnecessary?

def _ancombc(table, metadata, formula, p_adj_method, prv_cut, lib_cut,
reference_levels, neg_lb, tol, max_iter, conserve, alpha):

meta = metadata.to_dataframe()

md_column_types = {}
for name, attrs in metadata.columns.items():
# MetadataColumn type
if attrs[0] == 'numeric':
md_column_types[name] = 'numeric'
elif attrs[0] == 'categorical':
md_column_types[name] = 'categorical'
# deadman switch in case we ever add any other md column types
else:
raise TypeError('Unexpected MetadataColumn type: "%s"'
' Expected types are either "categorical" or'
' "numeric".' % attrs[0])

md_column_types_json = json.dumps(md_column_types)

# error on IDs found in table but not in metadata
missing_ids = table.index.difference(meta.index).values

Expand All @@ -93,59 +120,93 @@ def _ancombc(table, metadata, formula, p_adj_method, prv_cut, lib_cut,
for term in formula_terms:
metadata.get_column(term)

# Pulling default reference levels based on formula terms
if reference_levels is None:
reference_levels = []
for term in formula_terms:
if isinstance(metadata.get_column(term),
CategoricalMetadataColumn):
lizgehret marked this conversation as resolved.
Show resolved Hide resolved
reference_levels = \
_get_ref_level_defaults_from_formula_terms(
metadata=metadata, term=term,
reference_levels=reference_levels)

if isinstance(reference_levels, str):
reference_levels = [reference_levels]

# column & level validation for the reference_levels parameter
if reference_levels is not None:
lizgehret marked this conversation as resolved.
Show resolved Hide resolved
for i in reference_levels:
column = i.split('::')[0]
level_value = i.split('::')[1]

# check that reference_level columns are present in the metadata
ref_column = metadata.get_column(column)

# check that each chosen column contains discrete values
if isinstance(ref_column, NumericMetadataColumn):
raise TypeError('One of the `reference_levels` columns is not'
' a categorical Metadata column. Please make'
' sure that all chosen reference level columns'
' are categorical, and not numeric.'
' Non-categorical column selected:'
' %s' % column)

if level_value not in pd.unique(meta[column].values):
raise ValueError('Value provided in `reference_levels`'
' parameter not found in the associated'
' column within the metadata. Please make'
' sure each column::value pair is present'
' within the metadata file.'
' \n\n'
' column::value pair with a value that was'
' not found: "%s"' % i)

# check that reference_level columns are also in the formula
if column not in formula_terms:
raise ValueError('`reference_levels` column "%s" was not found'
' within the `formula` terms.' % column)

# check that IDs associated with chosen reference level(s) are
# present within the input table
level_value_idx = meta.index[meta[column] == level_value]
table_idx = table.index

if level_value_idx.intersection(table_idx).empty:
raise ValueError('Value provided in `reference_levels`'
' parameter not associated with any IDs'
' in the feature table. Please make sure'
' the value(s) selected in each'
' column::value pair are associated with'
' IDs present in the feature table.'
' \n\n'
' Value not associated with any IDs in'
' the table: "%s"' % level_value,
' IDs not found in table:'
' "%s"' % level_value_idx)

else:
reference_levels = ''
reference_level_columns = []
for level in reference_levels:
try:
column, level_value = level.split('::')
except Exception:
raise ValueError('Too many column-value pair separators found'
' (`::`) in the following `reference_level`:'
' "%s"' % level)
# check that multiple values for the same column aren't provided
if column in reference_level_columns:
raise ValueError('Multiple `reference_level` pairs with the same'
' column were provided. Please only include one'
' `reference_level` pair per column from the'
' terms in the formula. `reference_level` column'
' with multiple groups that was included:'
' "%s"' % term)
else:
reference_level_columns.append(column)

# check that reference_level columns are present in the metadata
ref_column = metadata.get_column(column)

# check that each chosen column contains discrete values
if isinstance(ref_column, NumericMetadataColumn):
raise TypeError('One of the `reference_levels` columns is not'
' a categorical Metadata column. Please make'
' sure that all chosen reference level columns'
' are categorical, and not numeric.'
' Non-categorical column selected:'
' %s' % column)

if level_value not in pd.unique(meta[column].values):
raise ValueError('Value provided in `reference_levels`'
' parameter not found in the associated'
' column within the metadata. Please make'
' sure each column::value pair is present'
' within the metadata file.'
' \n\n'
' column::value pair with a value that was'
' not found: "%s"' % level)

# check that reference_level columns are also in the formula
if column not in formula_terms:
raise ValueError('`reference_levels` column "%s" was not found'
' within the `formula` terms.' % column)

# check that IDs associated with chosen reference level(s) are
# present within the input table
level_value_idx = meta.index[meta[column] == level_value]
table_idx = table.index

if level_value_idx.intersection(table_idx).empty:
raise ValueError('Value provided in `reference_levels`'
' parameter not associated with any IDs'
' in the feature table. Please make sure'
' the value(s) selected in each'
' column::value pair are associated with'
' IDs present in the feature table.'
' \n\n'
' Value not associated with any IDs in'
' the table: "%s"' % level_value,
' IDs not found in table:'
' "%s"' % level_value_idx)

# Adding any column::value defaults from formula terms
# if not included in ref_levels param
for term in formula_terms:
if term not in reference_level_columns:
reference_levels = \
_get_ref_level_defaults_from_formula_terms(
metadata=metadata, term=term,
reference_levels=reference_levels)

with tempfile.TemporaryDirectory() as temp_dir_name:
biom_fp = os.path.join(temp_dir_name, 'input.biom.tsv')
Expand All @@ -159,6 +220,7 @@ def _ancombc(table, metadata, formula, p_adj_method, prv_cut, lib_cut,
cmd = ['run_ancombc.R',
'--inp_abundances_path', biom_fp,
'--inp_metadata_path', meta_fp,
'--md_column_types', md_column_types_json,
'--formula', str(formula),
'--p_adj_method', p_adj_method,
'--prv_cut', str(prv_cut),
Expand Down
18 changes: 14 additions & 4 deletions q2_composition/_dataloaf_tabulate/_visualizer.py
Expand Up @@ -45,11 +45,21 @@ def tabulate(output_dir: str, data: DataLoafPackageDirFmt):
slice_contents.append(slice_html)

slice_tables = zip(slice_names, slice_contents)

# Filling in the table that will appear on index.html
context = {
'intercept': slice_md_json['metadata']['intercept_groups'],
'tables': slice_tables
}
intercept = slice_md_json['metadata']['intercept_groups']
if isinstance(intercept, str):
lizgehret marked this conversation as resolved.
Show resolved Hide resolved
intercept = [intercept]

if len(intercept) == 1:
context = {
'intercept_single': intercept[0],
'tables': slice_tables
}
else:
context = {
'intercept_multi': intercept,
'tables': slice_tables
}
# Render the results using q2templates
q2templates.render(index, output_dir, context=context)
12 changes: 9 additions & 3 deletions q2_composition/_dataloaf_tabulate/assets/index.html
Expand Up @@ -92,19 +92,25 @@
{% if name == "lfc" %}
<h1>Log-Fold Change (LFC)</h1>
<p>Positive numbers indicate enrichment relative to the intercept,
negative numbers indicate depletion relative to the intercept.</p>
negative numbers indicate depletion relative to the intercept.</p><br>
{% elif name == "p_val" %}
<h1><i>p</i>-Values</h1>
{% elif name == "q_val" %}
<h1><i>q</i>-Values</h1>
<h1><i>q</i>-Values: FDR Corrected <i>p</i>-Values</h1>
{% elif name == "se" %}
<h1>Standard Error</h1>
{% elif name == "w" %}
<h1><i>W</i>-Scores: Log-Fold Change divided by the Standard Error</h1>
{% else %}
<h1>{{ name }}</h1>
{% endif %}
<p>Groups used to define the intercept: {{ ", ".join(intercept) }} (and any numerical metadata columns)</p>
{% if intercept_single %}
<p><b>Groups used to define the intercept:</b></p>
<p>{{ intercept_single }} (and any numerical metadata columns)</p>
{% else %}
<p><b>Groups used to define the intercept:</b></p>
<p>{{ ", ".join(intercept_multi) }} (and any numerical metadata columns)</p>
{% endif %}
<table class="dataframe table table-striped table-hover" border="0">{{ table }}</table>
</div>
{% endfor %}
Expand Down
64 changes: 38 additions & 26 deletions q2_composition/assets/run_ancombc.R
Expand Up @@ -16,6 +16,7 @@ suppressWarnings(library(tidyverse))
suppressWarnings(library(ANCOMBC))
library("optparse")
library("frictionless")
library("jsonlite")

# load arguments -----------------
cat(R.version$version.string, "\n")
Expand All @@ -25,6 +26,8 @@ option_list <- list(
type = "character"),
make_option("--inp_metadata_path", action = "store", default = "NULL",
type = "character"),
make_option("--md_column_types", action = "store", default = "NULL",
type = "character"),
make_option("--formula", action = "store", default = "NULL",
type = "character"),
make_option("--p_adj_method", action = "store", default = "NULL",
Expand Down Expand Up @@ -52,6 +55,7 @@ opt <- parse_args(OptionParser(option_list = option_list))
# Assign each arg (in positional order) to an appropriately named R variable
inp_abundances_path <- opt$inp_abundances_path
inp_metadata_path <- opt$inp_metadata_path
md_column_types <- opt$md_column_types
formula <- opt$formula
p_adj_method <- opt$p_adj_method
prv_cut <- as.numeric(opt$prv_cut)
Expand All @@ -71,45 +75,53 @@ if (!file.exists(inp_abundances_path)) {
otu_file <- t(read.delim(inp_abundances_path, check.names = FALSE,
row.names = 1))
}

if (!file.exists(inp_metadata_path)) {
errQuit("Metadata file path does not exist.")
} else {
metadata_file <- read.delim(inp_metadata_path, check.names = FALSE,
row.names = 1)
}

otu <- otu_table(otu_file, taxa_are_rows = TRUE)
# convert column types to numeric/categorical as specified in metadata
md <- sample_data(metadata_file)
row.names(md) <- rownames(metadata_file)

if (reference_levels == "") {
reference_levels <- NULL
md_column_types <- fromJSON(md_column_types)

for (i in seq(1, length(md_column_types))) {
if (md_column_types[i] == "numeric") {
md[[names(md_column_types[i])]] <-
as.numeric(md[[names(md_column_types[i])]])
} else if (md_column_types[i] == "categorical") {
md[[names(md_column_types[i])]] <-
as.character(md[[names(md_column_types[i])]])
}
}

otu <- otu_table(otu_file, taxa_are_rows = TRUE)

intercept_groups <- c()
# split the reference_levels param into each column and associated level order
if (!is.null(reference_levels)) {
lizgehret marked this conversation as resolved.
Show resolved Hide resolved
level_vectors <- unlist(strsplit(reference_levels, ", "))

for (i in level_vectors) {
column <- unlist(strsplit(i, "::"))[1]
column <- gsub("\\'", "", column)
column <- gsub("\\]", "", column)
column <- gsub("\\[", "", column)

intercept_vector <- unlist(strsplit(i, "::"))[2]
intercept_vector <- unlist(strsplit(intercept_vector, ","))
intercept_vector <- gsub("\\'", "", intercept_vector)
intercept_vector <- gsub("\\]", "", intercept_vector)
intercept_vector <- gsub("\\[", "", intercept_vector)

intercept_groups <- append(intercept_groups,
paste(column, intercept_vector, sep = "::"))

# handling formula input(s)
md[[column]] <- factor(md[[column]])
md[[column]] <- relevel(md[[column]], ref = intercept_vector)
}
level_vectors <- unlist(strsplit(reference_levels, ", "))

for (i in level_vectors) {
column <- unlist(strsplit(i, "::"))[1]
column <- gsub("\\'", "", column)
column <- gsub("\\]", "", column)
column <- gsub("\\[", "", column)

intercept_vector <- unlist(strsplit(i, "::"))[2]
intercept_vector <- unlist(strsplit(intercept_vector, ","))
intercept_vector <- gsub("\\'", "", intercept_vector)
intercept_vector <- gsub("\\]", "", intercept_vector)
intercept_vector <- gsub("\\[", "", intercept_vector)

intercept_groups <- append(intercept_groups,
paste(column, intercept_vector, sep = "::"))

# handling formula input(s)
md[[column]] <- factor(md[[column]])
md[[column]] <- relevel(md[[column]], ref = intercept_vector)
}

# create phyloseq object for use in ancombc
Expand Down