qiime2 · Oddant1 · May 1, 2023 · Apr 5, 2023 · Apr 6, 2023 · Apr 6, 2023
diff --git a/q2_composition/_ancombc.py b/q2_composition/_ancombc.py
@@ -9,10 +9,11 @@
 import tempfile
 import pandas as pd
 import os
+import json
 import formulaic
 
 import qiime2
-from qiime2.metadata import NumericMetadataColumn
+from qiime2.metadata import NumericMetadataColumn, CategoricalMetadataColumn
 
 from ._format import DataLoafPackageDirFmt
 
@@ -72,11 +73,37 @@ def _leaf_collector(term):
     return _leaf_collector(term[1]) + _leaf_collector(term[2])
 
 
+def _get_ref_level_defaults_from_formula_terms(metadata, term,
+                                               reference_levels):
+    term_alpha_value = (metadata.get_column(term)
+                        .to_dataframe()
+                        .sort_values(term)[term][0])
+    ref_level_pair = term + '::' + str(term_alpha_value)
+    reference_levels.append(ref_level_pair)
+
+    return reference_levels
+
+
 def _ancombc(table, metadata, formula, p_adj_method, prv_cut, lib_cut,
              reference_levels, neg_lb, tol, max_iter, conserve, alpha):
 
     meta = metadata.to_dataframe()
 
+    md_column_types = {}
+    for name, attrs in metadata.columns.items():
+        # MetadataColumn type
+        if attrs[0] == 'numeric':
+            md_column_types[name] = 'numeric'
+        elif attrs[0] == 'categorical':
+            md_column_types[name] = 'categorical'
+        # deadman switch in case we ever add any other md column types
+        else:
+            raise TypeError('Unexpected MetadataColumn type: "%s"'
+                            ' Expected types are either "categorical" or'
+                            ' "numeric".' % attrs[0])
+
+    md_column_types_json = json.dumps(md_column_types)
+
     # error on IDs found in table but not in metadata
     missing_ids = table.index.difference(meta.index).values
 
@@ -93,59 +120,93 @@ def _ancombc(table, metadata, formula, p_adj_method, prv_cut, lib_cut,
     for term in formula_terms:
         metadata.get_column(term)
 
+    # Pulling default reference levels based on formula terms
+    if reference_levels is None:
+        reference_levels = []
+        for term in formula_terms:
+            if isinstance(metadata.get_column(term),
+                          CategoricalMetadataColumn):
+                reference_levels = \
+                    _get_ref_level_defaults_from_formula_terms(
+                        metadata=metadata, term=term,
+                        reference_levels=reference_levels)
+
+    if isinstance(reference_levels, str):
+        reference_levels = [reference_levels]
+
     # column & level validation for the reference_levels parameter
-    if reference_levels is not None:
-        for i in reference_levels:
-            column = i.split('::')[0]
-            level_value = i.split('::')[1]
-
-            # check that reference_level columns are present in the metadata
-            ref_column = metadata.get_column(column)
-
-            # check that each chosen column contains discrete values
-            if isinstance(ref_column, NumericMetadataColumn):
-                raise TypeError('One of the `reference_levels` columns is not'
-                                ' a categorical Metadata column. Please make'
-                                ' sure that all chosen reference level columns'
-                                ' are categorical, and not numeric.'
-                                ' Non-categorical column selected:'
-                                ' %s' % column)
-
-            if level_value not in pd.unique(meta[column].values):
-                raise ValueError('Value provided in `reference_levels`'
-                                 ' parameter not found in the associated'
-                                 ' column within the metadata. Please make'
-                                 ' sure each column::value pair is present'
-                                 ' within the metadata file.'
-                                 ' \n\n'
-                                 ' column::value pair with a value that was'
-                                 ' not found: "%s"' % i)
-
-            # check that reference_level columns are also in the formula
-            if column not in formula_terms:
-                raise ValueError('`reference_levels` column "%s" was not found'
-                                 ' within the `formula` terms.' % column)
-
-            # check that IDs associated with chosen reference level(s) are
-            # present within the input table
-            level_value_idx = meta.index[meta[column] == level_value]
-            table_idx = table.index
-
-            if level_value_idx.intersection(table_idx).empty:
-                raise ValueError('Value provided in `reference_levels`'
-                                 ' parameter not associated with any IDs'
-                                 ' in the feature table. Please make sure'
-                                 ' the value(s) selected in each'
-                                 ' column::value pair are associated with'
-                                 ' IDs present in the feature table.'
-                                 ' \n\n'
-                                 ' Value not associated with any IDs in'
-                                 ' the table: "%s"' % level_value,
-                                 ' IDs not found in table:'
-                                 ' "%s"' % level_value_idx)
-
-    else:
-        reference_levels = ''
+    reference_level_columns = []
+    for level in reference_levels:
+        try:
+            column, level_value = level.split('::')
+        except Exception:
+            raise ValueError('Too many column-value pair separators found'
+                             ' (`::`) in the following `reference_level`:'
+                             ' "%s"' % level)
+        # check that multiple values for the same column aren't provided
+        if column in reference_level_columns:
+            raise ValueError('Multiple `reference_level` pairs with the same'
+                             ' column were provided. Please only include one'
+                             ' `reference_level` pair per column from the'
+                             ' terms in the formula. `reference_level` column'
+                             ' with multiple groups that was included:'
+                             ' "%s"' % term)
+        else:
+            reference_level_columns.append(column)
+
+        # check that reference_level columns are present in the metadata
+        ref_column = metadata.get_column(column)
+
+        # check that each chosen column contains discrete values
+        if isinstance(ref_column, NumericMetadataColumn):
+            raise TypeError('One of the `reference_levels` columns is not'
+                            ' a categorical Metadata column. Please make'
+                            ' sure that all chosen reference level columns'
+                            ' are categorical, and not numeric.'
+                            ' Non-categorical column selected:'
+                            ' %s' % column)
+
+        if level_value not in pd.unique(meta[column].values):
+            raise ValueError('Value provided in `reference_levels`'
+                             ' parameter not found in the associated'
+                             ' column within the metadata. Please make'
+                             ' sure each column::value pair is present'
+                             ' within the metadata file.'
+                             ' \n\n'
+                             ' column::value pair with a value that was'
+                             ' not found: "%s"' % level)
+
+        # check that reference_level columns are also in the formula
+        if column not in formula_terms:
+            raise ValueError('`reference_levels` column "%s" was not found'
+                             ' within the `formula` terms.' % column)
+
+        # check that IDs associated with chosen reference level(s) are
+        # present within the input table
+        level_value_idx = meta.index[meta[column] == level_value]
+        table_idx = table.index
+
+        if level_value_idx.intersection(table_idx).empty:
+            raise ValueError('Value provided in `reference_levels`'
+                             ' parameter not associated with any IDs'
+                             ' in the feature table. Please make sure'
+                             ' the value(s) selected in each'
+                             ' column::value pair are associated with'
+                             ' IDs present in the feature table.'
+                             ' \n\n'
+                             ' Value not associated with any IDs in'
+                             ' the table: "%s"' % level_value,
+                             ' IDs not found in table:'
+                             ' "%s"' % level_value_idx)
+
+    # Adding any column::value defaults from formula terms
+    # if not included in ref_levels param
+    for term in formula_terms:
+        if term not in reference_level_columns:
+            reference_levels = \
+                _get_ref_level_defaults_from_formula_terms(
+                    metadata=metadata, term=term,
+                    reference_levels=reference_levels)
 
     with tempfile.TemporaryDirectory() as temp_dir_name:
         biom_fp = os.path.join(temp_dir_name, 'input.biom.tsv')
@@ -159,6 +220,7 @@ def _ancombc(table, metadata, formula, p_adj_method, prv_cut, lib_cut,
         cmd = ['run_ancombc.R',
                '--inp_abundances_path', biom_fp,
                '--inp_metadata_path', meta_fp,
+               '--md_column_types', md_column_types_json,
                '--formula', str(formula),
                '--p_adj_method', p_adj_method,
                '--prv_cut', str(prv_cut),

diff --git a/q2_composition/_dataloaf_tabulate/_visualizer.py b/q2_composition/_dataloaf_tabulate/_visualizer.py
@@ -45,11 +45,21 @@ def tabulate(output_dir: str, data: DataLoafPackageDirFmt):
         slice_contents.append(slice_html)
 
     slice_tables = zip(slice_names, slice_contents)
+
     # Filling in the table that will appear on index.html
-    context = {
-        'intercept': slice_md_json['metadata']['intercept_groups'],
-        'tables': slice_tables
-    }
+    intercept = slice_md_json['metadata']['intercept_groups']
+    if isinstance(intercept, str):
+        intercept = [intercept]
 
+    if len(intercept) == 1:
+        context = {
+            'intercept_single': intercept[0],
+            'tables': slice_tables
+        }
+    else:
+        context = {
+            'intercept_multi': intercept,
+            'tables': slice_tables
+        }
     # Render the results using q2templates
     q2templates.render(index, output_dir, context=context)
diff --git a/q2_composition/_dataloaf_tabulate/assets/index.html b/q2_composition/_dataloaf_tabulate/assets/index.html
@@ -92,19 +92,25 @@
       {% if name == "lfc" %}
         <h1>Log-Fold Change (LFC)</h1>
           <p>Positive numbers indicate enrichment relative to the intercept,
-          negative numbers indicate depletion relative to the intercept.</p>
+          negative numbers indicate depletion relative to the intercept.</p><br>
       {% elif name == "p_val" %}
         <h1><i>p</i>-Values</h1>
       {% elif name == "q_val" %}
-        <h1><i>q</i>-Values</h1>
+        <h1><i>q</i>-Values: FDR Corrected <i>p</i>-Values</h1>
       {% elif name == "se" %}
         <h1>Standard Error</h1>
       {% elif name == "w" %}
         <h1><i>W</i>-Scores: Log-Fold Change divided by the Standard Error</h1>
       {% else %}
         <h1>{{ name }}</h1>
       {% endif %}
-      <p>Groups used to define the intercept: {{ ", ".join(intercept) }} (and any numerical metadata columns)</p>
+      {% if intercept_single %}
+        <p><b>Groups used to define the intercept:</b></p>
+        <p>{{ intercept_single }} (and any numerical metadata columns)</p>
+      {% else %}
+        <p><b>Groups used to define the intercept:</b></p>
+        <p>{{ ", ".join(intercept_multi) }} (and any numerical metadata columns)</p>
+      {% endif %}
       <table class="dataframe table table-striped table-hover" border="0">{{ table }}</table>
       </div>
     {% endfor %}

diff --git a/q2_composition/assets/run_ancombc.R b/q2_composition/assets/run_ancombc.R
@@ -16,6 +16,7 @@ suppressWarnings(library(tidyverse))
 suppressWarnings(library(ANCOMBC))
 library("optparse")
 library("frictionless")
+library("jsonlite")
 
 # load arguments -----------------
 cat(R.version$version.string, "\n")
@@ -25,6 +26,8 @@ option_list <- list(
               type = "character"),
   make_option("--inp_metadata_path", action = "store", default = "NULL",
               type = "character"),
+  make_option("--md_column_types", action = "store", default = "NULL",
+              type = "character"),
   make_option("--formula", action = "store", default = "NULL",
               type = "character"),
   make_option("--p_adj_method", action = "store", default = "NULL",
@@ -52,6 +55,7 @@ opt <- parse_args(OptionParser(option_list = option_list))
 # Assign each arg (in positional order) to an appropriately named R variable
 inp_abundances_path <- opt$inp_abundances_path
 inp_metadata_path   <- opt$inp_metadata_path
+md_column_types     <- opt$md_column_types
 formula             <- opt$formula
 p_adj_method        <- opt$p_adj_method
 prv_cut             <- as.numeric(opt$prv_cut)
@@ -71,45 +75,53 @@ if (!file.exists(inp_abundances_path)) {
   otu_file <- t(read.delim(inp_abundances_path, check.names = FALSE,
                             row.names = 1))
   }
+
 if (!file.exists(inp_metadata_path)) {
   errQuit("Metadata file path does not exist.")
 } else {
   metadata_file <- read.delim(inp_metadata_path, check.names = FALSE,
                               row.names = 1)
   }
 
-otu <- otu_table(otu_file, taxa_are_rows = TRUE)
+# convert column types to numeric/categorical as specified in metadata
 md <- sample_data(metadata_file)
 row.names(md) <- rownames(metadata_file)
-
-if (reference_levels == "") {
-  reference_levels <- NULL
+md_column_types <- fromJSON(md_column_types)
+
+for (i in seq(1, length(md_column_types))) {
+  if (md_column_types[i] == "numeric") {
+    md[[names(md_column_types[i])]] <-
+      as.numeric(md[[names(md_column_types[i])]])
+  } else if (md_column_types[i] == "categorical") {
+    md[[names(md_column_types[i])]] <-
+      as.character(md[[names(md_column_types[i])]])
+  }
 }
 
+otu <- otu_table(otu_file, taxa_are_rows = TRUE)
+
 intercept_groups <- c()
 # split the reference_levels param into each column and associated level order
-if (!is.null(reference_levels)) {
-  level_vectors <- unlist(strsplit(reference_levels, ", "))
-
-  for (i in level_vectors) {
-    column <- unlist(strsplit(i, "::"))[1]
-    column <- gsub("\\'", "", column)
-    column <- gsub("\\]", "", column)
-    column <- gsub("\\[", "", column)
-
-    intercept_vector <- unlist(strsplit(i, "::"))[2]
-    intercept_vector <- unlist(strsplit(intercept_vector, ","))
-    intercept_vector <- gsub("\\'", "", intercept_vector)
-    intercept_vector <- gsub("\\]", "", intercept_vector)
-    intercept_vector <- gsub("\\[", "", intercept_vector)
-
-    intercept_groups <- append(intercept_groups,
-                               paste(column, intercept_vector, sep = "::"))
-
-    # handling formula input(s)
-    md[[column]] <- factor(md[[column]])
-    md[[column]] <- relevel(md[[column]], ref = intercept_vector)
-  }
+level_vectors <- unlist(strsplit(reference_levels, ", "))
+
+for (i in level_vectors) {
+  column <- unlist(strsplit(i, "::"))[1]
+  column <- gsub("\\'", "", column)
+  column <- gsub("\\]", "", column)
+  column <- gsub("\\[", "", column)
+
+  intercept_vector <- unlist(strsplit(i, "::"))[2]
+  intercept_vector <- unlist(strsplit(intercept_vector, ","))
+  intercept_vector <- gsub("\\'", "", intercept_vector)
+  intercept_vector <- gsub("\\]", "", intercept_vector)
+  intercept_vector <- gsub("\\[", "", intercept_vector)
+
+  intercept_groups <- append(intercept_groups,
+                              paste(column, intercept_vector, sep = "::"))
+
+  # handling formula input(s)
+  md[[column]] <- factor(md[[column]])
+  md[[column]] <- relevel(md[[column]], ref = intercept_vector)
 }
 
 # create phyloseq object for use in ancombc