Merge pull request #132 from qiyunzhu/tools

made map as rank default
qiyunzhu · Aug 3, 2021 · 8b24732 · 8b24732
2 parents ac0823f + b547dcc
commit 8b24732
Show file tree

Hide file tree

Showing 14 changed files with 67 additions and 33 deletions.
diff --git a/README.md b/README.md
@@ -118,7 +118,6 @@ woltka classify \
   --coords function/coords.txt.xz \
   --map function/uniref.map.xz \
   --map function/go/process.tsv.xz \
-  --map-as-rank \
   --rank uniref,process \
   -o output_dir
 ```

diff --git a/doc/cli.md b/doc/cli.md
@@ -37,7 +37,7 @@ Option | Description
 `--lineage` | Lineage strings. Can accept Greengenes-style rank prefix.
 `--columns` | Table of classification units per rank (column).
 `--map`, `-m` | Mapping of lower classification units to higher ones.
-`--map-as-rank` | Extract rank name from mapping filename.
+`--map-as-rank/--map-no-rank` | Extract rank name from mapping filename. On by default when classifying with only mapping files.
 `--names`, `-n` | Names of classification units as defined by NCBI names.dmp or a plain map.
 
 ### Assignment

diff --git a/doc/hierarchy.md b/doc/hierarchy.md
@@ -37,7 +37,9 @@ Wolkta supports various types and formats of classification systems, as listed b
 
 5. `--map` or `-m`: Simple map of lower taxon \<tab\> higher taxon.
 
-   Flag `--map-as-rank` is to instruct the program to treat the map filename as rank. For example, with this flag, taxa in the 2nd column of `uniref.map.gz` will be given the rank "uniref".
+   Flag `--map-as-rank` or `--map-no-rank` is to instruct the program to treat the map filename as rank (or not). For example, the second column of `uniref.map.gz` will be given the rank "uniref".
+
+   If this flag is omitted, the program will automatically switch it on when only simple map(s) but no other hierarchy files are provided.
 
 Compressed files are supported and automatically recognized. For example, reading the gzipped Greengenes taxonomy file is as simple as:
 
@@ -222,6 +224,31 @@ With flag `--map-as-rank`, Woltka will extract a **rank** name from the filename
 - `reaction_to_pathway.tsv` => `pathway`
 - `apple-to-orange` => `orange`
 
+If the user provides only simple map(s), this flag will be automatically turned on. For example:
+
+```bash
+woltka classify \
+  -i indir \
+  --map taxon2species.map \
+  --map species2genus.map \
+  --rank species,genus \
+  -o outdir
+```
+
+But the following won't work without this flag, because there are other hierarchy files (`nodes.dmp`):
+
+```bash
+woltka classify \
+  -i indir \
+  --map taxid.map \
+  --nodes taxdump/nodes.dmp \
+  --names taxdump/names.dmp \
+  --rank taxid,genus \
+  -o outdir
+```
+
+One can force it off by flag `--map-no-rank` even when there are only mapping files.
+
 
 ## Multiple mapping
 

diff --git a/doc/kegg.md b/doc/kegg.md
@@ -12,7 +12,6 @@ woltka classify \
   --coords coords.txt.xz \
   --map    uniref/uniref.map.xz \
   --map    kegg/ko.map.xz \
-  --map-as-rank \
   --rank   ko \
   --output ko.tsv
 ```

diff --git a/doc/metacyc.md b/doc/metacyc.md
@@ -30,7 +30,6 @@ woltka classify \
   --coords coords.txt.xz \
   --map    metacyc/protein.map.xz \
   --names  metacyc/protein_name.txt \
-  --map-as-rank \
   --rank   protein \
   --output protein.biom
 ```
@@ -115,5 +114,5 @@ woltka classify -i input_dir --lineage lineages.txt -r genus -o genus.biom -u ma
 Second, perform functional classification. This command is identical to the first command in this document, except for the addition of `--stratify` or `-t` parameter pointing to the genus maps, which will be incorporated into the functional classes ([see details](stratify.md)).
 
 ```bash
-woltka classify -i input_dir -c coords.txt.xz -m metacyc/protein.map.xz --map-as-rank -r protein -t map_dir -o protein.biom
+woltka classify -i input_dir -c coords.txt.xz -m metacyc/protein.map.xz -r protein -t map_dir -o protein.biom
 ```
diff --git a/doc/normalize.md b/doc/normalize.md
@@ -101,7 +101,6 @@ woltka classify \
   --coords coords.txt.xz \
   --map    metacyc/protein.map.xz \
   --names  metacyc/protein_name.txt \
-  --map-as-rank \
   --rank   protein \
   --size   . \
   --scale  1k \

diff --git a/doc/stratify.md b/doc/stratify.md
@@ -39,7 +39,6 @@ woltka classify \
   --coords function/coords.txt.xz \
   --map function/uniref.map.xz \
   --map function/go/process.tsv.xz \
-  --map-as-rank \
   --rank process \
   --stratify mapdir \
   -o taxfunc.biom
@@ -89,7 +88,6 @@ woltka classify \
   -i align/diamond \
   --map function/uniref.map.xz \
   --map function/go/process.tsv.xz \
-  --map-as-rank \
   --rank process \
   --stratify mapdir \
   -o taxfunc.biom

diff --git a/doc/wol.md b/doc/wol.md
@@ -163,7 +163,6 @@ woltka classify \
   --names  function/uniref/uniref.name.xz \
   --map    function/kegg/ko.map.xz \
   --names  function/kegg/ko.name \
-  --map-as-rank \
   --rank   uniref,ko \
   --to-tsv \
   --output .
@@ -250,7 +249,6 @@ woltka classify \
   --coords   proteins/coords.txt.xz \
   --map      function/uniref/uniref.map.xz \
   --map      function/kegg/ko.map.xz \
-  --map-as-rank \
   --rank     ko \
   --stratify mapdir \
   --output   ko_by_genus.biom

diff --git a/doc/wolsop.sh b/doc/wolsop.sh
@@ -95,7 +95,6 @@ woltka classify \
   --coords $db/proteins/coords.txt.xz \
   --map    $db/function/uniref/uniref.map.xz \
   --names  $db/function/uniref/uniref.name.xz \
-  --map-as-rank \
   --rank   none,uniref \
   $filext \
   $altfmt \

diff --git a/woltka/cli.py b/woltka/cli.py
@@ -79,7 +79,7 @@ def cli():
     '--map', '-m', 'map_fps', type=click.Path(exists=True), multiple=True,
     help='Mapping of lower classification units to higher ones.')
 @click.option(
-    '--map-as-rank', is_flag=True,
+    '--map-as-rank/--map-no-rank', 'map_rank', default=None,
     help='Extract rank name from map filename.')
 @click.option(
     '--names', '-n', 'names_fps', type=click.Path(exists=True), multiple=True,

diff --git a/woltka/tests/data/README.md b/woltka/tests/data/README.md
@@ -133,7 +133,6 @@ woltka classify \
   --coords function/coords.txt.xz \
   --map function/uniref/uniref.map.xz \
   --map function/go/process.tsv.xz \
-  --map-as-rank \
   --rank process \
   --output burst.process.tsv
 ```
@@ -165,7 +164,6 @@ woltka classify \
   --input align/burst/split \
   --map function/nucl/uniref.map.xz \
   --map function/go/process.tsv.xz \
-  --map-as-rank \
   --rank process \
   --output split.process.tsv
 ```
@@ -198,7 +196,6 @@ woltka classify \
   --input align/diamond \
   --map function/uniref/uniref.map.xz \
   --map function/go/function.tsv.xz \
-  --map-as-rank \
   --rank function \
   --output diamond.function.tsv
 ```
@@ -211,7 +208,6 @@ woltka classify \
   --coords function/coords.txt.xz \
   --map function/uniref/uniref.map.xz \
   --map function/go/component.tsv.xz \
-  --map-as-rank \
   --rank component \
   --sizes . \
   --scale 1k \
@@ -236,7 +232,6 @@ woltka classify \
   --coords function/nucl/coords.txt.xz \
   --map function/nucl/uniref.map.xz \
   --names function/uniref/uniref.names.xz \
-  --map-as-rank \
   --rank uniref \
   --output truth.uniref.tsv
 ```

diff --git a/woltka/tests/test_cli.py b/woltka/tests/test_cli.py
@@ -129,8 +129,7 @@ def _test_params(params, exp):
                   '--rank',   'process',
                   '--coords', join(self.fundir, 'coords.txt.xz'),
                   '--map',    join(self.fundir, 'uniref', 'uniref.map.xz'),
-                  '--map',    join(self.fundir, 'go', 'process.tsv.xz'),
-                  '--map-as-rank']
+                  '--map',    join(self.fundir, 'go', 'process.tsv.xz')]
         _test_params(params, 'burst.process.tsv')
 
         # burst, stratified genus/process classification
@@ -140,7 +139,6 @@ def _test_params(params, exp):
                   '--coords', join(self.fundir, 'coords.txt.xz'),
                   '--map',    join(self.fundir, 'uniref', 'uniref.map.xz'),
                   '--map',    join(self.fundir, 'go', 'process.tsv.xz'),
-                  '--map-as-rank',
                   '--stratify', join(self.outdir, 'burst.genus.map')]
         _test_params(params, 'burst.genus.process.tsv')
 
@@ -152,7 +150,6 @@ def _test_params(params, exp):
                   '--coords', join(self.fundir, 'coords.txt.xz'),
                   '--map',    join(self.fundir, 'uniref', 'uniref.map.xz'),
                   '--map',    join(self.fundir, 'go', 'component.tsv.xz'),
-                  '--map-as-rank',
                   '--sizes',  '.',
                   '--scale',  '1k',
                   '--digits', 3]
@@ -174,8 +171,7 @@ def _test_params(params, exp):
                   '--output', output_fp,
                   '--rank',   'process',
                   '--map',    join(self.fundir, 'nucl', 'uniref.map.xz'),
-                  '--map',    join(self.fundir, 'go', 'process.tsv.xz'),
-                  '--map-as-rank']
+                  '--map',    join(self.fundir, 'go', 'process.tsv.xz')]
         _test_params(params, 'split.process.tsv')
 
         remove(output_fp)

diff --git a/woltka/tests/test_workflow.py b/woltka/tests/test_workflow.py
@@ -67,7 +67,7 @@ def test_classify(self):
         strata_dir = join(self.datdir, 'output', 'burst.genus.map')
         samples, files, demux = parse_samples(input_fp)
         tree, rankdic, namedic, root = build_hierarchy(
-            map_fps=map_fps, map_as_rank=True)
+            map_fps=map_fps, map_rank=True)
         mapper, chunk = build_mapper(coords_fp=coords_fp, overlap=80)
         stratmap = parse_strata(strata_dir, samples)
         obs = classify(
@@ -400,13 +400,31 @@ def test_build_hierarchy(self):
         self.assertDictEqual(obs[0], {
             'a': 'Bac', 'b': 'Arc', 'c': 'Bac', 'Bac': '1', 'Arc': '1',
             '1': '1'})
+        # map rank turned on
+        self.assertDictEqual(obs[1], {'Bac': 'map', 'Arc': 'map'})
         self.assertEqual(obs[3], '1')
 
-        # map as rank
-        obs = build_hierarchy(map_fps=[fp], map_as_rank=True)
-        self.assertDictEqual(obs[1], {'Bac': 'map', 'Arc': 'map'})
+        # map rank off
+        obs = build_hierarchy(map_fps=[fp], map_rank=False)
+        self.assertDictEqual(obs[1], {})
         remove(fp)
 
+        # newick tree plus simple map
+        fp1 = join(self.tmpdir, 'tree.nwk')
+        with open(fp1, 'w') as f:
+            f.write('((a,c)d,b)e;')
+        fp2 = join(self.tmpdir, 'map.txt')
+        with open(fp2, 'w') as f:
+            f.write('G1\ta\nG2\tb\nG3\tc\n')
+        obs = build_hierarchy(newick_fps=[fp1], map_fps=[fp2])
+        self.assertDictEqual(obs[0], {
+            'G1': 'a', 'G2': 'b', 'G3': 'c',
+            'a': 'd', 'b': 'e', 'c': 'd', 'd': 'e', 'e': 'e'})
+        self.assertDictEqual(obs[1], {})  # map rank turned off
+        self.assertEqual(obs[3], 'e')
+        remove(fp1)
+        remove(fp2)
+
     def test_strip_suffix(self):
         subs = [{'G1_1', 'G1_2', 'G2_3', 'G3'},
                 {'G1_1', 'G1.3', 'G4_5', 'G4_x'}]

diff --git a/woltka/workflow.py b/woltka/workflow.py
@@ -54,7 +54,7 @@ def workflow(input_fp:     str,
              lineage_fps: list = [],
              columns_fps: list = [],
              map_fps:     list = [],
-             map_as_rank: bool = False,
+             map_rank:    bool = False,
              names_fps:   list = [],
              # assignment
              ranks:        str = None,
@@ -115,7 +115,7 @@ def workflow(input_fp:     str,
     # build classification system
     tree, rankdic, namedic, root = build_hierarchy(
         names_fps, nodes_fps, newick_fps, lineage_fps, columns_fps, map_fps,
-        map_as_rank, zippers)
+        map_rank, zippers)
 
     # build mapping module
     mapper, chunk = build_mapper(coords_fp, overlap, chunk, zippers)
@@ -614,7 +614,7 @@ def build_hierarchy(names_fps:   list = [],
                     lineage_fps: list = [],
                     columns_fps: list = [],
                     map_fps:     list = [],
-                    map_as_rank: bool = False,
+                    map_rank:    bool = None,
                     zippers:     dict = None) -> Tuple[dict, dict, dict, str]:
     """Construct hierarchical classification system.
 
@@ -632,7 +632,7 @@ def build_hierarchy(names_fps:   list = [],
         Rank-per-column file.
     map_fps : list of str, optional
         Mapping file(s).
-    map_as_rank : bool, optional
+    map_rank : bool, optional
         Treat mapping filename stem as rank.
     zippers : dict, optional
         External compression programs.
@@ -698,6 +698,13 @@ def build_hierarchy(names_fps:   list = [],
             update_dict(rankdic, rankdic_)
         click.echo(' Done.')
 
+    # whether to extract rank from filename
+    if map_rank is None:
+        map_rank = bool(map_fps) and not any([
+            nodes_fps, newick_fps, lineage_fps, columns_fps])
+    if map_rank:
+        click.echo('  Will extract rank name from map filename.')
+
     # plain mapping files
     for fp in map_fps:
         click.echo(f'  Parsing simple map file: {basename(fp)}...', nl=False)
@@ -706,7 +713,7 @@ def build_hierarchy(names_fps:   list = [],
         update_dict(tree, map_)
 
         # filename stem as rank
-        if map_as_rank:
+        if map_rank:
             rank = stem2rank(path2stem(fp))
             update_dict(rankdic, {k: rank for k in set(map_.values())})
         click.echo(' Done.')