MRG: fix benchmark code & a few other small issues from pyOpenSci rev…

…iew (#2920) Address issues: * #2919 * #2913 * #2906 * #2914 Fixes #2919 Fixes #2913 Fixes #2906 Fixes #2914 Remaining TODO: * test `benchmarks/benchmarks.py`
sourmash-bio · Jan 15, 2024 · 4f32abc · 4f32abc
1 parent 0adc8aa
commit 4f32abc
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 33 deletions.
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,9 @@
+# benchmarks for asv ([airspeed velocity](https://asv.readthedocs.io/en/stable/index.html))
+
+The code in here is run by GitHub Actions during continuous integration.
+
+To test quickly, run:
+
+```
+asv run --show-stderr --quick  
+```
diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
@@ -1,28 +1,46 @@
-import os
 import random
-from pathlib import Path
 from tempfile import NamedTemporaryFile
 
-
 from sourmash.sbt_storage import ZipStorage
 from sourmash.minhash import MinHash
 
+RANDOM_SEQ_SIZE=3000
+RANDOM_SEQ_NUMBER=300
+
+MINHASH_NUM=500
+MINHASH_K=21
+
+GET_MINS_RANGE=500
+ADD_HASH_RANGE=10_000
+ADD_MANY_RANGE=1000
+SIMILARITY_TIMES=500
+COUNT_COMMON_TIMES=500
+MERGE_TIMES=500
+COPY_TIMES=500
+CONCAT_TIMES=500
+SET_ABUNDANCES_RANGE=500
+ZIP_STORAGE_WRITE=100_000
+ZIP_STORAGE_LOAD=20
+
 
 def load_sequences():
     sequences = []
     for i in range(10):
-        random_seq = random.sample("A,C,G,T".split(",") * 3000, 300)
+        random_seq = random.sample("A,C,G,T".split(",") * RANDOM_SEQ_SIZE,
+                                   RANDOM_SEQ_NUMBER)
         sequences.append("".join(random_seq))
     return sequences
 
 
 class TimeMinHashSuite:
     def setup(self):
-        self.mh = MinHash(500, 21, track_abundance=False)
-        self.protein_mh = MinHash(500, 21, is_protein=True, track_abundance=False)
+        self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False)
+        self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K, is_protein=True,
+                                  track_abundance=False)
         self.sequences = load_sequences()
 
-        self.populated_mh = MinHash(500, 21, track_abundance=False)
+        self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K,
+                                    track_abundance=False)
         for seq in self.sequences:
             self.populated_mh.add_sequence(seq)
 
@@ -40,52 +58,53 @@ def time_add_protein(self):
 
     def time_get_mins(self):
         mh = self.populated_mh
-        for i in range(500):
+        for i in range(GET_MINS_RANGE):
             mh.get_mins()
 
     def time_add_hash(self):
         mh = self.mh
-        for i in range(10000):
+        for i in range(ADD_HASH_RANGE):
             mh.add_hash(i)
 
     def time_add_many(self):
         mh = self.mh
-        mh.add_many(list(range(1000)))
+        mh.add_many(list(range(ADD_MANY_RANGE)))
 
     def time_similarity(self):
         mh = self.mh
         other_mh = self.populated_mh
-        for i in range(500):
+        for i in range(SIMILARITY_TIMES):
             mh.similarity(other_mh)
 
     def time_count_common(self):
         mh = self.mh
         other_mh = self.populated_mh
-        for i in range(500):
+        for i in range(COUNT_COMMON_TIMES):
             mh.count_common(other_mh)
 
     def time_merge(self):
         mh = self.mh
         other_mh = self.populated_mh
-        for i in range(500):
+        for i in range(MERGE_TIMES):
             mh.merge(other_mh)
 
     def time_copy(self):
         mh = self.populated_mh
-        for i in range(500):
+        for i in range(COPY_TIMES):
             mh.__copy__()
 
     def time_concat(self):
         mh = self.mh
         other_mh = self.populated_mh
-        for i in range(500):
+        for i in range(CONCAT_TIMES):
             mh += other_mh
 
 
 class PeakmemMinHashSuite:
     def setup(self):
-        self.mh = MinHash(500, 21, track_abundance=True)
-        self.protein_mh = MinHash(500, 21, is_protein=True, track_abundance=True)
+        self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
+        self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K,
+                                  is_protein=True, track_abundance=True)
         self.sequences = load_sequences()
 
     def peakmem_add_sequence(self):
@@ -102,12 +121,12 @@ def peakmem_add_protein(self):
 
     def peakmem_add_hash(self):
         mh = self.mh
-        for i in range(10000):
+        for i in range(ADD_HASH_RANGE):
             mh.add_hash(i)
 
     def peakmem_add_many(self):
         mh = self.mh
-        mh.add_many(list(range(1000)))
+        mh.add_many(list(range(ADD_MANY_RANGE)))
 
 
 ####################
@@ -116,33 +135,33 @@ def peakmem_add_many(self):
 class TimeMinAbundanceSuite(TimeMinHashSuite):
     def setup(self):
         TimeMinHashSuite.setup(self)
-        self.mh = MinHash(500, 21, track_abundance=True)
+        self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
 
-        self.populated_mh = MinHash(500, 21, track_abundance=True)
+        self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
         for seq in self.sequences:
             self.populated_mh.add_sequence(seq)
 
     def time_get_mins_abundance(self):
         mh = self.populated_mh
-        for i in range(500):
+        for i in range(GET_MINS_RANGE):
             mh.get_mins(with_abundance=True)
 
     def time_set_abundances(self):
         mh = self.mh
         mins = self.populated_mh.get_mins(with_abundance=True)
-        for i in range(500):
+        for i in range(SET_ABUNDANCES_RANGE):
             mh.set_abundances(mins)
 
     def time_set_abundances_noclear(self):
         mh = self.mh
         mins = self.populated_mh.get_mins(with_abundance=True)
-        for i in range(500):
+        for i in range(SET_ABUNDANCES_RANGE):
             mh.set_abundances(mins, clear=False)
 
 class PeakmemMinAbundanceSuite(PeakmemMinHashSuite):
     def setup(self):
         PeakmemMinHashSuite.setup(self)
-        self.mh = MinHash(500, 21, track_abundance=True)
+        self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
 
 ####################
 
@@ -154,20 +173,20 @@ def setup(self):
 
         with zipfile.ZipFile(self.zipfile, mode='w',
                           compression=zipfile.ZIP_STORED) as storage:
-            for i in range(100_000):
+            for i in range(ZIP_STORAGE_WRITE):
                 # just so we have lots of entries
                 storage.writestr(str(i), b"0")
             # one big-ish entry
             storage.writestr("sig1", b"9" * 1_000_000)
 
     def time_load_from_zipstorage(self):
         with ZipStorage(self.zipfile.name) as storage:
-            for i in range(20):
+            for i in range(ZIP_STORAGE_LOAD):
                 storage.load("sig1")
 
     def time_load_small_from_zipstorage(self):
         with ZipStorage(self.zipfile.name) as storage:
-            for i in range(20):
+            for i in range(ZIP_STORAGE_LOAD):
                 storage.load("99999")
 
     def teardown(self):
@@ -181,7 +200,7 @@ def setup(self):
 
         with zipfile.ZipFile(self.zipfile, mode='w',
                           compression=zipfile.ZIP_STORED) as storage:
-            for i in range(100_000):
+            for i in range(ZIP_STORAGE_WRITE):
                 # just so we have lots of entries
                 storage.writestr(str(i), b"0")
             # one big-ish entry
@@ -190,12 +209,12 @@ def setup(self):
 
     def peakmem_load_from_zipstorage(self):
         with ZipStorage(self.zipfile.name) as storage:
-            for i in range(20):
+            for i in range(ZIP_STORAGE_LOAD):
                 storage.load("sig1")
 
     def peakmem_load_small_from_zipstorage(self):
         with ZipStorage(self.zipfile.name) as storage:
-            for i in range(20):
+            for i in range(ZIP_STORAGE_LOAD):
                 storage.load("99999")
 
     def teardown(self):

diff --git a/doc/tutorials-lca.md b/doc/tutorials-lca.md
@@ -126,7 +126,7 @@ on the command line; separate them with `--db` or `--query`.
 Download some pre-calculated signatures:
 
 ```
-curl -L https://osf.io/bw8d7/download?version=1 -o delmont-subsample-sigs.tar.gz
+curl -L https://osf.io/bw8d7/download -o delmont-subsample-sigs.tar.gz
 tar xzf delmont-subsample-sigs.tar.gz
 ```
 

diff --git a/src/sourmash/cli/lca/classify.py b/src/sourmash/cli/lca/classify.py
@@ -9,7 +9,8 @@ def subparser(subparsers):
                            help='query signatures to classify')
     subparser.add_argument('--query-from-file',
                            help='file containing list of signature files to query')
-    subparser.add_argument('--threshold', metavar='T', type=int, default=5)
+    subparser.add_argument('--threshold', metavar='T', type=int, default=5,
+                           help="minimum number of hashes needed for a taxonomic classification (default: 5)")
     subparser.add_argument(
         '--majority', action='store_true',
         help='use majority vote classification instead of lca'