Skip to content

Commit

Permalink
MRG: fix benchmark code & a few other small issues from pyOpenSci rev…
Browse files Browse the repository at this point in the history
…iew (#2920)

Address issues:
* #2919
* #2913
* #2906
* #2914

Fixes #2919
Fixes #2913
Fixes #2906
Fixes #2914

Remaining TODO:
* test `benchmarks/benchmarks.py`
  • Loading branch information
ctb committed Jan 15, 2024
1 parent 0adc8aa commit 4f32abc
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 33 deletions.
9 changes: 9 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# benchmarks for asv ([airspeed velocity](https://asv.readthedocs.io/en/stable/index.html))

The code in here is run by GitHub Actions during continuous integration.

To test quickly, run:

```
asv run --show-stderr --quick
```
81 changes: 50 additions & 31 deletions benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,46 @@
import os
import random
from pathlib import Path
from tempfile import NamedTemporaryFile


from sourmash.sbt_storage import ZipStorage
from sourmash.minhash import MinHash

RANDOM_SEQ_SIZE=3000
RANDOM_SEQ_NUMBER=300

MINHASH_NUM=500
MINHASH_K=21

GET_MINS_RANGE=500
ADD_HASH_RANGE=10_000
ADD_MANY_RANGE=1000
SIMILARITY_TIMES=500
COUNT_COMMON_TIMES=500
MERGE_TIMES=500
COPY_TIMES=500
CONCAT_TIMES=500
SET_ABUNDANCES_RANGE=500
ZIP_STORAGE_WRITE=100_000
ZIP_STORAGE_LOAD=20


def load_sequences():
sequences = []
for i in range(10):
random_seq = random.sample("A,C,G,T".split(",") * 3000, 300)
random_seq = random.sample("A,C,G,T".split(",") * RANDOM_SEQ_SIZE,
RANDOM_SEQ_NUMBER)
sequences.append("".join(random_seq))
return sequences


class TimeMinHashSuite:
def setup(self):
self.mh = MinHash(500, 21, track_abundance=False)
self.protein_mh = MinHash(500, 21, is_protein=True, track_abundance=False)
self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False)
self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K, is_protein=True,
track_abundance=False)
self.sequences = load_sequences()

self.populated_mh = MinHash(500, 21, track_abundance=False)
self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K,
track_abundance=False)
for seq in self.sequences:
self.populated_mh.add_sequence(seq)

Expand All @@ -40,52 +58,53 @@ def time_add_protein(self):

def time_get_mins(self):
mh = self.populated_mh
for i in range(500):
for i in range(GET_MINS_RANGE):
mh.get_mins()

def time_add_hash(self):
mh = self.mh
for i in range(10000):
for i in range(ADD_HASH_RANGE):
mh.add_hash(i)

def time_add_many(self):
mh = self.mh
mh.add_many(list(range(1000)))
mh.add_many(list(range(ADD_MANY_RANGE)))

def time_similarity(self):
mh = self.mh
other_mh = self.populated_mh
for i in range(500):
for i in range(SIMILARITY_TIMES):
mh.similarity(other_mh)

def time_count_common(self):
mh = self.mh
other_mh = self.populated_mh
for i in range(500):
for i in range(COUNT_COMMON_TIMES):
mh.count_common(other_mh)

def time_merge(self):
mh = self.mh
other_mh = self.populated_mh
for i in range(500):
for i in range(MERGE_TIMES):
mh.merge(other_mh)

def time_copy(self):
mh = self.populated_mh
for i in range(500):
for i in range(COPY_TIMES):
mh.__copy__()

def time_concat(self):
mh = self.mh
other_mh = self.populated_mh
for i in range(500):
for i in range(CONCAT_TIMES):
mh += other_mh


class PeakmemMinHashSuite:
def setup(self):
self.mh = MinHash(500, 21, track_abundance=True)
self.protein_mh = MinHash(500, 21, is_protein=True, track_abundance=True)
self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K,
is_protein=True, track_abundance=True)
self.sequences = load_sequences()

def peakmem_add_sequence(self):
Expand All @@ -102,12 +121,12 @@ def peakmem_add_protein(self):

def peakmem_add_hash(self):
mh = self.mh
for i in range(10000):
for i in range(ADD_HASH_RANGE):
mh.add_hash(i)

def peakmem_add_many(self):
mh = self.mh
mh.add_many(list(range(1000)))
mh.add_many(list(range(ADD_MANY_RANGE)))


####################
Expand All @@ -116,33 +135,33 @@ def peakmem_add_many(self):
class TimeMinAbundanceSuite(TimeMinHashSuite):
def setup(self):
TimeMinHashSuite.setup(self)
self.mh = MinHash(500, 21, track_abundance=True)
self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)

self.populated_mh = MinHash(500, 21, track_abundance=True)
self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
for seq in self.sequences:
self.populated_mh.add_sequence(seq)

def time_get_mins_abundance(self):
mh = self.populated_mh
for i in range(500):
for i in range(GET_MINS_RANGE):
mh.get_mins(with_abundance=True)

def time_set_abundances(self):
mh = self.mh
mins = self.populated_mh.get_mins(with_abundance=True)
for i in range(500):
for i in range(SET_ABUNDANCES_RANGE):
mh.set_abundances(mins)

def time_set_abundances_noclear(self):
mh = self.mh
mins = self.populated_mh.get_mins(with_abundance=True)
for i in range(500):
for i in range(SET_ABUNDANCES_RANGE):
mh.set_abundances(mins, clear=False)

class PeakmemMinAbundanceSuite(PeakmemMinHashSuite):
def setup(self):
PeakmemMinHashSuite.setup(self)
self.mh = MinHash(500, 21, track_abundance=True)
self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)

####################

Expand All @@ -154,20 +173,20 @@ def setup(self):

with zipfile.ZipFile(self.zipfile, mode='w',
compression=zipfile.ZIP_STORED) as storage:
for i in range(100_000):
for i in range(ZIP_STORAGE_WRITE):
# just so we have lots of entries
storage.writestr(str(i), b"0")
# one big-ish entry
storage.writestr("sig1", b"9" * 1_000_000)

def time_load_from_zipstorage(self):
with ZipStorage(self.zipfile.name) as storage:
for i in range(20):
for i in range(ZIP_STORAGE_LOAD):
storage.load("sig1")

def time_load_small_from_zipstorage(self):
with ZipStorage(self.zipfile.name) as storage:
for i in range(20):
for i in range(ZIP_STORAGE_LOAD):
storage.load("99999")

def teardown(self):
Expand All @@ -181,7 +200,7 @@ def setup(self):

with zipfile.ZipFile(self.zipfile, mode='w',
compression=zipfile.ZIP_STORED) as storage:
for i in range(100_000):
for i in range(ZIP_STORAGE_WRITE):
# just so we have lots of entries
storage.writestr(str(i), b"0")
# one big-ish entry
Expand All @@ -190,12 +209,12 @@ def setup(self):

def peakmem_load_from_zipstorage(self):
with ZipStorage(self.zipfile.name) as storage:
for i in range(20):
for i in range(ZIP_STORAGE_LOAD):
storage.load("sig1")

def peakmem_load_small_from_zipstorage(self):
with ZipStorage(self.zipfile.name) as storage:
for i in range(20):
for i in range(ZIP_STORAGE_LOAD):
storage.load("99999")

def teardown(self):
Expand Down
2 changes: 1 addition & 1 deletion doc/tutorials-lca.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ on the command line; separate them with `--db` or `--query`.
Download some pre-calculated signatures:

```
curl -L https://osf.io/bw8d7/download?version=1 -o delmont-subsample-sigs.tar.gz
curl -L https://osf.io/bw8d7/download -o delmont-subsample-sigs.tar.gz
tar xzf delmont-subsample-sigs.tar.gz
```

Expand Down
3 changes: 2 additions & 1 deletion src/sourmash/cli/lca/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ def subparser(subparsers):
help='query signatures to classify')
subparser.add_argument('--query-from-file',
help='file containing list of signature files to query')
subparser.add_argument('--threshold', metavar='T', type=int, default=5)
subparser.add_argument('--threshold', metavar='T', type=int, default=5,
help="minimum number of hashes needed for a taxonomic classification (default: 5)")
subparser.add_argument(
'--majority', action='store_true',
help='use majority vote classification instead of lca'
Expand Down

0 comments on commit 4f32abc

Please sign in to comment.