sourmash-bio · luizirber · Nov 22, 2019 · Oct 10, 2019 · Oct 10, 2019 · Oct 11, 2019
diff --git a/requirements.txt b/requirements.txt
@@ -12,3 +12,4 @@ sphinxcontrib-napoleon
 setuptools_scm
 setuptools_scm_git_archive
 nbsphinx
+bam2fasta
diff --git a/sourmash/_minhash.pxd b/sourmash/_minhash.pxd
@@ -26,17 +26,19 @@ cdef extern from "kmer_min_hash.hh":
         const unsigned int ksize;
         const bool is_protein;
         const bool dayhoff;
+        const bool hp;
         const HashIntoType max_hash;
         CMinHashType mins;
 
-        KmerMinHash(unsigned int, unsigned int, bool, bool, uint32_t, HashIntoType)
+        KmerMinHash(unsigned int, unsigned int, bool, bool, bool, uint32_t, HashIntoType)
         void add_hash(HashIntoType) except +ValueError
         void remove_hash(HashIntoType) except +ValueError
         void add_word(const string& word) except +ValueError
         void add_word(const char * word) except +ValueError
         void add_sequence(const string&, bool) except +ValueError
         void merge(const KmerMinHash&) except +ValueError
         string aa_to_dayhoff(string aa) except +ValueError
+        string aa_to_hp(string aa) except +ValueError
         string translate_codon(string codon) except +ValueError
         unsigned int count_common(const KmerMinHash&) except +ValueError
         unsigned long size()
@@ -45,7 +47,7 @@ cdef extern from "kmer_min_hash.hh":
     cdef cppclass KmerMinAbundance(KmerMinHash):
         CMinHashType abunds;
 
-        KmerMinAbundance(unsigned int, unsigned int, bool, bool, uint32_t, HashIntoType)
+        KmerMinAbundance(unsigned int, unsigned int, bool, bool, bool, uint32_t, HashIntoType)
         void add_hash(HashIntoType) except +ValueError
         void remove_hash(HashIntoType) except +ValueError
         void add_word(string word) except +ValueError
@@ -54,6 +56,7 @@ cdef extern from "kmer_min_hash.hh":
         void merge(const KmerMinAbundance&) except +ValueError
         void merge(const KmerMinHash&) except +ValueError
         string aa_to_dayhoff(string aa) except +ValueError
+        string aa_to_hp(string aa) except +ValueError
         string translate_codon(string codon) except +ValueError
         unsigned int count_common(const KmerMinAbundance&) except +ValueError
         unsigned long size()

diff --git a/sourmash/_minhash.pyx b/sourmash/_minhash.pyx
@@ -93,6 +93,7 @@ cdef class MinHash(object):
     def __init__(self, unsigned int n, unsigned int ksize,
                        bool is_protein=False,
                        bool dayhoff=False,
+                       bool hp=False,
                        bool track_abundance=False,
                        uint32_t seed=MINHASH_DEFAULT_SEED,
                        HashIntoType max_hash=0,
@@ -112,9 +113,9 @@ cdef class MinHash(object):
 
         cdef KmerMinHash *mh = NULL
         if track_abundance:
-            mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, seed, max_hash)
+            mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, hp, seed, max_hash)
         else:
-            mh = new KmerMinHash(n, ksize, is_protein, dayhoff, seed, max_hash)
+            mh = new KmerMinHash(n, ksize, is_protein, dayhoff, hp, seed, max_hash)
 
         self._this.reset(mh)
 
@@ -128,6 +129,7 @@ cdef class MinHash(object):
     def __copy__(self):
         a = MinHash(deref(self._this).num, deref(self._this).ksize,
                     deref(self._this).is_protein, deref(self._this).dayhoff,
+                    deref(self._this).hp,
                     self.track_abundance,
                     deref(self._this).seed, deref(self._this).max_hash)
         a.merge(self)
@@ -142,23 +144,24 @@ cdef class MinHash(object):
                 deref(self._this).ksize,
                 deref(self._this).is_protein,
                 deref(self._this).dayhoff,
+                deref(self._this).hp,
                 self.get_mins(with_abundance=with_abundance),
                 None, self.track_abundance, deref(self._this).max_hash,
                 deref(self._this).seed)
 
     def __setstate__(self, tup):
-        (n, ksize, is_protein, dayhoff, mins, _, track_abundance, max_hash, seed) =\
+        (n, ksize, is_protein, dayhoff, hp, mins, _, track_abundance, max_hash, seed) =\
           tup
 
         self._track_abundance = track_abundance
 
         cdef KmerMinHash *mh = NULL
         if track_abundance:
-            mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, seed, max_hash)
+            mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, hp, seed, max_hash)
             self._this.reset(mh)
             self.set_abundances(mins)
         else:
-            mh = new KmerMinHash(n, ksize, is_protein, dayhoff, seed, max_hash)
+            mh = new KmerMinHash(n, ksize, is_protein, dayhoff, hp, seed, max_hash)
             self._this.reset(mh)
             self.add_many(mins)
 
@@ -168,6 +171,7 @@ cdef class MinHash(object):
                 deref(self._this).ksize,
                 deref(self._this).is_protein,
                 deref(self._this).dayhoff,
+                deref(self._this).hp,
                 self.track_abundance,
                 deref(self._this).seed,
                 deref(self._this).max_hash,
@@ -182,7 +186,7 @@ cdef class MinHash(object):
     def copy_and_clear(self):
         a = MinHash(deref(self._this).num, deref(self._this).ksize,
                     deref(self._this).is_protein, deref(self._this).dayhoff,
-                    self.track_abundance,
+                    deref(self._this).hp, self.track_abundance,
                     deref(self._this).seed, deref(self._this).max_hash)
         return a
 
@@ -247,6 +251,10 @@ cdef class MinHash(object):
     def dayhoff(self):
         return deref(self._this).dayhoff
 
+    @property
+    def hp(self):
+        return deref(self._this).hp
+
     @property
     def ksize(self):
         return deref(self._this).ksize
@@ -271,7 +279,7 @@ cdef class MinHash(object):
 
         if v:
             mh = new KmerMinAbundance(self.num, self.ksize, self.is_protein,
-                                      self.dayhoff, self.seed, self.max_hash)
+                                      self.dayhoff, self.hp, self.seed, self.max_hash)
             self._this.reset(mh)
 
         # At this point, if we are changing from track_abundance=True to False,
@@ -294,6 +302,7 @@ cdef class MinHash(object):
 
         a = MinHash(new_num, deref(self._this).ksize,
                     deref(self._this).is_protein, deref(self._this).dayhoff,
+                    deref(self._this).hp,
                     self.track_abundance,
                     deref(self._this).seed, 0)
         if self.track_abundance:
@@ -326,6 +335,7 @@ cdef class MinHash(object):
 
         a = MinHash(0, deref(self._this).ksize,
                     deref(self._this).is_protein, deref(self._this).dayhoff,
+                    deref(self._this).hp,
                     self.track_abundance,
                     deref(self._this).seed, new_max_hash)
         if self.track_abundance:
@@ -348,6 +358,7 @@ cdef class MinHash(object):
                                           deref(self._this).ksize,
                                           deref(self._this).is_protein,
                                           deref(self._this).dayhoff,
+                                          deref(self._this).hp,
                                           deref(self._this).seed,
                                           deref(self._this).max_hash)
 
@@ -356,6 +367,7 @@ cdef class MinHash(object):
                                           deref(self._this).ksize,
                                           deref(self._this).is_protein,
                                           deref(self._this).dayhoff,
+                                          deref(self._this).hp,
                                           deref(self._this).seed,
                                           deref(self._this).max_hash)
 
@@ -467,17 +479,25 @@ cdef class MinHash(object):
             raise ValueError("cannot add amino acid sequence to DNA MinHash!")
 
         aa_kmers = (sequence[i:i + ksize] for i in range(0, len(sequence) - ksize + 1))
-        if not self.dayhoff:
+        if not self.dayhoff and not self.hp:
             for aa_kmer in aa_kmers:
                 deref(self._this).add_word(to_bytes(aa_kmer))
-        else:
+        elif self.dayhoff:
             for aa_kmer in aa_kmers:
                 dayhoff_kmer = ''
                 for aa in aa_kmer:
                     dayhoff_letter = deref(self._this).aa_to_dayhoff(to_bytes(aa))
                     dayhoff_kmer += dayhoff_letter
                 # dayhoff_kmer = ''.join( for aa in aa_kmer)
                 deref(self._this).add_word(to_bytes(dayhoff_kmer))
+        else:
+            for aa_kmer in aa_kmers:
+                hp_kmer = ''
+                for aa in aa_kmer:
+                    hp_letter = deref(self._this).aa_to_hp(to_bytes(aa))
+                    hp_kmer += hp_letter
+                # hp_kmer = ''.join( for aa in aa_kmer)
+                deref(self._this).add_word(to_bytes(hp_kmer))
 
     def is_molecule_type(self, molecule):
         if molecule.upper() == 'DNA' and not self.is_protein:
@@ -489,4 +509,10 @@ cdef class MinHash(object):
             else:
                 if molecule == 'protein':
                     return True
+            if self.hp:
+                if molecule == 'hp':
+                    return True
+                else:
+                    if molecule == 'protein':
+                        return True
         return False
diff --git a/sourmash/command_compute.py b/sourmash/command_compute.py
@@ -152,6 +152,10 @@ def compute(args):
         notify('Computing both nucleotide and Dayhoff-encoded protein '
                'signatures.')
         num_sigs = 2*len(ksizes)
+    elif args.dna and args.hp:
+        notify('Computing both nucleotide and Hp-encoded protein '
+               'signatures.')
+        num_sigs = 2*len(ksizes)
     elif args.dna:
         notify('Computing only nucleotide (and not protein) signatures.')
         num_sigs = len(ksizes)
@@ -162,8 +166,12 @@ def compute(args):
         notify('Computing only Dayhoff-encoded protein (and not nucleotide) '
                'signatures.')
         num_sigs = len(ksizes)
+    elif args.hp:
+        notify('Computing only hp-encoded protein (and not nucleotide) '
+               'signatures.')
+        num_sigs = len(ksizes)
 
-    if (args.protein or args.dayhoff) and not args.input_is_protein:
+    if (args.protein or args.dayhoff or args.hp) and not args.input_is_protein:
         bad_ksizes = [ str(k) for k in ksizes if k % 3 != 0 ]
         if bad_ksizes:
             error('protein ksizes must be divisible by 3, sorry!')
@@ -190,6 +198,7 @@ def make_minhashes():
                 E = MinHash(ksize=k, n=args.num_hashes,
                             is_protein=True,
                             dayhoff=False,
+                            hp=False,
                             track_abundance=args.track_abundance,
                             scaled=args.scaled,
                             seed=seed)
@@ -198,6 +207,16 @@ def make_minhashes():
                 E = MinHash(ksize=k, n=args.num_hashes,
                             is_protein=True,
                             dayhoff=True,
+                            hp=False,
+                            track_abundance=args.track_abundance,
+                            scaled=args.scaled,
+                            seed=seed)
+                Elist.append(E)
+            if args.hp:
+                E = MinHash(ksize=k, n=args.num_hashes,
+                            is_protein=True,
+                            dayhoff=False,
+                            hp=True,
                             track_abundance=args.track_abundance,
                             scaled=args.scaled,
                             seed=seed)
@@ -206,6 +225,7 @@ def make_minhashes():
                 E = MinHash(ksize=k, n=args.num_hashes,
                             is_protein=False,
                             dayhoff=False,
+                            hp=False,
                             track_abundance=args.track_abundance,
                             scaled=args.scaled,
                             seed=seed)
@@ -342,6 +362,7 @@ def save_siglist(siglist, output_fp, filename=None):
         # make minhashes for the whole file
         Elist = make_minhashes()
 
+        n = 0
         total_seq = 0
         for filename in args.filenames:
             # consume & calculate signatures

diff --git a/sourmash/commands.py b/sourmash/commands.py
@@ -1002,14 +1002,22 @@ def watch(args):
         moltype = 'DNA'
         is_protein = False
         dayhoff = False
+        hp = False
     elif args.protein:
         moltype = 'protein'
         is_protein = True
         dayhoff = False
-    else:
+        hp = False
+    elif args.dayhoff:
         moltype = 'dayhoff'
         is_protein = True
         dayhoff = True
+        hp = False
+    else:
+        moltype = 'hp'
+        is_protein = True
+        dayhoff = False
+        hp = True
 
     tree = load_sbt_index(args.sbt_name)
 
@@ -1020,7 +1028,7 @@ def watch(args):
         tree_mh = leaf.data.minhash
         ksize = tree_mh.ksize
 
-    E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein, dayhoff=dayhoff)
+    E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein, dayhoff=dayhoff, hp=hp)
     streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name)
 
     notify('Computing signature for k={}, {} from stdin', ksize, moltype)