Skip to content

Commit

Permalink
fix kmeriterator, minimizer sketch. add protein minimizer sketch
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Apr 23, 2021
1 parent a5e7877 commit 26edfd0
Show file tree
Hide file tree
Showing 9 changed files with 391 additions and 50 deletions.
42 changes: 25 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,28 +49,36 @@ and also provides serialization methods.

CPU: AMD Ryzen 7 2700X Eight-Core Processor, 3.7 GHz

$ go test . -bench=Bench* -benchmem
$ go test . -bench=Bench* -benchmem \
| grep Bench \
| perl -pe 's/\s\s+/\t/g' \
| csvtk cut -Ht -f 1,3-5 \
| csvtk add-header -t -n test,time,memory,allocs \
| csvtk pretty -t -r
goos: linux
goarch: amd64
pkg: github.com/shenwei356/unikmer
cpu: AMD Ryzen 7 2700X Eight-Core Processor

BenchmarkHashIterator/1.00_KB-16 93423 11409 ns/op 232 B/op 3 allocs/op
BenchmarkKmerIterator/1.00_KB-16 64036 17668 ns/op 160 B/op 1 allocs/op
BenchmarkMinimizerIterator/1.00_KB-16 18207 61748 ns/op 688 B/op 6 allocs/op
BenchmarkSyncmerIterator/1.00_KB-16 9482 116342 ns/op 1456 B/op 8 allocs/op

BenchmarkEncodeK32-16 57040299 20.55 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeFromFormerKmerK32-16 127810105 9.035 ns/op 0 B/op 0 allocs/op
BenchmarkMustEncodeFromFormerKmerK32-16 525103304 2.004 ns/op 0 B/op 0 allocs/op
BenchmarkDecodeK32-16 16725636 84.25 ns/op 32 B/op 1 allocs/op
BenchmarkMustDecodeK32-16 14893524 84.81 ns/op 32 B/op 1 allocs/op
BenchmarkRevK32-16 76235223 15.36 ns/op 0 B/op 0 allocs/op
BenchmarkCompK32-16 1000000000 0.7635 ns/op 0 B/op 0 allocs/op
BenchmarkRevCompK32-16 75180781 17.34 ns/op 0 B/op 0 allocs/op
BenchmarkCannonalK32-16 71694920 16.59 ns/op 0 B/op 0 allocs/op


test time memory allocs
------------------------------------------ ------------ --------- -----------
BenchmarkEncodeK32-16 20.86 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeFromFormerKmerK32-16 9.232 ns/op 0 B/op 0 allocs/op
BenchmarkMustEncodeFromFormerKmerK32-16 2.045 ns/op 0 B/op 0 allocs/op
BenchmarkDecodeK32-16 79.30 ns/op 32 B/op 1 allocs/op
BenchmarkMustDecodeK32-16 72.23 ns/op 32 B/op 1 allocs/op
BenchmarkRevK32-16 21.82 ns/op 0 B/op 0 allocs/op
BenchmarkCompK32-16 0.8296 ns/op 0 B/op 0 allocs/op
BenchmarkRevCompK32-16 17.60 ns/op 0 B/op 0 allocs/op
BenchmarkCannonalK32-16 17.13 ns/op 0 B/op 0 allocs/op

BenchmarkKmerIterator/1.00_KB-16 15562 ns/op 160 B/op 1 allocs/op
BenchmarkHashIterator/1.00_KB-16 10263 ns/op 232 B/op 3 allocs/op
BenchmarkProteinIterator/1.00_KB-16 17396 ns/op 480 B/op 3 allocs/op

BenchmarkMinimizerSketch/1.00_KB-16 67238 ns/op 688 B/op 6 allocs/op
BenchmarkSyncmerSketch/1.00_KB-16 108619 ns/op 1456 B/op 8 allocs/op
BenchmarkProteinMinimizerSketch/1.00_KB-16 29166 ns/op 728 B/op 5 allocs/op

## The toolkit

Expand Down
12 changes: 5 additions & 7 deletions iterator-protein.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,11 @@ type ProteinIterator struct {
k int
finished bool
end int
kmer []byte
hash uint64
idx int
}

// NewProteinIterator returns an ProteinIterator.
func NewProteinIterator(s *seq.Seq, k int, table int, frame int) (*ProteinIterator, error) {
func NewProteinIterator(s *seq.Seq, k int, codonTable int, frame int) (*ProteinIterator, error) {
if k < 1 {
return nil, ErrInvalidK
}
Expand All @@ -51,7 +49,7 @@ func NewProteinIterator(s *seq.Seq, k int, table int, frame int) (*ProteinIterat

var err error
if s.Alphabet != seq.Protein {
iter.s, err = s.Translate(table, frame, false, false, true, false)
iter.s, err = s.Translate(codonTable, frame, false, false, true, false)
if err != nil {
return nil, err
}
Expand All @@ -64,7 +62,7 @@ func NewProteinIterator(s *seq.Seq, k int, table int, frame int) (*ProteinIterat
}

// Next return's a hash
func (iter *ProteinIterator) Next() (uint64, bool) {
func (iter *ProteinIterator) Next() (code uint64, ok bool) {
if iter.finished {
return 0, false
}
Expand All @@ -74,9 +72,9 @@ func (iter *ProteinIterator) Next() (uint64, bool) {
return 0, false
}

iter.hash = wyhash.Hash(iter.s.Seq[iter.idx:iter.idx+iter.k], 1)
code = wyhash.Hash(iter.s.Seq[iter.idx:iter.idx+iter.k], 1)
iter.idx++
return iter.hash, true
return code, true
}

// Index returns current 0-baesd index.
Expand Down
6 changes: 5 additions & 1 deletion iterator-protein_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (
"github.com/shenwei356/bio/seq"
)

func TestAAIterator(t *testing.T) {
func TestProteinIterator(t *testing.T) {
_s := "AAGTTTGAATCATTCAACTATCTAGTTTTCAGAGAACAATGTTCTCTAAAGAATAGAAAAGAGTCATTGTGCGGTGATGATGGCGGGAAGGATCCACCTG"
sequence, err := seq.NewSeq(seq.DNA, []byte(_s))
if err != nil {
Expand Down Expand Up @@ -55,4 +55,8 @@ func TestAAIterator(t *testing.T) {
codes = append(codes, code)
}

if len(codes) != len(_s)/3-k+1 {
t.Errorf("k-mer hashes number error")
}

}
3 changes: 2 additions & 1 deletion iterator.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ func NewKmerIterator(s *seq.Seq, k int, canonical bool, circular bool) (*Iterato
}

iter := &Iterator{s: s2, k: k, canonical: canonical, circular: circular}
iter.end = iter.length - k
iter.length = len(s2.Seq)
iter.end = iter.length - k + 1
iter.kUint = uint(k)
iter.kP1 = k - 1
iter.kP1Uint = uint(k - 1)
Expand Down
53 changes: 41 additions & 12 deletions iterator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,21 @@ func TestKmerIterator(t *testing.T) {
codes := make([]uint64, 0, 1024)
for {
code, ok, err = iter.Next()
if err != nil {
t.Error(err)
}
if !ok {
break
}

// idx = iter.Index()
// fmt.Printf("aa: %d-%s, %d\n", idx, iter.s.Seq[idx:idx+k], code)
// fmt.Printf("kmer: %d-%s, %d\n", idx, iter.s.Seq[idx:idx+k], code)

codes = append(codes, code)
}

if len(codes) != len(_s)-k+1 {
t.Errorf("kmer number error")
t.Errorf("k-mers number error")
}
}

Expand Down Expand Up @@ -86,13 +89,13 @@ func TestHashIterator(t *testing.T) {
}

// idx = iter.Index()
// fmt.Printf("aa: %d-%s, %d\n", idx, iter.s.Seq[idx:idx+k], code)
// fmt.Printf("kmer: %d-%s, %d\n", idx, iter.s.Seq[idx:idx+k], code)

codes = append(codes, code)
}

if len(codes) != len(_s)-k+1 {
t.Errorf("kmer hash number error")
t.Errorf("k-mer hashes number error")
}
}

Expand Down Expand Up @@ -121,6 +124,36 @@ func init() {
// fmt.Printf("%d DNA sequences generated\n", len(sizes))
}

func BenchmarkKmerIterator(b *testing.B) {
for i := range benchSeqs {
size := len(benchSeqs[i].Seq)
b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
var code uint64
var ok bool

for j := 0; j < b.N; j++ {
iter, err := NewKmerIterator(benchSeqs[i], 31, true, false)
if err != nil {
b.Errorf("fail to create hash iterator. seq length: %d", size)
}
for {
code, ok, err = iter.NextKmer()
if err != nil {
b.Errorf("fail to get kmer code: %d-%s", iter.Index(),
benchSeqs[i].Seq[iter.Index():iter.Index()+31])
}

if !ok {
break
}

_code = code
}
}
})
}
}

func BenchmarkHashIterator(b *testing.B) {
for i := range benchSeqs {
size := len(benchSeqs[i].Seq)
Expand All @@ -147,25 +180,21 @@ func BenchmarkHashIterator(b *testing.B) {
}
}

func BenchmarkKmerIterator(b *testing.B) {
func BenchmarkProteinIterator(b *testing.B) {
for i := range benchSeqs {
size := len(benchSeqs[i].Seq)
b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
var code uint64
var ok bool

for j := 0; j < b.N; j++ {
iter, err := NewKmerIterator(benchSeqs[i], 31, true, false)
iter, err := NewProteinIterator(benchSeqs[i], 10, 1, 1)
if err != nil {
b.Errorf("fail to create hash iterator. seq length: %d", size)
}
for {
code, ok, err = iter.NextKmer()
if err != nil {
b.Errorf("fail to get kmer code: %d-%s", iter.Index(),
benchSeqs[i].Seq[iter.Index():iter.Index()+31])
}

for {
code, ok = iter.Next()
if !ok {
break
}
Expand Down
Loading

0 comments on commit 26edfd0

Please sign in to comment.