In [8]:
# Problem 1: Build a simple, list based k-mer index of a string to be searched

# Example adapted from Ben Langmead (thanks!)

import bisect
import sys

class Index(object):
    def __init__(self, t, k):
      ''' Create index from all substrings of size 'length' '''
      self.t = t
      self.k = k  # k-mer length (k)
      self.index = []
        
      # Code to complete:
      # For each k-mer add (k-mer, offset) pair to index 
      # and then sort k-mers in lexicographic order
      for i in range(len(t)-k+1):
        self.index.append((t[i:i+3], i))
      self.index.sort() #(self.index)
      
    
    def queryKmer(self, kmer):
      ''' Return locations of kmer in t'''
        
      assert len(kmer) == self.k
        
      hits = [] 
        
      # Code to complete:
      # Find first location of kmer in self.index (hint: use bisect.bisect_left function)
      # Iterate through self.index from first location of kmer to last adding matches to hits
      for i in self.index:
        if i[0] == kmer:
          hits.append(i[1])
        hits.sort()
      return hits
    
    def query(self, p):
      ''' Return occurrences of pattern p in t'''
      kmer = p[:self.k]
      occurrences = []
      
      # Code to complete:
      # Use self.queryKmer to find locations of prefix kmer of p in t
      # For each location, ascertain if suffix of p matches the corresponding substring
      # of t, returning occurrences
      pKmers = []
      for i in range(0,len(p)-self.k+1,3):
        pKmers.append(p[i:i+3])
      totalPKmers = len(pKmers)
      
      hits = []
      for i in pKmers:
        for j in self.index:
          if j[0] == i:
            hits.append(j[1])
      hits.sort()

      for i in range(len(hits)):
        temp = hits[i:i+totalPKmers]
        values = [temp[i+1]-temp[i] for i in range(len(temp)-1)]
        if len(set(values)) == 1 and len(values) == totalPKmers-1:
            occurrences.append(hits[i])
      occurrences.sort()
      return occurrences
     
text = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'
pattern = 'GGTATTCGGGA'
K = 3

index = Index(text, K)

In [9]:
# Test queryKmer method
index.queryKmer("GGT") == [21, 64, 68]

True

In [10]:
# Test query method
index.query(pattern) == [21, 68]

True

In [11]:
# Report index specificity
float(len(index.query(pattern)))/len(index.queryKmer(pattern[:K]))

0.6666666666666666

In [12]:
# Problem 2: Build a simple suffix array


class SuffixArray(object):
    def __init__(self, t):
      ''' Create suffix array representing suffixes in t '''
      self.t = t
      self.td = t + "$"
      self.index = [] ## Array of integers representing lexicographically sorted suffixes of t
      # e.g. for t$ = ATA$
      # have suffixes
      # 0 = ATA$
      # 1 = TA$
      # 2 = A$
      # 3 = $
      # such that self.index == [ 3, 2, 0, 1 ]
      
      # Code to complete - finish building self.index for t
      suffixes = [self.td[-i:] for i in range(1, len(self.td) + 1)]
      originalSuffixes = suffixes
      revSuffixes = originalSuffixes[::-1]
      suffixes.sort()
      
      suffixVal = {}
      for i in range(len(revSuffixes)):
        suffixVal[revSuffixes[i]] = i
      
      for i in originalSuffixes:
        self.index.append(suffixVal[i])
    
    def query(self, p):
      ''' Return occurrences of pattern p in t'''
      
      # Code to complete - find all occurrences of p in t by writing binary search
      # function on self.index
      l = 0; r = len(self.index)
      while l < r:
        mid = (l+r) // 2
        if p > self.td[self.index[mid]:]:
          l = mid + 1
        else:
          r = mid
      start = l
      
      s = l; r = len(self.index) 
      while s < r:
        mid = (s+r) // 2
        if self.td[self.index[mid]:self.index[mid]+len(p)].startswith(p):
          s = mid+1
        else:
          r = mid
      end = r
      
      occurrences = []
      for index in range(start,end,1):
        occurrences.append(self.index[index]) #finds value at the index and places into the occurences list
      return occurrences

      # suffixAt = self.td[self.index[mid]]:
      

In [13]:
# Test suffix array construction
sa = SuffixArray("ATA")
sa.index == [ 3, 2, 0, 1 ]

True

In [14]:
# Test suffix array search
sa = SuffixArray(text)
sorted(sa.query(pattern)) == [21, 68]

True