### Implementing a K-mer Index

In [1]:
import bisect # allows you to do binary search on a list

### Create an Index object
##### Incorporates two methods:
##### 1. Initialization method (preprocesses the string)
##### 2. Query function

In [4]:
class Index(object):

    # Initialization method (preprocesses the string)
    def __init__(self, t, k):    # parameters text = t, k-mer of length k
        self.k = k               # define a few class variables, k = k passed in
        self.index = []          # list for the index which (empty list)

        # loop-through text (t) and add every kmer to the index
        for i in range(len(t) - k + 1):

            # append the kmer to index list, starting at i of length k 
            self.index.append((t[i:i+k], i)) # tuple, two associated values
        self.index.sort()                    # sort index so its easy to look-up

    # Query function
    def query(self, p):          # parameter pattern = p to match against text
        kmer = p[:self.k]        # find the first k-bases of p for table look-up

        # find the first position in the index list where kmer occurs
                            # (index list, (list of tuples), -1*)        
        i = bisect.bisect_left(self.index, (kmer, -1)) # * gets all indices
        hits = []           # empty list of hits where this kmer occurs
        while i < len(self.index):
            if self.index[i][0] != kmer: # location index != kmer
                break                    # break the loop!
            # if location index == kmer
            # append 2nd item in tuple[1] to list
            hits.append(self.index[i][1])
            i += 1              # increment i 
        return hits             # hits list of all indices in t where p matches









### Use Index object to match a p, against t using query index function

In [5]:
def queryIndex(p, t, index):    # p=pattern, t=text, index created from t
      k = index.k               # get length of k from that index
      offsets = []              # keep a list of matches
      # use query() function to return possible places where p could start
      for i in index.query(p):
          if p[k:] == t[i+k:i+len(p)]: # check kmer in p matches kmer in t
            offsets.append(i)          # if exact match add to list of offsets

      return offsets            # return list of offsets



### Matching a pattern against text

In [8]:
t = 'GCTACGATCTAGAATCTA'
p = 'TCTA'

In [21]:
index = Index(t, 2)
print("Returns a list of tuples alphabetically sorted kmer[0], index[1]")
print(index.index[:5], "etc...")
print("Returns positions of matches of p within t", queryIndex(p, t, index))

Returns a list of tuples alphabetically sorted kmer[0], index[1]
[('AA', 12), ('AC', 3), ('AG', 10), ('AT', 6), ('AT', 13)] etc...
Returns positions of matches of p within t [7, 14]


### Verify results above is correct

In [10]:
t[7:11]

'TCTA'

In [11]:
t[14:18]

'TCTA'