In [3]:
%run BurrowsWheelerTransform.ipynb

In [48]:
class FMIndex():
    def __init__(self, seq, step = 32):
        # safety in case seq is parsed without terminate symbol '$'
        if seq[-1] != '$':
            seq += '$'
        self.bwt = BWTViaSA(seq)
        self.offset = {}
        self.step = step
        
        # counter for all elements in BWT
        elemCount = {}
        for val in self.bwt:
            if val in elemCount:
                elemCount[val] += 1
            else:
                elemCount[val] = 1
        
        # count first occurence of each letter and thusly how much letters of each type is there
        letters = sorted(elemCount.keys())
        firstOccurence = {}
        idx = 0
        for c in letters:
            firstOccurence[c] = idx
            idx += elemCount[c]
        
        # create checkpoints
        self.first = firstOccurence
        self.CreateCheckpoints()
    
    def CreateCheckpoints(self):
        # Count elements and on every step-th element enter his checkpoint
        occurenceCounter = {}
        checkpoints = []
        # idx - index of element in bwt
        # val - exact letter that is on idx position in bwt
        for idx, val in enumerate(self.bwt):
            if idx % self.step == 0:
                checkpoints.append(occurenceCounter.copy())
            
            if occurenceCounter.get(val):
                occurenceCounter[val] += 1
            else:
                occurenceCounter[val] = 1
        self.checkpoints = checkpoints
    
    def CountLetterWithCheckpoints(self, idx, letter):
        # check which checkpoint to use and its position
        check = int((idx + (self.step / 2)) / self.step)
        if check >= len(self.checkpoints):
            check = len(self.checkpoints) - 1
        pos = check * self.step # checkpoint position
        
        # number of times this letter apperead till this checkpoint
        count = self.checkpoints[check].get(letter)
        if count == None:
            count = 0
        
        if pos < idx:
            togo = range(pos, idx) # closest checkpoint is up (before)
        else:
            togo = range(idx, pos) # closest checkpoint is down (after)
        
        # count occurence from our index to the nearest checkpoint
        occurence = 0
        for i in togo:
            if letter == self.bwt[i]:
                occurence += 1
        
        # add/subtract occurences depending on if checkpoint was up/down (before/after)
        if pos < idx:
            count += occurence
        else:
            count -= occurence
        
        return count
    
    def Rank(self, idx, letter):
        # count exact rank of chosen letter
        # already takes care of all the letters lexicographically before chosen one
        cnt = self.first.get(letter)
        if cnt == None:
            base = 0 # letter didn't appear
        else:
            base = cnt # base equals first occurence of given letter
        # count rank of chosen letter among the same letters
        cnt = self.CountLetterWithCheckpoints(idx, letter)
        return base + cnt
    
    def Resolve(self, idx):
        # get how much letters to follow from idx to terminate symbol
        num = 0
        i = idx
        while self.bwt[i] != '$':
            if self.offset.get(i):
                num += self.offset[i]
                break
            num += 1
            i = self.Rank(i, self.bwt[i])
        
        # add to offset list for further easier use (similar to Dynamic Programming)
        if not self.offset.get(idx):
            self.offset[i] = num
        return num
    
    def Range(self, pattern):
        # look if pattern can occur and returns the range in which all characters are first
        # characters of given pattern
        left = 0
        right = len(self.bwt)
        # look for possible left and right range for each letter in reversed sequence
        for _, val in enumerate(pattern[::-1]):
            left = self.Rank(left, val)
            right = self.Rank(right, val)
            # similar to binary search, if left == right pattern doesn't appear in sequence
            if left == right:
                return (-1, -1)
        return (left, right)
    
    def Search(self, pattern):
        # Return all occurences of pattern in sequence
        left, right = self.Range(pattern)
        matches = []
        
        # see on which position in sequence match occured
        for i in range(left, right):
            pos = self.Resolve(i)
            matches.append(pos)
        
        return matches
    
    def HasSubstring(self, pattern):
        # True if pattern is substring of sequence
        left, right = self.Range(pattern)
        return right > left # if right is after left, then the pattern definitely occured
    
    def HasSuffix(self, pattern):
        # True if sequences ends with pattern
        left, right = self.Range(pattern)
        if left >= len(self.bwt):
            return False
        offset = self.Resolve(left)
        return right > left and offset + len(pattern) == len(self.bwt) - 1

In [5]:
elemCount = {}
for _, val in enumerate("Test String"):
    if elemCount.get(val):
        elemCount[val] += 1
    else:
        elemCount[val] = 1
                
                
                
print(elemCount)



elemCount2 = {}
for i in "Test String":
    if i in elemCount2:
        elemCount2[i] += 1
    else:
        elemCount2[i] = 1

print(elemCount2)

{'T': 1, 'e': 1, 's': 1, 't': 2, ' ': 1, 'S': 1, 'r': 1, 'i': 1, 'n': 1, 'g': 1}
{'T': 1, 'e': 1, 's': 1, 't': 2, ' ': 1, 'S': 1, 'r': 1, 'i': 1, 'n': 1, 'g': 1}


In [71]:
test4 = "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTTGATTTGG\
GGTTCAAAGCAGTAATTTGGGGTTCAAAGCAGTATCGACAAATAGTAAATCCATTTGTTCATTCAAAGCAGTAATT\
TGGGGTTATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT$"

patternInString = "AGTATCGAC"
patternNotInString = "GATTACA"
suffixPattern = "TTT"

fm = FMIndex(test4, 1)

#print(fm.bwt)

#print(fm.offset)

#print(fm.step)

#print(fm.first)

#print(fm.checkpoints)

#print(fm.CountLetterWithCheckpoints(50, 'A'))

#print(fm.Rank(50, 'A'))

#print(fm.Resolve(50))

#print(fm.Range(patternInString))
#print(fm.Range(patternNotInString))

#print(fm.Search(patternInString))
#print(fm.Search(patternNotInString))

#pos = fm.Search(patternInString)
#for p in pos:
#    print(test4[int(p):int(p) + len(patternInString)])

#print(fm.HasSubstring(patternInString))
#print(fm.HasSubstring(patternNotInString))

#print(fm.HasSuffix(patternInString))
#print(fm.HasSuffix(patternNotInString))
#print(fm.HasSuffix(suffixPattern))