In [None]:
#!python -m pip install pydivsufsort
#!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
#ovo mozemo u readMe: prvi je da bi imao ovaj divsufsort, a drugi je ako jupyter pravi problem kad se ucitava fasta

In [None]:
from pydivsufsort import divsufsort

In [None]:
""" Return list of rotations of input string t """
def Rotations(t):
    tt = t * 2
    return [tt[i : i + len(t)] for i in range(0, len(t))]

In [None]:
""" Return lexicographically sorted list of t's rotations """
def BWM(t):
    return sorted(Rotations(t))

In [None]:
""" Given T, returns BWT(T) (last column) by creating BWM """
def BWTViaBWM(t):
    return ''.join(map(lambda x: x[-1], BWM(t)))

In [None]:
""" Given T return suffix array SA(T) """
def DeprecatedSuffixArray(s):
    satups = sorted([(s[i:], i) for i in range(len(s))])
    # Extract and return just the offsets
    return list(map(lambda x: x[1], satups))

In [None]:
""" Given text, return suffix array SA(text) """
def SuffixArray(text):
    return list(divsufsort(text))

In [None]:
""" Given T, returns BWT(T) (last column) by way of the suffix array """
def DeprecatedBWTViaSA(t):
    bw = []
    for si in SuffixArray(t):
        if si == 0:
            bw.append('$')
        else:
            bw.append(t[si - 1])
    return ''.join(bw) # returns string version of list bw

In [None]:
""" Given Suffix Array s and text t, return BWT(t) (last column)"""
def BWTViaSA(text, suffixArray):
    bw = []
    for si in suffixArray:
        if si == 0:
            bw.append('$')
        else:
            bw.append(text[si - 1])
    return ''.join(bw) # returns string version of list bw

In [None]:
""" Given BWT string bw, return parallel list of B-ranks. Also
    return tots: map from character to # times it appears. """
def RankBWT(bw):
    tots = dict()
    ranks = []
    for c in bw:
        if c not in tots:
            tots[c] = 0
        ranks.append(tots[c])
        tots[c] += 1
    return ranks, tots

In [None]:
""" Return map from character to the range of rows prefixed by 
    the character. """
def FirstColumn(tots):
    first = {}
    totc = 0
    for c, count in sorted(tots.items()):
        first[c] = (totc, totc + count)
        totc += count
    return first

In [None]:
""" Make T from BWT(T) """
def ReverseBWT(bw):
    ranks, tots = RankBWT(bw)
    first = FirstColumn(tots)
    rowi = 0   # first row
    t = '$'    # rightmost character
    while bw[rowi] != '$':
        c = bw[rowi]
        t = c + t    # prepend to answer
        # jump to row that starts with c of same rank
        rowi = first[c][0] + ranks[rowi]
    return t

In [None]:
"""Return the min and max ranks of a specified caracter from the BWT"""
def setRank(ranks, lColumn, char, lowerIndex, higherIndex):
    indexesOfChar = []
    
    for i in range(lowerIndex, higherIndex):#lower index is inclusive, higher is exclusive
        if lColumn[i] == char:
            indexesOfChar.append(i)
        
    lowerIndex = min(indexesOfChar)
    higherIndex = max(indexesOfChar)
    
    return (ranks[lowerIndex], ranks[higherIndex])

In [None]:
"""Return the range of indexes for a specified caracter with given ranks from the First Column of BWM"""
def setIndex(fColumn, char, lowerRank, higherRank):
    lowerIndex = fColumn[char][0] + lowerRank
    higherIndex = fColumn[char][0] + higherRank + 1 #+1 is to make higherIndex exclusive
    return (lowerIndex, higherIndex)

In [None]:
"""Return positions where the pattern matches the sequence, otherwise throw exception"""
def DeprecatedSearch(seqence, pattern):
    lColumn = BWTViaBWM(seqence)
    ranks, tots = RankBWT(lColumn)
    fColumn = FirstColumn(tots)
    positions = SuffixArray(seqence)
    
    lowerIndex = 0
    higherIndex = 0
    lowerRank = 0
    higherRank = 0
    firstIteration = True
    
    for char in reversed(pattern):
        if firstIteration:
            firstIteration = False
            (lowerIndex, higherIndex) = fColumn[char]
            continue
        (lowerRank, higherRank) = setRank(ranks, lColumn, char, lowerIndex, higherIndex)
        (lowerIndex, higherIndex) = setIndex(fColumn, char, lowerRank, higherRank)
    
    return [positions[i] for i in range(lowerIndex, higherIndex)]

In [None]:
"""Return positions where the pattern matches the sequence, otherwise throw exception"""
def Search(seqence, pattern):
    positions = SuffixArray(seqence)
    lColumn = BWTViaSA(seqence, positions)
    ranks, tots = RankBWT(lColumn)
    fColumn = FirstColumn(tots)
    
    lowerIndex = 0
    higherIndex = 0
    lowerRank = 0
    higherRank = 0
    firstIteration = True
    
    for char in reversed(pattern):
        if firstIteration:
            firstIteration = False
            (lowerIndex, higherIndex) = fColumn[char]
            continue
        (lowerRank, higherRank) = setRank(ranks, lColumn, char, lowerIndex, higherIndex)
        (lowerIndex, higherIndex) = setIndex(fColumn, char, lowerRank, higherRank)
    
    return [positions[i] for i in range(lowerIndex, higherIndex)]