In [32]:
#!python -m pip install pydivsufsort
#!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
#ovo mozemo u readMe: prvi je da bi imao ovaj divsufsort, a drugi je ako jupyter pravi problem kad se ucitava fasta

In [33]:
from collections import defaultdict
from Bio import SeqIO
from pydivsufsort import divsufsort

In [34]:
""" Improved suffix array algorithm using Python's defaultdict """
def SuffixArrayImprovedDict(seq):
    def SortBucket(seq, bucket, order):
        d = defaultdict(list)
        for i in bucket:
            key = seq[i : i + order]
            d[key].append(i)
        result = []
        for _, val in sorted(d.items()):
            if len(val) > 1:
                result += SortBucket(seq, val, order * 2)
            else:
                result.append(val[0])
        return result
    
    return SortBucket(seq, (i for i in range(len(seq))), 1)

In [35]:
""" Improved suffix array algorithm using library pydivsufsort """
def SuffixArrayImprovedSort(seq):
    return list(divsufsort(seq))

In [36]:
def BWTViaSAImprovedDict(seq, suffixArray = None):
    bwt = []
    if suffixArray == None:
        suffixArray = SuffixArrayImprovedDict(seq)
    for si in suffixArray:
        if si == 0:
            bwt.append('$')
        else:
            bwt.append(seq[si - 1])
    return ''.join(bwt)

In [37]:
def BWTViaSAImprovedSort(seq, suffixArray):
    bwt = []
    for si in suffixArray:
        if si == 0:
            bwt.append('$')
        else:
            bwt.append(seq[si - 1])
    return ''.join(bwt)

In [38]:
""" Given BWT string bw, return parallel list of B-ranks. Also
    return tots: map from character to # times it appears. """
def RankBWT(bw):
    tots = dict()
    ranks = []
    for c in bw:
        if c not in tots:
            tots[c] = 0
        ranks.append(tots[c])
        tots[c] += 1
    return ranks, tots

In [39]:
""" Return map from character to the range of rows prefixed by 
    the character. """
def FirstColumn(tots):
    first = {}
    totc = 0
    for c, count in sorted(tots.items()):
        first[c] = (totc, totc + count)
        totc += count
    return first

In [40]:
"""Return the min and max ranks of a specified caracter from the BWT"""
def SetRank(ranks, lColumn, char, lowerIndex, higherIndex):
    indexesOfChar = []
    
    for i in range(lowerIndex, higherIndex):#lower index is inclusive, higher is exclusive
        if lColumn[i] == char:
            indexesOfChar.append(i)
        
    if indexesOfChar:
        lowerIndex = min(indexesOfChar)
        higherIndex = max(indexesOfChar)
    else:
        return (-1, -1)
    
    return (ranks[lowerIndex], ranks[higherIndex])

In [41]:
"""Return the range of indexes for a specified caracter with given ranks from the First Column of BWM"""
def SetIndex(fColumn, char, lowerRank, higherRank):
    lowerIndex = fColumn[char][0] + lowerRank
    higherIndex = fColumn[char][0] + higherRank + 1 #+1 is to make higherIndex exclusive
    return (lowerIndex, higherIndex)

In [1]:
""" Returns positions where the pattern matches the sequence, otherwise throw exception """
def SearchViaImprovedSort(sequnce, pattern):
    positions = SuffixArrayImprovedSort(sequnce)
    lColumn = BWTViaSAImprovedSort(sequnce, positions)
    ranks, tots = RankBWT(lColumn)
    fColumn = FirstColumn(tots)
    
    lowerIndex = 0
    higherIndex = 0
    lowerRank = 0
    higherRank = 0
    firstIteration = True
    
    for char in reversed(pattern):
        if firstIteration:
            firstIteration = False
            (lowerIndex, higherIndex) = fColumn[char]
            continue
        (lowerRank, higherRank) = SetRank(ranks, lColumn, char, lowerIndex, higherIndex)
        if lowerRank == -1 or higherRank == -1:
            return [-1]
        else:
            (lowerIndex, higherIndex) = SetIndex(fColumn, char, lowerRank, higherRank)
    
    return [positions[i] for i in range(lowerIndex, higherIndex)]

In [None]:
""" Returns positions where the pattern matches the sequence, otherwise throw exception """
def SearchViaImprovedDict(sequnce, pattern):
    positions = SuffixArrayImprovedDict(sequnce)
    lColumn = BWTViaSAImprovedDict(sequnce, positions)
    ranks, tots = RankBWT(lColumn)
    fColumn = FirstColumn(tots)
    
    lowerIndex = 0
    higherIndex = 0
    lowerRank = 0
    higherRank = 0
    firstIteration = True
    
    for char in reversed(pattern):
        if firstIteration:
            firstIteration = False
            (lowerIndex, higherIndex) = fColumn[char]
            continue
        (lowerRank, higherRank) = SetRank(ranks, lColumn, char, lowerIndex, higherIndex)
        if lowerRank == -1 or higherRank == -1:
            return [-1]
        else:
            (lowerIndex, higherIndex) = SetIndex(fColumn, char, lowerRank, higherRank)
    
    return [positions[i] for i in range(lowerIndex, higherIndex)]

In [7]:
""" Get whole genome sequence from fasta file, concatenates all reads """
def GetWholeGenomeFromFile(file):
    fasta_sequences = SeqIO.parse(open(file),'fasta')
    genome = ""
    for sequence in fasta_sequences:
        genome += str(sequence.seq)
    return genome + "$"

In [44]:
test1 = "Tomorrow_and_tomorrow_and_tomorrow$"
#seq = GetWholeGenomeFromFile("./data/144034_ref_Pbar_UMD_V03_chrUn.fa")

In [45]:
#print(BWTViaSAImproved(test1))
#BWTViaSAImproved(seq)

In [46]:
#BWTViaSAImprovedDict(seq)
#BWTViaSAImprovedSort(seq)

