In [1]:
import time
import psutil
import os

In [2]:
%run BurrowsWheelerTransformImproved.ipynb

In [3]:
class FMIndexImproved():
    @staticmethod
    def SampleSuffixArray(suffixArray, step = 32):
        sampledSA = {}
        for index, suffix in enumerate(suffixArray):
            if suffix % step == 0:
                sampledSA[index] = suffix
        return sampledSA
    
    def __init__(self, seq, suffixArray = None, checkpointStep = 128, sampledSAStep = 32):
        if seq[-1] != '$':
            seq += '$'
        if suffixArray == None:
            suffixArray = SuffixArrayImprovedSort(seq)
        self.bwt = BWTViaSAImprovedDict(seq, suffixArray)
        self.sampledSA = self.SampleSuffixArray(suffixArray, sampledSAStep)
        self.length = len(self.bwt)
        
        self.CreateCheckpoints(checkpointStep)
        
        tots = dict()
        for c in self.bwt:
            tots[c] = tots.get(c, 0) + 1
        
        self.first = {}
        totc = 0
        for c, count in sorted(tots.items()):
            self.first[c] = totc
            totc += count
    
    def CreateCheckpoints(self, checkpointStep = 128):
        self.checkpoints = {}
        self.checkpointStep = checkpointStep
        tally = {}
        
        for char in self.bwt:
            if char not in tally:
                tally[char] = 0
                self.checkpoints[char] = []
        
        for index, char in enumerate(self.bwt):
            tally[char] += 1
            if index % checkpointStep == 0:
                for c in tally.keys():
                    self.checkpoints[c].append(tally[c])
    
    def Rank(self, bwt, char, row):
        if row < 0 or char not in self.checkpoints:
            return 0
        index, numOccurences = row, 0
        
        while index % self.checkpointStep != 0:
            if bwt[index] == char:
                numOccurences += 1
            index -= 1
        return self.checkpoints[char][index // self.checkpointStep] + numOccurences
    
    def Range(self, pattern):
        left, right = 0, self.length - 1
        for i in range(len(pattern) - 1, -1, -1):
            left = self.Rank(self.bwt, pattern[i], left - 1) + self.Count(pattern[i])
            right = self.Rank(self.bwt, pattern[i], right) + self.Count(pattern[i]) - 1
            if right < left:
                break
        return left, right + 1
    
    def Resolve(self, row):
        def StepLeft(row):
            char = self.bwt[row]
            return self.Rank(self.bwt, char, row - 1) + self.Count(char)
        
        numSteps = 0
        while row not in self.sampledSA:
            row = StepLeft(row)
            numSteps += 1
        return self.sampledSA[row] + numSteps
    
    def Count(self, char):
        if char not in self.first:
            for cc in sorted(self.first.keys()):
                if char < cc:
                    return self.first[cc]
            return self.first[cc]
        else:
            return self.first[char]
    
    def HasSubstring(self, pattern):
        left, right = self.Range(pattern)
        return right > left
    
    def HasSuffix(self, pattern):
        left, right = self.Range(pattern)
        if left >= self.length:
            return False
        offset = self.Resolve(left)
        return right > left and offset + len(pattern) == self.length - 1
    
    def Search(self, pattern):
        left, right = self.Range(pattern)
        return [self.Resolve(x) for x in range(left, right)]

In [4]:
dataSet = [
    {"file" : "./data/13443_ref_Cara_1.0_chr1c.fa",
     "patterns" : [
     "ATGCATG",
     "TCTCTCTA",
     "TTCACTACTCTCA"
     ]},
    {"file" : "./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa",
     "patterns" : [
     "ATGATG",
     "CTCTCTA",
     "TCACTACTCTCA"
     ]},
    {"file" : "./data/144034_ref_Pbar_UMD_V03_chrUn.fa",
     "patterns": [
     "CGCGAG",
     "GTCGAAT",
     "GGGCGTCATCGCGCG"
     ]}
]

In [None]:
for data in dataSet:
    file = data.get("file")
    genome = GetWholeGenomeFromFile(file)
    patterns = data.get("patterns")
    
    for pattern in patterns:
        startTime = time.time()
        fm = FMIndexImproved(genome)
        fm.Search(pattern)
        endTime = time.time()
        duration = endTime - startTime
        print(f"{file} : {pattern} executed in {duration}")
        del fm
    
    del file
    del genome
    del patterns



./data/13443_ref_Cara_1.0_chr1c.fa : ATGCATG executed in 69.1028802394867
./data/13443_ref_Cara_1.0_chr1c.fa : TCTCTCTA executed in 67.58903789520264
./data/13443_ref_Cara_1.0_chr1c.fa : TTCACTACTCTCA executed in 67.73388338088989
./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa : ATGATG executed in 385.99411153793335
./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa : CTCTCTA executed in 205.94877338409424
./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa : TCACTACTCTCA executed in 214.81392812728882


In [6]:
file = dataSet[0].get("file")
genome = GetWholeGenomeFromFile(file)
pattern = dataSet[0].get("patterns")[1]
    
startTime = time.time()
fm = FMIndexImproved(genome)
for f in fm.first:
    print()
fm.Search(pattern)
endTime = time.time()
duration = endTime - startTime
print(f"{file} : {pattern} executed in {duration}")
del fm
    
del file
del genome
del pattern



{'$': 0, 'A': 1, 'C': 15903823, 'G': 25169900, 'N': 34468427, 'T': 34714096}
./data/13443_ref_Cara_1.0_chr1c.fa : TCTCTCTA executed in 78.05802965164185


NameError: name 'patterns' is not defined

In [None]:
seq = GetWholeGenomeFromFile(dataSet[0].get("file"))

tots = dict()
for c in seq:
    tots[c] = tots.get(c, 0) + 1

bwt = ""
#with open("./data/bwt" + str(1) + ".txt", "a") as f:
for char, count in sorted(tots.items()):
    index = 0
    toAdd = list()
    while count > 0:
        if seq[index] == char:
            toAdd.append(seq[index:] + seq[:index])
            count -= 1
        index += 1
    toWrite = ''.join(map(lambda x: x[-1], sorted(toAdd)))
#        for rotation in sorted(toAdd):
#            toWrite += rotation[-1]
    bwt += toWrite
#        f.write(toWrite)
    print(toWrite)
    del toWrite
    del toAdd
#    f.close()
del tots

T
