In [None]:
import time
import psutil
import os

In [None]:
%run BurrowsWheelerTransform.ipynb
%run BurrowsWheelerTransformImproved.ipynb

In [None]:
"""Test samples"""

test1 = "Tomorrow_and_tomorrow_and_tomorrow$"
test2 = "It_was_the_best_of_times_it_was_the_worst_of_times$"
test3 = "in_the_jingle_jangle_morning_Ill_come_following_you$"
test4 = "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTTGATTTGG\
GGTTCAAAGCAGTAATTTGGGGTTCAAAGCAGTATCGACAAATAGTAAATCCATTTGTTCATTCAAAGCAGTAATT\
TGGGGTTATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT$"

In [None]:
""" Performance measurement and benchmarking """

%timeit BWTViaBWM(test1)
%timeit BWTViaBWM(test2)
%timeit BWTViaBWM(test3)
%timeit BWTViaBWM(test4)

print()

%timeit BWTViaSA(test1)
%timeit BWTViaSA(test2)
%timeit BWTViaSA(test3)
%timeit BWTViaSA(test4)

print()

%timeit ReverseBWT(bwt1)
%timeit ReverseBWT(bwt2)
%timeit ReverseBWT(bwt3)
%timeit ReverseBWT(bwt4)

In [None]:
!python "./memTest/bwmTest.py" $test1
!python "./memTest/bwmTest.py" $test2
!python "./memTest/bwmTest.py" $test3
!python "./memTest/bwmTest.py" $test4

In [None]:
!python "./memTest/saTest.py" $test1
!python "./memTest/saTest.py" $test2
!python "./memTest/saTest.py" $test3
!python "./memTest/saTest.py" $test4

In [None]:
!python "./memTest/reverseTest.py" $bwt1
!python "./memTest/reverseTest.py" $bwt2
!python "./memTest/reverseTest.py" $bwt3
!python "./memTest/reverseTest.py" $bwt4

In [None]:
!python "./memTest/bwmTestFiles.py" 1
!python "./memTest/bwmTestFiles.py" 2
!python "./memTest/bwmTestFiles.py" 3

In [None]:
!python "./memTest/saTestFiles.py" 1
!python "./memTest/saTestFiles.py" 2
!python "./memTest/saTestFiles.py" 3

In [None]:
"""Benchmarking Data"""

dataSet = [
    {"file" : "./data/13443_ref_Cara_1.0_chr1c.fa",
     "patterns" : [
     "ATGCATG",
     "TCTCTCTA",
     "TTCACTACTCTCA"
     ]},
    {"file" : "./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa",
     "patterns" : [
     "ATGATG",
     "CTCTCTA",
     "TCACTACTCTCA"
     ]},
    {"file" : "./data/144034_ref_Pbar_UMD_V03_chrUn.fa",
     "patterns": [
     "CGCGAG",
     "GTCGAAT",
     "GGGCGTCATCGCGCG"
     ]}
]

In [None]:
def BenchmarkSearchViaImprovedSort():
    for data in dataSet:
        file = data.get("file")
        genome = GetWholeGenomeFromFile(file)
        patterns = data.get("patterns")
        
        for pattern in patterns:
            print(f"{file} : {pattern} executed in:")
            BWToverGenomeWithImprovedSort(genome, pattern, 100000)
            

In [None]:
def BenchmarkSearchViaImprovedDict():
    for data in dataSet:
        file = data.get("file")
        genome = GetWholeGenomeFromFile(file)
        patterns = data.get("patterns")
        print("StartBWT")
        bwt = BWTViaSAImprovedDict(genome)
        print("EndBWT")
        
        for pattern in patterns:
            startTime = time.time()
            SearchViaImprovedDict(genome, bwt, pattern)
            endTime = time.time()
            duration = endTime - startTime
            print(f"{file} : {pattern} executed in {duration}")

In [None]:
BenchmarkSearchViaImprovedSort()

In [None]:
BenchmarkSearchViaImprovedDict()

In [None]:
"stara verzija"
def BenchmarkFileSearchViaImprovedSort(file, pattern):
    genome = GetWholeGenomeFromFile(file)
    
    startTime = time.time()
    SearchViaImprovedSort(genome, pattern)
    endTime = time.time()
    duration = endTime - startTime
    
    print(f"{file} : {pattern} executed in {duration}")

In [None]:
"stara verzija"
BenchmarkFileSearchViaImprovedSort(dataSet[1].get("file"), dataSet[1].get("patterns")[1])

In [None]:
#for i in range(1, len(sequences) + 1):
#    with open("./data/" + "sequence" + str(i) + ".txt", "w") as f:
#        f.write(sequences[i - 1])
#        f.close()

In [None]:
genome = GetWholeGenomeFromFile(dataSet[0].get("file"))
genome[len(genome)-5:len(genome)]

In [None]:
"stara verzija"
indexes = []
size = 100000

startTime = time.time()
for i in range(0, len(genome)//size):
    string = genome[i*size:i*size+size]
    
    tempIndex = SearchViaImprovedSort(string, "CGCGAG")
    
    indexes.append(list(map(lambda x:x+i*size,tempIndex)))
    
endTime = time.time()
novo = [indexes[i] for i in range(0,len(indexes)) if indexes[i] != [-1]]
print(novo)
print(endTime-startTime)

In [None]:
def BWToverGenomeWithImprovedSort(genome, pattern, stepSize):
    indexes = []
    totalTime = 0
    #patternLength = len(pattern) #da ne bi svaki put ovo racunao-usteda na vremenu
    
    subString = genome[:stepSize] + "$"
    
    startTime = time.time()
    tempIndexes = SearchViaImprovedSort(subString, pattern)
    endTime = time.time()
    totalTime += endTime - startTime
    
    indexes.append(tempIndexes)
    
    for i in range(1, (len(genome)//stepSize)+1):
        subString = genome[(i*stepSize)-len(pattern)+1:(i*stepSize)+len(pattern)-1] + "$"
        
        startTime = time.time()
        tempIndexes = SearchViaImprovedSort(subString, pattern)
        endTime = time.time()
        totalTime += endTime - startTime
        
        indexes.append(list(map(lambda x:x+(i*stepSize)-len(pattern)+1,filter(lambda x:x>-1, tempIndexes))))
        
        subString = genome[i*stepSize:i*stepSize+stepSize] + "$"
        
        startTime = time.time()
        tempIndexes = SearchViaImprovedSort(subString, pattern)
        endTime = time.time()
        totalTime += endTime - startTime
        
        indexes.append(list(map(lambda x:x+i*stepSize,filter(lambda x:x>-1, tempIndexes))))
        
    finalIndexes = [indexes[i] for i in range(0,len(indexes)) if indexes[i] != []]
    print(str(totalTime))
    
    return [item for sublist in finalIndexes for item in sublist]
    

In [None]:
test2 = "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTTGATTTGG\
GGTTCAAAGCAGTAATTTGGGGTTCAAAGCAGTATCGACAAATAGTAAATCCATTTGTTCATTCAAAGCAGTAATT\
TGGGGTTATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT$"
pattern = "TTTG"
BWToverGenomeWithImprovedSort(test2, pattern, 10)

In [None]:
SearchViaImprovedSort(test2,pattern)