In [None]:
import time
import psutil
import os

In [None]:
%run BurrowsWheelerTransform.ipynb
%run BurrowsWheelerTransformImproved.ipynb

In [None]:
"""Test samples"""

test1 = "Tomorrow_and_tomorrow_and_tomorrow$"
test2 = "It_was_the_best_of_times_it_was_the_worst_of_times$"
test3 = "in_the_jingle_jangle_morning_Ill_come_following_you$"
test4 = "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTTGATTTGG\
GGTTCAAAGCAGTAATTTGGGGTTCAAAGCAGTATCGACAAATAGTAAATCCATTTGTTCATTCAAAGCAGTAATT\
TGGGGTTATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT$"

In [None]:
""" Performance measurement and benchmarking """

%timeit BWTViaBWM(test1)
%timeit BWTViaBWM(test2)
%timeit BWTViaBWM(test3)
%timeit BWTViaBWM(test4)

print()

%timeit BWTViaSA(test1)
%timeit BWTViaSA(test2)
%timeit BWTViaSA(test3)
%timeit BWTViaSA(test4)

print()

%timeit ReverseBWT(bwt1)
%timeit ReverseBWT(bwt2)
%timeit ReverseBWT(bwt3)
%timeit ReverseBWT(bwt4)

In [None]:
!python "./memTest/bwmTest.py" $test1
!python "./memTest/bwmTest.py" $test2
!python "./memTest/bwmTest.py" $test3
!python "./memTest/bwmTest.py" $test4

In [None]:
!python "./memTest/saTest.py" $test1
!python "./memTest/saTest.py" $test2
!python "./memTest/saTest.py" $test3
!python "./memTest/saTest.py" $test4

In [None]:
!python "./memTest/reverseTest.py" $bwt1
!python "./memTest/reverseTest.py" $bwt2
!python "./memTest/reverseTest.py" $bwt3
!python "./memTest/reverseTest.py" $bwt4

In [None]:
!python "./memTest/bwmTestFiles.py" 1
!python "./memTest/bwmTestFiles.py" 2
!python "./memTest/bwmTestFiles.py" 3

In [None]:
!python "./memTest/saTestFiles.py" 1
!python "./memTest/saTestFiles.py" 2
!python "./memTest/saTestFiles.py" 3

In [None]:
"""Benchmarking Data"""

dataSet = [
    {"file" : "./data/13443_ref_Cara_1.0_chr1c.fa",
     "patterns" : [
     "ATGCATG",
     "TCTCTCTA",
     "TTCACTACTCTCA"
     ]},
    {"file" : "./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa",
     "patterns" : [
     "ATGATG",
     "CTCTCTA",
     "TCACTACTCTCA"
     ]},
    {"file" : "./data/144034_ref_Pbar_UMD_V03_chrUn.fa",
     "patterns": [
     "CGCGAG",
     "GTCGAAT",
     "GGGCGTCATCGCGCG"
     ]}
]

In [None]:
def BenchmarkSearchViaImprovedSort():
    for data in dataSet:
        file = data.get("file")
        genome = GetWholeGenomeFromFile(file)
        patterns = data.get("patterns")
        
        for pattern in patterns:
            startTime = time.time()
            SearchViaImprovedSort(genome, pattern)
            endTime = time.time()
            duration = endTime - startTime
            print(f"{file} : {pattern} executed in {duration}")

In [None]:
def BenchmarkSearchViaImprovedDict():
    for data in dataSet:
        file = data.get("file")
        genome = GetWholeGenomeFromFile(file)
        patterns = data.get("patterns")
        print("StartBWT")
        bwt = BWTViaSAImprovedDict(genome)
        print("EndBWT")
        
        for pattern in patterns:
            startTime = time.time()
            SearchViaImprovedDict(genome, bwt, pattern)
            endTime = time.time()
            duration = endTime - startTime
            print(f"{file} : {pattern} executed in {duration}")

In [None]:
BenchmarkSearchViaImprovedSort()

In [None]:
BenchmarkSearchViaImprovedDict()

In [None]:
def BenchmarkFileSearchViaImprovedSort(file, pattern):
    genome = GetWholeGenomeFromFile(file)
    
    startTime = time.time()
    SearchViaImprovedSort(genome, pattern)
    endTime = time.time()
    duration = endTime - startTime
    
    print(f"{file} : {pattern} executed in {duration}")

In [None]:
BenchmarkFileSearchViaImprovedSort(dataSet[1].get("file"), dataSet[1].get("patterns")[1])

In [None]:
#for i in range(1, len(sequences) + 1):
#    with open("./data/" + "sequence" + str(i) + ".txt", "w") as f:
#        f.write(sequences[i - 1])
#        f.close()

In [None]:
genome = GetWholeGenomeFromFile(dataSet[0].get("file"))
genome[len(genome)-5:len(genome)]

In [None]:
"""Pokreni ovaj blok iznad sa GetWholeGenome i slobodno stavi dataSet[2] za ovaj treci fajl i onda on uzima po step karaktera
i radi bwt i onda nikad memoriju ne prepuni i onda nikad ne mora da pristupa disku sto je mnogo sporo al ovo nije konacno
fali samo jedna stvar a to je sto ti uzmes jedan komad od size karaktera i nadjes pozicije, onda uzmes drugi komad od size
karaktera i nadjes pozicije ali ako je pattern pola na prvom komadu, pola na drugom komadu to nece da nadje tako da ovo
treba da se nadogradi tako sto bi uzeo poslednjih len(pattern) karaktera prvog komada i prvih len(pattern) karaktera drugog
komada, onda konkatenacija pa opet bwt
ispod ovog cell-a sam stavio cist search da bi video dal su to ti indexi i onda uradis ctrl+f i ako nadje 2 to je to al 
videces da za neke nadje 1 to su ti sto su na prelazu"""
indexes = []
size = 100000

startTime = time.time()
for i in range(0, len(genome)//size):
    string = genome[i*size:i*size+size]
    
    tempIndex = SearchViaImprovedSort(string, "CGCGAG")
    
    indexes.append(list(map(lambda x:x+i*size,tempIndex)))
    
endTime = time.time()
novo = [indexes[i] for i in range(0,len(indexes)) if indexes[i] != [-1]]
print(novo)
print(endTime-startTime)

In [None]:
SearchViaImprovedSort(genome, "CGCGAG")