In [1]:
import time
import psutil
import os
from Bio import SeqIO

In [2]:
%run BurrowsWheelerTransform.ipynb
%run BurrowsWheelerTransformImproved.ipynb
%run BurrowsWheelerTransformSearchOverGenome.ipynb

In [35]:
"""Test samples"""

test1 = "Tomorrow_and_tomorrow_and_tomorrow$"
test2 = "It_was_the_best_of_times_it_was_the_worst_of_times$"
test3 = "in_the_jingle_jangle_morning_Ill_come_following_you$"
test4 = "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTTGATTTGG\
GGTTCAAAGCAGTAATTTGGGGTTCAAAGCAGTATCGACAAATAGTAAATCCATTTGTTCATTCAAAGCAGTAATT\
TGGGGTTATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT$"

In [None]:
""" Performance measurement and benchmarking """

%timeit BWTViaBWM(test1)
%timeit BWTViaBWM(test2)
%timeit BWTViaBWM(test3)
%timeit BWTViaBWM(test4)

print()

%timeit BWTViaSA(test1)
%timeit BWTViaSA(test2)
%timeit BWTViaSA(test3)
%timeit BWTViaSA(test4)

print()

%timeit ReverseBWT(bwt1)
%timeit ReverseBWT(bwt2)
%timeit ReverseBWT(bwt3)
%timeit ReverseBWT(bwt4)

In [None]:
!python "./memTest/bwmTest.py" $test1
!python "./memTest/bwmTest.py" $test2
!python "./memTest/bwmTest.py" $test3
!python "./memTest/bwmTest.py" $test4

In [None]:
!python "./memTest/saTest.py" $test1
!python "./memTest/saTest.py" $test2
!python "./memTest/saTest.py" $test3
!python "./memTest/saTest.py" $test4

In [None]:
!python "./memTest/reverseTest.py" $bwt1
!python "./memTest/reverseTest.py" $bwt2
!python "./memTest/reverseTest.py" $bwt3
!python "./memTest/reverseTest.py" $bwt4

In [None]:
!python "./memTest/bwmTestFiles.py" 1
!python "./memTest/bwmTestFiles.py" 2
!python "./memTest/bwmTestFiles.py" 3

In [None]:
!python "./memTest/saTestFiles.py" 1
!python "./memTest/saTestFiles.py" 2
!python "./memTest/saTestFiles.py" 3

In [3]:
"""Benchmarking Data"""

dataSet = [
    {"file" : "./data/13443_ref_Cara_1.0_chr1c.fa",
     "patterns" : [
     "ATGCATG",
     "TCTCTCTA",
     "TTCACTACTCTCA"
     ]},
    {"file" : "./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa",
     "patterns" : [
     "ATGATG",
     "CTCTCTA",
     "TCACTACTCTCA"
     ]},
    {"file" : "./data/144034_ref_Pbar_UMD_V03_chrUn.fa",
     "patterns": [
     "CGCGAG",
     "GTCGAAT",
     "GGGCGTCATCGCGCG"
     ]}
]

In [4]:
def BenchmarkSearchOverGenomeWithImprovedSort(stepSize):
    for data in dataSet:
        file = data.get("file")
        genome = GetWholeGenomeFromFile(file)
        patterns = data.get("patterns")
        
        for pattern in patterns:
            duration = SearchOverGenomeWithImprovedSort(genome, pattern, stepSize)[1]
            print(f"{file} : {pattern} executed in: {duration}")

In [5]:
def BenchmarkSearchOverGenomeWithImprovedDict(stepSize):
    for data in dataSet:
        file = data.get("file")
        genome = GetWholeGenomeFromFile(file)
        patterns = data.get("patterns")
        
        for pattern in patterns:
            duration = SearchOverGenomeWithImprovedDict(genome, pattern, stepSize)[1]
            print(f"{file} : {pattern} executed in: {duration}")

In [6]:
def BenchmarkSearchOverGenomeWithOldSA(stepSize):
    for data in dataSet:
        file = data.get("file")
        genome = GetWholeGenomeFromFile(file)
        patterns = data.get("patterns")
        
        for pattern in patterns:
            duration = SearchOverGenomeWithOldSA(genome, pattern, stepSize)[1]
            print(f"{file} : {pattern} executed in: {duration}")

In [7]:
def BenchmarkSearchOverGenomeWithBWM(stepSize):
    for data in dataSet:
        file = data.get("file")
        genome = GetWholeGenomeFromFile(file)
        patterns = data.get("patterns")
        
        for pattern in patterns:
            duration = SearchOverGenomeWithBWM(genome, pattern, stepSize)[1]
            print(f"{file} : {pattern} executed in: {duration}")

In [8]:
def BenchmarkSearchViaImprovedSortOverWholeFile(file, pattern):
    genome = GetWholeGenomeFromFile(file)
    
    startTime = time.time()
    SearchViaImprovedSort(genome, pattern)
    endTime = time.time()
    duration = endTime - startTime
    
    print(f"{file} : {pattern} executed in: {duration}")

In [25]:
BenchmarkSearchOverGenomeWithImprovedSort(100000)

./data/13443_ref_Cara_1.0_chr1c.fa : ATGCATG executed in: 72.19099640846252
./data/13443_ref_Cara_1.0_chr1c.fa : TCTCTCTA executed in: 72.15397882461548
./data/13443_ref_Cara_1.0_chr1c.fa : TTCACTACTCTCA executed in: 72.28167748451233
./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa : ATGATG executed in: 197.65716290473938
./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa : CTCTCTA executed in: 201.04176306724548
./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa : TCACTACTCTCA executed in: 196.55902981758118
./data/144034_ref_Pbar_UMD_V03_chrUn.fa : CGCGAG executed in: 331.7564721107483
./data/144034_ref_Pbar_UMD_V03_chrUn.fa : GTCGAAT executed in: 334.61165046691895
./data/144034_ref_Pbar_UMD_V03_chrUn.fa : GGGCGTCATCGCGCG executed in: 335.06171774864197


In [11]:
def CompareBenchmarks(stepSize):
    print("Step size is: " + str(stepSize))
    print("===================================================")
    print("\nBenchmarkSearchOverGenomeWithImprovedSort\n")
    BenchmarkSearchOverGenomeWithImprovedSort(stepSize)
    print("===================================================")
#    print("\nBenchmarkSearchOverGenomeWithImprovedDict\n")
#    BenchmarkSearchOverGenomeWithImprovedDict(stepSize)
#    print("===================================================")
#    print("\nBenchmarkSearchOverGenomeWithOldSA\n")
#    BenchmarkSearchOverGenomeWithOldSA(stepSize)
#    print("===================================================")
#    print("\nBenchmarkSearchOverGenomeWithBWM\n")
#    BenchmarkSearchOverGenomeWithBWM(stepSize)
#    print("===================================================")

In [12]:
#CompareBenchmarks(1000)
#CompareBenchmarks(5000)
#CompareBenchmarks(10000)
#CompareBenchmarks(50000)
CompareBenchmarks(100000)
CompareBenchmarks(500000)
CompareBenchmarks(1000000)

Step size is: 100000

BenchmarkSearchOverGenomeWithImprovedSort

./data/13443_ref_Cara_1.0_chr1c.fa : ATGCATG executed in: 73.57338404655457
./data/13443_ref_Cara_1.0_chr1c.fa : TCTCTCTA executed in: 78.20743083953857
./data/13443_ref_Cara_1.0_chr1c.fa : TTCACTACTCTCA executed in: 78.1246395111084
./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa : ATGATG executed in: 213.05156016349792
./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa : CTCTCTA executed in: 214.63302779197693
./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa : TCACTACTCTCA executed in: 216.29544401168823
./data/144034_ref_Pbar_UMD_V03_chrUn.fa : CGCGAG executed in: 365.4286677837372
./data/144034_ref_Pbar_UMD_V03_chrUn.fa : GTCGAAT executed in: 371.5078568458557
./data/144034_ref_Pbar_UMD_V03_chrUn.fa : GGGCGTCATCGCGCG executed in: 348.6569027900696
Step size is: 500000

BenchmarkSearchOverGenomeWithImprovedSort

./data/13443_ref_Cara_1.0_chr1c.fa : ATGCATG executed in: 48.143019676208496
./data/13443_ref_Cara_1.0_chr1c.fa : TCTCTCTA executed i

In [16]:
file = dataSet[0].get("file")
patterns = dataSet[0].get("patterns")
for pattern in patterns:
    BenchmarkSearchViaImprovedSortOverWholeFile(file, pattern)

./data/13443_ref_Cara_1.0_chr1c.fa : ATGCATG executed in: 51.47582387924194
./data/13443_ref_Cara_1.0_chr1c.fa : TCTCTCTA executed in: 56.944053411483765
./data/13443_ref_Cara_1.0_chr1c.fa : TTCACTACTCTCA executed in: 57.87396454811096


In [26]:
!python "./memTest/searchOverGenomeFirstFile.py" "ATGCATG" 10000
!python "./memTest/searchOverGenomeFirstFile.py" "ATGCATG" 50000
!python "./memTest/searchOverGenomeFirstFile.py" "ATGCATG" 100000
!python "./memTest/searchOverGenomeFirstFile.py" "ATGCATG" 500000
!python "./memTest/searchOverGenomeFirstFile.py" "ATGCATG" 1000000
!python "./memTest/searchOverGenomeFirstFile.py" "ATGCATG" 5000000
!python "./memTest/searchOverGenomeFirstFile.py" "ATGCATG" 10000000

Used this much memory: 87.0625 Mb




Used this much memory: 85.96484375 Mb




Used this much memory: 85.171875 Mb




Used this much memory: 84.890625 Mb




Used this much memory: 85.875 Mb




Used this much memory: 85.68359375 Mb




Used this much memory: 85.28125 Mb




In [25]:
!python "./memTest/searchOverGenomeFirstFile.py" "ATGCATG" 100000
!python "./memTest/searchOverGenomeFirstFile.py" "TCTCTCTA" 100000
!python "./memTest/searchOverGenomeFirstFile.py" "TTCACTACTCTCA" 100000

Used this much memory: 85.57421875 Mb




Used this much memory: 85.26953125 Mb




Used this much memory: 84.9921875 Mb




