In [1]:
import subprocess
import sys
import resource
import time
import random

In [2]:
rusage_denom = 1024.
if sys.platform == "darwin":
    rusage_denom = rusage_denom * rusage_denom

In [3]:
def runglsearch(executable, seq1_file, seq2_file):
    '''
    use POpen to run process in background, get its pid for checking for memory
    poll frequently for memory usage --> can take its average for final result
    get final output of process for "Total scan time:"
    '''
    command = executable + ' -z -1 -f 0 -g -1 -r +1/-1 ' + seq1_file + ' ' + seq2_file
    args = command.split()
    proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    proc_pid = proc.pid
    mem_running_avg = 0.0
    num_readings = 0
    while proc.poll() == None:
        total = (mem_running_avg * num_readings) + (resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss / rusage_denom)
        num_readings += 1
        mem_running_avg = total / num_readings
    return (mem_running_avg, proc.communicate())

In [4]:
def glsearchCreateInputFilesAndRun(seq1, seq2):
    glsearch_exec = "/Users/pooja/Desktop/fasta-36.3.8g/bin/glsearch36"
    glsearch_file_seq1 = "/Users/pooja/Desktop/fasta-36.3.8g/test/input_seq1"
    glsearch_file_seq2 = "/Users/pooja/Desktop/fasta-36.3.8g/test/input_seq2"
    # create input file 1 for glsearch
    with open(glsearch_file_seq1, "w") as in1:
        in1.write(">input_seq1\n")
        in1.write(seq1)
    # create input file 2 for glsearch
    with open(glsearch_file_seq2, "w") as in2:
        in2.write(">input_seq2\n")
        in2.write(seq2)
    start_time = time.time()
    print(start_time)
    mem_avg, output = runglsearch(glsearch_exec, glsearch_file_seq1, glsearch_file_seq2)
    time_str = output[0][output[0].find(b'Total Scan time:'):]
    print("for length: " + str(len(seq1)))
    print('average mem usage (MB): ' + str(mem_avg))
    print('time taken: ' + str(float(time_str.split()[3])))
    time_taken = time.time() - start_time
    print('time measured: ' + str(time_taken))

In [5]:
def shortenSequence(seq1, seq2, to_length):
    if len(seq1) <= to_length or len(seq2) <= to_length:
        return (seq1, seq2)
    seq1_perm = list(seq1)
    seq2_perm = list(seq2)
    random.shuffle(seq1_perm)
    random.shuffle(seq2_perm)
    seq1_perm = ''.join(seq1_perm[:to_length])
    seq2_perm = ''.join(seq2_perm[:to_length])
    return (seq1_perm, seq2_perm)

In [None]:
with open("input_sequences.txt", "r") as fp:
    seq1 = ""
    seq2 = ""
    while True:
        # expected file format: line1: seq1, line2: seq2, line3: blank, and this continues for as many pairs of
        # sequences as need to be aligned
        line = fp.readline()
        if not line:
            break
        seq1 = line.strip()
        seq2 = fp.readline().strip()
        fp.readline()
        if len(seq1) >= 2000:
            for length in range(2000, len(seq1), 1000):
                short_seq1, short_seq2 = shortenSequence(seq1, seq2, length)
                glsearchCreateInputFilesAndRun(short_seq1, short_seq2)
        else:
            glsearchCreateInputFilesAndRun(seq1, seq2)

1544988371.2984462
for length: 100
average mem usage (MB): 0.78515625
time taken: 0.0
time measured: 0.03107285499572754
1544988371.3308392
for length: 500
average mem usage (MB): 6.72265625
time taken: 0.0
time measured: 0.03380775451660156
1544988371.366832
for length: 1000
average mem usage (MB): 9.546875
time taken: 0.01
time measured: 0.05371880531311035
1544988371.431369
for length: 2000
average mem usage (MB): 9.546875
time taken: 0.01
time measured: 0.15164995193481445
1544988371.5946329
for length: 3000
average mem usage (MB): 13.78125
time taken: 0.02
time measured: 0.3058791160583496
1544988371.912939
for length: 4000
average mem usage (MB): 18.9375
time taken: 0.04
time measured: 0.5473599433898926
1544988372.480695
for length: 2000
average mem usage (MB): 24.14453125
time taken: 0.01
time measured: 0.15391302108764648
1544988372.664143
for length: 3000
average mem usage (MB): 24.14453125
time taken: 0.03
time measured: 0.3813819885253906
1544988373.065696
for length: 4000


References:  
- https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_percent
- http://www.people.virginia.edu/~wrp/fasta/CURRENT/
- http://fa.bianp.net/blog/2013/different-ways-to-get-memory-consumption-or-lessons-learned-from-memory_profiler/
- https://docs.python.org/2/library/subprocess.html#subprocess.Popen.returncode
- https://docs.python.org/2/library/resource.html#resource.getrusage
- https://eli.thegreenplace.net/2017/interacting-with-a-long-running-child-process-in-python/