## MUSIAL2.0 Benchmark

#### `Author: Simon Hackl`
#### `Project: The OMPeome of Treponema pallidum`
#### `Contact: simon.hackl@uni-tuebingen.de`
#### `Date: 15.02.2022`

This _Python_ Notebook was used to generate benchmark data and run `MUSIAL2.0` on it with a concomitant runtime and maximum resident set size measurment.

_Note: In order to run this notebook `MUSIAL2.0` (https://github.com/Integrative-Transcriptomics/MUSIAL) and `AmberTools TLeap` (https://ambermd.org/AmberTools.php) have to be installed locally. The paths to the two tools can be set in the cell below. The notebook is intended to be run with Windows OS and the WSL enabled._

### 1. Benchmark Data Generation

In [None]:
import random
import string
import os
import shutil
import subprocess

import numpy as np

tleapPath = "/mnt/c/users/siha/desktop/tools/AmberTools21/amber20/bin/tleap"
MUSIALpath = "./MUSIAL-v2.0.jar"

In [None]:
def generateRandomNucleotideSequence( sequenceLength: int ) :
    ''' Generates a random string over the alphabet { A, C, G, T } of the specified length.
    In addition global variant positions are assigned with the following probalilities:
    Position substitution probability: 1.03*10**-2
    Position indel probability: 5.27*10**-4
    
    :param sequenceLength: The length of the sequence to be generated.
    :return: Returns a random nucleotide sequence of the specified length and dictionaries of substitution and indel positions.
    '''
    nucleotideBases = [ 'A', 'C', 'G', 'T' ]
    sequenceString = ""
    substitutionPositions = { }
    indelPositions = { }
    for i in range( 0, sequenceLength ) :
        referenceBase = nucleotideBases[ random.randint( 0, len( nucleotideBases ) - 1 ) ]
        rand = random.random( )
        if rand <= ( 5.27 * ( 10 ** -4 ) ) :
            # Declare position as indel position.
            # Skip if indel occurs in the last 20 positions to avoid out of index variants (with high probability).
            if i + 21 > sequenceLength :
                continue
            else :
                indelPositions[ i + 1 ] = referenceBase
        elif rand <= ( ( 5.27 * ( 10 ** -4 ) ) + ( 1.03 * ( 10 ** -2 ) ) ) :
            # Declare position as substitution position.
            substitutionPositions[ i + 1 ] = referenceBase
        sequenceString += referenceBase
    return sequenceString, substitutionPositions, indelPositions

def generateRandomName( randomNames: set = set( ), prefix: str = "", length: int = 6 ) :
    ''' Generates a random name intended to be used as a sequence or sample name.
        To avoid duplicates a set of already used names can be specified.
        
    :param randomNames: Set of already used names. If not passed no duplication check is conducted.
    :param prefix: A prefix to add to the random name.
    :param length: The length of the name to be generated.
    :return: String of length 6 representing a random sequence name.
    '''
    getRandomName = lambda : ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))
    randomName = prefix + getRandomName( )
    while randomName in randomNames :
        randomName = getRandomName( )
    randomNames.add( randomName )
    return randomName

def generateRandomFasta( sequence: str, sequenceId: str, outdir: str ) :
    ''' Writes the passed sequence and sequenceId as .fasta file to the specified outdir.
    
    :param sequence: The nucleotide sequence to be used.
    :param sequenceId: The nucleotide sequences identifier to be used in the fasta header.
    :param outdor: The output directory to which the .fasta file is written to.
    '''
    chunks = [ sequence[ i : i + 80  ] for i in range( 0, len( sequence ), 80 ) ]
    if not outdir.endswith( '/' ) :
        outdir += '/'
    with open( outdir + sequenceId + ".fasta", "w" ) as randomFasta :
        randomFasta.write( ">" + sequenceId + "\n" )
        randomFasta.write( "\n".join( chunks ) )
        
def generateRandomVcfForSequence( sequence: str, substitutionPositions: dict, indelPositions: dict, sequenceId: str, sampleId: str, outdir: str ) :
    ''' Writes a random .vcf file yielding variants at the specified substitutionPositions and
    indelPositions dictionaries and sequenceId to the specified outdir.
    
    The specified dictionaries have to map reference sequence positions to reference sequence
    nucleotide contents at these positions.
    
    For each substitution and indel position the .vcf file will have a probability of 0.5 to obtain a variant.
    In case of substitutions an non-reference base nucleotide with uniform probability 1/3 is chosen.
    In case of indels an additional decision for deletion or insertion, each with a probability of 0.5, is made
    and an length k indel is chosen with probability 0.66**(k-1)*0.34.
    For insertions an arbitrary nucleotide sequence of the chosen length is inserted.
        
    The generated .vcf file will be gziped and indexed using tabix.
        
    :param sequence: The reference sequence.
    :param substitutionPositions: Dictionary mapping integers to single-letter strings.
                                  Represents positions at which a substitution is allowed wrt. reference.
    :param indelPositions: Dictionary mapping integers to single-letter strings.
                           Represents positions at which an indel is allowed wrt. reference.
    :param sequenceId: The reference sequence identifier.
    :param sampleId: The sample identifier.
    :param outdir: The directory to which the .vcf file should be written to.
    '''
    vcfHeader = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleId + "\n"
    if not outdir.endswith( '/' ) :
        outdir += '/'
    sortedVariantPositions = sorted( list( substitutionPositions.keys( ) ) + list( indelPositions.keys( ) ) )
    with open( outdir + sampleId + ".vcf", "w" ) as sampleFile :
        # Write Header
        sampleFile.write( vcfHeader )
        for pos in sortedVariantPositions :
            rand = random.random( )
            if rand >= 0.5 :
                # Include variant with probability 0.5.
                positionString = sequenceId + "\t" + str( pos ) + "\t.\t"
                if pos in substitutionPositions :
                    # Assign substitution.
                    referenceBase = substitutionPositions[ pos ]
                    possibleAlternatives = [ 'A', 'C', 'G', 'T' ]
                    possibleAlternatives.remove( referenceBase )
                    alternativeBase = possibleAlternatives[ random.randint( 0, len( possibleAlternatives ) - 1 ) ]
                    positionString += referenceBase + "\t" + alternativeBase
                    positionString += "\t100\t.\tAC=2;AF=1.00;AN=2;DP=30;MQ=100.00;MQ0=0\tGT:AD:DP:GQ\t1/1:0,30:30:100\n"
                elif pos in indelPositions :
                    # Assign indel.
                    # Simulate indel length.
                    rand = random.random( )
                    indelLength = np.random.geometric( 0.35 )
                    if rand >= 0.5 :
                        # Assign deletion with probability 0.5.
                        referenceBase = ""
                        for delPos in range( pos - 1, pos + indelLength ) :
                            referenceBase += sequence[ delPos ]
                        alternativeBase = indelPositions[ pos ]
                        positionString += referenceBase + "\t" + alternativeBase
                        positionString += "\t100\t.\tAC=2;AF=1.00;AN=2;DP=30;MQ=100.00;MQ0=0\tGT:AD:DP:GQ\t1/1:0,30:30:100\n"
                    else :
                        # Assign insertion with probability 0.5.
                        referenceBase = indelPositions[ pos ]
                        alternativeBase = indelPositions[ pos ]
                        for insPos in range( 0, indelLength ) :
                            alternativeBase += [ 'A', 'C', 'G', 'T' ][ random.randint( 0, 3 ) ]
                        positionString += referenceBase + "\t" + alternativeBase
                        positionString += "\t100\t.\tAC=2;AF=1.00;AN=2;DP=30;MQ=100.00;MQ0=0\tGT:AD:DP:GQ\t1/1:0,30:30:100\n"
                sampleFile.write( positionString )
    # Gzip .vcf file.
    process = subprocess.Popen(
        ['wsl', 'bgzip', outdir + sampleId + ".vcf" ],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    stdout, stderr = process.communicate()
    # Generate index for .vcf file.
    process = subprocess.Popen(
        ['wsl', 'tabix', outdir + sampleId + ".vcf.gz" ],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    stdout, stderr = process.communicate()
    
def generateRandomGenesForSequence( sequence: str, sequenceId: str, numGenes: int, outdir: str, generatePdb: bool = False ) :
    ''' Writes a genome annotation .gff file for a specified sequence containing numGenes
    genes each with a length of 999 bp. In addition a flag can be set to true that induces
    the generation of a respective protein structure file in .pdb format using AmberTools tleap
        
    :param sequence: The reference genome sequence.
    :param sequenceId: The reference genome identifier.
    :param numGenes: Number of genes to be generated.
    :param outdir: The directory to which the output files should be written to.
    :param generatePdb: If a .pdb file for the gene should be generated.
    '''
    gffHeader = "##sequence-region " + sequenceId + " 1 " +  str( len( sequence ) ) + "\n"
    geneNames = set( )
    geneStarts = set( )
    if not outdir.endswith( '/' ) :
        outdir += '/'
    with open( outdir + sequenceId + ".gff", "w" ) as gffFile :
        # Write Header
        gffFile.write( gffHeader )
        while numGenes != 0 :
            geneStart = random.randint( 1, len( sequence ) )
            geneEnd = geneStart + 998
            geneIsValid = False
            # Check if gene overlaps with any other gene.
            while not geneIsValid :
                anyInvalid = False
                if geneEnd > len( sequence ) :
                    anyInvalid = True
                else :
                    for gs in geneStarts :
                        if ( geneStart >= ( gs - 998 ) ) and ( geneStart <= ( gs + 998 ) ) :
                            anyInvalid = True
                if not anyInvalid :
                    geneIsValid = True
                    geneName = generateRandomName( geneNames, "GENE_", 5 )
                    geneNames.add( geneName )
                    geneStarts.add( geneStart )
                    rand = random.random( )
                    if rand <= 0.5 :
                        strandedness = "+"
                    else :
                        strandedness = "-"
                    gffFile.write( sequenceId + "\tRefSeq\tgene\t" + str( geneStart ) + "\t" + str( geneEnd ) + "\t.\t" + strandedness + "\t.\tName=" + geneName + "\n" )
                    if generatePdb :
                        generateGenesPdb( sequence[ geneStart - 1 : geneEnd ], geneName, outdir )
                else :
                    geneStart = random.randint( 1, len( sequence ) )
                    geneEnd = geneStart + 998
                    geneIsValid = False
            numGenes -= 1
    return geneNames

def generateGenesPdb( geneSequence: str, geneId: str, outdir: str ) :
    ''' Writes a dummy .pdb file for a given nucleotide sequence.
    
    Therefore the genes nucleotide sequence should have a length fully divisible by three.
    This sequence is first translated into an amino-acid sequence (without considering stop codons).
    A .pdb file, i.e. atom coordinates is than generated by calling AmberTools tleap.
    
    :param geneSequence: The nucleotide sequence of the gene.
    :param geneId: The identifier of the gene, i.e. used for file name.
    :param outdir: The directory to which the .pdb file should be written to.
    '''
    def translate(seq):
        table = {
            'ATA':'ILE', 'ATC':'ILE', 'ATT':'ILE', 'ATG':'MET',
            'ACA':'THR', 'ACC':'THR', 'ACG':'THR', 'ACT':'THR',
            'AAC':'ASN', 'AAT':'ASN', 'AAA':'LYS', 'AAG':'LYS',
            'AGC':'SER', 'AGT':'SER', 'AGA':'ARG', 'AGG':'ARG',                
            'CTA':'LEU', 'CTC':'LEU', 'CTG':'LEU', 'CTT':'LEU',
            'CCA':'PRO', 'CCC':'PRO', 'CCG':'PRO', 'CCT':'PRO',
            'CAC':'HIS', 'CAT':'HIS', 'CAA':'GLN', 'CAG':'GLN',
            'CGA':'ARG', 'CGC':'ARG', 'CGG':'ARG', 'CGT':'ARG',
            'GTA':'VAL', 'GTC':'VAL', 'GTG':'VAL', 'GTT':'VAL',
            'GCA':'ALA', 'GCC':'ALA', 'GCG':'ALA', 'GCT':'ALA',
            'GAC':'ASP', 'GAT':'ASP', 'GAA':'GLU', 'GAG':'GLU',
            'GGA':'GLY', 'GGC':'GLY', 'GGG':'GLY', 'GGT':'GLY',
            'TCA':'SER', 'TCC':'SER', 'TCG':'SER', 'TCT':'SER',
            'TTC':'PHE', 'TTT':'PHE', 'TTA':'LEU', 'TTG':'LEU',
            'TAC':'TYR', 'TAT':'TYR', 'TAA':'', 'TAG':'',
            'TGC':'CYS', 'TGT':'CYS', 'TGA':'', 'TGG':'TRP',
        }
        aminoAcids = [ ]
        if len( seq ) % 3 == 0 :
            for i in range( 0, len( seq ), 3 ) :
                codon = seq[ i : i + 3 ]
                if table[ codon ] != '' :
                    aminoAcids.append( table[ codon ] )
        return " ".join( aminoAcids )
    if not outdir.endswith( '/' ) :
        outdir += '/'
    translatedGeneSequence = translate( geneSequence )
    translatedGeneSequenceChunks = [ translatedGeneSequence[ i : i + 400 ] for i in range( 0, len( translatedGeneSequence ), 400 ) ]
    tleapSequence = "{" + "\n".join( translatedGeneSequenceChunks ) + "}\n"
    with open( outdir + geneId + ".tleap", "w", newline="\n" ) as tleapConf :
        tleapConf.writelines( [
            "source oldff/leaprc.ff99\n",
            "protein = sequence " + tleapSequence,
            "savepdb protein " + outdir + geneId + ".pdb\n",
            "quit\n", ] )
    process = subprocess.Popen(
        ['wsl', tleapPath, '-f', outdir + geneId + '.tleap'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    stdout, stderr = process.communicate()

Running the cell below will start the benchmark data generation. Please ensure to keep the set random seed (cf. comment (2.)) in order to obtain the same benchmark data for each execution.

In [None]:
# (1.) Create dictionary to store generated data.
if not os.path.isdir( "./D1_TPOMPeome_Hackl2022_BenchmarkData" ) :
    os.mkdir( "./D1_TPOMPeome_Hackl2022_BenchmarkData" )
# (2.) Set random seed.
random.seed( 12345 )
np.random.seed( 12345 )
# (3.) Simulate 13 genomes with a length ranging from 1Mbp to 13Mbp in 1Mbp steps.
genomeNames = set( )
for i in range( 1, 14 ) :
    genomeSequence, substitutionPositions, indelPositions = generateRandomNucleotideSequence( i * 10**6 )
    genomeName = "GENOME_" + str ( i ) + "Mbp"
    genomeNames.add( genomeName )
    os.mkdir( "./D1_TPOMPeome_Hackl2022_BenchmarkData/" + genomeName )
    # (3.1) Write simulated genome to .fasta.
    generateRandomFasta( genomeSequence, genomeName, "./D1_TPOMPeome_Hackl2022_BenchmarkData/" + genomeName )
    # (3.2) Generate 300 samples for simulated genome.
    sampleNames = set( )
    for c in range( 0, 300 ) :
        sampleName = generateRandomName( sampleNames, prefix = "SAMPLE_", length = 4 )
        generateRandomVcfForSequence(
            genomeSequence,
            substitutionPositions,
            indelPositions,
            genomeName,
            sampleName,
            "./D1_TPOMPeome_Hackl2022_BenchmarkData/" + genomeName
        )
    if i == 5 :
        # (3.3.1) Simulate 1000 genes and genome annotation for the genome with 5Mbp length.
        geneNames = generateRandomGenesForSequence( 
            genomeSequence,
            genomeName,
            1000,
            "./D1_TPOMPeome_Hackl2022_BenchmarkData/" + genomeName,
            True
        )
         # (3.3.2) Prepare MUSIAL gene input configuration .txt file for genome of length 5Mbp.
        for genesSize in range( 50, 1050, 50 ) :
            with open( "./D1_TPOMPeome_Hackl2022_BenchmarkData/" + genomeName + "/" + "genes_" + str( genesSize ) + ".txt", "w", newline="\n"  ) as gf :
                for gn in list(geneNames)[ 0 : genesSize ] :
                    gf.write( ",".join( [ gn, gn, "./D1_TPOMPeome_Hackl2022_BenchmarkData/" + genomeName + "/" + gn + ".pdb" ] ) + "\n" )
    # (4.) Prepare MUSIAL sample input configuration .txt files.
    for sampleSize in range( 30, 330, 30 ) :
        with open( "./D1_TPOMPeome_Hackl2022_BenchmarkData/" + genomeName + "/" + "sample_" + str( sampleSize ) + ".txt", "w", newline="\n"  ) as sf :
            for sn in list(sampleNames)[ 0 : sampleSize ] :
                sf.write( "./D1_TPOMPeome_Hackl2022_BenchmarkData/" + genomeName + "/" + sn + ".vcf.gz," + sn + "\n" )

### 2. Run MUSIAL2.0 on the Dataset
The two cells below will start running `MUSIAL2.0` on the generated data and measure the runtime and peak resident set size usage with the `linux time` command.

#### 2.1 Increasing Sample Set Size and Genome Length, Number of Threads: 1

In [None]:
# Create files to store results.
if not os.path.isdir( "./R1_TPOMPeome_Hackl2022_Benchmark" ) :
    os.mkdir( "./R1_TPOMPeome_Hackl2022_Benchmark" )
if not os.path.isfile( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenomesTimeNT1.txt' ) :
    resultsTime = open( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenomesTimeNT1.txt', 'w' )
    resultsTime.close( )
if not os.path.isfile( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenomesMemoryNT1.txt' ) :
    resultsMem = open( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenomesMemoryNT1.txt', 'w' )
    resultsMem.close( )
# Iterate over each pair of genome and sample subset and run MUSIAL.
for i in range( 1, 14 ) :
    genomeName = "GENOME_" + str ( i ) + "Mbp"
    for sampleSize in range( 30, 330, 30 ) :
        if not os.path.isdir( "./temp" ) :
            os.mkdir( "./temp" )
        try :
            process = subprocess.Popen(
                [ 'wsl',
                  '/usr/bin/time',
                  '-v',
                  '--output=./D1_TPOMPeome_Hackl2022_BenchmarkData/eval.txt',
                  'java',
                  '-Xmx30G',
                  '-jar', MUSIALpath,
                  '-o', 'temp/',
                  '-r', './D1_TPOMPeome_Hackl2022_BenchmarkData/' + genomeName + '/' + genomeName + '.fasta',
                  '-s', './D1_TPOMPeome_Hackl2022_BenchmarkData/' + genomeName + '/' + 'sample_' + str( sampleSize ) + '.txt'
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE
            )    
            stdout, stderr = process.communicate( )
            # Parse and store the measured time and maximum resident set size.
            with open( "./D1_TPOMPeome_Hackl2022_BenchmarkData/eval.txt", "r" ) as evaluation :
                line = evaluation.readline( )
                while line :
                    if line.startswith( "\tMaximum resident set size (kbytes):" ) :
                        with open( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenomesMemoryNT1.txt', 'a' ) as resultsMem :
                            memoryMb = line.strip( "\tMaximum resident set size (kbytes): " ).strip( )
                            memoryMb = int( memoryMb ) / ( 10**3 )
                            resultsMem.write(
                                ",".join( [
                                    genomeName.strip( "GENOME_" ).strip( "Mbp" ),
                                    str( sampleSize ),
                                    str( memoryMb )
                                ] ) + "\n"
                            )
                    if line.startswith( "\tElapsed (wall clock) time (h:mm:ss or m:ss):" ) :
                        with open( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenomesTimeNT1.txt', 'a' ) as resultsTime :
                            timeS = line.strip( "\tElapsed (wall clock) time (h:mm:ss or m:ss): " ).strip( )
                            if timeS.count( ":" ) == 1 :
                                timeS = 60 * int( timeS.split( ":" )[ 0 ] ) + float( timeS.split( ":" )[ 1 ] )
                            else :
                                timeS = 360 * int( timeS.split( ":" )[ 0 ] ) + 60 * int( timeS.split( ":" )[ 1 ] ) + float( timeS.split( ":" )[ 2 ] )
                            resultsTime.write(
                                ",".join( [
                                    genomeName.strip( "GENOME_" ).strip( "Mbp" ),
                                    str( sampleSize ),
                                    str( timeS )
                                ] ) + "\n"
                            )
                    line = evaluation.readline( )
        except Exception as e :
            continue
        shutil.rmtree( "./temp/" )

#### 2.2 Increasing Sample Set and Gene Set Size, Number of Threads: 1

In [None]:
# Create files to store results.
if not os.path.isdir( "./R1_TPOMPeome_Hackl2022_Benchmark" ) :
    os.mkdir( "./R1_TPOMPeome_Hackl2022_Benchmark" )
if not os.path.isfile( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenes5MbpGenomeTimeNT1.txt' ) :
    resultsTime = open( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenes5MbpGenomeTimeNT1.txt', 'w' )
    resultsTime.close( )
if not os.path.isfile( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenes5MbpGenomeMemoryNT1.txt' ) :
    resultsMem = open( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenes5MbpGenomeMemoryNT1.txt', 'w' )
    resultsMem.close( )
# Iterate over each pair of genome and sample subset and run MUSIAL.
genomeName = "GENOME_5Mbp"
for genesSize in range( 50, 1050, 50 ) :
    for sampleSize in range( 30, 330, 30 ) :
        if not os.path.isdir( "./temp" ) :
            os.mkdir( "./temp" )
        try :
            process = subprocess.Popen(
                [ 'wsl',
                  '/usr/bin/time',
                  '-v',
                  '--output=./D1_TPOMPeome_Hackl2022_BenchmarkData/eval.txt',
                  'java',
                  '-Xmx30G',
                  '-jar', MUSIALpath,
                  '-o', 'temp/',
                  '-r', './D1_TPOMPeome_Hackl2022_BenchmarkData/' + genomeName + '/' + genomeName + '.fasta',
                  '-s', './D1_TPOMPeome_Hackl2022_BenchmarkData/' + genomeName + '/' + 'sample_' + str( sampleSize ) + '.txt',
                  '-a', './D1_TPOMPeome_Hackl2022_BenchmarkData/' + genomeName + '/' + genomeName + '.gff',
                  '-gf', './D1_TPOMPeome_Hackl2022_BenchmarkData/' + genomeName + '/' + 'genes_' + str( genesSize ) + '.txt'
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE
            )
            stdout, stderr = process.communicate( )
            with open( "./D1_TPOMPeome_Hackl2022_BenchmarkData/eval.txt", "r" ) as evaluation :
                line = evaluation.readline( )
                while line :
                    if line.startswith( "\tMaximum resident set size (kbytes):" ) :
                        with open( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenes5MbpGenomeMemoryNT1.txt', 'a' ) as resultsMem :
                            memoryMb = line.strip( "\tMaximum resident set size (kbytes): " ).strip( )
                            memoryMb = int( memoryMb ) / ( 10**3 )
                            resultsMem.write(
                                ",".join( [
                                    str( genesSize ),
                                    str( sampleSize ),
                                    str( memoryMb )
                                ] ) + "\n"
                            )
                    if line.startswith( "\tElapsed (wall clock) time (h:mm:ss or m:ss):" ) :
                        with open( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenes5MbpGenomeTimeNT1.txt', 'a' ) as resultsTime :
                            timeS = line.strip( "\tElapsed (wall clock) time (h:mm:ss or m:ss): " ).strip( )
                            if timeS.count( ":" ) == 1 :
                                timeS = 60 * int( timeS.split( ":" )[ 0 ] ) + float( timeS.split( ":" )[ 1 ] )
                            else :
                                timeS = 360 * int( timeS.split( ":" )[ 0 ] ) + 60 * int( timeS.split( ":" )[ 1 ] ) + float( timeS.split( ":" )[ 2 ] )
                            resultsTime.write(
                                ",".join( [
                                    str( genesSize ),
                                    str( sampleSize ),
                                    str( timeS )
                                ] ) + "\n"
                            )
                    line = evaluation.readline( )
        except Exception as e :
            continue
        shutil.rmtree( "./temp/" )

### 3. Visualize Results
Executing the cell below will generate visualizations for the two benchmark runs.

In [None]:
"""
Imports
"""
from mpl_toolkits.mplot3d import Axes3D
from itertools import product
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import seaborn as sns
import warnings
import numpy as np

plt.rcParams.update( { "text.usetex": False, "font.family": "serif" } )

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    # Parse genomesRSS measurement.
    dfGenomesRSS = pd.read_csv( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenomesMemoryNT1.txt', header=None )
    dfGenomesRSS.sort_values( by=[0,1], axis=0, inplace=True )
    yGenomesRSS = { }
    xGenomesRSS = { }
    for genomeSize, sampleSize in product( dfGenomesRSS[ 0 ].unique( ), dfGenomesRSS[ 1 ].unique( ) ) :
        if genomeSize not in xGenomesRSS :
            xGenomesRSS[ genomeSize ] = [ ]
            yGenomesRSS[ genomeSize ] = [ ]
        xGenomesRSS[ genomeSize ].append( sampleSize ) #genomeSize * 0.010827 * sampleSize
        yGenomesRSS[ genomeSize ].append( dfGenomesRSS[ dfGenomesRSS[ 0 ] == genomeSize ][ dfGenomesRSS[ 1 ] == sampleSize ][ 2 ].values[ 0 ] / 1000 )
    
    # Parse genomesTime measurement.
    dfGenomesTime = pd.read_csv( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenomesTimeNT1.txt', header=None )
    dfGenomesTime.sort_values( by=[0,1], axis=0, inplace=True )
    yGenomesTime = { }
    xGenomesTime = { }
    for genomeSize, sampleSize in product( dfGenomesTime[ 0 ].unique( ), dfGenomesTime[ 1 ].unique( ) ) :
        if genomeSize not in xGenomesTime :
            xGenomesTime[ genomeSize ] = [ ]
            yGenomesTime[ genomeSize ] = [ ]
        xGenomesTime[ genomeSize ].append( sampleSize ) #genomeSize * 0.010827 * sampleSize
        yGenomesTime[ genomeSize ].append( dfGenomesTime[ dfGenomesTime[ 0 ] == genomeSize ][ dfGenomesTime[ 1 ] == sampleSize ][ 2 ].values[ 0 ] )
    
    # Generate plot.
    fig, ( ax1, ax2 ) = plt.subplots( nrows = 2, ncols = 1, figsize = ( 13, 13 ), sharex = True )
    
    # Plot genomesRSS values.
    for gS in list( xGenomesRSS.keys( ) ) :
        ax1.plot(
            xGenomesRSS[ gS ],
            yGenomesRSS[ gS ],
            color = cm.get_cmap('Spectral')( gS / len( list( xGenomesRSS.keys( ) ) ) ),
            marker='o'
        )
        ax1.set_ylabel( "Maximal RSS [GB]", size = 16, labelpad = 10 )
        ax1.tick_params( labelsize = 14 )
        ax1.set_title( "Maximal RSS", size = 18 )
        ax1.grid( )

    
    # Plot genomesTime values.
    for gS in list( xGenomesTime.keys( ) ) :
        ax2.plot(
            xGenomesTime[ gS ],
            yGenomesTime[ gS ],
            color = cm.get_cmap('Spectral')( gS / len( list( xGenomesRSS.keys( ) ) ) ),
            marker='o',
            label = "Genome Size: " + str( gS ) + " Mbp"
        )
        ax2.set_xlabel( "Number of Samples", size = 16, labelpad = 10 )
        ax2.set_xticks( np.arange( 30, 330, 30 ) )
        ax2.set_ylabel( "Runtime [s]", size = 16, labelpad = 10 )
        ax2.tick_params( labelsize = 14 )
        ax2.set_title( "Runtime", size = 18 )
        ax2.grid( )
    
    plt.suptitle( "Benchmark Results of MUSIAL2.0\nIncreasing Genome and Sample Set Size", size = 20, y = 1.01, horizontalalignment='center', verticalalignment='top' )
    plt.tight_layout( )
    plt.legend( ncol = 4, loc = 'center', bbox_to_anchor = ( 0.5, -0.3 ), fontsize = 14 )
    plt.show( )

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    # Parse genesRSS measurement.
    dfGenesRSS = pd.read_csv( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenes5MbpGenomeMemoryNT1.txt', header=None )
    dfGenesRSS.sort_values( by=[0,1], axis=0, inplace=True )
    yGenesRSS = { }
    xGenesRSS = { }
    for genesSize, sampleSize in product( dfGenesRSS[ 0 ].unique( ), dfGenesRSS[ 1 ].unique( ) ) :
        if genesSize % 100 == 0 and not genesSize in xGenesRSS :
            xGenesRSS[ genesSize ] = [ ]
            yGenesRSS[ genesSize ] = [ ]
        if genesSize % 100 == 0 :
            xGenesRSS[ genesSize ].append( sampleSize ) #genomeSize * 0.010827 * sampleSize
            yGenesRSS[ genesSize ].append( dfGenesRSS[ dfGenesRSS[ 0 ] == genesSize ][ dfGenesRSS[ 1 ] == sampleSize ][ 2 ].values[ 0 ] / 1000 )
    
    # Parse genesTime measurement.
    dfGenesTime = pd.read_csv( './R1_TPOMPeome_Hackl2022_Benchmark/IncSamplesAndGenes5MbpGenomeTimeNT1.txt', header=None )
    dfGenesTime.sort_values( by=[0,1], axis=0, inplace=True )
    yGenesTime = { }
    xGenesTime = { }
    for genesSize, sampleSize in product( dfGenesTime[ 0 ].unique( ), dfGenesTime[ 1 ].unique( ) ) :
        if genesSize % 100 == 0 and not genesSize in xGenesTime :
            xGenesTime[ genesSize ] = [ ]
            yGenesTime[ genesSize ] = [ ]
        if genesSize % 100 == 0 :
            xGenesTime[ genesSize ].append( sampleSize ) #genomeSize * 0.010827 * sampleSize
            yGenesTime[ genesSize ].append( dfGenesTime[ dfGenesTime[ 0 ] == genesSize ][ dfGenesTime[ 1 ] == sampleSize ][ 2 ].values[ 0 ] )
        
    # Generate plot.
    fig, ( ax1, ax2 ) = plt.subplots( nrows = 2, ncols = 1, figsize = ( 13, 13 ), sharex = True )
    
    # Plot genesRSS values.
    for gS in list( xGenesRSS.keys( ) ) :
        ax1.plot(
            xGenesRSS[ gS ],
            yGenesRSS[ gS ],
            color = cm.get_cmap('Spectral')( gS / 1000 ),
            marker='o'
        )
        ax1.set_ylabel( "Maximal RSS [GB]", size = 16, labelpad = 10 )
        ax1.tick_params( labelsize = 14 )
        ax1.set_title( "Maximal RSS", size = 18 )
        ax1.grid( b = True )

    
    # Plot genesTime values.
    for gS in list( xGenesTime.keys( ) ) :
        ax2.plot(
            xGenesTime[ gS ],
            yGenesTime[ gS ],
            color = cm.get_cmap('Spectral')( gS / 1000 ),
            marker='o',
            label = "Gene Set Size: " + str( gS )
        )
        ax2.set_xlabel( "Number of Samples", size = 16, labelpad = 10 )
        ax2.set_xticks( np.arange( 30, 330, 30 ) )
        ax2.set_ylabel( "Runtime [s]", size = 16, labelpad = 10 )
        ax2.tick_params( labelsize = 14 )
        ax2.set_title( "Runtime", size = 18 )
        ax2.grid( b = True )
    
    plt.suptitle( "Benchmark Results of MUSIAL2.0\nIncreasing Gene Set and Sample Set Size, Genome Size 5 Mbp", size = 20, y = 1.01, horizontalalignment='center', verticalalignment='top' )
    plt.tight_layout( )
    plt.legend( ncol = 4, loc = 'center', bbox_to_anchor = ( 0.5, -0.3 ), fontsize = 14 )
    plt.show( )