# Getting simulation runtimes

Samuel Barnett

### Introduction

I need to collect all the runtimes for the simulations. I dont really want to do this by hand so I'll use the magic of computers to read all the simulation logs.



## 1) Initialization

First I need to import the python modules I'll use, set some variables, initiate R magic, and create/get into the working directory.

In [5]:
import os
baseDir = '/home/sam/data/SIPSim2_data/RealWorld_study3/'

In [6]:
import sys
import pandas as pd
import numpy as np


In [7]:
## Base directory
if not os.path.isdir(baseDir):
    print("Base directory does not exist!!!")
else:
    %cd $baseDir


/home/sam/data/SIPSim2_data/RealWorld_study3


## 2) Get runtimes for simulation

This data comes from the simulation logs

### Start with individual libraries
Find the mean time for libraries

In [68]:
libtime_df = pd.DataFrame()

for genome_set in ['low_GC_skew', 'medium_GC', 'high_GC_skew']:
    for depth in ['depth5MM', 'depth10MM']:
        for exp_type in ['SIP', 'nonSIP']:
            log_file = '_'.join([exp_type, 'simulation.log'])
            log_file = os.path.join(baseDir, genome_set, depth, log_file)
            with open(log_file, 'r') as log:
                liblines = [l for l in log if 'seconds to run the whole library' in l]
            sub_df = pd.DataFrame(liblines, columns = ['line'])
            sub_df['seconds'] = sub_df['line'].str.split(" seconds to run the whole library ", n = 1, expand = True)[0].replace({'It took ':''}, regex=True).astype('float')
            sub_df['library'] = sub_df['line'].str.split(" seconds to run the whole library ", n = 1, expand = True)[1].replace({'\n':''}, regex=True).astype('int')
            sub_df['exp_type'] = exp_type
            sub_df['depth'] = depth
            sub_df['genome_set'] = genome_set
            libtime_df = libtime_df.append(sub_df)
            
            
libtime_sum = libtime_df.groupby(['exp_type', 'depth']).mean()
libtime_sum['minutes'] = libtime_sum['seconds']/60.0
libtime_sum


Unnamed: 0_level_0,Unnamed: 1_level_0,seconds,library,minutes
exp_type,depth,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SIP,depth10MM,1105.097944,3.5,18.418299
SIP,depth5MM,873.704778,3.5,14.561746
nonSIP,depth10MM,455.568111,3.5,7.592802
nonSIP,depth5MM,279.614556,3.5,4.660243


In [69]:
fragtime_df = pd.DataFrame()

for genome_set in ['low_GC_skew', 'medium_GC', 'high_GC_skew']:
    for depth in ['depth5MM', 'depth10MM']:
        for exp_type in ['SIP', 'nonSIP']:
            log_file = '_'.join([exp_type, 'simulation.log'])
            log_file = os.path.join(baseDir, genome_set, depth, log_file)
            with open(log_file, 'r') as log:
                liblines = [l for l in log if 'seconds to build the fragments' in l]
            sub_df = pd.DataFrame(liblines, columns = ['line'])
            sub_df['seconds'] = sub_df['line'].str.split(" seconds to build the fragments", n = 1, expand = True)[0].replace({'It took ':''}, regex=True).astype('float')
            sub_df['exp_type'] = exp_type
            sub_df['depth'] = depth
            sub_df['genome_set'] = genome_set
            fragtime_df = fragtime_df.append(sub_df)
            
            
fragtime_df = fragtime_df.groupby(['exp_type', 'depth']).mean()
fragtime_df['minutes'] = fragtime_df['seconds']/60.0
fragtime_df

Unnamed: 0_level_0,Unnamed: 1_level_0,seconds,minutes
exp_type,depth,Unnamed: 2_level_1,Unnamed: 3_level_1
SIP,depth10MM,620.532333,10.342206
SIP,depth5MM,611.302667,10.188378
nonSIP,depth10MM,634.038667,10.567311
nonSIP,depth5MM,618.499667,10.308328


### Now for the entire simulation
Find the mean time for the entire simulations

In [67]:
simtime_df = pd.DataFrame()

for genome_set in ['low_GC_skew', 'medium_GC', 'high_GC_skew']:
    for depth in ['depth5MM', 'depth10MM']:
        for exp_type in ['SIP', 'nonSIP']:
            log_file = '_'.join([exp_type, 'simulation.log'])
            log_file = os.path.join(baseDir, genome_set, depth, log_file)
            with open(log_file, 'r') as log:
                liblines = [l for l in log if 'seconds to run the entire simulation' in l]
            sub_df = pd.DataFrame(liblines, columns = ['line'])
            sub_df['seconds'] = sub_df['line'].replace({'It took ':''}, regex=True).replace({' seconds.*\n':''}, regex=True).astype('float')
            sub_df['exp_type'] = exp_type
            sub_df['depth'] = depth
            sub_df['genome_set'] = genome_set
            simtime_df = simtime_df.append(sub_df)
            
            
simtime_sum = simtime_df.groupby(['exp_type', 'depth']).mean()

simtime_sum['minutes'] = simtime_sum['seconds']/60.0
simtime_sum['hours'] = simtime_sum['minutes']/60.0
simtime_sum


Unnamed: 0_level_0,Unnamed: 1_level_0,seconds,minutes,hours
exp_type,depth,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SIP,depth10MM,7278.964333,121.316072,2.021935
SIP,depth5MM,5859.781,97.663017,1.627717
nonSIP,depth10MM,3381.460667,56.357678,0.939295
nonSIP,depth5MM,2296.874667,38.281244,0.638021
