# Resource usage (memory, total CPU time, and cpu scaling with threads)
Using Slurm and pyseff, we can get the runtime and memory usage of each tool.  


## Part 1: of the real data (all/fractions of hq imgvr4 with iphop spacers)
This is only for the fractions: ['0.001', '0.005', '0.01', '0.05', '0.1'], of the HQ IMG/VR4 contigs with the iphop spacer, when testing tools with either no restrictions on maximum distance, or up to 3/5 (edir or hamming, actual values are in Prepare_all_jobs.ipynb).

**Reminder** - for blastn fraction_1, resource tracking for the 64 thread job was only done until the 1689771th spacer out of 3826979 total spacers. 

**Implementation**: The code below automatically extrapolates blastn's fraction_1 runtime by multiplying the observed runtime by 3826979/1689771 ≈ 2.266 to estimate the total runtime. This is valid because blastn scales linearly with the number of input spacers.

<!-- :>MGYG000018353:MGYG000018353_21:1:pilercr14 -->

In [1]:
import os
os.chdir('/clusterfs/jgi/scratch/science/metagen/neri/code/blits/spacer_bench/')
from bench import *
from bench.utils.functions import *
from bench.utils.pyseff import *
import polars as pl
pl.Config(tbl_rows=110)

base_dir = "/clusterfs/jgi/scratch/science/metagen/neri/code/blits/spacer_bench/results/real_data/subsamples/fraction_0.001/" # just need one to get the configs
threads = 12
tools_versions = pl.read_csv("tool_configs/tool_versions.csv")
tool_configs = load_tool_configs(base_dir)
tool_names = list(tool_configs.keys())
tool_names

['blastn',
 'bowtie1',
 'bowtie2',
 'indelfree_bruteforce',
 'indelfree_indexed',
 'minimap2',
 'mmseqs2',
 'mummer4',
 'sassy',
 'strobealign',
 'x_mapper']

In [2]:
sacct_df = pyseff(remove_cancelled=False,remove_failed=False,calculate_cpu_efficiency=False,calculate_memory_efficiency=False)
sacct_df= sacct_df.with_columns(pl.col("JobName").str.replace_all("_long","").str.replace_all("mmseqs","mmseqs2"))
sacct_df = sacct_df.filter(~pl.col("JobName").str.contains_any(["damn","biofaster"])).filter(pl.col("JobName").str.contains_any(tool_names))
sacct_df

BaseJobID,JobName,AllocCPUS,State,ExitCode,MaxRSS,ReqMem,Elapsed_Seconds,TotalCPU_Seconds,MaxRSS_MB,ReqMem_MB,Elapsed,TotalCPU
str,str,i64,list[str],list[str],str,str,u32,u32,f64,f64,str,str
"""20258257""","""sassy""",16,"[""COMPLETED""]","[""0:0""]","""68244K""","""168G""",8011,83850,66.644531,172032.0,"""02:13:31""","""23:17:30"""
"""19675946""","""mmseqs2""",16,"[""COMPLETED""]","[""0:0""]","""21234896K""","""168G""",625,0,20737.203125,172032.0,"""00:10:25""","""00:00:00"""
"""20668452""","""sassy""",0,"[""CANCELLED by 91591""]","[""0:0""]",,"""168G""",0,0,,172032.0,"""00:00:00""","""00:00:00"""
"""20668597""","""bowtie1""",38,"[""FAILED"", ""COMPLETED""]","[""1:0"", ""0:0""]","""174940K""","""168G""",39,0,170.839844,172032.0,"""00:00:39""","""00:00:00"""
"""20668621""","""bowtie1""",38,"[""COMPLETED""]","[""0:0""]","""868176K""","""168G""",331,0,847.828125,172032.0,"""00:05:31""","""00:00:00"""
"""19675794""","""strobealign""",0,"[""CANCELLED by 91591""]","[""0:0""]",,"""168G""",0,0,,172032.0,"""00:00:00""","""00:00:00"""
"""20697913""","""mummer4""",38,"[""COMPLETED""]","[""0:0""]","""289444K""","""250G""",573,0,282.660156,256000.0,"""00:09:33""","""00:00:00"""
"""20896450""","""indelfree_bruteforce""",0,"[""CANCELLED by 91591""]","[""0:0""]",,"""168G""",0,0,,172032.0,"""00:00:00""","""00:00:00"""
"""19675862""","""indelfree_bruteforce""",16,"[""CANCELLED by 91591"", ""CANCELLED"", ""COMPLETED""]","[""0:0"", ""0:15""]","""14928652K""","""168G""",78,0,14578.761719,172032.0,"""00:01:18""","""00:00:00"""
"""19675921""","""minimap2""",16,"[""COMPLETED""]","[""0:0""]","""1258640K""","""168G""",1644,8578,1229.140625,172032.0,"""00:27:24""","""02:22:58"""


Get all jobids that are actually needed

In [3]:
import glob
# fraction_dirs = glob.glob(os.path.join("results/real_data/subsamples/fraction_*"))
log_files = glob.glob("results/real_data/subsamples/fraction_*/slurm_logs/*.out")
# next - into a df with jobid, fraction, and tool name
log_files_df = pl.DataFrame({
    "log_file": log_files
}).with_columns(
    pl.col("log_file").str.extract(r"-(\d+)\.out").alias("BaseJobID"),
    pl.col("log_file").str.extract(r"fraction_([\d.]+)").alias("fraction"),
    pl.col("log_file").str.extract(r"slurm_logs/(.+)-\d+\.out").str.replace_all("_long","").str.replace_all("mmseqs","mmseqs2").alias("tool_name")
)
log_files_df = log_files_df.join(sacct_df, on="BaseJobID", how="inner")
log_files_df
# sacct_df.filter(~pl.col("JobName").is_in(log_files_df["tool_name"]))
# log_files_df.filter(~pl.col("tool_name").is_in(sacct_df["JobName"]))

log_file,BaseJobID,fraction,tool_name,JobName,AllocCPUS,State,ExitCode,MaxRSS,ReqMem,Elapsed_Seconds,TotalCPU_Seconds,MaxRSS_MB,ReqMem_MB,Elapsed,TotalCPU
str,str,str,str,str,i64,list[str],list[str],str,str,u32,u32,f64,f64,str,str
"""results/real_data/subsamples/fraction_0.001/slurm_logs/bowtie2-20668610.out""","""20668610""","""0.001""","""bowtie2""","""bowtie2""",38,"[""COMPLETED""]","[""0:0""]","""816692K""","""168G""",225,0,797.550781,172032.0,"""00:03:45""","""00:00:00"""
"""results/real_data/subsamples/fraction_0.001/slurm_logs/blastn-20697880.out""","""20697880""","""0.001""","""blastn""","""blastn""",38,"[""COMPLETED""]","[""0:0""]","""662220K""","""168G""",1971,32472,646.699219,172032.0,"""00:32:51""","""09:01:12"""
"""results/real_data/subsamples/fraction_0.01/slurm_logs/bowtie2-20668634.out""","""20668634""","""0.01""","""bowtie2""","""bowtie2""",38,"[""COMPLETED""]","[""0:0""]","""1928272K""","""168G""",1408,10380,1883.078125,172032.0,"""00:23:28""","""02:53:00"""
"""results/real_data/subsamples/fraction_0.001/slurm_logs/mummer4-20668616.out""","""20668616""","""0.001""","""mummer4""","""mummer4""",38,"[""COMPLETED""]","[""0:0""]","""127156K""","""168G""",30,0,124.175781,172032.0,"""00:00:30""","""00:00:00"""
"""results/real_data/subsamples/fraction_0.01/slurm_logs/minimap2-20668638.out""","""20668638""","""0.01""","""minimap2""","""minimap2""",38,"[""COMPLETED""]","[""0:0""]","""1730348K""","""168G""",2260,9386,1689.792969,172032.0,"""00:37:40""","""02:36:26"""
"""results/real_data/subsamples/fraction_0.1/slurm_logs/mmseqs-20668660.out""","""20668660""","""0.1""","""mmseqs2""","""mmseqs2""",38,"[""COMPLETED""]","[""0:0""]","""23944384K""","""168G""",481,0,23383.1875,172032.0,"""00:08:01""","""00:00:00"""
"""results/real_data/subsamples/fraction_0.005/slurm_logs/bowtie1-20668621.out""","""20668621""","""0.005""","""bowtie1""","""bowtie1""",38,"[""COMPLETED""]","[""0:0""]","""868176K""","""168G""",331,0,847.828125,172032.0,"""00:05:31""","""00:00:00"""
"""results/real_data/subsamples/fraction_0.0005/slurm_logs/strobealign-20668606.out""","""20668606""","""0.0005""","""strobealign""","""strobealign""",38,"[""COMPLETED""]","[""0:0""]","""664144K""","""168G""",19,0,648.578125,172032.0,"""00:00:19""","""00:00:00"""
"""results/real_data/subsamples/fraction_0.0005/slurm_logs/indelfree_bruteforce-20668599.out""","""20668599""","""0.0005""","""indelfree_bruteforce""","""indelfree_bruteforce""",38,"[""COMPLETED""]","[""0:0""]","""23106652K""","""168G""",96310,711960,22565.089844,172032.0,"""1-02:45:10""","""8-05:46:00"""
"""results/real_data/subsamples/fraction_0.05/slurm_logs/x_mapper-20668653.out""","""20668653""","""0.05""","""x_mapper""","""x_mapper""",38,"[""COMPLETED""]","[""0:0""]","""11270352K""","""168G""",1540,0,11006.203125,172032.0,"""00:25:40""","""00:00:00"""


In [36]:
# Get contig counts and total sizes for each fraction
fraction_dirs = glob.glob("results/real_data/subsamples/fraction_*")
contig_counts = []

for fdir in sorted(fraction_dirs):
    fraction = fdir.split("fraction_")[-1]
    metadata_file = os.path.join(fdir, "subsampled_data/subsampled_metadata.tsv")
    
    if os.path.exists(metadata_file):
        # Read metadata to get counts and sizes
        metadata_df = pl.read_csv(metadata_file, separator='\t')
        num_contigs = len(metadata_df)
        total_size_bp = metadata_df["length"].sum()
        
        contig_counts.append({
            "fraction": fraction,
            "fraction_float": float(fraction),
            "num_contigs": num_contigs,
            "total_size_bp": total_size_bp,
            "total_size_Mbp": total_size_bp / 1_000_000
        })

contig_counts_df = pl.DataFrame(contig_counts).sort("fraction_float")
print("Contig counts and sizes by fraction:")
print(contig_counts_df)

Contig counts and sizes by fraction:
shape: (7, 5)
┌──────────┬────────────────┬─────────────┬───────────────┬────────────────┐
│ fraction ┆ fraction_float ┆ num_contigs ┆ total_size_bp ┆ total_size_Mbp │
│ ---      ┆ ---            ┆ ---         ┆ ---           ┆ ---            │
│ str      ┆ f64            ┆ i64         ┆ i64           ┆ f64            │
╞══════════╪════════════════╪═════════════╪═══════════════╪════════════════╡
│ 0.0005   ┆ 0.0005         ┆ 279         ┆ 7036591       ┆ 7.036591       │
│ 0.001    ┆ 0.001          ┆ 421         ┆ 9745671       ┆ 9.745671       │
│ 0.005    ┆ 0.005          ┆ 2107        ┆ 57059412      ┆ 57.059412      │
│ 0.01     ┆ 0.01           ┆ 4214        ┆ 123668270     ┆ 123.66827      │
│ 0.05     ┆ 0.05           ┆ 21071       ┆ 715073062     ┆ 715.073062     │
│ 0.1      ┆ 0.1            ┆ 42143       ┆ 1504536616    ┆ 1504.536616    │
│ 1        ┆ 1.0            ┆ 421431      ┆ 18870643188   ┆ 18870.643188   │
└──────────┴─────────────

In [5]:
# Get spacer counts and total size (same for all fractions)
spacer_file = "imgvr4_data/spacers/iphop_filtered_spacers.fna"

spacer_df = pl.DataFrame(read_fasta_needletail(spacer_file), schema={'seqid': pl.Utf8, 'seq': pl.Utf8},)
spacer_df = spacer_df.with_columns(
    pl.col("seq").str.len_chars().alias("length")
).drop("seq")
num_spacers = spacer_df.height
total_spacer_bp = spacer_df["length"].sum()
total_spacer_Mbp = total_spacer_bp / 1_000_000

print("Spacer dataset (constant across all fractions):")
print(f"  Number of spacers: {num_spacers:,}")
print(f"  Total length: {total_spacer_bp:,} bp ({total_spacer_Mbp:.2f} Mbp)")

Spacer dataset (constant across all fractions):
  Number of spacers: 3,826,979
  Total length: 129,494,053 bp (129.49 Mbp)


In [6]:
# Check the relationship between Elapsed and TotalCPU to understand the metrics
# Elapsed_Seconds should be wall-clock time, TotalCPU_Seconds is actual CPU time used
sample_check = log_files_df.select([
    "tool_name", "fraction", "AllocCPUS", "Elapsed_Seconds", "TotalCPU_Seconds"
]).head(20)

print("Sample of timing data:")
print(sample_check)

# Check if we have TotalCPU data
if "TotalCPU_Seconds" in log_files_df.columns:
    print("\n✓ TotalCPU_Seconds is available - this is the actual CPU time used")
    print("✓ Elapsed_Seconds is wall-clock time")
else:
    print("\n⚠ Only Elapsed_Seconds available")

# Check CPU allocation variations
cpu_check = log_files_df.with_columns([
    pl.col("log_file").str.contains("_long").alias("is_long")
]).group_by(["tool_name", "is_long"]).agg([
    pl.col("AllocCPUS").unique().alias("CPU_values"),
    pl.len().alias("count")
]).sort("tool_name")

print("\nCPU allocation by tool and retry status:")
print(cpu_check)

Sample of timing data:
shape: (20, 5)
┌──────────────────────┬──────────┬───────────┬─────────────────┬──────────────────┐
│ tool_name            ┆ fraction ┆ AllocCPUS ┆ Elapsed_Seconds ┆ TotalCPU_Seconds │
│ ---                  ┆ ---      ┆ ---       ┆ ---             ┆ ---              │
│ str                  ┆ str      ┆ i64       ┆ u32             ┆ u32              │
╞══════════════════════╪══════════╪═══════════╪═════════════════╪══════════════════╡
│ bowtie2              ┆ 0.001    ┆ 38        ┆ 225             ┆ 0                │
│ blastn               ┆ 0.001    ┆ 38        ┆ 1971            ┆ 32472            │
│ bowtie2              ┆ 0.01     ┆ 38        ┆ 1408            ┆ 10380            │
│ mummer4              ┆ 0.001    ┆ 38        ┆ 30              ┆ 0                │
│ minimap2             ┆ 0.01     ┆ 38        ┆ 2260            ┆ 9386             │
│ mmseqs2              ┆ 0.1      ┆ 38        ┆ 481             ┆ 0                │
│ bowtie1              ┆ 0.

In [7]:
# Add max_distance information to log_files_df
def get_max_distance(tool_name, fraction_float):
    """
    Determine the max_distance setting for a given tool and fraction.
    
    Returns a string describing the max_distance setting:
    - For tools with fixed edit distance (indelfree, sassy): returns the numeric value
    - For bowtie1: always returns "3" (max supported)
    - For blastn/mmseqs2: returns descriptive text about their similarity thresholds
    - For other tools: returns "N/A"
    """
    # Determine the configured max_distance based on fraction size
    if fraction_float in [0.0005, 0.001, 0.005, 0.001]:
        configured_max_dist = 5
    elif fraction_float in [0.05, 0.1]:
        configured_max_dist = 3
    elif fraction_float == 1.0:
        configured_max_dist = 3
    else:
        configured_max_dist = None
    
    # Tool-specific handling
    if tool_name in ['indelfree_bruteforce', 'indelfree_indexed', 'sassy']:
        # These tools use the configured max_distance parameter
        return str(configured_max_dist) if configured_max_dist is not None else "N/A"
    elif tool_name == 'bowtie1':
        # bowtie1 always uses max_distance=3 (maximum supported with -v flag)
        return "3"
    elif tool_name == 'blastn':
        # blastn uses perc_identity and qcov_hsp_perc, not fixed edit distance
        return "perc_id=84%"
    elif tool_name == 'mmseqs2':
        # mmseqs2 uses min-seq-id, not fixed edit distance
        return "min-seq-id=85%"
    else:
        # Other tools (bowtie2, minimap2, strobealign, mummer4, x_mapper) don't use max_distance
        return "N/A"

# Apply the function to add max_distance column
log_files_df = log_files_df.with_columns([
    pl.struct(["tool_name", "fraction"]).map_elements(
        lambda x: get_max_distance(x["tool_name"], x["fraction"]),
        return_dtype=pl.Utf8
    ).alias("max_distance_setting")
])

print("Max distance settings by tool and fraction:")
max_dist_summary = log_files_df.group_by(["tool_name", "fraction", "max_distance_setting"]).agg([
    pl.len().alias("count")
]).sort(["tool_name", "fraction"])
print(max_dist_summary)

Max distance settings by tool and fraction:
shape: (71, 4)
┌──────────────────────┬──────────┬──────────────────────┬───────┐
│ tool_name            ┆ fraction ┆ max_distance_setting ┆ count │
│ ---                  ┆ ---      ┆ ---                  ┆ ---   │
│ str                  ┆ str      ┆ str                  ┆ u64   │
╞══════════════════════╪══════════╪══════════════════════╪═══════╡
│ blastn               ┆ 0.0005   ┆ perc_id=84%          ┆ 2     │
│ blastn               ┆ 0.001    ┆ perc_id=84%          ┆ 2     │
│ blastn               ┆ 0.005    ┆ perc_id=84%          ┆ 2     │
│ blastn               ┆ 0.01     ┆ perc_id=84%          ┆ 1     │
│ blastn               ┆ 0.05     ┆ perc_id=84%          ┆ 1     │
│ blastn               ┆ 0.1      ┆ perc_id=84%          ┆ 1     │
│ blastn               ┆ 1        ┆ perc_id=84%          ┆ 1     │
│ bowtie1              ┆ 0.0005   ┆ 3                    ┆ 1     │
│ bowtie1              ┆ 0.001    ┆ 3                    ┆ 1     │
│ b

## Determine max_distance settings for each tool

Based on the Prepare_all_jobs.ipynb notebook:
- Smaller fractions (0.0005, 0.001, 0.005, 0.01): max_distance=5
- Larger fractions (0.05, 0.1): max_distance=3
- Full dataset (1): max_distance=3

Tool-specific notes:
- **bowtie1**: Always uses max_distance=3 (maximum it supports with -v flag)
- **blastn**: Uses perc_identity=84 and qcov_hsp_perc=80, not a fixed edit distance
- **mmseqs2**: Uses min-seq-id=0.85, not a fixed edit distance
- **indelfree_bruteforce/indexed**: Use the configured max_distance (subs parameter)
- **sassy**: Uses the configured max_distance (-k parameter)

# Process jobs accounting for retries and fraction sizes

Key considerations:
- Multiple fraction sizes tested (0.001, 0.01, 0.1, etc.)
- Some jobs timed out and were resubmitted with "_long" suffix (may have different QOS/CPU allocation)
- For each tool-fraction combination, prefer completed jobs (prioritize _long if both exist)
- Track CPU allocations as they may differ between regular and _long jobs
- Report TIMEOUT only if all attempts failed for that tool-fraction

In [8]:
# Process log_files_df to handle regular and _long jobs
# First, identify which jobs are regular vs _long retries
log_files_df = log_files_df.with_columns([
    pl.col("log_file").str.contains("_long").alias("is_long_retry"),
    pl.col("tool_name").alias("base_tool_name"),  # already has _long stripped
    pl.col("fraction").cast(pl.Float64).alias("fraction_float")  # for sorting/plotting
])

# Check job states - categorize them
log_files_df = log_files_df.with_columns([
    pl.col("State").list.get(0).alias("state_first"),  # Get first state from list
])

# Mark jobs to exclude (OOM, CANCELLED, PREEMPTED) and timeout
exclude_states = ['OUT_OF_MEMORY', 'CANCELLED', 'PREEMPTED',"RUNNING"]
log_files_df = log_files_df.with_columns([
    pl.col("state_first").is_in(exclude_states).alias("should_exclude"),
    pl.col("state_first").str.contains("TIMEOUT").fill_null(False).alias("is_timeout"),
    pl.col("state_first").str.contains("COMPLETED").fill_null(False).alias("is_completed"),
    pl.col("state_first").str.contains("FAILED").fill_null(False).alias("is_failed"),
])

# For each (tool, fraction) pair, select the best job
# Priority: _long completed > regular completed > mark failed/timeout
job_selection = log_files_df.group_by(["base_tool_name", "fraction"]).agg([
    pl.all(),
])

def select_best_job(group_df):
    """Select the best job from regular/_long pair for each tool-fraction combination"""
    results = []
    
    for row in group_df.iter_rows(named=True):
        tool = row["base_tool_name"]
        fraction = row["fraction"]
        
        # Get all jobs for this tool-fraction pair
        is_long = row["is_long_retry"]
        should_exclude = row["should_exclude"]
        is_completed = row["is_completed"]
        is_timeout = row["is_timeout"]
        is_failed = row["is_failed"]
        jobids = row["BaseJobID"]
        elapsed = row["Elapsed_Seconds"]
        total_cpu = row.get("TotalCPU_Seconds", elapsed)  # fallback to elapsed if no TotalCPU
        maxrss = row["MaxRSS_MB"]
        reqmem = row["ReqMem_MB"]
        alloccpus = row["AllocCPUS"]
        max_distance = row["max_distance_setting"]
        
        # Filter out excluded jobs (OOM, CANCELLED, PREEMPTED)
        valid_indices = [i for i, excl in enumerate(should_exclude) if not excl]
        
        if not valid_indices:
            continue  # All jobs excluded, skip
        
        # Separate regular and long jobs
        regular_idx = [i for i in valid_indices if not is_long[i]]
        long_idx = [i for i in valid_indices if is_long[i]]
        
        # Selection priority:
        # 1. _long completed
        # 2. regular completed  
        # 3. Mark as FAILED/TIMEOUT if all attempts failed
        
        selected_idx = None
        status = None
        replaced_failed_job = False
        
        # Check _long jobs first
        if long_idx:
            long_i = long_idx[0]
            if is_completed[long_i]:
                selected_idx = long_i
                status = "COMPLETED"
                # Check if this replaced a failed regular job
                if regular_idx and (is_timeout[regular_idx[0]] or is_failed[regular_idx[0]]):
                    replaced_failed_job = True
            elif is_timeout[long_i] or is_failed[long_i]:
                # _long also failed
                selected_idx = long_i
                status = "FAILED" if is_failed[long_i] else "TIMEOUT"
        
        # If no _long or _long didn't work, try regular
        if selected_idx is None and regular_idx:
            reg_i = regular_idx[0]
            if is_completed[reg_i]:
                selected_idx = reg_i
                status = "COMPLETED"
            elif is_timeout[reg_i] or is_failed[reg_i]:
                if not long_idx:  # Only report failure if no _long exists
                    selected_idx = reg_i
                    status = "FAILED" if is_failed[reg_i] else "TIMEOUT"
        
        if selected_idx is not None:
            # Use TotalCPU_Seconds if available and non-zero, otherwise Elapsed_Seconds
            # For very fast jobs, Slurm reports TotalCPU as 0, so we need to use wall time
            cpu_time_raw = total_cpu[selected_idx] if isinstance(total_cpu, list) else elapsed[selected_idx]
            elapsed_time = elapsed[selected_idx]
            
            # If TotalCPU is 0 or very small (< 1 second), use Elapsed instead
            # This handles fast jobs where Slurm doesn't capture CPU time accurately
            if cpu_time_raw < 1:
                cpu_time = elapsed_time
            else:
                cpu_time = cpu_time_raw
            
            # Special case: blastn fraction_1 - extrapolate runtime
            # Resource tracking only captured up to spacer 1689771 of 3826979 total
            # Multiply observed runtime by 3826979/1689771 ≈ 2.266 to get full estimate
            blastn_extrapolation_factor = 3826979 / 1689771
            if tool == "blastn" and fraction == "1":
                cpu_time = cpu_time * blastn_extrapolation_factor
                elapsed_time = elapsed_time * blastn_extrapolation_factor
                # Mark as completed since we have valid extrapolated data
                if status in ["TIMEOUT", "FAILED"]:
                    status = "COMPLETED"
            
            results.append({
                "tool": tool,
                "fraction": fraction,
                "fraction_float": float(fraction) if fraction else 0,
                "BaseJobID": jobids[selected_idx],
                "status": status,
                "is_long_retry": is_long[selected_idx],
                "replaced_failed_job": replaced_failed_job,
                "Elapsed_Seconds": elapsed_time,
                "CPU_Time_Seconds": cpu_time,
                "MaxRSS_MB": maxrss[selected_idx],
                "ReqMem_MB": reqmem[selected_idx],
                "AllocCPUS": alloccpus[selected_idx],
                "Peak_Memory_Gb": maxrss[selected_idx] / 1024 if maxrss[selected_idx] else None,
                "CPU_Time_Hours": cpu_time / 3600,
                "Walltime_Hours": elapsed_time / 3600,
                "max_distance": max_distance[selected_idx] if isinstance(max_distance, list) else max_distance,
            })
    
    return pl.DataFrame(results) if results else pl.DataFrame()

# Apply selection
processed_jobs = select_best_job(job_selection)
processed_jobs = processed_jobs.sort(["tool", "fraction_float"])

print(f"Total jobs processed: {len(processed_jobs)}")
print(f"Unique tools: {processed_jobs['tool'].n_unique()}")
# print(f"Unique fractions: {sorted(processed_jobs['fraction'].unique().to_list())}")
print(f"\nJobs using _long retry: {processed_jobs['is_long_retry'].sum()}")
print(f"Jobs that replaced failed attempts: {processed_jobs['replaced_failed_job'].sum()}")

processed_jobs

Total jobs processed: 70
Unique tools: 11

Jobs using _long retry: 0
Jobs that replaced failed attempts: 0


tool,fraction,fraction_float,BaseJobID,status,is_long_retry,replaced_failed_job,Elapsed_Seconds,CPU_Time_Seconds,MaxRSS_MB,ReqMem_MB,AllocCPUS,Peak_Memory_Gb,CPU_Time_Hours,Walltime_Hours,max_distance
str,str,f64,str,str,bool,bool,i64,i64,f64,f64,i64,f64,f64,f64,str
"""blastn""","""0.0005""",0.0005,"""20668596""","""COMPLETED""",False,False,127207,329398,0.0,172032.0,38,,91.499444,35.335278,"""perc_id=84%"""
"""blastn""","""0.001""",0.001,"""20697880""","""COMPLETED""",False,False,1971,32472,646.699219,172032.0,38,0.631542,9.02,0.5475,"""perc_id=84%"""
"""blastn""","""0.005""",0.005,"""20697881""","""COMPLETED""",False,False,11469,161246,696.730469,172032.0,38,0.680401,44.790556,3.185833,"""perc_id=84%"""
"""blastn""","""0.01""",0.01,"""20697882""","""COMPLETED""",False,False,32403,341006,712.613281,172032.0,38,0.695911,94.723889,9.000833,"""perc_id=84%"""
"""blastn""","""0.05""",0.05,"""20697883""","""COMPLETED""",False,False,103560,2003888,1340.75,172032.0,38,1.309326,556.635556,28.766667,"""perc_id=84%"""
"""blastn""","""0.1""",0.1,"""20697884""","""COMPLETED""",False,False,528481,4089534,2888.320312,172032.0,38,2.820625,1135.981667,146.800278,"""perc_id=84%"""
"""bowtie1""","""0.0005""",0.0005,"""20668861""","""COMPLETED""",False,False,150,150,183.382812,172032.0,38,0.179085,0.041667,0.041667,"""3"""
"""bowtie1""","""0.001""",0.001,"""20668609""","""COMPLETED""",False,False,153,153,516.863281,172032.0,38,0.504749,0.0425,0.0425,"""3"""
"""bowtie1""","""0.005""",0.005,"""20668621""","""COMPLETED""",False,False,331,331,847.828125,172032.0,38,0.827957,0.091944,0.091944,"""3"""
"""bowtie1""","""0.01""",0.01,"""20668633""","""COMPLETED""",False,False,382,382,1254.332031,172032.0,38,1.224934,0.106111,0.106111,"""3"""


In [9]:
# Filter to only completed jobs for main analysis
completed_jobs = processed_jobs.filter(pl.col("status") == "COMPLETED")

# Add tool_display column that includes max_distance suffix for tools that use it
def get_tool_display_name(tool, max_dist):
    """Add suffix to tool names that have variable max_distance settings"""
    if tool in ['indelfree_bruteforce', 'indelfree_indexed', 'sassy']:
        # Extract numeric value if present
        if max_dist.isdigit():
            return f"{tool}_d{max_dist}"
    return tool

completed_jobs = completed_jobs.with_columns([
    pl.struct(["tool", "max_distance"]).map_elements(
        lambda x: get_tool_display_name(x["tool"], x["max_distance"]),
        return_dtype=pl.Utf8
    ).alias("tool_display")   
])

# Show summary of job statuses
print("Job Status Summary:")
status_summary = processed_jobs.group_by(["tool", "status"]).agg([
    pl.len().alias("count")
]).sort(["tool", "status"])
print(status_summary)

print(f"\nTotal completed jobs: {len(completed_jobs)}")
print(f"Total jobs with issues: {len(processed_jobs) - len(completed_jobs)}")

Job Status Summary:
shape: (13, 3)
┌──────────────────────┬───────────┬───────┐
│ tool                 ┆ status    ┆ count │
│ ---                  ┆ ---       ┆ ---   │
│ str                  ┆ str       ┆ u64   │
╞══════════════════════╪═══════════╪═══════╡
│ blastn               ┆ COMPLETED ┆ 6     │
│ bowtie1              ┆ COMPLETED ┆ 7     │
│ bowtie2              ┆ COMPLETED ┆ 7     │
│ indelfree_bruteforce ┆ COMPLETED ┆ 2     │
│ indelfree_bruteforce ┆ TIMEOUT   ┆ 2     │
│ indelfree_indexed    ┆ COMPLETED ┆ 6     │
│ indelfree_indexed    ┆ TIMEOUT   ┆ 1     │
│ minimap2             ┆ COMPLETED ┆ 7     │
│ mmseqs2              ┆ COMPLETED ┆ 7     │
│ mummer4              ┆ COMPLETED ┆ 7     │
│ sassy                ┆ COMPLETED ┆ 4     │
│ strobealign          ┆ COMPLETED ┆ 7     │
│ x_mapper             ┆ COMPLETED ┆ 7     │
└──────────────────────┴───────────┴───────┘

Total completed jobs: 67
Total jobs with issues: 3


In [10]:
# Display max_distance settings summary
print("Max Distance Settings Summary:")
max_dist_by_tool = completed_jobs.group_by(["tool", "tool_display", "max_distance"]).agg([
    pl.col("fraction").unique().sort().alias("fractions"),
    pl.len().alias("count")
]).sort("tool_display")
print(max_dist_by_tool)


Max Distance Settings Summary:
shape: (11, 5)
┌──────────────────────┬──────────────────────┬────────────────┬───────────────────────────────┬───────┐
│ tool                 ┆ tool_display         ┆ max_distance   ┆ fractions                     ┆ count │
│ ---                  ┆ ---                  ┆ ---            ┆ ---                           ┆ ---   │
│ str                  ┆ str                  ┆ str            ┆ list[str]                     ┆ u64   │
╞══════════════════════╪══════════════════════╪════════════════╪═══════════════════════════════╪═══════╡
│ blastn               ┆ blastn               ┆ perc_id=84%    ┆ ["0.0005", "0.001", … "0.1"]  ┆ 6     │
│ bowtie1              ┆ bowtie1              ┆ 3              ┆ ["0.0005", "0.001", … "1"]    ┆ 7     │
│ bowtie2              ┆ bowtie2              ┆ N/A            ┆ ["0.0005", "0.001", … "1"]    ┆ 7     │
│ indelfree_bruteforce ┆ indelfree_bruteforce ┆ N/A            ┆ ["0.0005", "0.001"]           ┆ 2     │
│ indelfr

Notes:
- indelfree_bruteforce, indelfree_indexed, sassy: Use configured edit/hamming distance
- bowtie1: Fixed at max_distance=3 (maximum supported by -v flag)
- blastn: Uses perc_identity=84% and qcov_hsp_perc=80% (not fixed edit distance)
- mmseqs2: Uses min-seq-id=85% (not fixed edit distance)
- Other tools: Do not use max_distance constraints


In [11]:
# Summary statistics by tool across all fractions
# Using Total CPU Time (TotalCPU) which already accounts for CPU allocation
tool_summary = completed_jobs.group_by("tool_display").agg([
    pl.col("tool").first().alias("base_tool"),
    pl.col("max_distance").unique().alias("max_distances"),
    pl.col("CPU_Time_Hours").mean().alias("Avg_CPU_Time_Hours"),
    pl.col("CPU_Time_Hours").min().alias("Min_CPU_Time_Hours"),
    pl.col("CPU_Time_Hours").max().alias("Max_CPU_Time_Hours"),
    pl.col("CPU_Time_Hours").sum().alias("Total_CPU_Time_Hours"),
    pl.col("Peak_Memory_Gb").mean().alias("Avg_Peak_Memory_Gb"),
    pl.col("Peak_Memory_Gb").max().alias("Max_Peak_Memory_Gb"),
    pl.col("AllocCPUS").unique().alias("CPU_Allocations_Used"),
    pl.col("is_long_retry").sum().alias("Num_Long_Retries"),
    pl.col("replaced_failed_job").sum().alias("Num_Replaced_Failures"),
    pl.len().alias("Num_Fractions")
]).sort("Avg_CPU_Time_Hours")

# Add note about varying CPU allocations
tool_summary = tool_summary.with_columns([
    (pl.col("CPU_Allocations_Used").list.len() > 1).alias("Variable_CPUs")
])


print("Note: CPU Time is total CPU time (from Slurm TotalCPU) - already accounts for # of CPUs used")

print(tool_summary)

# Show which tools had varying CPU allocations
varying_cpus = tool_summary.filter(pl.col("Variable_CPUs"))
if len(varying_cpus) > 0:
    print("\nTools with varying CPU allocations (likely due to _long retries):")
    for row in varying_cpus.iter_rows(named=True):
        print(f"  {row['tool']}: CPUs = {row['CPU_Allocations_Used']}")

tool_summary

Note: CPU Time is total CPU time (from Slurm TotalCPU) - already accounts for # of CPUs used
shape: (11, 14)
┌──────────────────────┬──────────────────────┬────────────────────┬────────────────────┬────────────────────┬────────────────────┬──────────────────────┬────────────────────┬────────────────────┬──────────────────────┬──────────────────┬───────────────────────┬───────────────┬───────────────┐
│ tool_display         ┆ base_tool            ┆ max_distances      ┆ Avg_CPU_Time_Hours ┆ Min_CPU_Time_Hours ┆ Max_CPU_Time_Hours ┆ Total_CPU_Time_Hours ┆ Avg_Peak_Memory_Gb ┆ Max_Peak_Memory_Gb ┆ CPU_Allocations_Used ┆ Num_Long_Retries ┆ Num_Replaced_Failures ┆ Num_Fractions ┆ Variable_CPUs │
│ ---                  ┆ ---                  ┆ ---                ┆ ---                ┆ ---                ┆ ---                ┆ ---                  ┆ ---                ┆ ---                ┆ ---                  ┆ ---              ┆ ---                   ┆ ---           ┆ ---           │
│ str 

tool_display,base_tool,max_distances,Avg_CPU_Time_Hours,Min_CPU_Time_Hours,Max_CPU_Time_Hours,Total_CPU_Time_Hours,Avg_Peak_Memory_Gb,Max_Peak_Memory_Gb,CPU_Allocations_Used,Num_Long_Retries,Num_Replaced_Failures,Num_Fractions,Variable_CPUs
str,str,list[str],f64,f64,f64,f64,f64,f64,list[i64],u64,u64,u64,bool
"""strobealign""","""strobealign""","[""N/A""]",0.067103,0.005,0.320833,0.469722,22.364977,116.668949,[38],0,0,7,False
"""mmseqs2""","""mmseqs2""","[""min-seq-id=85%""]",0.835595,0.065833,5.366667,5.849167,31.56556,141.762463,[38],0,0,7,False
"""x_mapper""","""x_mapper""","[""N/A""]",7.066429,0.025,45.495556,49.465,41.36002,244.277416,[38],0,0,7,False
"""mummer4""","""mummer4""","[""N/A""]",7.558492,0.006667,47.875556,52.909444,1.830369,9.023922,[38],0,0,7,False
"""bowtie1""","""bowtie1""","[""3""]",18.845238,0.041667,117.29,131.916667,23.040318,146.369663,[38],0,0,7,False
"""minimap2""","""minimap2""","[""N/A""]",81.428452,0.029167,517.35,569.999167,15.711505,90.691395,[38],0,0,7,False
"""indelfree_indexed""","""indelfree_indexed""","[""N/A""]",112.678611,2.877222,418.195556,676.071667,29.981812,40.75721,[38],0,0,6,False
"""bowtie2""","""bowtie2""","[""N/A""]",148.91381,0.0525,994.867222,1042.396667,24.839762,151.482643,[38],0,0,7,False
"""sassy""","""sassy""","[""N/A""]",178.843194,26.244444,439.597778,715.372778,11.115348,27.940441,[38],0,0,4,False
"""indelfree_bruteforce""","""indelfree_bruteforce""","[""N/A""]",284.449167,197.766667,371.131667,568.898333,24.926435,27.81665,[38],0,0,2,False


In [12]:
# Show all non-completed jobs (failed or timed out, NOT includding those still "RUNNING")
failed_jobs = processed_jobs.filter(pl.col("status") != "COMPLETED").sort(["tool", "fraction"])

if len(failed_jobs) > 0:
    
    print(f"FAILED/TIMEOUT JOBS: {len(failed_jobs)} total")
    
    
    # Convert elapsed seconds to hours for readability
    failed_jobs_display = failed_jobs.with_columns([
        (pl.col("Elapsed_Seconds") / 3600).round(2).alias("Time_Hours")
    ]).select([
        "tool", "fraction", "status", "Time_Hours", "AllocCPUS", "BaseJobID"
    ])
    
    print(failed_jobs_display)
    
    
    # Summary by tool
    failure_summary = failed_jobs.group_by(["tool"]).agg([
        pl.len().alias("num_failures")
    ]).sort("num_failures", descending=True)
    
    print("\nFailure Summary by Tool:")
    print(failure_summary)
else:
    print("\n✓ All jobs completed successfully!")

failed_jobs

FAILED/TIMEOUT JOBS: 3 total
shape: (3, 6)
┌──────────────────────┬──────────┬─────────┬────────────┬───────────┬───────────┐
│ tool                 ┆ fraction ┆ status  ┆ Time_Hours ┆ AllocCPUS ┆ BaseJobID │
│ ---                  ┆ ---      ┆ ---     ┆ ---        ┆ ---       ┆ ---       │
│ str                  ┆ str      ┆ str     ┆ f64        ┆ i64       ┆ str       │
╞══════════════════════╪══════════╪═════════╪════════════╪═══════════╪═══════════╡
│ indelfree_bruteforce ┆ 0.005    ┆ TIMEOUT ┆ 216.01     ┆ 38        ┆ 20668623  │
│ indelfree_bruteforce ┆ 0.01     ┆ TIMEOUT ┆ 216.03     ┆ 38        ┆ 20668635  │
│ indelfree_indexed    ┆ 1        ┆ TIMEOUT ┆ 216.0      ┆ 38        ┆ 20668667  │
└──────────────────────┴──────────┴─────────┴────────────┴───────────┴───────────┘

Failure Summary by Tool:
shape: (2, 2)
┌──────────────────────┬──────────────┐
│ tool                 ┆ num_failures │
│ ---                  ┆ ---          │
│ str                  ┆ u64          │
╞═════════

tool,fraction,fraction_float,BaseJobID,status,is_long_retry,replaced_failed_job,Elapsed_Seconds,CPU_Time_Seconds,MaxRSS_MB,ReqMem_MB,AllocCPUS,Peak_Memory_Gb,CPU_Time_Hours,Walltime_Hours,max_distance
str,str,f64,str,str,bool,bool,i64,i64,f64,f64,i64,f64,f64,f64,str
"""indelfree_bruteforce""","""0.005""",0.005,"""20668623""","""TIMEOUT""",False,False,777636,6498920,28696.960938,172032.0,38,28.024376,1805.255556,216.01,"""N/A"""
"""indelfree_bruteforce""","""0.01""",0.01,"""20668635""","""TIMEOUT""",False,False,777693,8055988,30156.121094,172032.0,38,29.449337,2237.774444,216.025833,"""N/A"""
"""indelfree_indexed""","""1""",1.0,"""20668667""","""TIMEOUT""",False,False,777612,11044956,92536.476562,256000.0,38,90.367653,3068.043333,216.003333,"""N/A"""


# Failed/Timeout Jobs Report
List all jobs that timed out or failed

In [13]:
# Detailed performance table by tool and fraction
performance_by_fraction = completed_jobs.select([
    "tool_display", "tool", "fraction", "fraction_float", "max_distance", "AllocCPUS", 
    "CPU_Time_Hours", "Peak_Memory_Gb", 
    "is_long_retry", "replaced_failed_job"
]).sort(["tool_display", "fraction_float"])

print("DETAILED PERFORMANCE: CPU Time × Memory by Tool and Fraction")
print("Note: CPU Time = Total CPU time from Slurm (already accounts for # CPUs)")

print(performance_by_fraction)

# Create summary showing resource usage scaling with fraction size
scaling_summary = completed_jobs.group_by(["tool_display", "fraction"]).agg([
    pl.col("tool").first().alias("base_tool"),
    pl.col("max_distance").first().alias("max_dist"),
    pl.col("CPU_Time_Hours").first().alias("CPU_Time_Hrs"),
    pl.col("Peak_Memory_Gb").first().alias("Memory_GB"),
    pl.col("AllocCPUS").first().alias("CPUs"),
    pl.col("replaced_failed_job").first().alias("Replaced_Failed")
]).sort(["tool_display", "fraction"])

print("RESOURCE SCALING BY FRACTION SIZE")

print(scaling_summary)


# Show jobs that replaced failed attempts
replaced = completed_jobs.filter(pl.col("replaced_failed_job")).select([
    "tool", "fraction", "AllocCPUS", "is_long_retry"
])
if len(replaced) > 0:
    print(f"\n{len(replaced)} jobs successfully completed after initial failure:")
    print(replaced)

performance_by_fraction

DETAILED PERFORMANCE: CPU Time × Memory by Tool and Fraction
Note: CPU Time = Total CPU time from Slurm (already accounts for # CPUs)
shape: (67, 10)
┌──────────────────────┬──────────────────────┬──────────┬────────────────┬────────────────┬───────────┬────────────────┬────────────────┬───────────────┬─────────────────────┐
│ tool_display         ┆ tool                 ┆ fraction ┆ fraction_float ┆ max_distance   ┆ AllocCPUS ┆ CPU_Time_Hours ┆ Peak_Memory_Gb ┆ is_long_retry ┆ replaced_failed_job │
│ ---                  ┆ ---                  ┆ ---      ┆ ---            ┆ ---            ┆ ---       ┆ ---            ┆ ---            ┆ ---           ┆ ---                 │
│ str                  ┆ str                  ┆ str      ┆ f64            ┆ str            ┆ i64       ┆ f64            ┆ f64            ┆ bool          ┆ bool                │
╞══════════════════════╪══════════════════════╪══════════╪════════════════╪════════════════╪═══════════╪════════════════╪════════════════╪════

tool_display,tool,fraction,fraction_float,max_distance,AllocCPUS,CPU_Time_Hours,Peak_Memory_Gb,is_long_retry,replaced_failed_job
str,str,str,f64,str,i64,f64,f64,bool,bool
"""blastn""","""blastn""","""0.0005""",0.0005,"""perc_id=84%""",38,91.499444,,False,False
"""blastn""","""blastn""","""0.001""",0.001,"""perc_id=84%""",38,9.02,0.631542,False,False
"""blastn""","""blastn""","""0.005""",0.005,"""perc_id=84%""",38,44.790556,0.680401,False,False
"""blastn""","""blastn""","""0.01""",0.01,"""perc_id=84%""",38,94.723889,0.695911,False,False
"""blastn""","""blastn""","""0.05""",0.05,"""perc_id=84%""",38,556.635556,1.309326,False,False
"""blastn""","""blastn""","""0.1""",0.1,"""perc_id=84%""",38,1135.981667,2.820625,False,False
"""bowtie1""","""bowtie1""","""0.0005""",0.0005,"""3""",38,0.041667,0.179085,False,False
"""bowtie1""","""bowtie1""","""0.001""",0.001,"""3""",38,0.0425,0.504749,False,False
"""bowtie1""","""bowtie1""","""0.005""",0.005,"""3""",38,0.091944,0.827957,False,False
"""bowtie1""","""bowtie1""","""0.01""",0.01,"""3""",38,0.106111,1.224934,False,False


In [None]:
from lets_plot import *
import json

# Load tool styles 
with open("notebooks/tool_styles.json", "r") as f:
    tool_styles = json.load(f)

# Setup lets-plot for HTML output
LetsPlot.setup_html()

tool_colors = {tool: styles['color'] for tool, styles in tool_styles.items()}
tool_shapes = {tool: styles['letsplot_shape_num'] for tool, styles in tool_styles.items()}

# Prepare data for plotting
tools_list = sorted(completed_jobs["tool_display"].unique().to_list())
fractions_list = sorted(completed_jobs["fraction_float"].unique().to_list())

# Merge contig info with completed_jobs for plotting
contig_info = {row['fraction_float']: (row['num_contigs'], row['total_size_Mbp']) 
               for row in contig_counts_df.iter_rows(named=True)}

plot_data = completed_jobs.with_columns([
    pl.struct(['fraction_float']).map_elements(
        lambda x: contig_info.get(x['fraction_float'], (None, None))[1],
        return_dtype=pl.Float64
    ).alias("total_size_Mbp"),
    pl.struct(['fraction_float']).map_elements(
        lambda x: contig_info.get(x['fraction_float'], (None, None))[0],
        return_dtype=pl.Int64
    ).alias("num_contigs_val")
])

# Add base tool for color mapping
plot_data = plot_data.with_columns([
    pl.col("tool_display").map_elements(
        lambda x: x.rsplit('_d', 1)[0] if '_d' in x else x,
        return_dtype=pl.Utf8
    ).alias("base_tool")
])

# Plot 1: CPU Time vs Dataset Size
plot1 = (
    ggplot(plot_data, aes(x='total_size_Mbp', y='CPU_Time_Hours', color='base_tool', group='tool_display')) +
    geom_line(size=1.5) +
    geom_point(aes(shape='base_tool'), size=4) +
    scale_color_manual(values=tool_colors, name='Tool') +
    scale_shape_manual(values=tool_shapes, name='Tool') +
    scale_x_log10() +
    scale_y_log10() +
    labs(
        x='Total Dataset Size (Mbp)',
        y='Total CPU Time (hours)',
        title='CPU Time Scaling'
    ) +
    ggsize(600, 450) +
    theme_minimal() +
    theme(
        plot_title=element_text(size=14, face='bold'),
        axis_title=element_text(size=11, face='bold')
    )
)

plot1


In [15]:
# Plot 2: Memory vs Dataset Size
plot2 = (
    ggplot(plot_data.filter(pl.col('Peak_Memory_Gb').is_not_null()), 
           aes(x='total_size_Mbp', y='Peak_Memory_Gb', color='base_tool', group='tool_display')) +
    geom_line(size=1.5) +
    geom_point(aes(shape='base_tool'), size=4) +
    scale_color_manual(values=tool_colors, name='Tool') +
    scale_shape_manual(values=tool_shapes, name='Tool') +
    scale_x_log10() +
    scale_y_log10() +
    labs(
        x='Total Dataset Size (Mbp)',
        y='Peak Memory (GB)',
        title='Memory Scaling'
    ) +
    ggsize(600, 450) +
    theme_minimal() +
    theme(
        plot_title=element_text(size=14, face='bold'),
        axis_title=element_text(size=11, face='bold')
    )
)

plot2

In [45]:
# Plot 3: Memory vs CPU Time (marker size = dataset size)
plot_data_scatter = plot_data.filter(pl.col('Peak_Memory_Gb').is_not_null())

# Create rank-based sizing for more balanced visual appearance
# Map each unique size to a rank (1, 2, 3, ...) for evenly spaced point sizes
actual_sizes_sorted = sorted(plot_data_scatter['total_size_Mbp'].unique().to_list())
size_to_rank = {size: rank + 1 for rank, size in enumerate(actual_sizes_sorted)}

plot_data_scatter = plot_data_scatter.with_columns([
    pl.col('total_size_Mbp').map_elements(
        lambda x: size_to_rank.get(x, 1),
        return_dtype=pl.Int64
    ).alias('size_rank')
])

plot3 = (
    ggplot(plot_data_scatter, 
           aes(x='CPU_Time_Hours', y='Peak_Memory_Gb', color='base_tool')) +
    geom_point(aes(shape='base_tool', size='size_rank'), alpha=0.4) +
    scale_color_manual(values=tool_colors, name='Tool') +
    scale_shape_manual(values=tool_shapes, name='Tool') +
    scale_size(range=[4, 12], breaks=list(size_to_rank.values()), 
               labels=[f'{s:.1f}' for s in actual_sizes_sorted],
               name='Dataset Size (Mbp)') +
    scale_x_log10() +
    scale_y_log10() +
    labs(
        x='Total CPU Time (hours)',
        y='Peak Memory (GB)',
        title='Memory vs CPU Time (marker size ∝ dataset size)'
    ) +
    ggsize(800, 650) +
    theme_minimal() +
    theme(
        plot_title=element_text(size=14, face='bold'),
        axis_title=element_text(size=11, face='bold')
    )
)
plot3.to_svg("./results/real_data/plots/resource_usage_real_cputime_vs_memory_with_size.svg")
plot3.to_html("./results/real_data/plots/resource_usage_real_cputime_vs_memory_with_size.html")
plot3

In [17]:
# Create enhanced figure with max_distance dimension
# Simplify max_distance to just N/A, 3, and 5

def simplify_max_distance(max_dist):
    """Simplify max_distance values to N/A, 3, or 5"""
    if max_dist in ['3', '5']:
        return max_dist
    else:
        return 'N/A'

# Add simplified max_distance column
completed_jobs_plot = completed_jobs.with_columns([
    pl.col("max_distance").map_elements(
        simplify_max_distance,
        return_dtype=pl.Utf8
    ).alias("max_dist_simple")
])

# Prepare plot data with all necessary columns
plot_data_maxdist = completed_jobs_plot.with_columns([
    pl.struct(['fraction_float']).map_elements(
        lambda x: contig_info.get(x['fraction_float'], (None, None))[1],
        return_dtype=pl.Float64
    ).alias("total_size_Mbp"),
    pl.struct(['fraction_float']).map_elements(
        lambda x: contig_info.get(x['fraction_float'], (None, None))[0],
        return_dtype=pl.Int64
    ).alias("num_contigs_val"),
    pl.col("tool_display").map_elements(
        lambda x: x.rsplit('_d', 1)[0] if '_d' in x else x,
        return_dtype=pl.Utf8
    ).alias("base_tool")
])

# Create a combined label for legend
plot_data_maxdist = plot_data_maxdist.with_columns([
    (pl.col('tool_display') + ' (d=' + pl.col('max_dist_simple') + ')').alias('tool_maxdist_label')
])

# Plot 1: CPU Time vs Dataset Size (with max_distance shown by shape/color combination)
plot_maxdist_1 = (
    ggplot(plot_data_maxdist, 
           aes(x='total_size_Mbp', y='CPU_Time_Hours', 
               color='base_tool', group='tool_maxdist_label')) +
    geom_line(size=1, alpha=0.6) +
    geom_point(aes(shape='max_dist_simple'), size=4, alpha=0.9) +
    scale_color_manual(values=tool_colors, name='Tool') +
    scale_shape_manual(values={'N/A': 19, '3': 17, '5': 15}, 
                      name='Max Distance',
                      labels={'N/A': 'N/A', '3': '3', '5': '5'}) +
    scale_x_log10() +
    scale_y_log10() +
    labs(
        x='Total Dataset Size (Mbp)',
        y='Total CPU Time (hours)',
        title='CPU Time Scaling by Max Distance'
    ) +
    ggsize(700, 500) +
    theme_minimal() +
    theme(
        plot_title=element_text(size=14, face='bold'),
        axis_title=element_text(size=11, face='bold')
    )
)

plot_maxdist_1

In [18]:
# Plot 2: Memory vs Dataset Size (with max_distance)
plot_maxdist_2 = (
    ggplot(plot_data_maxdist.filter(pl.col('Peak_Memory_Gb').is_not_null()), 
           aes(x='total_size_Mbp', y='Peak_Memory_Gb', 
               color='base_tool', group='tool_maxdist_label')) +
    geom_line(size=1, alpha=0.6) +
    geom_point(aes(shape='max_dist_simple'), size=4, alpha=0.9) +
    scale_color_manual(values=tool_colors, name='Tool') +
    scale_shape_manual(values={'N/A': 19, '3': 17, '5': 15}, 
                      name='Max Distance',
                      labels={'N/A': 'N/A', '3': '3', '5': '5'}) +
    scale_x_log10() +
    scale_y_log10() +
    labs(
        x='Total Dataset Size (Mbp)',
        y='Peak Memory (GB)',
        title='Memory Scaling by Max Distance'
    ) +
    ggsize(700, 500) +
    theme_minimal() +
    theme(
        plot_title=element_text(size=14, face='bold'),
        axis_title=element_text(size=11, face='bold')
    )
)

plot_maxdist_2

In [None]:
# Plot 3: Memory vs CPU Time (max_distance by shape, dataset size by marker size)
plot_data_maxdist_scatter = plot_data_maxdist.filter(pl.col('Peak_Memory_Gb').is_not_null())

# Normalize dataset size for marker sizing
min_size_mbp = plot_data_maxdist_scatter['total_size_Mbp'].min()
max_size_mbp = plot_data_maxdist_scatter['total_size_Mbp'].max()

plot_data_maxdist_scatter = plot_data_maxdist_scatter.with_columns([
    (3 + (pl.col('total_size_Mbp').log10() - pl.lit(min_size_mbp).log10()) / 
     (pl.lit(max_size_mbp).log10() - pl.lit(min_size_mbp).log10()) * 7).alias('size_normalized')
])

plot_maxdist_3 = (
    ggplot(plot_data_maxdist_scatter, 
           aes(x='CPU_Time_Hours', y='Peak_Memory_Gb', color='base_tool')) +
    geom_point(aes(shape='max_dist_simple', size='size_normalized'), alpha=0.7) +
    scale_color_manual(values=tool_colors, name='Tool') +
    scale_shape_manual(values={'N/A': 19, '3': 17, '5': 15}, 
                      name='Max Distance',
                      labels={'N/A': 'N/A', '3': '3', '5': '5'}) +
    scale_size_identity() +
    scale_x_log10() +
    scale_y_log10() +
    labs(
        x='Total CPU Time (hours)',
        y='Peak Memory (GB)',
        title='Memory vs CPU Time (size ∝ dataset, shape = max_dist)'
    ) +
    ggsize(700, 500) +
    theme_minimal() +
    theme(
        plot_title=element_text(size=14, face='bold'),
        axis_title=element_text(size=11, face='bold')
    )
)

plot_maxdist_3.to_svg("./results/real_data/plots/resource_usage_real_cputime_vs_memory_with_maxdist.svg")
plot_maxdist_3.to_html("./results/real_data/plots/resource_usage_real_cputime_vs_memory_with_maxdist.html")
plot_maxdist_3
### TODO: Remove timing in the above plot for TIMEDOUT
### TODO: match the shape size to the shapes in the legend
### TODO: Add another dimension (so 3D) with the max (edit/hamming) distance set for the tools at that run (if set, so only for indelfrees and sassy really)

### NOTE ABOUT MEMORY:
The peak memory from Slurm doesn't mean that is the actual minimum memory required to run the job - the java tools (indelfree) will use what they can but only clear memory when needed (according to Brian).

In [20]:
# # Debug: Check which tools have data for which fraction sizes
# print("Tools and fraction sizes in completed_jobs:")
# tool_fraction_matrix = completed_jobs.group_by(["tool", "fraction"]).agg([
#     pl.len().alias("count")
# ]).sort(["tool", "fraction"])
# print(tool_fraction_matrix)

# # Pivot to see which combinations are missing
# pivot = completed_jobs.pivot(values="CPU_Time_Hours", index="tool", on="fraction")
# print("\nPivot view (showing which tool-fraction combinations exist):")
# print(pivot)

## Part 2: of the simulated runs (doesn't include all tools, for the fully simulated and semi-synthetic datasets)
Actual job preparation is in Prepare_all_jobs.ipynb

For simulated runs, we analyze resource usage across different simulation scales (various combinations of number of spacers and contigs). Each simulation was run with multiple tools on Slurm.

**Simulated datasets:**
- `ns_50000_nc_5000`: 50k spacers × 5k contigs
- `ns_75000_nc_5000`: 75k spacers × 5k contigs  
- `ns_100000_nc_10000`: 100k spacers × 10k contigs
- `ns_100000_nc_20000`: 100k spacers × 20k contigs
- `ns_500000_nc_100000`: 500k spacers × 100k contigs
- `ns_500_nc_5000_HIGH_INSERTION_RATE`: 500 spacers × 5k contigs (high insertion rate)
- `ns_3826979_nc_421431_real_baseline`: Real baseline (3.8M spacers × 421k contigs)

Each simulation directory contains slurm logs in `slurm_logs/` subdirectory.

In [21]:
# Get all simulated run directories
import glob
import os
os.chdir('/clusterfs/jgi/scratch/science/metagen/neri/code/blits/spacer_bench/')
from bench import *
from bench.utils.functions import *
from bench.utils.pyseff import *
import polars as pl
pl.Config(tbl_rows=110)

simulated_base_dir = "results/simulated"
simulation_dirs = sorted([d for d in glob.glob(os.path.join(simulated_base_dir, "ns_*")) 
                         if os.path.isdir(d)])

# Display simulation directories
print(f"Found {len(simulation_dirs)} simulation directories:")
for sim_dir in simulation_dirs:
    sim_name = os.path.basename(sim_dir)
    print(f"  - {sim_name}")

Found 8 simulation directories:
  - ns_100000_nc_10000
  - ns_100000_nc_20000
  - ns_100_nc_50000
  - ns_3826979_nc_421431_real_baseline
  - ns_500000_nc_100000
  - ns_50000_nc_5000
  - ns_500_nc_5000_HIGH_INSERTION_RATE
  - ns_75000_nc_5000


In [22]:
# Collect all slurm log files from simulated runs
log_files_simulated = []
for sim_dir in simulation_dirs:
    slurm_log_dir = os.path.join(sim_dir, "slurm_logs")
    if os.path.exists(slurm_log_dir):
        log_files = glob.glob(os.path.join(slurm_log_dir, "*.out"))
        log_files_simulated.extend(log_files)

print(f"Total slurm log files found: {len(log_files_simulated)}")

# Parse log files to extract simulation name, tool name, and job ID
log_files_sim_df = pl.DataFrame({
    "log_file": log_files_simulated
}).with_columns([
    pl.col("log_file").str.extract(r"-(\d+)\.out").alias("BaseJobID"),
    pl.col("log_file").str.extract(r"/(ns_[^/]+)/").alias("simulation"),
    pl.col("log_file").str.extract(r"slurm_logs/(.+)-\d+\.out")
        .str.replace_all("mmseqs", "mmseqs2")
        .alias("tool_name")
])

print(f"\nParsed {len(log_files_sim_df)} log files")
print(f"Unique simulations: {log_files_sim_df['simulation'].n_unique()}")
print(f"Unique tools: {log_files_sim_df['tool_name'].n_unique()}")
print("\nSample of parsed data:")
log_files_sim_df.head(10)

Total slurm log files found: 79

Parsed 79 log files
Unique simulations: 7
Unique tools: 11

Sample of parsed data:


log_file,BaseJobID,simulation,tool_name
str,str,str,str
"""results/simulated/ns_100000_nc_10000/slurm_logs/blastn-20697887.out""","""20697887""","""ns_100000_nc_10000""","""blastn"""
"""results/simulated/ns_100000_nc_10000/slurm_logs/bowtie1-20697889.out""","""20697889""","""ns_100000_nc_10000""","""bowtie1"""
"""results/simulated/ns_100000_nc_10000/slurm_logs/bowtie1-20704670.out""","""20704670""","""ns_100000_nc_10000""","""bowtie1"""
"""results/simulated/ns_100000_nc_10000/slurm_logs/bowtie2-20697890.out""","""20697890""","""ns_100000_nc_10000""","""bowtie2"""
"""results/simulated/ns_100000_nc_10000/slurm_logs/indelfree_bruteforce-20697892.out""","""20697892""","""ns_100000_nc_10000""","""indelfree_bruteforce"""
"""results/simulated/ns_100000_nc_10000/slurm_logs/indelfree_indexed-20697894.out""","""20697894""","""ns_100000_nc_10000""","""indelfree_indexed"""
"""results/simulated/ns_100000_nc_10000/slurm_logs/minimap2-20697895.out""","""20697895""","""ns_100000_nc_10000""","""minimap2"""
"""results/simulated/ns_100000_nc_10000/slurm_logs/mmseqs-20697897.out""","""20697897""","""ns_100000_nc_10000""","""mmseqs2"""
"""results/simulated/ns_100000_nc_10000/slurm_logs/mummer4-20697898.out""","""20697898""","""ns_100000_nc_10000""","""mummer4"""
"""results/simulated/ns_100000_nc_10000/slurm_logs/sassy-20697900.out""","""20697900""","""ns_100000_nc_10000""","""sassy"""


In [23]:
# Get sacct data for simulated runs
sacct_df_sim = pyseff(remove_cancelled=False, remove_failed=False, 
                      calculate_cpu_efficiency=False, calculate_memory_efficiency=False)

# Clean tool names (same as Part 1)
sacct_df_sim = sacct_df_sim.with_columns(
    pl.col("JobName").str.replace_all("_long", "").str.replace_all("mmseqs", "mmseqs2")
)

# Join with log files to get simulation context
log_files_sim_df = log_files_sim_df.join(sacct_df_sim, on="BaseJobID", how="inner")

print(f"Matched {len(log_files_sim_df)} jobs with sacct data")
log_files_sim_df

Matched 79 jobs with sacct data


log_file,BaseJobID,simulation,tool_name,JobName,AllocCPUS,State,ExitCode,MaxRSS,ReqMem,Elapsed_Seconds,TotalCPU_Seconds,MaxRSS_MB,ReqMem_MB,Elapsed,TotalCPU
str,str,str,str,str,i64,list[str],list[str],str,str,u32,u32,f64,f64,str,str
"""results/simulated/ns_500000_nc_100000/slurm_logs/minimap2-20697921.out""","""20697921""","""ns_500000_nc_100000""","""minimap2""","""minimap2""",38,"[""COMPLETED""]","[""0:0""]","""93945328K""","""250G""",11595,253236,91743.484375,256000.0,"""03:13:15""","""2-22:20:36"""
"""results/simulated/ns_75000_nc_5000/slurm_logs/mummer4-20697944.out""","""20697944""","""ns_75000_nc_5000""","""mummer4""","""mummer4""",38,"[""COMPLETED""]","[""0:0""]","""276840K""","""168G""",162,0,270.351562,172032.0,"""00:02:42""","""00:00:00"""
"""results/simulated/ns_500000_nc_100000/slurm_logs/mummer4-20697923.out""","""20697923""","""ns_500000_nc_100000""","""mummer4""","""mummer4""",38,"[""COMPLETED""]","[""0:0""]","""1986828K""","""250G""",11754,52526,1940.261719,256000.0,"""03:15:54""","""14:35:26"""
"""results/simulated/ns_500_nc_5000_HIGH_INSERTION_RATE/slurm_logs/minimap2-20697873.out""","""20697873""","""ns_500_nc_5000_HIGH_INSERTION_RATE""","""minimap2""","""minimap2""",38,"[""COMPLETED""]","[""0:0""]","""2816332K""","""168G""",48,0,2750.324219,172032.0,"""00:00:48""","""00:00:00"""
"""results/simulated/ns_50000_nc_5000/slurm_logs/sassy-20697934.out""","""20697934""","""ns_50000_nc_5000""","""sassy""","""sassy""",38,"[""COMPLETED""]","[""0:0""]","""62544K""","""168G""",3813,65444,61.078125,172032.0,"""01:03:33""","""18:10:44"""
"""results/simulated/ns_500_nc_5000_HIGH_INSERTION_RATE/slurm_logs/indelfree_bruteforce-20697871.out""","""20697871""","""ns_500_nc_5000_HIGH_INSERTION_RATE""","""indelfree_bruteforce""","""indelfree_bruteforce""",38,"[""COMPLETED""]","[""0:0""]","""28804280K""","""168G""",648,0,28129.179688,172032.0,"""00:10:48""","""00:00:00"""
"""results/simulated/ns_50000_nc_5000/slurm_logs/mmseqs-20697932.out""","""20697932""","""ns_50000_nc_5000""","""mmseqs2""","""mmseqs2""",38,"[""COMPLETED""]","[""0:0""]","""11834740K""","""168G""",87,0,11557.363281,172032.0,"""00:01:27""","""00:00:00"""
"""results/simulated/ns_3826979_nc_421431_real_baseline/slurm_logs/mmseqs-20697705.out""","""20697705""","""ns_3826979_nc_421431_real_baseline""","""mmseqs2""","""mmseqs2""",38,"[""COMPLETED""]","[""0:0""]","""180455348K""","""250G""",2590,23564,176225.925781,256000.0,"""00:43:10""","""06:32:44"""
"""results/simulated/ns_75000_nc_5000/slurm_logs/blastn-20697937.out""","""20697937""","""ns_75000_nc_5000""","""blastn""","""blastn""",38,"[""COMPLETED""]","[""0:0""]","""724744K""","""168G""",1086,17576,707.757812,172032.0,"""00:18:06""","""04:52:56"""
"""results/simulated/ns_100000_nc_10000/slurm_logs/x_mapper-20697903.out""","""20697903""","""ns_100000_nc_10000""","""x_mapper""","""x_mapper""",38,"[""COMPLETED""]","[""0:0""]","""23013868K""","""250G""",294,0,22474.480469,256000.0,"""00:04:54""","""00:00:00"""


In [24]:
# Get dataset sizes for each simulation
def get_simulation_stats(sim_dir, 
                         spacers_filename=None, 
                         contigs_filename=None):
    """Extract number of spacers and contigs from simulation directory"""
    sim_data_dir = os.path.join(sim_dir, "simulated_data")
    
    # Get spacer count and size
    spacer_file = spacers_filename if spacers_filename is not None else os.path.join(sim_data_dir, "simulated_spacers.fa")
    contig_file = contigs_filename if contigs_filename is not None else os.path.join(sim_data_dir, "simulated_contigs.fa")

    if os.path.exists(spacer_file):
        spacer_df = pl.DataFrame(read_fasta_needletail(spacer_file), schema={'seqid': pl.Utf8, 'seq': pl.Utf8},)
        spacer_df = spacer_df.with_columns(
            pl.col("seq").str.len_chars().alias("length")
        ).drop("seq")
        num_spacers = spacer_df.height
        total_spacer_bp = spacer_df["length"].sum()
    else:
        num_spacers = 0
        total_spacer_bp = 0
    
    # Get contig count and size
    if os.path.exists(contig_file):
        contig_df = pl.DataFrame(read_fasta_needletail(contig_file), schema={'seqid': pl.Utf8, 'seq': pl.Utf8},)
        contig_df = contig_df.with_columns(
            pl.col("seq").str.len_chars().alias("length")
        ).drop("seq")
        num_contigs = contig_df.height
        total_contig_bp = contig_df["length"].sum()
    else:
        num_contigs = 0
        total_contig_bp = 0
    
    return {
        "num_spacers": num_spacers,
        "total_spacer_bp": total_spacer_bp,
        "total_spacer_Mbp": total_spacer_bp / 1_000_000,
        "num_contigs": num_contigs,
        "total_contig_bp": total_contig_bp,
        "total_contig_Mbp": total_contig_bp / 1_000_000,
        "search_space_size": num_spacers * num_contigs
    }

# Collect stats for all simulations
sim_stats_list = []
for sim_dir in simulation_dirs:
    sim_name = os.path.basename(sim_dir)
    if sim_name == "ns_3826979_nc_421431_real_baseline":
        # Special case: different spacer/contig filenames
        stats = get_simulation_stats(
            sim_dir, 
            spacers_filename="imgvr4_data/spacers/iphop_filtered_spacers.fna", # probably not necceary as the subsample would have made a copy of this file into simulated_data
        )
    else:
        stats = get_simulation_stats(sim_dir)
    stats["simulation"] = sim_name
    sim_stats_list.append(stats)

sim_stats_df = pl.DataFrame(sim_stats_list).sort("search_space_size")

print("Simulation Dataset Statistics:")
sim_stats_df

Simulation Dataset Statistics:


num_spacers,total_spacer_bp,total_spacer_Mbp,num_contigs,total_contig_bp,total_contig_Mbp,search_space_size,simulation
i64,i64,f64,i64,i64,f64,i64,str
500,16007,0.016007,5000,400779688,400.779688,2500000,"""ns_500_nc_5000_HIGH_INSERTION_RATE"""
100,3568,0.003568,50000,4137896557,4137.896557,5000000,"""ns_100_nc_50000"""
50000,1599652,1.599652,5000,402606570,402.60657,250000000,"""ns_50000_nc_5000"""
75000,2401592,2.401592,5000,401074812,401.074812,375000000,"""ns_75000_nc_5000"""
100000,3201222,3.201222,10000,802832723,802.832723,1000000000,"""ns_100000_nc_10000"""
100000,3200186,3.200186,20000,1604143975,1604.143975,2000000000,"""ns_100000_nc_20000"""
500000,16002956,16.002956,100000,2232675503,2232.675503,50000000000,"""ns_500000_nc_100000"""
3826979,129494053,129.494053,421431,4291112041,4291.112041,1612807586949,"""ns_3826979_nc_421431_real_baseline"""


In [None]:
sim_stats_df.write_csv("results/simulated/simulation_dataset_stats.csv")

In [25]:
# Process simulated jobs similar to Part 1
# Mark job states and handle retries
log_files_sim_df = log_files_sim_df.with_columns([
    pl.col("State").list.get(0).alias("state_first"),
])

# Categorize job states
exclude_states = ['OUT_OF_MEMORY', 'CANCELLED', 'PREEMPTED', 'RUNNING']
log_files_sim_df = log_files_sim_df.with_columns([
    pl.col("state_first").is_in(exclude_states).alias("should_exclude"),
    pl.col("state_first").str.contains("TIMEOUT").fill_null(False).alias("is_timeout"),
    pl.col("state_first").str.contains("COMPLETED").fill_null(False).alias("is_completed"),
    pl.col("state_first").str.contains("FAILED").fill_null(False).alias("is_failed"),
])

# For each (simulation, tool) pair, select the best job (completed jobs only)
processed_jobs_sim = log_files_sim_df.filter(
    pl.col("is_completed") & ~pl.col("should_exclude")
).group_by(["simulation", "tool_name"]).agg([
    pl.col("BaseJobID").first(),
    pl.col("Elapsed_Seconds").first(),
    pl.col("TotalCPU_Seconds").first(),
    pl.col("MaxRSS_MB").first(),
    pl.col("ReqMem_MB").first(),
    pl.col("AllocCPUS").first(),
])

# Add derived metrics
processed_jobs_sim = processed_jobs_sim.with_columns([
    # Use TotalCPU if available and > 1, otherwise use Elapsed
    pl.when(pl.col("TotalCPU_Seconds") >= 1)
        .then(pl.col("TotalCPU_Seconds"))
        .otherwise(pl.col("Elapsed_Seconds"))
        .alias("CPU_Time_Seconds"),
    (pl.col("MaxRSS_MB") / 1024).alias("Peak_Memory_Gb"),
])

processed_jobs_sim = processed_jobs_sim.with_columns([
    (pl.col("CPU_Time_Seconds") / 3600).alias("CPU_Time_Hours"),
    (pl.col("Elapsed_Seconds") / 3600).alias("Walltime_Hours"),
])

# Merge with simulation stats
processed_jobs_sim = processed_jobs_sim.join(sim_stats_df, on="simulation", how="left")

print(f"Processed {len(processed_jobs_sim)} completed jobs")
print(f"Simulations: {processed_jobs_sim['simulation'].n_unique()}")
print(f"Tools: {processed_jobs_sim['tool_name'].n_unique()}")
processed_jobs_sim

Processed 70 completed jobs
Simulations: 7
Tools: 11


simulation,tool_name,BaseJobID,Elapsed_Seconds,TotalCPU_Seconds,MaxRSS_MB,ReqMem_MB,AllocCPUS,CPU_Time_Seconds,Peak_Memory_Gb,CPU_Time_Hours,Walltime_Hours,num_spacers,total_spacer_bp,total_spacer_Mbp,num_contigs,total_contig_bp,total_contig_Mbp,search_space_size
str,str,str,u32,u32,f64,f64,i64,u32,f64,f64,f64,i64,i64,f64,i64,i64,f64,i64
"""ns_50000_nc_5000""","""sassy""","""20697934""",3813,65444,61.078125,172032.0,38,65444,0.059647,18.178889,1.059167,50000,1599652,1.599652,5000,402606570,402.60657,250000000
"""ns_3826979_nc_421431_real_baseline""","""strobealign""","""20697709""",838,0,95524.339844,256000.0,38,838,93.285488,0.232778,0.232778,3826979,129494053,129.494053,421431,4291112041,4291.112041,1612807586949
"""ns_100000_nc_10000""","""indelfree_indexed""","""20697894""",936,22452,40246.910156,256000.0,38,22452,39.303623,6.236667,0.26,100000,3201222,3.201222,10000,802832723,802.832723,1000000000
"""ns_500000_nc_100000""","""bowtie1""","""20704672""",86544,472526,211975.921875,256000.0,38,472526,207.007736,131.257222,24.04,500000,16002956,16.002956,100000,2232675503,2232.675503,50000000000
"""ns_75000_nc_5000""","""x_mapper""","""20697947""",201,0,14671.882812,172032.0,38,201,14.328011,0.055833,0.055833,75000,2401592,2.401592,5000,401074812,401.074812,375000000
"""ns_100000_nc_20000""","""indelfree_indexed""","""20697910""",5586,46856,30215.472656,256000.0,38,46856,29.507298,13.015556,1.551667,100000,3200186,3.200186,20000,1604143975,1604.143975,2000000000
"""ns_500000_nc_100000""","""blastn""","""20697917""",750438,9281248,33874.339844,256000.0,38,9281248,33.08041,2578.124444,208.455,500000,16002956,16.002956,100000,2232675503,2232.675503,50000000000
"""ns_3826979_nc_421431_real_baseline""","""bowtie1""","""20693675""",37549,394174,186419.28125,256000.0,38,394174,182.050079,109.492778,10.430278,3826979,129494053,129.494053,421431,4291112041,4291.112041,1612807586949
"""ns_3826979_nc_421431_real_baseline""","""mmseqs2""","""20697705""",2590,23564,176225.925781,256000.0,38,23564,172.095631,6.545556,0.719444,3826979,129494053,129.494053,421431,4291112041,4291.112041,1612807586949
"""ns_500000_nc_100000""","""mmseqs2""","""20697922""",4444,29876,222375.441406,256000.0,38,29876,217.163517,8.298889,1.234444,500000,16002956,16.002956,100000,2232675503,2232.675503,50000000000


In [None]:
# Add max_distance information based on simulation configuration
# Reference: Prepare_all_jobs.ipynb configurations

def get_max_distance_simulated(simulation_name, tool_name):
    """
    Determine the max_distance setting for a given simulation and tool.
    Based on Prepare_all_jobs.ipynb configurations.
    
    Returns a string describing the max_distance setting:
    - For tools with fixed edit distance (indelfree, sassy): returns the numeric value
    - For bowtie1: always returns "3" (max supported)
    - For blastn/mmseqs2: returns descriptive text about their similarity thresholds
    - For other tools: returns "N/A"
    """
    # Simulation-specific max_distance configurations from Prepare_all_jobs.ipynb
    # Smaller simulated datasets: max_distance=5
    smaller_simulated = [
        'ns_50000_nc_5000',
        'ns_75000_nc_5000', 
        'ns_500_nc_5000_HIGH_INSERTION_RATE',
    ]
    
    # Larger simulated datasets with varying max_distance
    larger_simulated_configs = {
        'ns_100000_nc_10000': 5,
        'ns_100000_nc_20000': 5,
        'ns_500000_nc_100000': 3,  # Reduced for compute efficiency
        'ns_3826979_nc_421431_real_baseline': 3  # Semi-synthetic baseline
    }
    
    # Determine configured max_distance for this simulation
    if simulation_name in smaller_simulated:
        configured_max_dist = 5
    elif simulation_name in larger_simulated_configs:
        configured_max_dist = larger_simulated_configs[simulation_name]
    else:
        configured_max_dist = None
    
    # Tool-specific handling (same logic as Part 1)
    if tool_name in ['indelfree_bruteforce', 'indelfree_indexed', 'sassy']:
        return str(configured_max_dist) if configured_max_dist is not None else "N/A"
    elif tool_name == 'bowtie1':
        return "3"  # Always max supported
    elif tool_name == 'blastn':
        return "perc_id=84%"
    elif tool_name == 'mmseqs2':
        return "min-seq-id=85%"
    else:
        return "N/A"

# Apply max_distance to processed jobs
processed_jobs_sim = processed_jobs_sim.with_columns([
    pl.struct(["simulation", "tool_name"]).map_elements(
        lambda x: get_max_distance_simulated(x["simulation"], x["tool_name"]),
        return_dtype=pl.Utf8
    ).alias("max_distance_setting")
])

processed_jobs_sim = processed_jobs_sim.with_columns([
    pl.struct(["tool_name", "max_distance_setting"]).map_elements(
        lambda x: get_tool_display_name(x["tool_name"], x["max_distance_setting"]),
        return_dtype=pl.Utf8
    ).alias("tool_display")
])

print("Max distance settings by simulation and tool:")
max_dist_summary_sim = processed_jobs_sim.group_by(["simulation", "tool_name", "tool_display", "max_distance_setting"]).agg([
    pl.len().alias("count")
]).sort(["simulation", "tool_display"])
print(max_dist_summary_sim)

Max distance settings by simulation and tool:
shape: (70, 5)
┌────────────────────────────────────┬──────────────────────┬─────────────────────────┬──────────────────────┬───────┐
│ simulation                         ┆ tool_name            ┆ tool_display            ┆ max_distance_setting ┆ count │
│ ---                                ┆ ---                  ┆ ---                     ┆ ---                  ┆ ---   │
│ str                                ┆ str                  ┆ str                     ┆ str                  ┆ u64   │
╞════════════════════════════════════╪══════════════════════╪═════════════════════════╪══════════════════════╪═══════╡
│ ns_100000_nc_10000                 ┆ blastn               ┆ blastn                  ┆ perc_id=84%          ┆ 1     │
│ ns_100000_nc_10000                 ┆ bowtie1              ┆ bowtie1                 ┆ 3                    ┆ 1     │
│ ns_100000_nc_10000                 ┆ bowtie2              ┆ bowtie2                 ┆ N/A               

## Max Distance Settings for Simulated Runs

Based on Prepare_all_jobs.ipynb configurations:

**Smaller simulations (max_distance=5):**
- ns_50000_nc_5000
- ns_75000_nc_5000
- ns_500_nc_5000_HIGH_INSERTION_RATE

**Larger simulations:**
- ns_100000_nc_10000: max_distance=5
- ns_100000_nc_20000: max_distance=5  
- ns_500000_nc_100000: max_distance=3 (reduced for compute efficiency, excludes sassy/indelfree_bruteforce)
- ns_3826979_nc_421431_real_baseline: max_distance=3 (semi-synthetic baseline, excludes sassy/indelfree_bruteforce)

**Tool-specific notes:**
- **indelfree_bruteforce/indexed, sassy**: Use the configured max_distance (edit/hamming distance)
- **bowtie1**: Fixed at max_distance=3 (maximum supported by -v flag)
- **blastn**: Uses perc_identity=84% and qcov_hsp_perc=80% (not fixed edit distance)
- **mmseqs2**: Uses min-seq-id=85% (not fixed edit distance)
- **Other tools** (bowtie2, minimap2, strobealign, mummer4, x_mapper): Do not use max_distance constraints

In [27]:
# Check for failed/timeout jobs in simulated runs
failed_jobs_sim = log_files_sim_df.filter(
    ~pl.col("is_completed") & ~pl.col("should_exclude")
).select([
    "simulation", "tool_name", "BaseJobID", "state_first", 
    "Elapsed_Seconds", "AllocCPUS"
]).with_columns([
    (pl.col("Elapsed_Seconds") / 3600).round(2).alias("Time_Hours")
])

if len(failed_jobs_sim) > 0:
    print(f"FAILED/TIMEOUT JOBS: {len(failed_jobs_sim)} total")
    print(failed_jobs_sim)
    
    # Summary by tool
    failure_summary_sim = failed_jobs_sim.group_by("tool_name").agg([
        pl.len().alias("num_failures")
    ]).sort("num_failures", descending=True)
    print("\nFailure Summary by Tool:")
    print(failure_summary_sim)
else:
    print("✓ All simulated jobs completed successfully!")

FAILED/TIMEOUT JOBS: 8 total
shape: (8, 7)
┌────────────────────────────────────┬───────────────────┬───────────┬─────────────┬─────────────────┬───────────┬────────────┐
│ simulation                         ┆ tool_name         ┆ BaseJobID ┆ state_first ┆ Elapsed_Seconds ┆ AllocCPUS ┆ Time_Hours │
│ ---                                ┆ ---               ┆ ---       ┆ ---         ┆ ---             ┆ ---       ┆ ---        │
│ str                                ┆ str               ┆ str       ┆ str         ┆ u32             ┆ i64       ┆ f64        │
╞════════════════════════════════════╪═══════════════════╪═══════════╪═════════════╪═════════════════╪═══════════╪════════════╡
│ ns_100000_nc_20000                 ┆ bowtie1           ┆ 20697906  ┆ FAILED      ┆ 3022            ┆ 38        ┆ 0.84       │
│ ns_3826979_nc_421431_real_baseline ┆ indelfree_indexed ┆ 20695519  ┆ TIMEOUT     ┆ 777618          ┆ 38        ┆ 216.0      │
│ ns_3826979_nc_421431_real_baseline ┆ blastn            ┆ 20

In [28]:
# Summary statistics by tool across all simulations
tool_summary_sim = processed_jobs_sim.group_by("tool_display").agg([
    pl.col("tool_name").first().alias("base_tool"),
    pl.col("max_distance_setting").unique().alias("max_distances"),
    pl.col("CPU_Time_Hours").mean().alias("Avg_CPU_Time_Hours"),
    pl.col("CPU_Time_Hours").min().alias("Min_CPU_Time_Hours"),
    pl.col("CPU_Time_Hours").max().alias("Max_CPU_Time_Hours"),
    pl.col("CPU_Time_Hours").sum().alias("Total_CPU_Time_Hours"),
    pl.col("Peak_Memory_Gb").mean().alias("Avg_Peak_Memory_Gb"),
    pl.col("Peak_Memory_Gb").max().alias("Max_Peak_Memory_Gb"),
    pl.col("AllocCPUS").unique().alias("CPU_Allocations_Used"),
    pl.len().alias("Num_Simulations")
]).sort("Avg_CPU_Time_Hours")

print("Tool Summary Statistics (Simulated Runs)")
print("Note: CPU Time is total CPU time from Slurm TotalCPU")
tool_summary_sim

Tool Summary Statistics (Simulated Runs)
Note: CPU Time is total CPU time from Slurm TotalCPU


tool_display,base_tool,max_distances,Avg_CPU_Time_Hours,Min_CPU_Time_Hours,Max_CPU_Time_Hours,Total_CPU_Time_Hours,Avg_Peak_Memory_Gb,Max_Peak_Memory_Gb,CPU_Allocations_Used,Num_Simulations
str,str,list[str],f64,f64,f64,f64,f64,f64,list[i64],u64
"""strobealign""","""strobealign""","[""N/A""]",0.071151,0.004167,0.232778,0.498056,36.130708,141.825691,[38],7
"""mmseqs2""","""mmseqs2""","[""min-seq-id=85%""]",2.147222,0.024167,8.298889,15.030556,65.354014,217.163517,[38],7
"""indelfree_indexed_d5""","""indelfree_indexed""","[""5""]",4.381611,0.015833,13.015556,21.908056,45.538423,85.125195,[38],5
"""x_mapper""","""x_mapper""","[""N/A""]",5.751111,0.04,32.103889,34.506667,54.057505,229.822166,[38],6
"""mummer4""","""mummer4""","[""N/A""]",12.444325,0.040833,72.123889,87.110278,3.398307,20.543667,[38],7
"""bowtie1""","""bowtie1""","[""3""]",37.78254,2.991667,131.257222,264.477778,59.067429,207.007736,[38],7
"""sassy_d5""","""sassy""","[""5""]",50.738333,0.026667,141.300556,253.691667,0.098599,0.150276,[38],5
"""minimap2""","""minimap2""","[""N/A""]",74.506508,0.013333,451.039444,521.545556,27.182028,89.593246,[38],7
"""bowtie2""","""bowtie2""","[""N/A""]",120.035873,0.18,618.215,840.251111,57.53741,208.148857,[38],7
"""indelfree_indexed_d3""","""indelfree_indexed""","[""3""]",414.513889,414.513889,414.513889,414.513889,70.632778,70.632778,[38],1


DETAILED PERFORMANCE: CPU Time × Memory by Simulation and Tool  
**Note**: max_distance_setting shows the configured distance threshold for each tool


In [29]:
# Detailed performance by simulation and tool
performance_by_simulation = processed_jobs_sim.select([
    "simulation", "tool_display", "tool_name", "max_distance_setting", "search_space_size",
    "num_spacers", "num_contigs", 
    "total_spacer_Mbp", "total_contig_Mbp",
    "AllocCPUS", "CPU_Time_Hours", "Peak_Memory_Gb"
]).sort(["search_space_size", "tool_display"])

performance_by_simulation

simulation,tool_display,tool_name,max_distance_setting,search_space_size,num_spacers,num_contigs,total_spacer_Mbp,total_contig_Mbp,AllocCPUS,CPU_Time_Hours,Peak_Memory_Gb
str,str,str,str,i64,i64,i64,f64,f64,i64,f64,f64
"""ns_500_nc_5000_HIGH_INSERTION_RATE""","""blastn""","""blastn""","""perc_id=84%""",2500000,500,5000,0.016007,400.779688,38,0.025,1.460548
"""ns_500_nc_5000_HIGH_INSERTION_RATE""","""bowtie1""","""bowtie1""","""3""",2500000,500,5000,0.016007,400.779688,38,2.991667,3.639755
"""ns_500_nc_5000_HIGH_INSERTION_RATE""","""bowtie2""","""bowtie2""","""N/A""",2500000,500,5000,0.016007,400.779688,38,0.184444,4.088547
"""ns_500_nc_5000_HIGH_INSERTION_RATE""","""indelfree_bruteforce_d5""","""indelfree_bruteforce""","""5""",2500000,500,5000,0.016007,400.779688,38,0.18,27.469902
"""ns_500_nc_5000_HIGH_INSERTION_RATE""","""indelfree_indexed_d5""","""indelfree_indexed""","""5""",2500000,500,5000,0.016007,400.779688,38,0.015833,7.555111
"""ns_500_nc_5000_HIGH_INSERTION_RATE""","""minimap2""","""minimap2""","""N/A""",2500000,500,5000,0.016007,400.779688,38,0.013333,2.685863
"""ns_500_nc_5000_HIGH_INSERTION_RATE""","""mmseqs2""","""mmseqs2""","""min-seq-id=85%""",2500000,500,5000,0.016007,400.779688,38,0.035833,11.205273
"""ns_500_nc_5000_HIGH_INSERTION_RATE""","""mummer4""","""mummer4""","""N/A""",2500000,500,5000,0.016007,400.779688,38,0.068333,0.275127
"""ns_500_nc_5000_HIGH_INSERTION_RATE""","""sassy_d5""","""sassy""","""5""",2500000,500,5000,0.016007,400.779688,38,0.026667,0.09034
"""ns_500_nc_5000_HIGH_INSERTION_RATE""","""strobealign""","""strobealign""","""N/A""",2500000,500,5000,0.016007,400.779688,38,0.004444,1.849068


In [None]:
# Visualization: Resource usage scaling with simulation size

# Prepare plot data with base_tool for color mapping
processed_jobs_sim_plot = processed_jobs_sim.with_columns([
    pl.col("tool_display").map_elements(
        lambda x: x.rsplit('_d', 1)[0] if '_d' in x else x,
        return_dtype=pl.Utf8
    ).alias("base_tool")
])

# Plot 1: CPU Time vs Search Space Size
plot_sim_1 = (
    ggplot(processed_jobs_sim_plot, 
           aes(x='search_space_size', y='CPU_Time_Hours', 
               color='base_tool', group='tool_display')) +
    geom_line(size=1.5) +
    geom_point(aes(shape='base_tool'), size=4) +
    scale_color_manual(values=tool_colors, name='Tool') +
    scale_shape_manual(values=tool_shapes, name='Tool') +
    scale_x_log10() +
    scale_y_log10() +
    labs(
        x='Search Space Size (spacers × contigs)',
        y='Total CPU Time (hours)',
        title='CPU Time Scaling with Search Space'
    ) +
    ggsize(700, 500) +
    theme_minimal() +
    theme(
        plot_title=element_text(size=14, face='bold'),
        axis_title=element_text(size=11, face='bold')
    )
)
plot_sim_1.to_svg("./results/simulated/plots/cpu_time_scaling_with_search_space.svg")
plot_sim_1.to_html("./results/simulated/plots/cpu_time_scaling_with_search_space.html")

plot_sim_1

In [46]:
# Plot 2: Memory vs Search Space Size
plot_sim_2 = (
    ggplot(processed_jobs_sim_plot.filter(pl.col('Peak_Memory_Gb').is_not_null()), 
           aes(x='search_space_size', y='Peak_Memory_Gb', 
               color='base_tool', group='tool_display')) +
    geom_line(size=1.5) +
    geom_point(aes(shape='base_tool'), size=4) +
    scale_color_manual(values=tool_colors, name='Tool') +
    scale_shape_manual(values=tool_shapes, name='Tool') +
    scale_x_log10() +
    scale_y_log10() +
    labs(
        x='Search Space Size (spacers × contigs)',
        y='Peak Memory (GB)',
        title='Memory Scaling with Search Space'
    ) +
    ggsize(700, 500) +
    theme_minimal() +
    theme(
        plot_title=element_text(size=14, face='bold'),
        axis_title=element_text(size=11, face='bold')
    )
)
plot_sim_1.to_svg("./results/simulated/plots/cpu_time_scaling_with_search_space.svg")
plot_sim_1.to_html("./results/simulated/plots/cpu_time_scaling_with_search_space.html")
plot_sim_2

In [49]:
# Plot 3: Memory vs CPU Time (marker size proportional to search space)
plot_data_sim_scatter = processed_jobs_sim_plot.filter(pl.col('Peak_Memory_Gb').is_not_null())

# Normalize search space size for marker sizing (log scale)
min_search = plot_data_sim_scatter['search_space_size'].min()
max_search = plot_data_sim_scatter['search_space_size'].max()

plot_data_sim_scatter = plot_data_sim_scatter.with_columns([
    (3 + (pl.col('search_space_size').log10() - pl.lit(min_search).log10()) / 
     (pl.lit(max_search).log10() - pl.lit(min_search).log10()) * 7).alias('size_normalized')
])

plot_sim_3 = (
    ggplot(plot_data_sim_scatter, 
           aes(x='CPU_Time_Hours', y='Peak_Memory_Gb', color='base_tool')) +
    geom_point(aes(shape='base_tool', size='size_normalized'), alpha=0.7) +
    scale_color_manual(values=tool_colors, name='Tool') +
    scale_shape_manual(values=tool_shapes, name='Tool') +
    scale_size_identity() +
    scale_x_log10() +
    scale_y_log10() +
    labs(
        x='Total CPU Time (hours)',
        y='Peak Memory (GB)',
        title='Memory vs CPU Time (marker size ∝ search space) - Simulated Runs'
    ) +
    ggsize(700, 500) +
    theme_minimal() +
    theme(
        plot_title=element_text(size=14, face='bold'),
        axis_title=element_text(size=11, face='bold')
    )
)

# Save the plot
plot_sim_3.to_svg("results/simulated/plots/resource_usage.svg")
plot_sim_3.to_html("results/simulated/plots/resource_usage.html")
plot_sim_3
### TODO: add max_distance like in the plot for the real data
### TODO: only keep 1 legend
### TODO: Decide if the connecting line is really meaningful or just adds clutter.

### TODO: add max_distance like in the plot for the real data
### TODO: Decide if the connecting line is really meaningful or just adds clutter.

### Summary by Simulation:  
Note: max_distances_used shows all unique distance settings across tools in that simulation


In [33]:
# Summary statistics grouped by simulation
sim_summary = processed_jobs_sim.group_by("simulation").agg([
    pl.col("num_spacers").first(),
    pl.col("num_contigs").first(),
    pl.col("search_space_size").first(),
    pl.col("total_spacer_Mbp").first().round(2),
    pl.col("total_contig_Mbp").first().round(2),
    pl.col("max_distance_setting").unique().alias("max_distances_used"),
    pl.col("CPU_Time_Hours").mean().alias("Avg_CPU_Time_Hours"),
    pl.col("CPU_Time_Hours").sum().alias("Total_CPU_Time_Hours"),
    pl.col("Peak_Memory_Gb").mean().alias("Avg_Peak_Memory_Gb"),
    pl.col("Peak_Memory_Gb").max().alias("Max_Peak_Memory_Gb"),
    pl.col("tool_name").n_unique().alias("Num_Tools")
]).sort("search_space_size")

sim_summary

simulation,num_spacers,num_contigs,search_space_size,total_spacer_Mbp,total_contig_Mbp,max_distances_used,Avg_CPU_Time_Hours,Total_CPU_Time_Hours,Avg_Peak_Memory_Gb,Max_Peak_Memory_Gb,Num_Tools
str,i64,i64,i64,f64,f64,list[str],f64,f64,f64,f64,u64
"""ns_500_nc_5000_HIGH_INSERTION_RATE""",500,5000,2500000,0.02,400.78,"[""3"", ""5"", … ""min-seq-id=85%""]",0.328561,3.614167,6.679746,27.469902,11
"""ns_50000_nc_5000""",50000,5000,250000000,1.6,402.61,"[""5"", ""N/A"", … ""3""]",25.179394,276.973333,17.426079,85.46278,11
"""ns_75000_nc_5000""",75000,5000,375000000,2.4,401.07,"[""N/A"", ""5"", … ""perc_id=84%""]",40.325758,443.583333,19.064709,85.466595,11
"""ns_100000_nc_10000""",100000,10000,1000000000,3.2,802.83,"[""5"", ""N/A"", … ""3""]",68.433232,752.765556,18.508971,105.351448,11
"""ns_100000_nc_20000""",100000,20000,2000000000,3.2,1604.14,"[""5"", ""N/A"", … ""3""]",285.946944,3145.416389,13.803971,32.379257,11
"""ns_500000_nc_100000""",500000,100000,50000000000,16.0,2232.68,"[""3"", ""perc_id=84%"", … ""N/A""]",428.449757,3427.598056,121.168378,217.163517,8
"""ns_3826979_nc_421431_real_baseline""",3826979,421431,1612807586949,129.49,4291.11,"[""N/A"", ""3"", ""min-seq-id=85%""]",184.250476,1289.753333,134.216832,229.822166,7
