# Performance Analysis: IMG/VR4 Stratified Subsamples

This notebook analyzes tool performance across stratified subsamples of the IMG/VR4 dataset. The subsamples were created using taxonomy-aware, GC-content, and length-stratified sampling to preserve diversity while reducing computational requirements.

**Subsamples analyzed**: fractions 0.001, 0.005, 0.01, 0.05, 0.1  
**Approach**: Load tool results from each subsample, analyze performance metrics, and compare across scales.

Note: Unlike simulated data, we don't have ground truth here, so we focus on:
- Number of unique spacer-contig matches found
The main goal here:  
Are the tools subsampled results indicative on a larger (non sample size / larger sample size) comparison (i.e. can we trust interpration made using the largest subsample)


Note2: The actual performence comparisons for the 5 and 10% samples are in Performence_imgvr4.ipynb


## Setup and Imports

In [None]:
# %load_ext autoreload
# %autoreload 2
import os
import glob
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

os.chdir('/clusterfs/jgi/scratch/science/metagen/neri/code/blits/spacer_bench/')

import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pl.Config(tbl_rows=50, tbl_cols=15)

from bench.utils.functions import *
from bench.utils.tool_commands import load_tool_configs
# Analysis parameters
MAX_MISMATCHES = 3
base_dir = "/clusterfs/jgi/scratch/science/metagen/neri/code/blits/spacer_bench/results/real_data/subsamples"
spacers_file = "/clusterfs/jgi/scratch/science/metagen/neri/code/blits/spacer_bench/imgvr4_data/spacers/All_CRISPR_spacers_nr_clean.fna"
fractions = [0.001, 0.005, 0.01, 0.05, 0.1]

# Tool color and marker configuration
import json

# Create tool-to-style mapping
TOOL_COLORS_FILE = "notebooks/antonio_14_colors.json"
with open(TOOL_COLORS_FILE, 'r') as f:
    color_config = json.load(f)
TOOL_ORDER = [
    'blastn', 'bowtie1', 'bowtie2', 'indelfree_bruteforce', 'indelfree_indexed',
    'lexicmap', 'minimap2', 'mmseqs2', 'mummer4', 'sassy', 'strobealign', 'x_mapper'
]
MARKERS = ['o', 's', '^', 'v', 'D', 'P', '*', 'X', 'h', 'p', '<', '>']
TOOL_STYLES = {}
for i, tool in enumerate(TOOL_ORDER):
    TOOL_STYLES[tool] = {
        'color': color_config['hex_colors'][i % len(color_config['hex_colors'])],
        'marker': MARKERS[i % len(MARKERS)]
    }

print(f"  Max mismatches: {MAX_MISMATCHES}")
print(f"  Tool styles configured for {len(TOOL_STYLES)} tools")
with open('tool_styles.json', 'w') as f:
    json.dump(TOOL_STYLES, f, indent=4)

## Scan Subsample Directories and Identify Completed Tools

First, we scan each subsample fraction directory and check the SLURM logs to identify which tools completed successfully vs timed out.

In [None]:
def check_tool_completion(fraction):
    """Check which tools completed for a given fraction by examining SLURM logs"""
    frac_dir = f"{base_dir}/fraction_{fraction}"
    log_dir = f"{frac_dir}/slurm_logs"
    
    if not os.path.exists(log_dir):
        return {"completed": [], "timed_out": []} #, "failed": []}
    
    completed = set()
    timed_out = set()
    # failed = set()
    
    # Check all .out and .err files
    for out_file in glob.glob(f"{log_dir}/*.out"):
        tool_name = os.path.basename(out_file).replace("_long-", "-").split('-')[0]
        err_file = out_file.replace('.out', '.err')

        
        if not os.path.exists(err_file):
            continue
            
        # Read error log to check for timeout
        with open(err_file, 'r') as f:
            err_content = f.read()
            if 'TIME LIMIT' in err_content or 'DUE TO TIME LIMIT' in err_content:
                timed_out.add(tool_name)
            # elif 'CANCELLED' in err_content or 'FAILED' in err_content:
            #     failed.add(tool_name)
            else:
                # Check if output file exists
                output_file = f"{frac_dir}/raw_outputs/{tool_name}_output.{'sam' if tool_name not in ['blastn', 'lexicmap', 'mmseqs'] else 'tsv'}"
                if tool_name == "sassy":
                    output_file = f"{frac_dir}/raw_outputs/sassy.tsv"
                if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
                    completed.add(tool_name)
    
    return {
        "completed": sorted(list(completed)),
        "timed_out": sorted(list(timed_out - completed)),  # Remove if completed on resubmit
        # "failed": sorted(list(failed - completed))
    }

# Check completion status for all fractions
completion_status = {}
for frac in fractions:
    status = check_tool_completion(frac)
    completion_status[frac] = status
    print(f"Fraction {frac}:")
    print(f"  Completed: {len(status['completed'])} tools - {', '.join(status['completed'])}")
    if status['timed_out']:
        print(f"  Timed out: {', '.join(status['timed_out'])}")
    # if status['failed']:
    #     print(f"  Failed: {', '.join(status['failed'])}")
    print()

## Load Tool Results from Each Subsample

For each fraction, load the tool results using the `read_results` function with proper filtering.

In [None]:
# Read spacer lengths for filtering
spacers = read_fasta(spacers_file)
spacer_lendf = pl.DataFrame({
    "spacer_id": list(spacers.keys()), 
    "length": [len(seq) for seq in spacers.values()]
})

print(f"Loaded {len(spacers)} spacers")
print(f"Length range: {spacer_lendf['length'].min()} - {spacer_lendf['length'].max()} bp")

In [None]:
%%time
# Load results from all fractions
all_results = {}
# fractions = [0.001, 0.005, 0.01, 0.05, 0.1]
fractions = [0.001]#, 0.005] #, 0.01, 0.05, 0.1] # starting small

for frac in fractions:
    print(f"\n=== Loading fraction {frac} ===")
    frac_dir = f"{base_dir}/fraction_{frac}"
    contigs_file = f"{frac_dir}/subsampled_data/subsampled_contigs.fa"
    
    # Only load completed tools
    completed_tools = completion_status[frac]['completed']
    if not completed_tools:
        print(f"  No completed tools for fraction {frac}, skipping")
        continue
    
    # Load tool configurations
    tools = load_tool_configs(
        results_dir=frac_dir,
        threads=8,
        contigs_file=contigs_file,
        spacers_file=spacers_file
    )
    
    # Filter to only completed tools
    tools_to_load = {k: v for k, v in tools.items() if k in completed_tools}
    print(f"  Loading {len(tools_to_load)} tools: {', '.join(tools_to_load.keys())}")
    
    # Read results with max_mismatches filter
    try:
        results_df = read_results(
            tools_to_load,
            max_mismatches=MAX_MISMATCHES+2, #tool reported, not validated for the scalling tests
            spacer_lendf=spacer_lendf,
            ref_file=contigs_file,
            threads=8,
        )
        
        # Add fraction column
        results_df = results_df.with_columns(pl.lit(frac).alias('fraction'))
        results_df.write_parquet(f'results/real_data/subsamples_analysis/alignments_fraction_{frac}.parquet')
        
        all_results[frac] = results_df
        print(f"  Loaded {results_df.height:,} alignments from {results_df['tool'].n_unique()} tools")
        print(f"  Unique spacers: {results_df['spacer_id'].n_unique():,}, contigs: {results_df['contig_id'].n_unique():,}")
    except Exception as e:
        print(f"  Error loading results: {e}")
        continue

print(f"\n✓ Loaded results from {len(all_results)} fractions")

In [None]:
%%time
# Load results from all fractions
# fractions = [0.001, 0.005, 0.01, 0.05, 0.1]
all_results = {0.001 :pl.read_parquet(f'results/real_data/subsamples_analysis/alignments_fraction_0.001.parquet')
}


In [None]:
all_results 

In [None]:
fractions = [0.005] #, 0.01, 0.05, 0.1] # starting small

for frac in fractions:
    print(f"\n=== Loading fraction {frac} ===")
    frac_dir = f"{base_dir}/fraction_{frac}"
    contigs_file = f"{frac_dir}/subsampled_data/subsampled_contigs.fa"
    
    # Only load completed tools
    completed_tools = completion_status[frac]['completed']
    if not completed_tools:
        print(f"  No completed tools for fraction {frac}, skipping")
        continue
    
    # Load tool configurations
    tools = load_tool_configs(
        results_dir=frac_dir,
        threads=8,
        contigs_file=contigs_file,
        spacers_file=spacers_file
    )
    
    # Filter to only completed tools
    tools_to_load = {k: v for k, v in tools.items() if k in completed_tools}
    print(f"  Loading {len(tools_to_load)} tools: {', '.join(tools_to_load.keys())}")
    
    # Read results with max_mismatches filter
    try:
        results_df = read_results(
            tools_to_load,
            max_mismatches=MAX_MISMATCHES+2, #tool reported, not validated for the scalling tests
            spacer_lendf=spacer_lendf,
            ref_file=contigs_file,
            threads=8,
        )
        
        # Add fraction column
        results_df = results_df.with_columns(pl.lit(frac).alias('fraction'))
        results_df.write_parquet(f'results/real_data/subsamples_analysis/alignments_fraction_{frac}.parquet')
        
        all_results[frac] = results_df
        print(f"  Loaded {results_df.height:,} alignments from {results_df['tool'].n_unique()} tools")
        print(f"  Unique spacers: {results_df['spacer_id'].n_unique():,}, contigs: {results_df['contig_id'].n_unique():,}")
    except Exception as e:
        print(f"  Error loading results: {e}")
        continue

print(f"\n✓ Loaded results from {len(all_results)} fractions")

## Per-Fraction Analysis (No Aggregation)

Analyze each fraction separately since they're stratified samples from the same dataset and are not independent.

In [None]:
# Analyze each fraction separately (they are NOT independent samples)
# For each fraction, we'll compute summary stats

per_fraction_stats = {}

for frac, results_df in all_results.items():
    print(f"\n=== Fraction {frac} Statistics ===")
    
    # Basic stats
    print(f"Total alignments: {results_df.height:,}")
    print(f"Unique spacers: {results_df['spacer_id'].n_unique():,}")
    print(f"Unique contigs: {results_df['contig_id'].n_unique():,}")
    print(f"Tools: {results_df['tool'].n_unique()}")
    
    # Per-tool summary
    tool_summary = results_df.group_by('tool').agg([
        pl.col('spacer_id').n_unique().alias('n_unique_spacers'),
        pl.col('contig_id').n_unique().alias('n_unique_contigs'),
        pl.len().alias('n_total_alignments'),
        pl.col('mismatches').mean().alias('mean_mismatches'),
        pl.col('mismatches').median().alias('median_mismatches'),
        (pl.col('mismatches') == 0).sum().alias('n_perfect_matches'),
    ]).sort('n_unique_spacers', descending=True)
    
    per_fraction_stats[frac] = tool_summary
    print(f"\nTop 5 tools by unique spacers:")
    print(tool_summary.head(5))

# Display all stats
print("\n\n=== Summary Table: All Fractions ===")
for frac in sorted(per_fraction_stats.keys()):
    print(f"\n--- Fraction {frac} ---")
    print(per_fraction_stats[frac])

## Recalculate Alignments with Parasail for Deterministic Mismatch Counts

The tool-reported mismatches can vary. We'll recalculate them deterministically using parasail.
This follows the same workflow as the original full-dataset notebook.

In [None]:
%%time
# For each fraction, recalculate alignments
recalculated_results = {}

for frac, results_df in all_results.items():
    print(f"\n=== Recalculating alignments for fraction {frac} ===")
    frac_dir = f"{base_dir}/fraction_{frac}"
    contigs_file = f"{frac_dir}/subsampled_data/subsampled_contigs.fa"
    
    # Get unique regions (spacer-contig pairs with coordinates)
    unique_regions = results_df.select([
        "spacer_id", "contig_id", "strand", "start", "end"
    ]).unique()
    
    print(f"Unique regions to verify: {unique_regions.height:,}")
    
    # Populate with spacer sequences
    print("  Loading spacer sequences...")
    unique_regions = populate_pldf_withseqs_needletail(
        seqfile=spacers_file,
        pldf=unique_regions,
        chunk_size=2000000,
        reverse_by_strand_col=False,
        trim_to_region=False,
        idcol="spacer_id",
        seqcol="spacer_seq"
    )
    
    # Populate with contig sequences (trimmed to region)
    print("  Loading contig sequences...")
    unique_regions = populate_pldf_withseqs_needletail(
        seqfile=contigs_file,
        trim_to_region=True,
        reverse_by_strand_col=True,
        chunk_size=200000,
        pldf=unique_regions,
        idcol="contig_id",
        start_col="start",
        end_col="end",
        strand_col="strand",
        seqcol="contig_seq"
    )
    
    # Recalculate mismatches using parasail
    print("  Recalculating mismatches with parasail...")
    recalced = test_alignment_polars(
        results=unique_regions,
        return_deviations=False,
        ignore_region_strands=True
    )
    
    # Rename columns for clarity
    recalced = recalced.rename({"alignment_test": "recalced_mismatches"})
    
    # Join back with original results
    results_with_recalc = results_df.join(
        recalced[["spacer_id", "contig_id", "strand", "start", "end", 
                  "spacer_seq", "contig_seq", "recalced_mismatches"]],
        on=["spacer_id", "contig_id", "strand", "start", "end"],
        how="left"
    )
    
    # Rename original mismatches for clarity
    results_with_recalc = results_with_recalc.rename({
        "mismatches": "tool_reported_mismatches"
    }).rename({
        "recalced_mismatches": "mismatches"
    })
    
    # Check for deviations
    results_with_recalc = results_with_recalc.with_columns(
        (pl.col("mismatches") - pl.col("tool_reported_mismatches")).alias("deviation")
    )
    
    n_deviations = results_with_recalc.filter(pl.col("deviation") != 0).height
    print(f"  Alignments with deviations: {n_deviations:,} ({100*n_deviations/results_with_recalc.height:.2f}%)")
    
    # Filter to max_mismatches after recalculation
    results_with_recalc = results_with_recalc.filter(pl.col("mismatches") <= MAX_MISMATCHES)
    print(f"  Alignments after filtering (≤{MAX_MISMATCHES} mismatches): {results_with_recalc.height:,}")
    
    recalculated_results[frac] = results_with_recalc

print(f"\n✓ Recalculated alignments for {len(recalculated_results)} fractions")

## Updated Per-Fraction Statistics (with Recalculated Mismatches)

Now compute statistics using the recalculated mismatches from parasail.

In [None]:
# Recompute statistics with recalculated mismatches
updated_per_fraction_stats = {}

for frac, results_df in recalculated_results.items():
    print(f"\n=== Fraction {frac} (Recalculated Mismatches) ===")
    
    # Per-tool summary
    tool_summary = results_df.group_by('tool').agg([
        pl.col('spacer_id').n_unique().alias('n_unique_spacers'),
        pl.col('contig_id').n_unique().alias('n_unique_contigs'),
        pl.len().alias('n_total_alignments'),
        pl.col('mismatches').mean().alias('mean_mismatches'),
        pl.col('mismatches').median().alias('median_mismatches'),
        (pl.col('mismatches') == 0).sum().alias('n_perfect_matches'),
        pl.col('deviation').mean().alias('mean_deviation_from_tool'),
        (pl.col('deviation') != 0).sum().alias('n_with_deviation'),
    ]).sort('n_unique_spacers', descending=True)
    
    updated_per_fraction_stats[frac] = tool_summary
    display(tool_summary)

# Save updated stats
print("\n✓ Updated statistics computed")

## Visualizations: Comparing Tools Across Fractions

Plot tool performance metrics across subsample sizes using the recalculated mismatches.

In [None]:
os.makedirs('results/real_data/subsamples_analysis', exist_ok=True)
# Combine stats from all fractions for plotting
plot_data_list = []
for frac in sorted(updated_per_fraction_stats.keys()):
    stats_df = updated_per_fraction_stats[frac].with_columns(pl.lit(frac).alias('fraction'))
    plot_data_list.append(stats_df)

combined_stats = pl.concat(plot_data_list)

# Plot 1: Number of unique spacers found per tool across fractions
fig, ax = plt.subplots(figsize=(14, 7))

# Plot tools in consistent order with assigned colors/markers
for tool in TOOL_ORDER:
    tool_data = combined_stats.filter(pl.col('tool') == tool)
    if tool_data.height == 0:
        continue
    tool_data = tool_data.sort('fraction')
    style = TOOL_STYLES.get(tool, {'color': 'gray', 'marker': 'o'})
    
    ax.plot(tool_data['fraction'].to_list(), 
            tool_data['n_unique_spacers'].to_list(), 
            marker=style['marker'], 
            color=style['color'],
            label=tool, 
            linewidth=2, 
            markersize=8, 
            alpha=0.85)

ax.set_xlabel('Subsample Fraction', fontsize=13)

ax.set_ylabel('Number of Unique Spacers Found', fontsize=13)
print(f"✓ Saved unique spacers plot (≤{MAX_MISMATCHES} mismatches)")

ax.set_title(f'Tool Performance: Unique Spacers vs Subsample Size (Recalculated ≤{MAX_MISMATCHES} mismatches)', plt.show()

             fontsize=14, fontweight='bold', pad=15)
plt.savefig('results/real_data/subsamples_analysis/unique_spacers_vs_fraction.png', dpi=300, bbox_inches='tight')
ax.set_xscale('log')
plt.savefig('results/real_data/subsamples_analysis/unique_spacers_vs_fraction.pdf', bbox_inches='tight')

ax.set_yscale('log')
plt.tight_layout()

ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10, framealpha=0.9)
ax.grid(True, alpha=0.3, which='both')

In [None]:
# Plot 2: Number of unique contigs matched per tool across fractions
fig, ax = plt.subplots(figsize=(14, 7))

# Plot tools in consistent order with assigned colors/markers
for tool in TOOL_ORDER:
    tool_data = combined_stats.filter(pl.col('tool') == tool)
    if tool_data.height == 0:
        continue
    tool_data = tool_data.sort('fraction')
    style = TOOL_STYLES.get(tool, {'color': 'gray', 'marker': 'o'})
    
    ax.plot(tool_data['fraction'].to_list(), 
            tool_data['n_unique_contigs'].to_list(), 
            marker=style['marker'],
            color=style['color'],
            label=tool, 
            linewidth=2, 
            markersize=8,
            alpha=0.85)

ax.set_xlabel('Subsample Fraction', fontsize=12)
ax.set_ylabel('Number of Unique Contigs Matched', fontsize=12)
ax.set_title(f'Tool Performance: Unique Contigs Across Subsample Sizes (≤{MAX_MISMATCHES} mismatches)', 
             fontsize=14, fontweight='bold')
ax.set_xscale('log')

ax.set_yscale('log')
print(f"✓ Saved contigs plot (≤{MAX_MISMATCHES} mismatches)")

ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.show()

ax.grid(True, alpha=0.3)
plt.savefig('results/real_data/subsamples_analysis/contigs_by_fraction.png', dpi=150, bbox_inches='tight')

plt.tight_layout()
plt.savefig('results/real_data/subsamples_analysis/contigs_by_fraction.pdf', dpi=300, bbox_inches='tight')


## Conclusion
In the these selected subsampling sizes, the results of each tool appear consist (similar ratios compared to the same tool but the other size fractions).
This suggests we should be able to extrapolate/assume that should we have the CPU time to run all tools on the entire dataset (no subsampling) the results would be qualitative similar (from tool vs tool comparison perspective).