Cleaning up the txt file for the tool versions and merging it with the json configs.

In [149]:
import json
import polars as pl
from pathlib import Path
import re
from typing import Dict, List, Union


In [150]:
all_tools= json.load(open("all_configs.json"))
all_tool_df = pl.DataFrame(all_tools)
all_tool_df = all_tool_df.select(
    pl.col("command").list.first().alias("command"),
    pl.col("name").str.to_lowercase().alias("tool_name")
)

In [151]:
all_tool_df

command,tool_name
str,str
"""bbmapskimmer.sh sam=1.4 maxind…","""bbmapskimmer"""
"""bbmapskimmer.sh sam=1.4 maxind…","""bbmapskimmermod"""
"""makeblastdb -in {contigs_file}…","""blastn"""
"""bowtie-build --threads {thread…","""bowtie1"""
"""bowtie2-build --large-index --…","""bowtie2"""
…,…
"""mkdir -p {output_dir}/tmp_spac…","""mmseqs2"""
"""mkdir -p {output_dir}/tmp_spac…","""mmseqs2map"""
"""nucmer --maxmatch --nosimplify…","""mummer4"""
"""spacer-containment --n-threads…","""spacer_containment"""


In [152]:
tools_versions_raw = open("tool_versions.txt").read().split("#####")
tools_versions_raw = [x.strip() for x in tools_versions_raw if x.strip()]
# remove absolute path ( anything until bin/)
tools_versions_raw = [re.sub(r".+/bin/", "", x) for x in tools_versions_raw]
tools_versions_raw = [re.sub(f"/clusterfs/jgi/groups/science/homes/uneri/", "", x) for x in tools_versions_raw]

# get tool name from first line
tools_versions_raw = {x.split("\n")[0]: x.split("\n", 1)[1] for x in tools_versions_raw}
# replace newlines with commas
tools_versions_raw = {k: v.replace("\n", ", ") for k, v in tools_versions_raw.items()}
tools_versions_raw_df = pl.DataFrame({"tool": list(tools_versions_raw.keys()), "version": list(tools_versions_raw.values())})
tools_versions_raw_df = tools_versions_raw_df.select(
    pl.col("tool").str.to_lowercase().str.replace("-", "_").alias("tool_name"),
    pl.col("version").str.replace("  ", " ").alias("version")
)
tools_versions_raw_df


tool_name,version
str,str
"""bowtie1""","""bowtie-align-s version 1.3.1, …"
"""bowtie2""","""bowtie2-align-s version 2.5.4,…"
"""minimap2""","""2.28-r1209"""
"""bbmapskimmer""","""39.13"""
"""strobealign""","""0.15.0"""
…,…
"""bwa""","""0.7.19-r1273"""
"""hisat2""","""hisat2-align-s version 2.2.1, …"
"""spacer_containment""","""0.1.0"""
"""lexicmap""","""v0.5.0 (06741c8)"""


In [154]:
merged_df = all_tool_df.join( tools_versions_raw_df, on="tool_name", how="inner")
merged_df.write_csv("utils/tool_versions.tsv", separator="\t")

merged_df 

command,tool_name,version
str,str,str
"""bbmapskimmer.sh sam=1.4 maxind…","""bbmapskimmer""","""39.13"""
"""makeblastdb -in {contigs_file}…","""blastn""","""2.16.0+ Package: blast 2.16.0,…"
"""bowtie-build --threads {thread…","""bowtie1""","""bowtie-align-s version 1.3.1, …"
"""bowtie2-build --large-index --…","""bowtie2""","""bowtie2-align-s version 2.5.4,…"
"""hisat2-build -p {threads} {con…","""hisat2""","""hisat2-align-s version 2.2.1, …"
…,…,…
"""minimap2 -N 100 --eqx -t {thre…","""minimap2""","""2.28-r1209"""
"""mkdir -p {output_dir}/tmp_spac…","""mmseqs2""","""db8ad2d14d0a285ce0ad62bbefd0dc…"
"""nucmer --maxmatch --nosimplify…","""mummer4""","""4.10.0-r1304 // 4.0.1"""
"""spacer-containment --n-threads…","""spacer_containment""","""0.1.0"""
