# Imports and setup

### Auto-re-import python modules, useful for editing local fils

In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
import itertools

import matplotlib as mpl
import pandas as pd
import seaborn as sns

mpl.rcParams["figure.max_open_warning"] = 0

# Handwritten local modules
import scop_utils

# Read in data

In [3]:
analysis_outdir = (
    "s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__protein_k5-20"
)

In [4]:
pipeline_outdir = (
    "s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20"
)
! aws s3 ls $pipeline_outdir/

                           PRE multiqc/
                           PRE pipeline_info/
                           PRE seqkit/
                           PRE sourmash/


In [5]:
! aws s3 ls $pipeline_outdir/sourmash/

                           PRE multisearch/
                           PRE sigs/


In [6]:
! aws s3 ls --human-readable $pipeline_outdir/sourmash/multisearch/

2024-10-09 13:29:03    0 Bytes 
2024-10-09 03:28:12    6.0 MiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.10.multisearch.csv
2024-10-09 06:17:15    5.7 MiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.11.multisearch.csv
2024-10-09 04:22:24    5.5 MiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.12.multisearch.csv
2024-10-09 08:07:15    5.5 MiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.13.multisearch.csv
2024-10-09 07:10:35    5.4 MiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.14.multisearch.csv
2024-10-09 12:35:34    5.4 MiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_00

In [7]:
def make_multisearch_csv(
    ksize,
    outdir,
    moltype,
    query="astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa",
    against="astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa",
):
    basename = f"{query}--in--{against}.{moltype}.{ksize}.multisearch.csv"
    csv = f"{outdir}/sourmash/multisearch/{basename}"
    return csv


scop_fixed = pd.read_csv(
    "https://raw.githubusercontent.com/steineggerlab/foldseek-analysis/refs/heads/main/scopbenchmark/data/scop_lookup.fix.tsv",
    sep="\t",
    header=None,
    names=["scop_id", "scop_class"],
    index_col=0,
).squeeze()
print(scop_fixed.shape)
scop_fixed.head()

(11211,)


scop_id
d1dlwa_    a.1.1.1
d2gkma_    a.1.1.1
d2qrwa_    a.1.1.1
d1s69a_    a.1.1.1
d2bkma_    a.1.1.1
Name: scop_class, dtype: object

In [14]:
# Skip ksizes 5 and 6 for now because those files are enormous, 2.3 GiB for k=5 and 175 MiB for k=6
# -> Figure out how to use polars later
ksizes = range(7, 21)

for ksize in ksizes:
    print(f"\n\n--- ksize: {ksize} --")
    csv = make_multisearch_csv(ksize, pipeline_outdir, "protein")
    print(f"\nReading {csv} ...")
    %time multisearch = pd.read_csv(csv)
    print("\tDone")

    query_metadata = scop_utils.extract_scop_info_from_name(
        multisearch.query_name, scop_fixed, "query", verbose=False
    )

    match_metadata = scop_utils.extract_scop_info_from_name(
        multisearch.match_name, scop_fixed, "match", verbose=False
    )

    multisearch_metadata = multisearch.join(query_metadata, on="query_name").join(
        match_metadata, on="match_name"
    )

    pq = f"{analysis_outdir}/00_cleaned_multisearch_results/scope40.multisearch.protein.k{ksize}.pq"
    print(f"\nWriting {pq} ...")
    %time multisearch_metadata.to_parquet(pq)
    print(f"\tDone.")



--- ksize: 7 --

Reading s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.7.multisearch.csv ...
CPU times: user 140 ms, sys: 40.9 ms, total: 181 ms
Wall time: 2.25 s
	Done

Writing s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__protein_k5-20/00_cleaned_multisearch_results/scope40.multisearch.protein.k7.pq ...
CPU times: user 114 ms, sys: 28.1 ms, total: 142 ms
Wall time: 410 ms
	Done.


--- ksize: 8 --

Reading s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.8.multisearch.csv ...
CPU times: user 88.1 ms, sys: 586 μs, total: 88.7 ms
Wall time: 554 ms
	Done

Writing s3://seanome-kmerseek/scope-benchmark/analysis