# Imports and setup

### Auto-re-import python modules, useful for editing local fils

In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [15]:
import itertools

import matplotlib as mpl
import polars as pl
import seaborn as sns

mpl.rcParams["figure.max_open_warning"] = 0

# Handwritten local modules
import process_scop_sourmash_multisearch_polars

# Read in data

In [16]:
analysis_outdir = (
    "s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__protein_k5-20"
)

In [17]:
pipeline_outdir = (
    "s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20"
)
! aws s3 ls $pipeline_outdir/

                           PRE multiqc/
                           PRE pipeline_info/
                           PRE seqkit/
                           PRE sourmash/


In [18]:
! aws s3 ls $pipeline_outdir/sourmash/

                           PRE multisearch/
                           PRE sigs/


In [19]:
! aws s3 ls --human-readable $pipeline_outdir/sourmash/multisearch/

2024-10-09 13:29:03    0 Bytes 
2024-10-09 03:28:12    6.0 MiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.10.multisearch.csv
2024-10-09 06:17:15    5.7 MiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.11.multisearch.csv
2024-10-09 04:22:24    5.5 MiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.12.multisearch.csv
2024-10-09 08:07:15    5.5 MiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.13.multisearch.csv
2024-10-09 07:10:35    5.4 MiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.14.multisearch.csv
2024-10-09 12:35:34    5.4 MiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_00

In [20]:
analysis_outdir

's3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__protein_k5-20'

## Read metadata

### Read Query Metadata

In [21]:
query_metadata = pl.scan_parquet(
    "s3://seanome-kmerseek/scope-benchmark/reference_files/scop.e.2.08.query_metadata.pq"
)
query_metadata


  return method()


### Read match metadata

In [22]:
match_metadata = pl.scan_parquet(
    "s3://seanome-kmerseek/scope-benchmark/reference_files/scop.e.2.08.match_metadata.pq"
)
match_metadata

In [None]:
# Skip ksizes 5 and 6 for now because those files are enormous, 2.3 GiB for k=5 and 175 MiB for k=6
# # -> Figure out how to use polars later
# ksizes = range(7, 21)
# moltype = "protein"

moltype_info = {
    # "protein": dict(
    #     ksizes=range(5, 21),
    #     pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20",
    #     analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/protein",
    # ),
    # "dayhoff": dict(
    #     ksizes=range(5, 21),
    #     pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__dayhoff_k5-20",
    #     analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/dayhoff",
    # ),
    "hp": dict(
        ksizes=range(10, 20),
        pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60",
        analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp/",
    ),
}

for moltype, info in moltype_info.items():
    ksizes = info["ksizes"]
    analysis_outdir = info["analysis_outdir"]
    pipeline_outdir = info["pipeline_outdir"]
    for ksize in ksizes:
        try:
            parser = process_scop_sourmash_multisearch_polars.MultisearchParser(
                query_metadata=query_metadata,
                match_metadata=match_metadata,
                pipeline_outdir=pipeline_outdir,
                moltype=moltype,
                ksize=ksize,
                analysis_outdir=analysis_outdir,
                verbose=True,
            )
            lf = parser.process_multisearch_scop_results()
            # lf.head().collect()
        except FileNotFoundError:
            pass
    # break
multisearch_metadata_filtered.head()



--- moltype: hp, ksize: 10 --

Reading s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.csv ...


  fs.get(csv, f.name)


In [32]:
! aws s3 ls --human-readable \
    s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.csv

2024-10-11 23:54:22   87.9 GiB astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.csv


In [26]:
! aws s3 ls s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60/sourmash/multisearch/

2024-10-12 00:55:22          0 
2024-10-11 23:54:22 94359260819 astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.csv
2024-10-11 23:35:42 89741856710 astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.11.multisearch.csv
2024-10-12 00:10:32 80305344491 astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.12.multisearch.csv
2024-10-12 00:24:12 65519595388 astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.13.multisearch.csv
2024-10-12 00:45:02 47899918437 astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.14.multisearch.csv
2024-10-12 00:35:02 31404785257 astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-sc

In [None]:
multisearch_metadata_filtered.dtypes