# Imports and setup

### Auto-re-import python modules, useful for editing local fils

In [2]:
%load_ext autoreload
%autoreload 2

## Imports

In [3]:
import itertools

import matplotlib as mpl
import polars as pl
import seaborn as sns

# pl.Config.set_verbose(True)
# pl.Config.()
mpl.rcParams["figure.max_open_warning"] = 0

# Handwritten local modules
# import process_scop_sourmash_multisearch_polars
from process_scop_sourmash_multisearch_polars_mem_optimized import MultisearchParser

In [4]:
pl.__version__

'1.12.0'

In [5]:
pl.read_parquet_schema?

[0;31mSignature:[0m [0mpl[0m[0;34m.[0m[0mread_parquet_schema[0m[0;34m([0m[0msource[0m[0;34m:[0m [0;34m'str | Path | IO[bytes] | bytes'[0m[0;34m)[0m [0;34m->[0m [0;34m'dict[str, DataType]'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Get the schema of a Parquet file without reading data.

Parameters
----------
source
    Path to a file or a file-like object (by "file-like object" we refer to objects
    that have a `read()` method, such as a file handler like the builtin `open`
    function, or a `BytesIO` instance).
    For file-like objects,
    stream position may not be updated accordingly after reading.

Returns
-------
dict
    Dictionary mapping column names to datatypes
[0;31mFile:[0m      ~/miniconda3/envs/2024-kmerseek-analysis-polars/lib/python3.12/site-packages/polars/io/parquet/functions.py
[0;31mType:[0m      function

# Read in data

In [6]:
analysis_outdir = (
    "s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__protein_k5-20"
)

pipeline_outdir = (
    "s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20"
)
analysis_outdir

's3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__protein_k5-20'

## Read in one example file for schema

In [7]:
pq = "/home/ec2-user/data/seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.pq"

df = pl.scan_parquet(pq, n_rows=10)
df.schema

  df.schema


Schema([('query_name', String),
        ('query_md5', String),
        ('match_name', String),
        ('match_md5', String),
        ('containment', Float64),
        ('max_containment', Float64),
        ('jaccard', Float64),
        ('intersect_hashes', Float64),
        ('prob_overlap', Float64),
        ('prob_overlap_adjusted', Float64),
        ('containment_adjusted', Float64),
        ('containment_adjusted_log10', Float64),
        ('tf_idf_score', Float64)])

In [8]:
schema = df.schema.copy()
# schema["intersect_hashes"] = int
schema

  schema = df.schema.copy()


Schema([('query_name', String),
        ('query_md5', String),
        ('match_name', String),
        ('match_md5', String),
        ('containment', Float64),
        ('max_containment', Float64),
        ('jaccard', Float64),
        ('intersect_hashes', Float64),
        ('prob_overlap', Float64),
        ('prob_overlap_adjusted', Float64),
        ('containment_adjusted', Float64),
        ('containment_adjusted_log10', Float64),
        ('tf_idf_score', Float64)])

## Read metadata

### Read Query Metadata

In [9]:
query_metadata = pl.scan_parquet(
    "s3://seanome-kmerseek/scope-benchmark/reference_files/scop.e.2.08.query_metadata.pq",
    low_memory=True,
)
query_metadata


  return method()


In [10]:
# print(query_metadata.head().collect().write_csv())

### Read match metadata

In [11]:
match_metadata = pl.scan_parquet(
    "s3://seanome-kmerseek/scope-benchmark/reference_files/scop.e.2.08.match_metadata.pq",
    low_memory=True,
)
match_metadata

In [12]:
print(match_metadata.head().collect().write_csv())

match_name,match_family,match_superfamily,match_fold,match_class,match_scop_id
d1x3ka_ a.1.1.0 (A:) automated matches {Tokunagayusurika akamusi [TaxId: 28383]},a.1.1.0,a.1.1,a.1,a,d1x3ka_
d1x46a_ a.1.1.0 (A:) automated matches {Tokunagayusurika akamusi [TaxId: 28383]},a.1.1.0,a.1.1,a.1,a,d1x46a_
d2bk9a_ a.1.1.0 (A:) automated matches {Fruit fly (Drosophila melanogaster) [TaxId: 7227]},a.1.1.0,a.1.1,a.1,a,d2bk9a_
d2c0ka_ a.1.1.0 (A:) automated matches {Gasterophilus intestinalis [TaxId: 84525]},a.1.1.0,a.1.1,a.1,a,d2c0ka_
d2ig3a_ a.1.1.0 (A:) automated matches {Campylobacter jejuni [TaxId: 197]},a.1.1.0,a.1.1,a.1,a,d2ig3a_



In [None]:
# Skip ksizes 5 and 6 for now because those files are enormous, 2.3 GiB for k=5 and 175 MiB for k=6
# # -> Figure out how to use polars later
# ksizes = range(7, 21)
# moltype = "protein"

moltype_info = {
    # "protein": dict(
    #     ksizes=range(5, 21),
    #     pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20",
    #     analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/protein",
    # ),
    # "dayhoff": dict(
    #     ksizes=range(5, 21),
    #     pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__dayhoff_k5-20",
    #     analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/dayhoff",
    # ),
    "hp": dict(
        ksizes=range(10, 20),
        # pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60",
        pipeline_outdir="/home/ec2-user/data/seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60",
        # analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp",
        analysis_outdir="/home/ec2-user/data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp",
    ),
}

for moltype, info in moltype_info.items():
    ksizes = info["ksizes"]
    analysis_outdir = info["analysis_outdir"]
    pipeline_outdir = info["pipeline_outdir"]
    for ksize in ksizes:
        # try:
        parser = MultisearchParser(
            query_metadata=query_metadata,
            match_metadata=match_metadata,
            pipeline_outdir=pipeline_outdir,
            moltype=moltype,
            ksize=ksize,
            analysis_outdir=analysis_outdir,
            verbose=True,
            input_filetype="pq",
            chunk_size=10000,
            # schema=schema,
            # chunk_size=100000, # Default
        )
        lf = parser.process_multisearch_scop_results()
    #     break
    # break
    # lf.head().collect()
    # except FileNotFoundError:
    #     pass

    # break
lf

2024-10-31 23:28 - DEBUG - 
--- moltype: hp, ksize: 10 --

--- moltype: hp, ksize: 10 --

--- moltype: hp, ksize: 10 --
2024-10-31 23:28 - DEBUG - Processing /home/ec2-user/data/seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.pq ...
Processing /home/ec2-user/data/seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.pq ...
Processing /home/ec2-user/data/seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.pq ...
2024-10-31 23:28 - DEBUG - Processing chunk 1
Pro

In [None]:
# %debug