# Imports and setup

### Auto-re-import python modules, useful for editing local fils

In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
import itertools

import matplotlib as mpl
import polars as pl
import seaborn as sns

# pl.Config.set_verbose(True)
# pl.Config.()
mpl.rcParams["figure.max_open_warning"] = 0

# Handwritten local modules
# import process_scop_sourmash_multisearch_polars
from process_scop_sourmash_multisearch_polars_mem_optimized import MultisearchParser

In [3]:
pl.__version__

'1.12.0'

# Read in data

In [4]:
analysis_outdir = (
    "s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__protein_k5-20"
)

pipeline_outdir = (
    "s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20"
)
analysis_outdir

's3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__protein_k5-20'

## Read in one example file for schema

In [5]:
pq = "/home/ec2-user/data/seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.pq"

df = pl.scan_parquet(pq, n_rows=10)
df.schema

  df.schema


Schema([('query_name', String),
        ('query_md5', String),
        ('match_name', String),
        ('match_md5', String),
        ('containment', Float64),
        ('max_containment', Float64),
        ('jaccard', Float64),
        ('intersect_hashes', Float64),
        ('prob_overlap', Float64),
        ('prob_overlap_adjusted', Float64),
        ('containment_adjusted', Float64),
        ('containment_adjusted_log10', Float64),
        ('tf_idf_score', Float64)])

In [6]:
schema = df.schema.copy()
# schema["intersect_hashes"] = int
schema

  schema = df.schema.copy()


Schema([('query_name', String),
        ('query_md5', String),
        ('match_name', String),
        ('match_md5', String),
        ('containment', Float64),
        ('max_containment', Float64),
        ('jaccard', Float64),
        ('intersect_hashes', Float64),
        ('prob_overlap', Float64),
        ('prob_overlap_adjusted', Float64),
        ('containment_adjusted', Float64),
        ('containment_adjusted_log10', Float64),
        ('tf_idf_score', Float64)])

## Read metadata

### Read Query Metadata

In [7]:
query_metadata = pl.scan_parquet(
    "s3://seanome-kmerseek/scope-benchmark/reference_files/scop.e.2.08.query_metadata.pq",
    low_memory=True,
)
query_metadata


  return method()


In [8]:
# print(query_metadata.head().collect().write_csv())

### Read match metadata

In [9]:
match_metadata = pl.scan_parquet(
    "s3://seanome-kmerseek/scope-benchmark/reference_files/scop.e.2.08.match_metadata.pq",
    low_memory=True,
)
match_metadata

In [15]:
print(match_metadata.head().collect().write_csv())

match_name,match_family,match_superfamily,match_fold,match_class,match_scop_id
d1x3ka_ a.1.1.0 (A:) automated matches {Tokunagayusurika akamusi [TaxId: 28383]},a.1.1.0,a.1.1,a.1,a,d1x3ka_
d1x46a_ a.1.1.0 (A:) automated matches {Tokunagayusurika akamusi [TaxId: 28383]},a.1.1.0,a.1.1,a.1,a,d1x46a_
d2bk9a_ a.1.1.0 (A:) automated matches {Fruit fly (Drosophila melanogaster) [TaxId: 7227]},a.1.1.0,a.1.1,a.1,a,d2bk9a_
d2c0ka_ a.1.1.0 (A:) automated matches {Gasterophilus intestinalis [TaxId: 84525]},a.1.1.0,a.1.1,a.1,a,d2c0ka_
d2ig3a_ a.1.1.0 (A:) automated matches {Campylobacter jejuni [TaxId: 197]},a.1.1.0,a.1.1,a.1,a,d2ig3a_



## Do HP

In [16]:
# Skip ksizes 5 and 6 for now because those files are enormous, 2.3 GiB for k=5 and 175 MiB for k=6
# # -> Figure out how to use polars later
# ksizes = range(7, 21)
# moltype = "protein"

moltype_info = {
    # "protein": dict(
    #     ksizes=range(5, 21),
    #     pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20",
    #     analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/protein",
    # ),
    # "dayhoff": dict(
    #     ksizes=range(5, 21),
    #     pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__dayhoff_k5-20",
    #     analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/dayhoff",
    # ),
    "hp": dict(
        ksizes=range(10, 20),
        # pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60",
        pipeline_outdir="/home/ec2-user/data/seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60",
        # analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp",
        analysis_outdir="/home/ec2-user/data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp",
    ),
}

for moltype, info in moltype_info.items():
    ksizes = info["ksizes"]
    analysis_outdir = info["analysis_outdir"]
    pipeline_outdir = info["pipeline_outdir"]
    for ksize in ksizes:
        print(f"--- moltype: {moltype}, ksize: {ksize} --")

        # try:
        parser = MultisearchParser(
            query_metadata=query_metadata,
            match_metadata=match_metadata,
            pipeline_outdir=pipeline_outdir,
            moltype=moltype,
            ksize=ksize,
            analysis_outdir=analysis_outdir,
            verbose=False,
            input_filetype="pq",
            chunk_size=1000000,
            # schema=schema,
            # chunk_size=100000, # Default
        )
        lf = parser.process_multisearch_scop_results()
    #     break
    # break
    # lf.head().collect()
    # except FileNotFoundError:
    #     pass

    # break
lf

--- moltype: dayhoff, ksize: 5 --


ClientError: An error occurred (404) when calling the HeadObject operation: Not Found

In [17]:
%debug

> [0;32m/home/ec2-user/miniconda3/envs/2024-kmerseek-analysis-polars/lib/python3.12/site-packages/botocore/client.py[0m(1023)[0;36m_make_api_call[0;34m()[0m
[0;32m   1021 [0;31m            [0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1022 [0;31m            [0merror_class[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mexceptions[0m[0;34m.[0m[0mfrom_code[0m[0;34m([0m[0merror_code[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1023 [0;31m            [0;32mraise[0m [0merror_class[0m[0;34m([0m[0mparsed_response[0m[0;34m,[0m [0moperation_name[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1024 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1025 [0;31m            [0;32mreturn[0m [0mparsed_response[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  a


self = <botocore.client.S3 object at 0x7f85c00edfa0>
operation_name = 'HeadObject'
api_params = {'Bucket': 'seanome-kmerseek', 'Key': 'scope-benchmark/pipeline-outputs/2024-10-09__dayhoff_k5-20/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.dayhoff.5.multisearch.pq'}


ipdb>  q


In [2]:
! ls /home/ec2-user/data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results

scope40.multisearch.hp.k10.filtered.pq	scope40.multisearch.hp.k15.filtered.pq
scope40.multisearch.hp.k11.filtered.pq	scope40.multisearch.hp.k16.filtered.pq
scope40.multisearch.hp.k12.filtered.pq	scope40.multisearch.hp.k17.filtered.pq
scope40.multisearch.hp.k13.filtered.pq	scope40.multisearch.hp.k18.filtered.pq
scope40.multisearch.hp.k14.filtered.pq	scope40.multisearch.hp.k19.filtered.pq


In [3]:
! aws s3 sync --dryrun /home/ec2-user/data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp

(dryrun) upload: ../../data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k10.filtered.pq to s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k10.filtered.pq
(dryrun) upload: ../../data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k11.filtered.pq to s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k11.filtered.pq
(dryrun) upload: ../../data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k12.filtered.pq to s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k12.filtered.pq
(dryrun) upload: ../../data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k13.filtered.pq to 

## Copy outputs to S3 for safekeeping

In [4]:
! aws s3 sync /home/ec2-user/data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp

upload: ../../data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k14.filtered.pq to s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k14.filtered.pq
upload: ../../data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k13.filtered.pq to s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k13.filtered.pq
upload: ../../data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k15.filtered.pq to s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k15.filtered.pq
upload: ../../data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k17.filtered.pq to s3://seanome-kmerseek/scope-benchmar

## Do Protein, dayhoff, so everything is in the same format

Dayhoff never finished because the k=5 was SO HUGE and could never be processed

In [None]:
# Skip ksizes 5 and 6 for now because those files are enormous, 2.3 GiB for k=5 and 175 MiB for k=6
# # -> Figure out how to use polars later
# ksizes = range(7, 21)
# moltype = "protein"

moltype_info = {
    "protein": dict(
        ksizes=range(5, 21),
        pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20",
        analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/protein",
    ),
    "dayhoff": dict(
        ksizes=range(5, 21),
        pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__dayhoff_k5-20",
        analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/dayhoff",
    ),
    # "hp": dict(
    #     ksizes=range(10, 20),
    #     # pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60",
    #     pipeline_outdir="/home/ec2-user/data/seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60",
    #     # analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp",
    #     analysis_outdir="/home/ec2-user/data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp",
    # ),
}

for moltype, info in moltype_info.items():
    ksizes = info["ksizes"]
    pipeline_outdir = info["pipeline_outdir"]
    analysis_outdir = info["analysis_outdir"]
    ! mkdir -p $analysis_outdir/00_cleaned_multisearch_results
    for ksize in ksizes:
        print(f"--- moltype: {moltype}, ksize: {ksize} --")

        # try:
        parser = MultisearchParser(
            query_metadata=query_metadata,
            match_metadata=match_metadata,
            pipeline_outdir=pipeline_outdir,
            moltype=moltype,
            ksize=ksize,
            analysis_outdir=analysis_outdir,
            verbose=False,
            input_filetype="csv",
            chunk_size=1000000,
            # schema=schema,
            # chunk_size=100000, # Default
        )
        lf = parser.process_multisearch_scop_results()
    #     break
    # break
    # lf.head().collect()
    # except FileNotFoundError:
    #     pass

    # break
lf

--- moltype: protein, ksize: 5 --


/home/ec2-user/tmp/fmokozol: 100%|██████████████████████████████████████████████████████████████████████████████████████| 2.52G/2.52G [00:09<00:00, 272MB/s]
6it [00:06,  1.03s/it]
/tmp/tmp8bqvv18d: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 86.2M/86.2M [00:00<00:00, 93.9MB/s]


	Done.
--- moltype: protein, ksize: 6 --


/home/ec2-user/tmp/79sci593: 100%|████████████████████████████████████████████████████████████████████████████████████████| 183M/183M [00:00<00:00, 223MB/s]
1it [00:00,  1.92it/s]
/tmp/tmpyk8128ts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 6.19M/6.19M [00:00<00:00, 53.3MB/s]


	Done.
--- moltype: protein, ksize: 7 --


/home/ec2-user/tmp/ervtygnz: 100%|█████████████████████████████████████████████████████████████████████████████████████| 21.7M/21.7M [00:00<00:00, 23.6MB/s]
1it [00:00,  8.48it/s]
/tmp/tmpe5lctqdz: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1.20M/1.20M [00:00<00:00, 5.46MB/s]


	Done.
--- moltype: protein, ksize: 8 --


/home/ec2-user/tmp/3tev8zs0: 100%|█████████████████████████████████████████████████████████████████████████████████████| 8.79M/8.79M [00:00<00:00, 37.9MB/s]
1it [00:00, 13.61it/s]
/tmp/tmp3m71pc1p: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 470k/470k [00:00<00:00, 5.70MB/s]


	Done.
--- moltype: protein, ksize: 9 --


/home/ec2-user/tmp/mqn4d1zx: 100%|█████████████████████████████████████████████████████████████████████████████████████| 6.90M/6.90M [00:00<00:00, 11.7MB/s]
1it [00:00,  9.36it/s]
/tmp/tmpfo_ylyso: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 249k/249k [00:00<00:00, 1.70MB/s]


	Done.
--- moltype: protein, ksize: 10 --


/home/ec2-user/tmp/1d1vvqzh: 100%|█████████████████████████████████████████████████████████████████████████████████████| 6.26M/6.26M [00:00<00:00, 19.0MB/s]
1it [00:00, 12.57it/s]
/tmp/tmprw20j89f: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 164k/164k [00:00<00:00, 1.05MB/s]


	Done.
--- moltype: protein, ksize: 11 --


/home/ec2-user/tmp/eprq4dm9: 100%|█████████████████████████████████████████████████████████████████████████████████████| 5.97M/5.97M [00:00<00:00, 21.1MB/s]
1it [00:00, 13.89it/s]
/tmp/tmpqet597y6: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 116k/116k [00:00<00:00, 850kB/s]


	Done.
--- moltype: protein, ksize: 12 --


/home/ec2-user/tmp/ar_d4tf1: 100%|█████████████████████████████████████████████████████████████████████████████████████| 5.81M/5.81M [00:00<00:00, 12.1MB/s]
1it [00:00,  9.93it/s]
/tmp/tmpl6a288t0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 92.7k/92.7k [00:00<00:00, 953kB/s]


	Done.
--- moltype: protein, ksize: 13 --


/home/ec2-user/tmp/u6yzxgnx: 100%|█████████████████████████████████████████████████████████████████████████████████████| 5.73M/5.73M [00:00<00:00, 22.6MB/s]
1it [00:00,  7.95it/s]
/tmp/tmp1011dhjj: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 77.9k/77.9k [00:00<00:00, 871kB/s]


	Done.
--- moltype: protein, ksize: 14 --


/home/ec2-user/tmp/awblsl6y: 100%|█████████████████████████████████████████████████████████████████████████████████████| 5.70M/5.70M [00:00<00:00, 16.8MB/s]
1it [00:00, 15.30it/s]
/tmp/tmp2lcxybhe: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 67.3k/67.3k [00:00<00:00, 557kB/s]


	Done.
--- moltype: protein, ksize: 15 --


/home/ec2-user/tmp/fybed5f2: 100%|█████████████████████████████████████████████████████████████████████████████████████| 5.67M/5.67M [00:00<00:00, 11.1MB/s]
1it [00:00, 14.80it/s]
/tmp/tmpdaoba41d: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 61.0k/61.0k [00:00<00:00, 454kB/s]


	Done.
--- moltype: protein, ksize: 16 --


/home/ec2-user/tmp/71ymv0cc: 100%|█████████████████████████████████████████████████████████████████████████████████████| 5.65M/5.65M [00:00<00:00, 18.7MB/s]
1it [00:00,  8.14it/s]
/tmp/tmpnvxd3op7: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 59.0k/59.0k [00:00<00:00, 681kB/s]


	Done.
--- moltype: protein, ksize: 17 --


/home/ec2-user/tmp/lwpbkaj5: 100%|█████████████████████████████████████████████████████████████████████████████████████| 5.65M/5.65M [00:00<00:00, 18.4MB/s]
1it [00:00, 17.12it/s]
/tmp/tmpo3a_zs3e: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 56.5k/56.5k [00:00<00:00, 484kB/s]


	Done.
--- moltype: protein, ksize: 18 --


/home/ec2-user/tmp/lay3xyds: 100%|█████████████████████████████████████████████████████████████████████████████████████| 5.64M/5.64M [00:00<00:00, 17.1MB/s]
1it [00:00, 16.20it/s]
/tmp/tmpw7mz8aa1: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 54.4k/54.4k [00:00<00:00, 770kB/s]


	Done.
--- moltype: protein, ksize: 19 --


/home/ec2-user/tmp/gzflw26p: 100%|█████████████████████████████████████████████████████████████████████████████████████| 5.64M/5.64M [00:00<00:00, 9.26MB/s]
1it [00:00, 21.23it/s]
/tmp/tmpzh8j2dj8: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 53.5k/53.5k [00:00<00:00, 681kB/s]


	Done.
--- moltype: protein, ksize: 20 --


/home/ec2-user/tmp/zlu3heq2: 100%|█████████████████████████████████████████████████████████████████████████████████████| 5.64M/5.64M [00:00<00:00, 43.8MB/s]
1it [00:00, 15.76it/s]
/tmp/tmpvoohm9u4: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 51.7k/51.7k [00:00<00:00, 462kB/s]


	Done.
--- moltype: dayhoff, ksize: 5 --


/home/ec2-user/tmp/80wh67qc:  10%|████████▏                                                                          | 9.12G/93.0G [04:50<6:43:47, 3.46MB/s]