# Imports and setup

### Auto-re-import python modules, useful for editing local files

In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [37]:
from typing import get_args

import polars as pl

# Local modules
import process_scop_sourmash_multisearch_polars
from scop_constants import SCOP_LINEAGES

In [76]:
pl.__version__

'1.9.0'

# Read in data

In [6]:
! aws s3 ls s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/

                           PRE 2024-10-08__dayhoff_k5-20/
                           PRE 2024-10-08__protein_k5-20/
                           PRE 2024-10-09__dayhoff_k5-20/
                           PRE 2024-10-09__hp_k20-60/
                           PRE dayhoff_k5-20/
                           PRE hp_k10-60/
                           PRE protein/
                           PRE protein_k5-20/


In [8]:
! aws s3 ls s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/protein_k5-20/

                           PRE multiqc/
                           PRE pipeline_info/
                           PRE seqkit/
                           PRE sourmash/


In [4]:
outdir = (
    "s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20"
)
! aws s3 ls $outdir/

                           PRE multiqc/
                           PRE pipeline_info/
                           PRE seqkit/
                           PRE sourmash/


In [5]:
! aws s3 ls $outdir/sourmash/

                           PRE multisearch/
                           PRE sigs/


In [15]:
analysis_outdir = outdir.replace("pipeline", "analysis")
analysis_outdir

's3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-08__protein_k5-20'

In [16]:
import polars as pl

In [17]:
multisearch = pl.scan_csv(
    f"{outdir}/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.10.multisearch.csv"
)
multisearch


  return method()


## Test the pipeline

In [52]:
query_metadata = pl.scan_parquet(
    "s3://seanome-kmerseek/scope-benchmark/reference_files/scop.e.2.08.query_metadata.pq"
)
query_metadata

In [53]:
query_metadata.schema

  query_metadata.schema


Schema([('query_name', String),
        ('query_family', Categorical(ordering='physical')),
        ('query_superfamily', Categorical(ordering='physical')),
        ('query_fold', Categorical(ordering='physical')),
        ('query_class', Categorical(ordering='physical')),
        ('n_family', Int64),
        ('n_superfamily', Int64),
        ('n_fold', Int64),
        ('n_class', Int64),
        ('query_scop_id', String)])

### Make match metadata

In [54]:
match_metadata = pl.scan_parquet(
    "s3://seanome-kmerseek/scope-benchmark/reference_files/scop.e.2.08.match_metadata.pq"
)
match_metadata

In [55]:
multisearch_metadata = multisearch.join(query_metadata, on="query_name").join(
    match_metadata, on="match_name"
)
multisearch_metadata

In [56]:
check_same_cols = get_args(SCOP_LINEAGES)

In [57]:
for col in check_same_cols:
    query = f"query_{col}"
    match = f"match_{col}"
    same = f"same_{col}"

    multisearch_metadata = multisearch_metadata.with_columns(
        (pl.col(query) == pl.col(match)).alias(same)
    )
multisearch_metadata

In [58]:
multisearch_metadata.columns

  multisearch_metadata.columns


['query_name',
 'query_md5',
 'match_name',
 'match_md5',
 'containment',
 'max_containment',
 'jaccard',
 'intersect_hashes',
 'prob_overlap',
 'prob_overlap_adjusted',
 'containment_adjusted',
 'containment_adjusted_log10',
 'tf_idf_score',
 'query_family',
 'query_superfamily',
 'query_fold',
 'query_class',
 'n_family',
 'n_superfamily',
 'n_fold',
 'n_class',
 'query_scop_id',
 'match_family',
 'match_superfamily',
 'match_fold',
 'match_class',
 'match_scop_id',
 'same_family',
 'same_superfamily',
 'same_fold',
 'same_class']

In [59]:
multisearch_metadata.select(pl.len()).collect().item()

16725

In [45]:
multisearch_metadata.width

  multisearch_metadata.width


31

In [46]:
multisearch_metadata.schema

  multisearch_metadata.schema


Schema([('query_name', String),
        ('query_md5', String),
        ('match_name', String),
        ('match_md5', String),
        ('containment', Float64),
        ('max_containment', Float64),
        ('jaccard', Float64),
        ('intersect_hashes', Float64),
        ('prob_overlap', Float64),
        ('prob_overlap_adjusted', Float64),
        ('containment_adjusted', Float64),
        ('containment_adjusted_log10', Float64),
        ('tf_idf_score', Float64),
        ('query_family', Categorical(ordering='physical')),
        ('query_superfamily', Categorical(ordering='physical')),
        ('query_fold', Categorical(ordering='physical')),
        ('query_class', Categorical(ordering='physical')),
        ('n_family', Int64),
        ('n_superfamily', Int64),
        ('n_fold', Int64),
        ('n_class', Int64),
        ('query_scop_id', String),
        ('match_family', Categorical(ordering='physical')),
        ('match_superfamily', Categorical(ordering='physical')),
       

## Test end-to-end parser run

In [82]:
parser = process_scop_sourmash_multisearch_polars.MultisearchParser(
    query_metadata=query_metadata,
    match_metadata=match_metadata,
    pipeline_outdir=outdir,
    moltype="protein",
    ksize=10,
    analysis_outdir=analysis_outdir,
    verbose=True,
)
lf = parser.process_multisearch_scop_results()
lf.head().collect()



--- moltype: protein, ksize: 10 --

Reading s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.protein.10.multisearch.csv ...
	Done

Writing 16725 rows and 33 columns to s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-08__protein_k5-20/00_cleaned_multisearch_results/scope40.multisearch.protein.k10.pq ...


  f"\nWriting {df.select(pl.len()).collect().item()} rows and {len(df.columns)} columns to {pq} ..."


	Done.

Writing 832 rows and 33 columns to s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-08__protein_k5-20/00_cleaned_multisearch_results/scope40.multisearch.protein.k10.filtered.pq ...
	Done.


query_name,query_md5,match_name,match_md5,containment,max_containment,jaccard,intersect_hashes,prob_overlap,prob_overlap_adjusted,containment_adjusted,containment_adjusted_log10,tf_idf_score,query_family,query_superfamily,query_fold,query_class,n_family,n_superfamily,n_fold,n_class,query_scop_id,match_family,match_superfamily,match_fold,match_class,match_scop_id,same_family,same_superfamily,same_fold,same_class,ksize,moltype
str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,cat,cat,cat,cat,i64,i64,i64,i64,str,cat,cat,cat,cat,str,bool,bool,bool,bool,i32,str
"""d1fp3a_ a.102.1.3 (A:) N-acyl-…","""8358aea911022b5d3d233702fff0a4…","""d2gz6a_ a.102.1.0 (A:) automat…","""aada4480cca78dcc9ea2f35d1fa6f2…",0.010178,0.01061,0.005222,4.0,2.247e-12,0.000518,19.664533,1.293684,0.096987,"""a.102.1.3""","""a.102.1""","""a.102""","""a""",2,28,55,2644,"""d1fp3a_""","""a.102.1.0""","""a.102.1""","""a.102""","""a""","""d2gz6a_""",False,True,True,True,10,"""protein"""
"""d1ks8a1 a.102.1.2 (A:2-433) En…","""7eb43ac76d78fa95e5f567cad82cbc…","""d1ia6a_ a.102.1.2 (A:) Nonproc…","""fd71810b383181e09539c279a3a540…",0.004728,0.004739,0.002372,2.0,1.1235e-12,0.000259,18.269885,1.261736,0.045054,"""a.102.1.2""","""a.102.1""","""a.102""","""a""",10,28,55,2644,"""d1ks8a1""","""a.102.1.2""","""a.102.1""","""a.102""","""a""","""d1ia6a_""",True,True,True,True,10,"""protein"""
"""d1ia6a_ a.102.1.2 (A:) Nonproc…","""fd71810b383181e09539c279a3a540…","""d1ks8a1 a.102.1.2 (A:2-433) En…","""7eb43ac76d78fa95e5f567cad82cbc…",0.004739,0.004739,0.002372,2.0,1.1235e-12,0.000259,18.313179,1.262764,0.045161,"""a.102.1.2""","""a.102.1""","""a.102""","""a""",10,28,55,2644,"""d1ia6a_""","""a.102.1.2""","""a.102.1""","""a.102""","""a""","""d1ks8a1""",True,True,True,True,10,"""protein"""
"""d1xwta1 a.102.1.2 (A:1-404) En…","""3f99b13f5f5a92491e7c5285e5bac6…","""d1wu4a1 a.102.1.2 (A:6-381) Xy…","""70d077485c5d0efa9f4372c21db317…",0.005063,0.00545,0.002632,2.0,1.1235e-12,0.000259,19.564966,1.291479,0.048248,"""a.102.1.2""","""a.102.1""","""a.102""","""a""",10,28,55,2644,"""d1xwta1""","""a.102.1.2""","""a.102.1""","""a.102""","""a""","""d1wu4a1""",True,True,True,True,10,"""protein"""
"""d1wu4a1 a.102.1.2 (A:6-381) Xy…","""70d077485c5d0efa9f4372c21db317…","""d1xwta1 a.102.1.2 (A:1-404) En…","""3f99b13f5f5a92491e7c5285e5bac6…",0.00545,0.00545,0.002632,2.0,1.1235e-12,0.000259,21.057661,1.32341,0.051929,"""a.102.1.2""","""a.102.1""","""a.102""","""a""",10,28,55,2644,"""d1wu4a1""","""a.102.1.2""","""a.102.1""","""a.102""","""a""","""d1xwta1""",True,True,True,True,10,"""protein"""


In [78]:
! aws s3 ls s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-08__protein_k5-20/00_cleaned_multisearch_results/scope40.multisearch.protein.k10.filtered.pq

2024-10-18 18:29:36     149184 scope40.multisearch.protein.k10.filtered.pq


In [71]:
%debug

> [0;32m<frozen posixpath>[0m(259)[0;36mexpanduser[0;34m()[0m



ipdb>  ll


*** could not get source code


ipdb>  u


> [0;32m/home/ec2-user/miniconda3/envs/2024-kmerseek-analysis-polars/lib/python3.12/site-packages/polars/_utils/various.py[0m(220)[0;36mnormalize_filepath[0;34m()[0m
[0;32m    218 [0;31m    [0;34m"""Create a string path, expanding the home directory if present."""[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    219 [0;31m    [0;31m# don't use pathlib here as it modifies slashes (s3:// -> s3:/)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 220 [0;31m    [0mpath[0m [0;34m=[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0mexpanduser[0m[0;34m([0m[0mpath[0m[0;34m)[0m  [0;31m# noqa: PTH111[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    221 [0;31m    if (
[0m[0;32m    222 [0;31m        [0mcheck_not_directory[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  ll


[1;32m    217 [0m[0;32mdef[0m [0mnormalize_filepath[0m[0;34m([0m[0mpath[0m[0;34m:[0m [0mstr[0m [0;34m|[0m [0mPath[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0mcheck_not_directory[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m    218 [0m    [0;34m"""Create a string path, expanding the home directory if present."""[0m[0;34m[0m[0;34m[0m[0m
[1;32m    219 [0m    [0;31m# don't use pathlib here as it modifies slashes (s3:// -> s3:/)[0m[0;34m[0m[0;34m[0m[0m
[0;32m--> 220 [0;31m    [0mpath[0m [0;34m=[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0mexpanduser[0m[0;34m([0m[0mpath[0m[0;34m)[0m  [0;31m# noqa: PTH111[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m    221 [0m    if (
[1;32m    222 [0m        [0mcheck_not_directory[0m[0;34m[0m[0;34m[0m[0m
[1;32m    223 [0m        [0;32mand[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0mexists

ipdb>  os.path.expanduser('s3://seanome-kmerseek/scope-benchmark/analysis-outputs/')


's3://seanome-kmerseek/scope-benchmark/analysis-outputs/'


ipdb>  q


In [18]:
name_series = multisearch.select("query_name")
name_series

In [20]:
name_series.unique().collect()

query_name
str
"""d1fnfa2 b.1.2.1 (A:1236-1326) …"
"""d6jbra_ c.87.1.0 (A:) automate…"
"""d1yuda1 b.82.1.16 (A:1-158) Hy…"
"""d3equa1 d.175.1.0 (A:63-237) a…"
"""d1wdkc1 c.95.1.1 (C:2-263) Fat…"
…
"""d2zkmx3 b.55.1.1 (X:11-141) Ph…"
"""d1svia_ c.37.1.8 (A:) Probable…"
"""d1td6a_ a.234.1.1 (A:) Hypothe…"
"""d2wy4a_ a.1.1.0 (A:) automated…"
