# Imports and setup

### Auto-re-import python modules, useful for editing local files

In [2]:
%load_ext autoreload
%autoreload 2

## Imports

In [3]:
import itertools
import warnings
from collections import Counter

import matplotlib as mpl
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from tqdm import tqdm

pd.options.display.max_columns = 100

mpl.rcParams["figure.max_open_warning"] = 0

In [4]:
assert pl.__version__ == "1.9.0"

# Read in data

In [5]:
lineage_cols = ["family", "superfamily", "fold", "class"]

# Iterate over all protein benchmarks, randomly sampling 1000 queries to compare across moltypes and ksizes

### Read one example file to get query names

In [6]:
pq = "s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__hp_k20-60/00_cleaned_multisearch_results/scope40.multisearch.hp.k40.pq"
multisearch = pl.read_parquet(pq)

randomly_chosen_queries = multisearch["query_name"].unique().sample(1000, seed=0)
randomly_chosen_queries


  multisearch = pl.read_parquet(pq)


query_name
str
"""d3mgka_ c.23.16.0 (A:) automat…"
"""d5aooc_ b.121.4.0 (C:) automat…"
"""d5jgya_ c.1.7.0 (A:) automated…"
"""d1mw7a_ e.39.1.1 (A:) Hypothet…"
"""d2nvwa2 d.81.1.5 (A:155-373) G…"
…
"""d1jpdx2 d.54.1.1 (X:-2-113) L-…"
"""d1h3za_ b.34.9.2 (A:) Hypothet…"
"""d6oa6a1 a.40.1.0 (A:47-160) au…"
"""d4dnda_ a.47.2.1 (A:) automate…"


In [7]:
def make_multisearch_pq(
    analysis_outdir,
    moltype,
    ksize,
):
    basename = f"scope40.multisearch.{moltype}.k{ksize}.pq"
    pq = f"{analysis_outdir}/00_cleaned_multisearch_results/{basename}"
    return pq


moltype_info = {
    "protein": dict(
        ksizes=range(5, 21),
        pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-08__protein_k5-20",
        analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__protein_k5-20",
    ),
    "dayhoff": dict(
        ksizes=range(5, 21),
        pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__dayhoff_k5-20",
        analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__dayhoff_k5-20",
    ),
    "hp": dict(
        ksizes=range(20, 61),
        pipeline_outdir="s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60",
        analysis_outdir="s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__hp_k20-60",
    ),
}


def add_log10_col(df, col):
    return df.with_columns(pl.col(col).log10().alias(f"{col}_log10"))


def process_multisearch_results(analysis_outidr, moltype, ksize):
    pq = make_multisearch_pq(analysis_outdir, moltype, ksize)
    scanned = pl.scan_parquet(pq)
    df = scanned.filter(pl.col("query_name").is_in(randomly_chosen_queries)).collect()

    df = add_log10_col(df, "prob_overlap_adjusted")
    df = add_log10_col(df, "containment")
    df = add_log10_col(df, "max_containment")
    df = add_log10_col(df, "tf_idf_score")
    df = add_log10_col(df, "jaccard")

    melted = melt_sourmash_scop(df)

    melted = melted.with_columns(pl.lit(ksize).alias("ksize"))
    melted = melted.with_columns(pl.lit(moltype).alias("moltype"))
    return melted


def melt_sourmash_scop(
    df,
    same_lineage_cols=["same_family", "same_superfamily", "same_fold", "same_class"],
    sourmash_score_cols=[
        "containment_log10",
        "max_containment_log10",
        "jaccard_log10",
        "intersect_hashes",
        "containment_adjusted_log10",
        "tf_idf_score_log10",
        "prob_overlap_adjusted_log10",
    ],
):

    df_melt_scop_levels = df.unpivot(
        index=["query_name", "match_name"],
        on=same_lineage_cols,
        variable_name="scop_level",
        value_name="is_same",
    )
    df_melt_scop_levels = df_melt_scop_levels.with_columns(
        pl.col("scop_level").str.split("_").list.last().alias("scop_level")
    )

    df_melt_sourmash_score = df.unpivot(
        index=["query_name", "match_name"],
        on=sourmash_score_cols,
        variable_name="sourmash_score",
        value_name="score_value",
    )
    melt_combined = df_melt_sourmash_score.join(
        df_melt_scop_levels, on=["query_name", "match_name"]
    )

    return melt_combined

    # print(multisearch_melt_sourmash_score.shape)
    # multisearch_melt_sourmash_score.head()


dfs = []

for moltype, info in moltype_info.items():
    ksizes = info["ksizes"]
    analysis_outdir = info["analysis_outdir"]
    print(f"moltype: {moltype}")
    for ksize in ksizes:
        try:
            df = process_multisearch_results(analysis_outdir, moltype, ksize)
            dfs.append(df)
        except pl.exceptions.ComputeError:
            pass

moltype: protein
moltype: dayhoff
moltype: hp


In [8]:
multisearch_moltype_ksize = pl.concat(dfs)
multisearch_moltype_ksize

query_name,match_name,sourmash_score,score_value,scop_level,is_same,ksize,moltype
str,str,str,f64,str,bool,i32,str
"""d2cqka1 a.4.5.46 (A:43-130) La…","""d4j42a_ a.25.3.0 (A:) automate…","""containment_log10""",-1.924279,"""family""",false,5,"""protein"""
"""d2cqka1 a.4.5.46 (A:43-130) La…","""d4j42a_ a.25.3.0 (A:) automate…","""containment_log10""",-1.924279,"""superfamily""",false,5,"""protein"""
"""d2cqka1 a.4.5.46 (A:43-130) La…","""d4j42a_ a.25.3.0 (A:) automate…","""containment_log10""",-1.924279,"""fold""",false,5,"""protein"""
"""d2cqka1 a.4.5.46 (A:43-130) La…","""d4j42a_ a.25.3.0 (A:) automate…","""containment_log10""",-1.924279,"""class""",true,5,"""protein"""
"""d6vzda_ a.64.1.0 (A:) automate…","""d4j42a_ a.25.3.0 (A:) automate…","""containment_log10""",-1.875061,"""family""",false,5,"""protein"""
…,…,…,…,…,…,…,…
"""d2q6ka1 c.132.1.0 (A:1-163) au…","""d2q6ka1 c.132.1.0 (A:1-163) au…","""prob_overlap_adjusted_log10""",-2.187956,"""class""",true,60,"""hp"""
"""d2wcua_ c.133.1.0 (A:) automat…","""d2wcua_ c.133.1.0 (A:) automat…","""prob_overlap_adjusted_log10""",-2.250747,"""family""",true,60,"""hp"""
"""d2wcua_ c.133.1.0 (A:) automat…","""d2wcua_ c.133.1.0 (A:) automat…","""prob_overlap_adjusted_log10""",-2.250747,"""superfamily""",true,60,"""hp"""
"""d2wcua_ c.133.1.0 (A:) automat…","""d2wcua_ c.133.1.0 (A:) automat…","""prob_overlap_adjusted_log10""",-2.250747,"""fold""",true,60,"""hp"""


In [16]:
import s3fs

fs = s3fs.S3FileSystem()
# fs.ls('seanome-kmerseek')

pq = "s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-11__subsample_1000_queries/multisearch.subsampled.1000.protein.dayhoff.hp.pq"

with fs.open(pq, mode="wb") as f:
    multisearch_moltype_ksize.write_parquet(f)

  multisearch_moltype_ksize.write_parquet(f)
