# Imports and setup

### Auto-re-import python modules, useful for editing local files

In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
import itertools
import warnings
from collections import Counter

import matplotlib as mpl
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from tqdm import tqdm

pd.options.display.max_columns = 100

mpl.rcParams["figure.max_open_warning"] = 0

In [3]:
assert pl.__version__ == "1.9.0"

# Read in data

## read in unfiltered data

In [4]:
pq = "s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__hp_k20-60/00_cleaned_multisearch_results/scope40.multisearch.hp.k20.filtered.pq"
multisearch = pl.read_parquet(pq)


  multisearch = pl.read_parquet(pq)


### Set SCOP lineage column names

In [5]:
lineage_cols = ["family", "superfamily", "fold", "class"]
query_scop_cols = [f"query_{x}" for x in lineage_cols]
match_scop_cols = [f"match_{x}" for x in lineage_cols]

same_scop_cols = [f"same_{x}" for x in lineage_cols]

### read query metadata

In [21]:
query_metadata = pl.read_parquet(
    "s3://seanome-kmerseek/scope-benchmark/reference_files/scop.e.2.08.query_metadata.pq",
    # index_col=0,
)
query_metadata.head()

query_family,query_superfamily,query_fold,query_class,n_family,n_superfamily,n_fold,n_class,query_scop_id
cat,cat,cat,cat,i64,i64,i64,i64,str
"""d.104.1.1""","""d.104.1""","""d.104""","""d""",15,29,29,3653,"""d12asa_"""
"""d.180.1.1""","""d.180.1""","""d.180""","""d""",1,1,1,3653,"""d16vpa_"""
"""d.49.1.1""","""d.49.1""","""d.49""","""d""",2,3,3,3653,"""d1914a1"""
"""d.49.1.1""","""d.49.1""","""d.49""","""d""",2,3,3,3653,"""d1914a2"""
"""a.4.6.2""","""a.4.6""","""a.4""","""a""",5,30,425,2644,"""d1a04a1"""


In [23]:
multisearch_with_n_groups = multisearch.join(
    query_metadata, on=["query_scop_id"] + query_scop_cols
)
multisearch_with_n_groups

query_name,query_md5,match_name,match_md5,containment,max_containment,jaccard,intersect_hashes,prob_overlap,prob_overlap_adjusted,containment_adjusted,containment_adjusted_log10,tf_idf_score,query_scop_id,query_scop_lineage,query_scop_lineage_fixed,query_family,query_superfamily,query_fold,query_class,match_scop_id,match_scop_lineage,match_scop_lineage_fixed,match_family,match_superfamily,match_fold,match_class,same_family,same_superfamily,same_fold,same_class,__index_level_0__,n_family,n_superfamily,n_fold,n_class
str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,cat,cat,cat,cat,str,str,str,cat,cat,cat,cat,bool,bool,bool,bool,i64,i64,i64,i64,i64
"""d2fcwa1 a.13.1.1 (A:216-320) a…","""e6ac707cdd74e7c0bdffe2ec104b1c…","""d4j42a_ a.25.3.0 (A:) automate…","""429b50d612d737b387bab434a8ff53…",0.034884,0.04918,0.020833,3.0,1.7368e-11,0.004001,8.71974,0.940504,0.303089,"""d2fcwa1""","""a.13.1.1""","""a.13.1.1""","""a.13.1.1""","""a.13.1""","""a.13""","""a""","""d4j42a_""","""a.25.3.0""","""a.25.3.0""","""a.25.3.0""","""a.25.3""","""a.25""","""a""",false,false,false,true,2,2,2,2,2644
"""d6d5xa_ a.25.2.0 (A:) automate…","""ad230364a72799163d9e8eb34d3f41…","""d4j42a_ a.25.3.0 (A:) automate…","""429b50d612d737b387bab434a8ff53…",0.028169,0.065574,0.020101,4.0,1.9894e-11,0.004582,6.14716,0.788675,0.247069,"""d6d5xa_""","""a.25.2.0""","""a.25.2.0""","""a.25.2.0""","""a.25.2""","""a.25""","""a""","""d4j42a_""","""a.25.3.0""","""a.25.3.0""","""a.25.3.0""","""a.25.3""","""a.25""","""a""",false,false,true,true,3,4,6,69,2644
"""d4dlla2 a.100.1.0 (A:191-316) …","""892f25011eb07e4a98cf99b6a5785b…","""d4j42a_ a.25.3.0 (A:) automate…","""429b50d612d737b387bab434a8ff53…",0.018692,0.032787,0.012048,2.0,7.1051e-12,0.001637,11.421079,1.057707,0.167505,"""d4dlla2""","""a.100.1.0""","""a.100.1.0""","""a.100.1.0""","""a.100.1""","""a.100""","""a""","""d4j42a_""","""a.25.3.0""","""a.25.3.0""","""a.25.3.0""","""a.25.3""","""a.25""","""a""",false,false,false,true,8,18,40,40,2644
"""d1n4ka1 a.118.22.1 (A:436-602)…","""5a73d8ef1d497161dda842a31cb621…","""d4j42a_ a.25.3.0 (A:) automate…","""429b50d612d737b387bab434a8ff53…",0.027027,0.065574,0.019512,4.0,2.1315e-11,0.00491,5.504754,0.740738,0.235821,"""d1n4ka1""","""a.118.22.1""","""a.118.22.1""","""a.118.22.1""","""a.118.22""","""a.118""","""a""","""d4j42a_""","""a.25.3.0""","""a.25.3.0""","""a.25.3.0""","""a.25.3""","""a.25""","""a""",false,false,false,true,12,1,1,179,2644
"""d6s7ja_ a.127.1.0 (A:) automat…","""63a7a4cd3cb81a25bf7cd92c5674fc…","""d4j42a_ a.25.3.0 (A:) automate…","""429b50d612d737b387bab434a8ff53…",0.004184,0.032787,0.003724,2.0,8.2103e-12,0.001891,2.212443,0.344872,0.037029,"""d6s7ja_""","""a.127.1.0""","""a.127.1.0""","""a.127.1.0""","""a.127.1""","""a.127""","""a""","""d4j42a_""","""a.25.3.0""","""a.25.3.0""","""a.25.3.0""","""a.25.3""","""a.25""","""a""",false,false,false,true,15,5,13,13,2644
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""d3idaa2 b.18.1.0 (A:352-574) a…","""9674e7d8c01c18bf970a5ae3b70cd3…","""d3ryca1 c.32.1.1 (A:1-245) aut…","""e070d6043f22b2f99edae145e20eab…",0.009804,0.009804,0.004673,2.0,2.0526e-12,0.000473,20.736235,1.31673,0.092011,"""d3idaa2""","""b.18.1.0""","""b.18.1.0""","""b.18.1.0""","""b.18.1""","""b.18""","""b""","""d3ryca1""","""c.32.1.1""","""c.32.1.1""","""c.32.1.1""","""c.32.1""","""c.32""","""c""",false,false,false,false,3471660,23,77,77,3059
"""d1z45a1 b.30.5.4 (A:358-699) G…","""a5ae62efa8a9c39af7629615f7c865…","""d3ryca1 c.32.1.1 (A:1-245) aut…","""e070d6043f22b2f99edae145e20eab…",0.006192,0.00885,0.003656,2.0,1.1368e-11,0.002619,2.364658,0.373768,0.053757,"""d1z45a1""","""b.30.5.4""","""b.30.5.4""","""b.30.5.4""","""b.30.5""","""b.30""","""b""","""d3ryca1""","""c.32.1.1""","""c.32.1.1""","""c.32.1.1""","""c.32.1""","""c.32""","""c""",false,false,false,false,3471664,3,27,33,3059
"""d2h2ba1 b.36.1.1 (A:18-110) au…","""b1307accde1388146147d61e29c440…","""d3ryca1 c.32.1.1 (A:1-245) aut…","""e070d6043f22b2f99edae145e20eab…",0.027027,0.027027,0.006711,2.0,7.8945e-12,0.001818,14.862836,1.172102,0.238807,"""d2h2ba1""","""b.36.1.1""","""b.36.1.1""","""b.36.1.1""","""b.36.1""","""b.36""","""b""","""d3ryca1""","""c.32.1.1""","""c.32.1.1""","""c.32.1.1""","""c.32.1""","""c.32""","""c""",false,false,false,false,3471665,40,88,88,3059
"""d3ja861 b.40.4.11 (6:103-463) …","""54aab68f2860c895f7e933898b9d6b…","""d3ryca1 c.32.1.1 (A:1-245) aut…","""e070d6043f22b2f99edae145e20eab…",0.011696,0.017699,0.007092,4.0,2.4315e-11,0.005601,2.08827,0.319787,0.10185,"""d3ja861""","""b.40.4.11""","""b.40.4.11""","""b.40.4.11""","""b.40.4""","""b.40""","""b""","""d3ryca1""","""c.32.1.1""","""c.32.1.1""","""c.32.1.1""","""c.32.1""","""c.32""","""c""",false,false,false,false,3471667,7,130,197,3059



## Count sensitivity to first false positive

In [24]:
sourmash_col = "containment"

cols = [sourmash_col] + list(reversed(same_scop_cols))
cols

['containment', 'same_class', 'same_fold', 'same_superfamily', 'same_family']

In [25]:
multisearch_with_n_groups = multisearch_with_n_groups.sort(by=cols, descending=True)

In [26]:
# groupby = ["query_scop_id"] + query_scop_cols

# for query, df in multisearch.group_by(groupby):
#     print("query:", query, df.shape)
#     # display(df.head())

#     # Sort the values so the biggest one is first
#     # df = df.sort_values(sourmash_col, ascending=False)

#     # display(df)
#     break

In [27]:
df = multisearch_with_n_groups.filter(pl.col("query_scop_id") == "d1k91a1")
df

query_name,query_md5,match_name,match_md5,containment,max_containment,jaccard,intersect_hashes,prob_overlap,prob_overlap_adjusted,containment_adjusted,containment_adjusted_log10,tf_idf_score,query_scop_id,query_scop_lineage,query_scop_lineage_fixed,query_family,query_superfamily,query_fold,query_class,match_scop_id,match_scop_lineage,match_scop_lineage_fixed,match_family,match_superfamily,match_fold,match_class,same_family,same_superfamily,same_fold,same_class,__index_level_0__,n_family,n_superfamily,n_fold,n_class
str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,cat,cat,cat,cat,str,str,str,cat,cat,cat,cat,bool,bool,bool,bool,i64,i64,i64,i64,i64
"""d1k91a1 b.104.1.1 (A:221-256) …","""b285a9116d3e80bdf28d50c1026309…","""d1jhna3 b.104.1.1 (A:270-411) …","""a7fa935828069ca043b43ee240f20b…",0.235294,0.235294,0.029412,4.0,1.7842e-11,0.00411,57.254029,1.757806,2.071611,"""d1k91a1""","""b.104.1.1""","""b.104.1.1""","""b.104.1.1""","""b.104.1""","""b.104""","""b""","""d1jhna3""","""b.104.1.1""","""b.104.1.1""","""b.104.1.1""","""b.104.1""","""b.104""","""b""",true,true,true,true,1922048,2,2,2,3059
"""d1k91a1 b.104.1.1 (A:221-256) …","""b285a9116d3e80bdf28d50c1026309…","""d1roca1 b.1.22.1 (A:2-154) Ant…","""e68cc86e6f9293b8042b6b2b2a6695…",0.235294,0.235294,0.027211,4.0,2.0999e-11,0.004837,48.644401,1.687033,2.051819,"""d1k91a1""","""b.104.1.1""","""b.104.1.1""","""b.104.1.1""","""b.104.1""","""b.104""","""b""","""d1roca1""","""b.1.22.1""","""b.1.22.1""","""b.1.22.1""","""b.1.22""","""b.1""","""b""",false,false,false,true,1143499,2,2,2,3059
"""d1k91a1 b.104.1.1 (A:221-256) …","""b285a9116d3e80bdf28d50c1026309…","""d6tt2a_ b.55.1.1 (A:) Bruton's…","""221d6dfe734100c788a9fe3e2df19b…",0.176471,0.176471,0.018405,3.0,8.9997e-12,0.002073,85.127701,1.930071,1.580716,"""d1k91a1""","""b.104.1.1""","""b.104.1.1""","""b.104.1.1""","""b.104.1""","""b.104""","""b""","""d6tt2a_""","""b.55.1.1""","""b.55.1.1""","""b.55.1.1""","""b.55.1""","""b.55""","""b""",false,false,false,true,824939,2,2,2,3059
"""d1k91a1 b.104.1.1 (A:221-256) …","""b285a9116d3e80bdf28d50c1026309…","""d5dgja1 b.47.1.4 (A:1-173) aut…","""630090fb88f1876c8e82988a519a23…",0.176471,0.176471,0.017857,3.0,5.9998e-12,0.001382,127.691552,2.106162,1.623891,"""d1k91a1""","""b.104.1.1""","""b.104.1.1""","""b.104.1.1""","""b.104.1""","""b.104""","""b""","""d5dgja1""","""b.47.1.4""","""b.47.1.4""","""b.47.1.4""","""b.47.1""","""b.47""","""b""",false,false,false,true,696669,2,2,2,3059
"""d1k91a1 b.104.1.1 (A:221-256) …","""b285a9116d3e80bdf28d50c1026309…","""d2y24a1 b.71.1.2 (A:31-43,A:32…","""7e9378a1133aaff014e4aca4a3815e…",0.176471,0.176471,0.029412,3.0,1.7052e-11,0.003928,44.928509,1.652522,1.532063,"""d1k91a1""","""b.104.1.1""","""b.104.1.1""","""b.104.1.1""","""b.104.1""","""b.104""","""b""","""d2y24a1""","""b.71.1.2""","""b.71.1.2""","""b.71.1.2""","""b.71.1""","""b.71""","""b""",false,false,false,true,1310429,2,2,2,3059
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""d1k91a1 b.104.1.1 (A:221-256) …","""b285a9116d3e80bdf28d50c1026309…","""d6df4a_ a.29.2.0 (A:) automate…","""288d2db345dd1fef6e6db2f0d20c30…",0.117647,0.117647,0.015385,2.0,9.6313e-12,0.002218,53.030371,1.724525,1.030443,"""d1k91a1""","""b.104.1.1""","""b.104.1.1""","""b.104.1.1""","""b.104.1""","""b.104""","""b""","""d6df4a_""","""a.29.2.0""","""a.29.2.0""","""a.29.2.0""","""a.29.2""","""a.29""","""a""",false,false,false,false,100906,2,2,2,3059
"""d1k91a1 b.104.1.1 (A:221-256) …","""b285a9116d3e80bdf28d50c1026309…","""d5d9ab_ e.8.1.9 (B:) Polymeras…","""549c1a9e28a56345597eb7593f8dde…",0.117647,0.117647,0.00267,2.0,3.1578e-12,0.000727,161.742632,2.208825,1.091009,"""d1k91a1""","""b.104.1.1""","""b.104.1.1""","""b.104.1.1""","""b.104.1""","""b.104""","""b""","""d5d9ab_""","""e.8.1.9""","""e.8.1.9""","""e.8.1.9""","""e.8.1""","""e.8""","""e""",false,false,false,false,2029618,2,2,2,3059
"""d1k91a1 b.104.1.1 (A:221-256) …","""b285a9116d3e80bdf28d50c1026309…","""d6pwsa_ d.169.1.1 (A:) automat…","""77a10b0970756a06af0f62d94a9932…",0.117647,0.117647,0.016,2.0,5.3683e-12,0.001237,95.142725,1.978376,1.063362,"""d1k91a1""","""b.104.1.1""","""b.104.1.1""","""b.104.1.1""","""b.104.1""","""b.104""","""b""","""d6pwsa_""","""d.169.1.1""","""d.169.1.1""","""d.169.1.1""","""d.169.1""","""d.169""","""d""",false,false,false,false,2207150,2,2,2,3059
"""d1k91a1 b.104.1.1 (A:221-256) …","""b285a9116d3e80bdf28d50c1026309…","""d2b0ta_ c.77.1.2 (A:) automate…","""7bd3c8a6815d5b63dc8dbe11a102af…",0.117647,0.117647,0.002736,2.0,1.1368e-11,0.002619,44.928509,1.652522,1.021376,"""d1k91a1""","""b.104.1.1""","""b.104.1.1""","""b.104.1.1""","""b.104.1""","""b.104""","""b""","""d2b0ta_""","""c.77.1.2""","""c.77.1.2""","""c.77.1.2""","""c.77.1""","""c.77""","""c""",false,false,false,false,2623792,2,2,2,3059


In [28]:
df.select(pl.col(same_scop_cols)).head(10)

same_family,same_superfamily,same_fold,same_class
bool,bool,bool,bool
True,True,True,True
False,False,False,True
False,False,False,True
False,False,False,True
False,False,False,True
False,False,False,False
False,False,False,False
False,False,False,False
False,False,False,False
False,False,False,False


In [29]:
print(df.select(pl.col(same_scop_cols)).head(10).write_csv(separator="\t"))

same_family	same_superfamily	same_fold	same_class
true	true	true	true
false	false	false	true
false	false	false	true
false	false	false	true
false	false	false	true
false	false	false	false
false	false	false	false
false	false	false	false
false	false	false	false
false	false	false	false



In [166]:
import polars as pl


def sum_until_first_false(df):
    return df.select(
        [
            pl.col(col)
            .cast(pl.UInt32)
            .cum_sum()
            .sub(pl.col(col).cast(pl.UInt32).cum_min().cast(pl.UInt32))
            .eq(pl.col(col).cast(pl.UInt32).cum_sum())
            .eq(False)
            .cast(pl.UInt32)
            .cum_sum()
            .max()
            .alias(col)
            for col in df.columns
        ]
    )


# Assuming your dataframe is called 'df'
result = sum_until_first_false(df.select(same_scop_cols))
result

same_family,same_superfamily,same_fold,same_class
u32,u32,u32,u32
1,1,1,5


In [31]:
first_zeros = df.select(pl.col(same_scop_cols).arg_min())
first_zeros

same_family,same_superfamily,same_fold,same_class
u32,u32,u32,u32
1,1,1,5


In [32]:
df[[sourmash_col] + same_scop_cols].head(10)

containment,same_family,same_superfamily,same_fold,same_class
f64,bool,bool,bool,bool
0.235294,True,True,True,True
0.235294,False,False,False,True
0.176471,False,False,False,True
0.176471,False,False,False,True
0.176471,False,False,False,True
0.176471,False,False,False,False
0.117647,False,False,False,False
0.117647,False,False,False,False
0.117647,False,False,False,False
0.117647,False,False,False,False


### Write function for sensitive to first FP

In [33]:
n_scop_cols = "n_family	n_superfamily	n_fold	n_class".split()
n_scop_cols

['n_family', 'n_superfamily', 'n_fold', 'n_class']

In [66]:
def sensitivity_until_first_false_positive(same_scop_cols, n_scop_cols):
    return [
        (pl.col(same_col).cast(pl.Float64).arg_min() / (pl.col(n_col) - 1))
        .first()
        .fill_null(0)
        .alias(same_col.replace("same", "sensitivity"))
        for same_col, n_col in zip(same_scop_cols, n_scop_cols)
    ]


sensitivity = (
    multisearch_with_n_groups.sort(sourmash_col, descending=True)
    # .head(1000)
    .group_by("query_scop_id").agg(
        sensitivity_until_first_false_positive(same_scop_cols, n_scop_cols)
    )
).fill_nan(0)
sensitivity

query_scop_id,sensitivity_family,sensitivity_superfamily,sensitivity_fold,sensitivity_class
str,f64,f64,f64,f64
"""d1yb1a_""",0.0,0.0,0.0,0.000672
"""d2qzsa_""",0.0,0.0,0.0,0.0
"""d3mjja1""",0.0,0.0,0.0,0.0
"""d2ihya1""",0.0,0.0,0.0,0.001121
"""d1k8ma1""",0.0,0.0,0.0,0.0
…,…,…,…,…
"""d6xp1a2""",0.0,0.0,0.0,0.0
"""d1jz8a5""",0.0,0.0,0.00188,0.000224
"""d1th7a1""",0.0,0.0,0.0,0.0
"""d1k66a1""",0.0,0.0,0.0,0.000224


In [68]:
sensitivity.filter(
    (pl.col("sensitivity_family") > 0.9) & (pl.col("sensitivity_superfamily") > 0.5)
).sort('query_scop_id')

query_scop_id,sensitivity_family,sensitivity_superfamily,sensitivity_fold,sensitivity_class
str,f64,f64,f64,f64
"""d1k91a1""",1.0,1.0,1.0,0.001635
"""d1klxa_""",1.0,1.0,0.005618,0.000378
"""d1ouva1""",1.0,1.0,0.005618,0.000378
"""d2ekna_""",1.0,1.0,0.002132,0.000548
"""d2i8da1""",1.0,1.0,0.05,0.000274
"""d3jqka_""",1.0,1.0,0.002132,0.000274
"""d7dkzl_""",1.0,1.0,1.0,0.012085
"""d7lx0l_""",1.0,1.0,1.0,0.003021


## Count number of same scop per column

In [15]:
same_scop_counts = multisearch.group_by(["query_scop_id"] + query_scop_cols).agg(
    pl.col(same_scop_cols).sum()
)

# Subtract 1 for self-matches
# Doing this after the fact makes sure that we have ALL 15,177 samples for
# each analyses, otherwise they'd be unobserved
# same_scop_counts -= 1

# same_scop_bool = same_scop_counts > 0
# same_scop_bool.columns = same_scop_bool.columns + "_bool"
# same_scop_counts = pd.concat([same_scop_counts, same_scop_bool], axis=1)
same_scop_counts.head()

query_scop_id,query_family,query_superfamily,query_fold,query_class,same_family,same_superfamily,same_fold,same_class
str,cat,cat,cat,cat,u64,u64,u64,u64
"""d2e9ja1""","""b.1.18.10""","""b.1.18""","""b.1""","""b""",0,0,2,7
"""d2qxza_""","""b.80.1.0""","""b.80.1""","""b.80""","""b""",0,2,2,33
"""d6pzda1""","""b.68.1.1""","""b.68.1""","""b.68""","""b""",0,0,0,29
"""d2cw9a1""","""d.17.4.13""","""d.17.4""","""d.17""","""d""",0,0,0,30
"""d1vj7a1""","""a.211.1.1""","""a.211.1""","""a.211""","""a""",0,0,0,10
