In [2]:
%load_ext autoreload
%autoreload 2

## Imports

In [3]:
import itertools
import warnings
from collections import Counter

import matplotlib as mpl
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from tqdm import tqdm

pd.options.display.max_columns = 100

mpl.rcParams["figure.max_open_warning"] = 0

In [4]:
assert pl.__version__ == "1.9.0"

# Read in data

## read in unfiltered data

The unfiltered data uses the fixed SCOP lookups provided by FoldSeek authors

In [5]:
pq = "s3://seanome-kmerseek/scope-benchmark/analysis-outputs/2024-10-09__hp_k20-60/00_cleaned_multisearch_results/scope40.multisearch.hp.k20.pq"
multisearch_unfiltered = pd.read_parquet(pq)

### Set SCOP lineage column names

In [6]:
lineage_cols = ["family", "superfamily", "fold", "class"]
query_scop_cols = [f"query_{x}" for x in lineage_cols]
match_scop_cols = [f"match_{x}" for x in lineage_cols]

same_scop_cols = [f"same_{x}" for x in lineage_cols]

### Make query metadata

In [7]:
query_metadata = pd.DataFrame(
    multisearch_unfiltered[query_scop_cols].values,
    index=multisearch_unfiltered["query_scop_id"].values,
    columns=query_scop_cols,
)
query_metadata = query_metadata.sort_index()
print(query_metadata.shape)
query_metadata = query_metadata.loc[~query_metadata.index.duplicated()]
print(query_metadata.shape)
query_metadata.head()

(3471675, 4)
(15177, 4)


Unnamed: 0,query_family,query_superfamily,query_fold,query_class
d12asa_,d.104.1.1,d.104.1,d.104,d
d16vpa_,d.180.1.1,d.180.1,d.180,d
d1914a1,d.49.1.1,d.49.1,d.49,d
d1914a2,d.49.1.1,d.49.1,d.49,d
d1a04a1,a.4.6.2,a.4.6,a.4,a


### Count number of groups per sample

In [8]:
def count_scop_lineage(df, col):
    return Counter(df[col])


n_groups_per_scop_lineage = {
    lineage: pd.Series(
        count_scop_lineage(query_metadata, f"query_{lineage}"), name=f"n_{lineage}"
    )
    for lineage in lineage_cols
}
n_groups_per_scop_lineage.keys()
# n_groups_per_scop_lineage

dict_keys(['family', 'superfamily', 'fold', 'class'])

In [9]:
n_groups_per_scop_lineage["class"]

d    3653
a    2644
c    4463
b    3059
f     332
g     722
e     304
Name: n_class, dtype: int64

In [11]:
query_metadata_with_n_groups = query_metadata.copy()

for lineage, series in n_groups_per_scop_lineage.items():
    # display(series)
    on = f"query_{lineage}"
    # pd_df = pl_df.to_pandas()
    # series = pd.Series(
    #     index=pd_df[pd_df.columns[0]],
    #     data=pd_df[pd_df.columns[1]].values,
    #     name=pd_df.columns[1],
    # )
    query_metadata_with_n_groups = query_metadata_with_n_groups.join(series, on=on)
query_metadata_with_n_groups.index.name = "query_scop_id"
query_metadata_with_n_groups.head()

Unnamed: 0_level_0,query_family,query_superfamily,query_fold,query_class,n_family,n_superfamily,n_fold,n_class
query_scop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
d12asa_,d.104.1.1,d.104.1,d.104,d,15,29,29,3653
d16vpa_,d.180.1.1,d.180.1,d.180,d,1,1,1,3653
d1914a1,d.49.1.1,d.49.1,d.49,d,2,3,3,3653
d1914a2,d.49.1.1,d.49.1,d.49,d,2,3,3,3653
d1a04a1,a.4.6.2,a.4.6,a.4,a,5,30,425,2644


In [12]:
query_metadata_with_n_groups.to_csv(
    "s3://seanome-kmerseek/scope-benchmark/reference_files/scop.e.2.08.query_metadata.csv"
)

### Add categories to make polars happy

In [14]:
for col in query_scop_cols:
    categories = sorted(list(set(query_metadata_with_n_groups[col])))
    query_metadata_with_n_groups[col] = pd.Categorical(query_metadata_with_n_groups[col], ordered=True, categories=categories)
query_metadata_with_n_groups.dtypes

query_family         category
query_superfamily    category
query_fold           category
query_class          category
n_family                int64
n_superfamily           int64
n_fold                  int64
n_class                 int64
dtype: object

In [16]:
query_metadata_with_n_groups.to_parquet(
    "s3://seanome-kmerseek/scope-benchmark/reference_files/scop.e.2.08.query_metadata.pq"
)