each line in the features df: features (1711-dim or 211-dim) with a column "id"  
10 predictors telling a protein serves this certain function or not: [
    "lysis",
    "tail",
    "connector",
    "dna_rna_and_nucleotide_metabolism",
    "head_and_packaging",
    "other",
    "transcription_regulation",
    "moron_auxiliary_metabolic_gene_and_host_takeover",
    "unknown_function",
    "integration_and_excision",
]

predictor paths: "../models/best_configs/xxx(function name)_predictor.joblib"

best threshold:
[
    "lysis": 0.06,
    "tail": 0.03,
    "connector": 0.03,
    "dna_rna_and_nucleotide_metabolism": 0.04,
    "head_and_packaging": 0.01,
    "other": 0.01,
    "transcription_regulation": 0.01,
    "moron_auxiliary_metabolic_gene_and_host_takeover": 0.01,
    "unknown_function": 0.29,
    "integration_and_excision": 0.23,
]


use the 10 predictors to predict on each proteins and output the result for each protein; note that "unknown_function" use "features_211dim" and other 9 predictors use "features" which is 1711-dim

output: dataframe, where columns are: id, hit_number, hit_function;  
hit_number is the number of predictors that give positive answer (like 0 or 1), 
hit_function is the name of predictors that give positive answer (like "connector")

goal: write functions in script as much as possible so that it is easier to implement

In [1]:
import pandas as pd
import numpy as np
from joblib import load
from typing import List, Dict, Tuple, Any
from sklearn.preprocessing import StandardScaler

In [2]:
FUNCTION_NAMES = [
    "lysis",
    "tail",
    "connector",
    "dna_rna_and_nucleotide_metabolism",
    "head_and_packaging",
    "other",
    "transcription_regulation",
    "moron_auxiliary_metabolic_gene_and_host_takeover",
    "unknown_function",
    "integration_and_excision",
]

BEST_THRESHOLDS = {
    "lysis": 0.06,
    "tail": 0.03,
    "connector": 0.03,
    "dna_rna_and_nucleotide_metabolism": 0.04,
    "head_and_packaging": 0.01,
    "other": 0.01,
    "transcription_regulation": 0.01,
    "moron_auxiliary_metabolic_gene_and_host_takeover": 0.01,
    "unknown_function": 0.29,
    "integration_and_excision": 0.23,
}

In [3]:
def load_predictor(function_name: str) -> Tuple[Any, StandardScaler]:
    """Load a predictor model and its scaler from the saved file."""
    model_path = f"../models/best_configs/{function_name}_predictor.joblib"
    loaded_data = load(model_path)

    # The loaded data is a dictionary with 'model' and 'scaler' keys
    model = loaded_data["model"]
    scaler = loaded_data["scaler"]

    return model, scaler


def predict_single_protein(
    protein_features: np.ndarray,
    function_name: str,
    model: Any,
    scaler: StandardScaler,
    threshold: float,
) -> bool:
    """Predict if a single protein has a specific function."""
    # Scale features if scaler is available
    if scaler is not None:
        features_scaled = scaler.transform(protein_features.reshape(1, -1))
    else:
        features_scaled = protein_features.reshape(1, -1)

    # Get prediction probability
    prob = model.predict_proba(features_scaled)[0, 1]
    # Return True if probability exceeds threshold
    return prob > threshold


def predict_all_functions(features_df: pd.DataFrame) -> pd.DataFrame:
    """
    Predict functions for all proteins using all predictors.

    Args:
        features_df: DataFrame with 1711-dim features and 'id' column

    Returns:
        DataFrame with columns: id, hit_number, hit_function
    """
    results = []

    # Create 211-dim features by dropping DIPEP and TRIPEP columns
    features_211dim_df = features_df.drop(
        columns=[
            col for col in features_df.columns if "DIPEP" in col or "TRIPEP" in col
        ]
    )

    # Process each protein
    for _, row in features_df.iterrows():
        protein_id = row["id"]
        protein_features = row.drop("id").values

        # Get 211-dim features for this protein
        protein_row_211dim = features_211dim_df[
            features_211dim_df["id"] == protein_id
        ].iloc[0]
        protein_features_211dim = protein_row_211dim.drop("id").values

        hit_functions = []

        # Predict using each function predictor
        for function_name in FUNCTION_NAMES:
            # Load predictor
            model, scaler = load_predictor(function_name)

            # Choose correct feature set
            if function_name == "unknown_function":
                features = protein_features_211dim
            else:
                features = protein_features

            # Make prediction
            is_hit = predict_single_protein(
                features, function_name, model, scaler, BEST_THRESHOLDS[function_name]
            )

            if is_hit:
                hit_functions.append(function_name)

        # Add results for this protein
        results.append(
            {
                "id": protein_id,
                "hit_number": len(hit_functions),
                "hit_function": ",".join(hit_functions) if hit_functions else "",
            }
        )

    return pd.DataFrame(results)

In [4]:
features = pd.read_parquet(
    "../dataset/demonstration_samples/Escherichia_coli_O157_H7_str_FRIK2000/features.pa"
)


In [5]:
results_df = predict_all_functions(features)

In [6]:
results_df

Unnamed: 0,id,hit_number,hit_function
0,WP_000002304.1 MULTISPECIES: alpha-D-ribose 1-...,0,
1,WP_000002542.1 MULTISPECIES: signal peptidase ...,2,"tail,head_and_packaging"
2,WP_000002701.1 hypothetical protein [Escherich...,5,"tail,dna_rna_and_nucleotide_metabolism,transcr..."
3,WP_000002907.1 MULTISPECIES: diacylglycerol ki...,2,"lysis,moron_auxiliary_metabolic_gene_and_host_..."
4,WP_000002953.1 MULTISPECIES: ribonuclease E in...,1,other
...,...,...,...
5090,WP_255087022.1 A24 family peptidase [Escherich...,1,moron_auxiliary_metabolic_gene_and_host_takeover
5091,WP_272481064.1 hypothetical protein [Escherich...,3,"other,moron_auxiliary_metabolic_gene_and_host_..."
5092,WP_306256364.1 murein hydrolase activator NlpD...,1,tail
5093,WP_323670517.1 DUF4942 domain-containing prote...,3,"other,transcription_regulation,unknown_function"


In [None]:
results_df = predict_all_functions(features)

In [7]:
results_df["hit_number"].value_counts()

hit_number
2    2132
1    1385
3    1151
4     256
0     120
5      46
6       5
Name: count, dtype: int64