## protein seqs and gff

## protein -> features

In [None]:
# from pici_predictor.utilities import feature_generation

# feature_df = feature_generation(fasta_dir, out_dir)

## features -> functional categories

In [3]:
import pandas as pd
import joblib
import numpy as np
from pici_predictor.phrog_function import (
    function_names_formatted,
    function_name_formatted_to_num,
    thresholds_0_5,
    mu_neg_dict,
    sigma_neg_dict,
)

function_list = [name for name in function_names_formatted if name != "no_hit"]


def predict_function(feature_df, out_dir):
    # List of function names in the same order as your predictors

    # Prepare output dataframe
    predicted_probs = pd.DataFrame({"id": feature_df["id"]})

    for function_name in function_list:
        model_bundle = joblib.load(
            f"../models/best_configs/{function_name}_predictor.joblib"
        )
        model = model_bundle["model"]
        feature_cols = model_bundle["feature_cols"]
        if function_name == "unknown_function":
            feature_df = feature_df.drop(
                columns=[
                    col
                    for col in feature_df.columns
                    if "DIPEP" in col or "TRIPEP" in col
                ]
            )

        # Predict probabilities
        probs = model.predict_proba(feature_df[feature_cols].values)[:, 1]
        predicted_probs[function_name] = probs

    # Save predicted probabilities
    predicted_probs.to_csv(out_dir, index=False)
    return predicted_probs

In [None]:
feature_df = pd.read_csv("../results/demonstration/GCF_000175755.1/features.csv")
out_dir = "../results/demonstration/GCF_000175755.1/predicted_function_prob.csv"
predicted_probs = predict_function(feature_df, out_dir)

In [16]:
def assign_functions(
    predicted_probs,
    threshold_dict,
    mu_neg_dict,
    sigma_neg_dict,
    function_to_num,
    out_dir,
):
    def assign_function_row(row):
        above = []
        z_scores = []
        for fn in function_list:
            prob = row[fn]
            if prob >= threshold_dict[fn]:
                above.append(fn)
                z = (prob - mu_neg_dict[fn]) / sigma_neg_dict[fn]
                z_scores.append(z)
        if len(above) == 0:
            return "no_hit"
        elif len(above) == 1:
            return above[0]
        else:
            idx = np.argmax(z_scores)
            return above[idx]

    predicted_probs["function"] = predicted_probs.apply(assign_function_row, axis=1)
    predicted_probs["function_num"] = predicted_probs["function"].map(function_to_num)
    predicted_probs[["id", "function", "function_num"]].to_csv(out_dir, index=False)
    return predicted_probs[["id", "function", "function_num"]]

In [None]:
predicted_probs = pd.read_csv(
    "../results/demonstration/GCF_000175755.1/predicted_function_prob.csv"
)
out_dir = "../results/demonstration/GCF_000175755.1/predicted_function_thresh05.csv"
function_to_num = function_name_formatted_to_num
threshold_dict = thresholds_0_5

predicted_functions = assign_functions(
    predicted_probs,
    threshold_dict,
    mu_neg_dict,
    sigma_neg_dict,
    function_to_num,
    out_dir,
)

## windowing

In [69]:
import pandas as pd

# Load predicted functions and GFF
predicted_df = pd.read_csv(
    "../results/demonstration/GCF_000175755.1/predicted_function_thresh05.csv"
)
predicted_df["id"] = predicted_df["id"].str.split(" ").str[0]
gff_df = pd.read_csv("../dataset/demonstration_samples/GCF_000175755.1/gff_df.csv")
gff_df = gff_df.rename(columns={"protein_id": "id"})

merged = gff_df.merge(predicted_df[["id", "function_num"]], on="id", how="inner")
merged = merged.sort_values(["contig", "start"]).reset_index(drop=True)
function_vector = merged["function_num"].values

In [72]:
import numpy as np


def window_vector(vec, window_size=30, step_size=1):
    windows = []
    indices = []
    for i in range(0, len(vec) - window_size + 1, step_size):
        windows.append(vec[i : i + window_size])
        indices.append(i)
    return np.array(windows), np.array(indices)


# Forward windows
forward_windows, forward_indices = window_vector(
    function_vector, window_size=30, step_size=1
)

# Reverse windows
reverse_vector = function_vector[::-1]
reverse_windows, reverse_indices = window_vector(
    reverse_vector, window_size=30, step_size=1
)

## pici classification

In [76]:
import numpy as np
import joblib
import pandas as pd
import xgboost as xgb


def predict_pici_segments(windows, model_path, threshold=0.4, class_names=None):
    # Load XGBoost model from JSON
    model = xgb.XGBClassifier()
    model.load_model(model_path)

    # Predict probabilities
    proba = model.predict_proba(windows)
    max_proba = np.max(proba, axis=1)
    max_class = np.argmax(proba, axis=1)
    predicted_class = np.where(max_proba >= threshold, max_class, 0)

    import pandas as pd

    df = pd.DataFrame(proba, columns=[f"prob_class_{i}" for i in range(proba.shape[1])])
    df["max_probability"] = max_proba
    df["predicted_class"] = predicted_class
    if class_names:
        df["predicted_class_name"] = [class_names[i] for i in predicted_class]
    return df

In [77]:
class_names = ["none", "PICI", "CFPICI", "P4"]
model_path = "../models/pici_classifier_model.json"
forward_results = predict_pici_segments(
    forward_windows, model_path, threshold=0.4, class_names=class_names
)
reverse_results = predict_pici_segments(
    reverse_windows, model_path, threshold=0.4, class_names=class_names
)

In [None]:
forward_results.to_csv(
    "../results/demonstration/GCF_000175755.1/predicted_pici_forward.csv", index=False
)
reverse_results.to_csv(
    "../results/demonstration/GCF_000175755.1/predicted_pici_reverse.csv", index=False
)

In [82]:
forward_results[forward_results["predicted_class_name"] == "PICI"]

Unnamed: 0,prob_class_0,prob_class_1,prob_class_2,prob_class_3,max_probability,predicted_class,predicted_class_name
2585,0.087787,0.89375,0.017478,0.000985,0.89375,1,PICI
3936,0.442614,0.550461,0.001739,0.005187,0.550461,1,PICI
4020,0.22131,0.778677,4e-06,9e-06,0.778677,1,PICI
4071,0.38075,0.611834,0.001944,0.005471,0.611834,1,PICI
4079,0.216074,0.783903,1.2e-05,1.2e-05,0.783903,1,PICI
4082,0.477615,0.522329,3.8e-05,1.8e-05,0.522329,1,PICI
4084,0.429963,0.569649,0.000229,0.000159,0.569649,1,PICI
4085,0.214228,0.785734,3.1e-05,7e-06,0.785734,1,PICI
4086,0.170062,0.828025,0.000769,0.001144,0.828025,1,PICI
4087,0.247925,0.751792,0.000178,0.000106,0.751792,1,PICI
