In [1]:
import pandas as pd
from tqdm import tqdm
from Bio.SeqIO.FastaIO import SimpleFastaParser
from feature_generation import seq2features, create_genomic_features
import warnings
from Bio import BiopythonDeprecationWarning

warnings.filterwarnings("ignore", category=BiopythonDeprecationWarning)

# generation

In [2]:
def load_sequences(fasta_file):
    # Count lines to estimate number of sequences (optional, for better progress bar)
    with open(fasta_file, "r") as f:
        n_seqs = sum(1 for line in f if line.startswith(">"))

    with open(fasta_file, "r") as f:
        entries = [
            (name.split()[0], seq)
            for name, seq in tqdm(
                SimpleFastaParser(f), total=n_seqs, desc="Loading sequences"
            )
        ]
    return entries


def generate_features(entries, is_dna=False, chunk_size=None):
    if chunk_size is None:
        # Original behavior
        if is_dna:
            features_df = create_genomic_features("your_dna.fasta", file_format="fasta")
        else:
            features_df = seq2features(entries, min_length=10, scaling=True)
        return features_df
    else:
        features_list = []
        for i in tqdm(range(0, len(entries), chunk_size), desc="Feature extraction"):
            chunk = entries[i : i + chunk_size]
            features_chunk = seq2features(chunk, min_length=10, scaling=True)
            features_list.append(features_chunk)
        return pd.concat(features_list)


def save_results(features_df, output_file):
    features_df = features_df.reset_index().rename(columns={"index": "id"})
    features_df = features_df.drop(columns=["md5"])
    features_df.to_parquet(output_file)
    print(f"Features saved to {output_file}")

In [4]:
if __name__ == "__main__":
    # Load sequences
    entries = load_sequences(
        "../dataset/demonstration_samples/GCF_000009645.1/protein.faa"
    )

    # Generate features
    features_df = generate_features(entries, chunk_size=100)

    # Save results
    save_results(
        features_df,
        "../dataset/demonstration_samples/GCF_000009645.1/features_notebook.pa",
    )

Loading sequences: 100%|██████████| 2621/2621 [00:00<00:00, 667350.86it/s]
Feature extraction: 100%|██████████| 27/27 [00:00<00:00, 37.92it/s]


Features saved to ../dataset/demonstration_samples/GCF_000009645.1/features_notebook.pa


# loading

In [8]:
import pandas as pd

In [5]:
features_notebook = pd.read_parquet(
    "../dataset/demonstration_samples/GCF_000009645.1/features_notebook.pa"
)
features_pipeline = pd.read_parquet(
    "../dataset/demonstration_samples/GCF_000009645.1/features.pa"
)

In [7]:
print(features_notebook.columns)

Index(['id', 'PROSITE:ASX_HYDROXYL', 'PROSITE:PHOSPHOPANTETHEINE',
       'PROSITE:EF_HAND_1', 'PROSITE:EGF_1', 'PROSITE:HOMEOBOX_1',
       'PROSITE:ZINC_FINGER_C2H2_1', 'PROSITE:DEAD_ATP_HELICASE',
       'PROSITE:RIBOSOMAL_S12', 'PROSITE:CYTOCHROME_P450',
       ...
       'RED_TRIPEP:SSA', 'RED_TRIPEP:SSC', 'RED_TRIPEP:SSE', 'RED_TRIPEP:SSF',
       'RED_TRIPEP:SSG', 'RED_TRIPEP:SSH', 'RED_TRIPEP:SSK', 'RED_TRIPEP:SSL',
       'RED_TRIPEP:SSP', 'RED_TRIPEP:SSS'],
      dtype='object', length=1712)


In [11]:
features_notebook.describe() == features_pipeline.describe()

Unnamed: 0,PROSITE:ASX_HYDROXYL,PROSITE:PHOSPHOPANTETHEINE,PROSITE:EF_HAND_1,PROSITE:EGF_1,PROSITE:HOMEOBOX_1,PROSITE:ZINC_FINGER_C2H2_1,PROSITE:DEAD_ATP_HELICASE,PROSITE:RIBOSOMAL_S12,PROSITE:CYTOCHROME_P450,PROSITE:CARBAMOYLTRANSFERASE,...,RED_TRIPEP:SSA,RED_TRIPEP:SSC,RED_TRIPEP:SSE,RED_TRIPEP:SSF,RED_TRIPEP:SSG,RED_TRIPEP:SSH,RED_TRIPEP:SSK,RED_TRIPEP:SSL,RED_TRIPEP:SSP,RED_TRIPEP:SSS
count,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
mean,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
std,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
min,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
25%,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
50%,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
75%,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
max,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [10]:
# # write all column names to a file
# with open("../document/feature_names.txt", "w") as f:
#     for col in features.columns:
#         f.write(col + "\n")

In [16]:
from pici_predictor.utilities import predict_function

feature_df = pd.read_parquet("../dataset/protein_features_unique.pa").sample(1000)
out_dir = "../results/demonstration/GCF_000009645.1/predicted_prob.csv"
model_path = "../models/best_configs"
predicted_prob = predict_function(feature_df, out_dir, model_path)
predicted_prob.describe()

Unnamed: 0,lysis,tail,connector,dna_rna_and_nucleotide_metabolism,head_and_packaging,other,transcription_regulation,moron_auxiliary_metabolic_gene_and_host_takeover,unknown_function,integration_and_excision
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.000639,7.1e-05,5.119628e-07,0.858891,0.000345,6e-05,4.71315e-10,0.040504,0.001123,2.6e-05
std,0.000227,2.1e-05,1.772102e-07,0.045859,0.000167,3.1e-05,2.363847e-10,0.010618,0.000625,2.2e-05
min,0.00011,3.1e-05,5.832449e-08,0.667213,2.5e-05,1.2e-05,1.817702e-10,0.025386,0.000208,7e-06
25%,0.000482,5.8e-05,4.039036e-07,0.831356,0.000234,4.1e-05,2.818803e-10,0.034654,0.000714,1.4e-05
50%,0.000663,6.8e-05,5.011168e-07,0.86387,0.000326,5.5e-05,4.160402e-10,0.038846,0.001013,2e-05
75%,0.000802,8e-05,6.282454e-07,0.890974,0.000425,7.2e-05,6.113164e-10,0.044248,0.001379,2.6e-05
max,0.001259,0.000256,1.206345e-06,0.961023,0.001633,0.000335,2.245561e-09,0.24292,0.007014,0.000239
