In [9]:
import pandas as pd
from tqdm import tqdm
from Bio.SeqIO.FastaIO import SimpleFastaParser
from feature_generation import seq2features, create_genomic_features
import warnings
from Bio import BiopythonDeprecationWarning

warnings.filterwarnings("ignore", category=BiopythonDeprecationWarning)

In [6]:
def load_sequences(fasta_file):
    # Count lines to estimate number of sequences (optional, for better progress bar)
    with open(fasta_file, "r") as f:
        n_seqs = sum(1 for line in f if line.startswith(">"))

    with open(fasta_file, "r") as f:
        entries = [
            (name, seq)
            for name, seq in tqdm(
                SimpleFastaParser(f), total=n_seqs, desc="Loading sequences"
            )
        ]
    return entries


def generate_features(entries, is_dna=False, chunk_size=None):
    if chunk_size is None:
        # Original behavior
        if is_dna:
            features_df = create_genomic_features("your_dna.fasta", file_format="fasta")
        else:
            features_df = seq2features(entries, min_length=10, scaling=True)
        return features_df
    else:
        features_list = []
        for i in tqdm(range(0, len(entries), chunk_size), desc="Feature extraction"):
            chunk = entries[i : i + chunk_size]
            features_chunk = seq2features(chunk, min_length=10, scaling=True)
            features_list.append(features_chunk)
        return pd.concat(features_list)


def save_results(features_df, output_file):
    features_df.to_parquet(output_file)
    print(f"Features saved to {output_file}")

In [None]:
if __name__ == "__main__":
    # Load sequences
    entries = load_sequences("../dataset/combined_proteins.fa")

    # Generate features
    features_df = generate_features(entries, chunk_size=100)

    # Save results
    save_results(features_df, "../dataset/protein_features.pa")

In [10]:
features = pd.read_parquet("../dataset/protein_features.pa")
features.shape

(927040, 1712)

In [12]:
features.columns

Index(['md5', 'PROSITE:ASX_HYDROXYL', 'PROSITE:PHOSPHOPANTETHEINE',
       'PROSITE:EF_HAND_1', 'PROSITE:EGF_1', 'PROSITE:HOMEOBOX_1',
       'PROSITE:ZINC_FINGER_C2H2_1', 'PROSITE:DEAD_ATP_HELICASE',
       'PROSITE:RIBOSOMAL_S12', 'PROSITE:CYTOCHROME_P450',
       ...
       'RED_TRIPEP:SSA', 'RED_TRIPEP:SSC', 'RED_TRIPEP:SSE', 'RED_TRIPEP:SSF',
       'RED_TRIPEP:SSG', 'RED_TRIPEP:SSH', 'RED_TRIPEP:SSK', 'RED_TRIPEP:SSL',
       'RED_TRIPEP:SSP', 'RED_TRIPEP:SSS'],
      dtype='object', length=1712)

In [24]:
features = features.reset_index().rename(columns={"index": "id"})
features

Unnamed: 0,id,md5,PROSITE:ASX_HYDROXYL,PROSITE:PHOSPHOPANTETHEINE,PROSITE:EF_HAND_1,PROSITE:EGF_1,PROSITE:HOMEOBOX_1,PROSITE:ZINC_FINGER_C2H2_1,PROSITE:DEAD_ATP_HELICASE,PROSITE:RIBOSOMAL_S12,...,RED_TRIPEP:SSA,RED_TRIPEP:SSC,RED_TRIPEP:SSE,RED_TRIPEP:SSF,RED_TRIPEP:SSG,RED_TRIPEP:SSH,RED_TRIPEP:SSK,RED_TRIPEP:SSL,RED_TRIPEP:SSP,RED_TRIPEP:SSS
0,MH669004_00012 head-tail_adaptor_Ad1,2ff0e79a1f5ff426b7825da909d5a0a6,0,0,0,0,0,0,0,0,...,0.012903,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1,MH669004_00013 head_closure_Hc1,06b55e692677f30e9023c95c4a3d876f,0,0,0,0,0,0,0,0,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
2,MH669004_00014 tail_completion_or_Neck1_protein,7af629d61338fc3f1d72497af8b4338c,0,0,0,0,0,0,0,0,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
3,MH669004_00015 tail_terminator,1133055ee6cbdb96e3272a6d1ec452ea,0,0,0,0,0,0,0,0,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
4,MK562502_00076 tail_completion_or_Neck1_protein,49b382ff261cf081f267d23078ae3c13,0,0,0,0,0,0,0,0,...,0.000000,0.0,0.000000,0.0,0.006250,0.000000,0.006250,0.006250,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
927035,AWHKGCFP_CDS_0011 hypothetical protein,058a25b24f8f77330d3d3b52d8e50041,0,0,0,0,0,0,0,0,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
927036,AWHKGCFP_CDS_0013 hypothetical protein,9fcda9da931e195cc06c274449ee56fc,0,0,0,0,0,0,0,0,...,0.009009,0.0,0.009009,0.0,0.009009,0.000000,0.000000,0.000000,0.0,0.0
927037,AWHKGCFP_CDS_0014 hypothetical protein,a63c3f265e92f5dc73bda2e79d1b0a96,0,0,0,0,0,0,0,0,...,0.000000,0.0,0.005882,0.0,0.000000,0.005882,0.000000,0.005882,0.0,0.0
927038,AWHKGCFP_CDS_0021 hypothetical protein,fdbec28bf93d8146074822fc1e7f54b8,0,0,0,0,0,0,0,0,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0


In [29]:
features["md5"].nunique()

360244

In [25]:
features["id"] = features["id"].str.split().str[0]

In [30]:
features.to_parquet("../dataset/protein_features.pa")