In [3]:
import pandas as pd
from tqdm import tqdm
from Bio.SeqIO.FastaIO import SimpleFastaParser
from feature_generation import seq2features, create_genomic_features
import warnings
from Bio import BiopythonDeprecationWarning

warnings.filterwarnings("ignore", category=BiopythonDeprecationWarning)

# generation

In [6]:
def load_sequences(fasta_file):
    # Count lines to estimate number of sequences (optional, for better progress bar)
    with open(fasta_file, "r") as f:
        n_seqs = sum(1 for line in f if line.startswith(">"))

    with open(fasta_file, "r") as f:
        entries = [
            (name, seq)
            for name, seq in tqdm(
                SimpleFastaParser(f), total=n_seqs, desc="Loading sequences"
            )
        ]
    return entries


def generate_features(entries, is_dna=False, chunk_size=None):
    if chunk_size is None:
        # Original behavior
        if is_dna:
            features_df = create_genomic_features("your_dna.fasta", file_format="fasta")
        else:
            features_df = seq2features(entries, min_length=10, scaling=True)
        return features_df
    else:
        features_list = []
        for i in tqdm(range(0, len(entries), chunk_size), desc="Feature extraction"):
            chunk = entries[i : i + chunk_size]
            features_chunk = seq2features(chunk, min_length=10, scaling=True)
            features_list.append(features_chunk)
        return pd.concat(features_list)


def save_results(features_df, output_file):
    features_df.to_parquet(output_file)
    print(f"Features saved to {output_file}")

In [None]:
if __name__ == "__main__":
    # Load sequences
    entries = load_sequences("../dataset/combined_proteins.fa")

    # Generate features
    features_df = generate_features(entries, chunk_size=100)

    # Save results
    save_results(features_df, "../dataset/protein_features.pa")

# loading

In [1]:
import pandas as pd

In [13]:
features = pd.read_parquet("../dataset/protein_features.pa")

In [None]:
print(features.columns)

In [None]:
features

In [None]:
print(features.shape)
print(features["id"].nunique())
print(features["md5"].nunique())

In [None]:
all_unique_ids = pd.read_csv("../dataset/pcat/all_unique_proteins.csv")
unique_ids = all_unique_ids["id"]
len(unique_ids)

In [17]:
# keep only rows in features that have an id in unique_ids
features = features[features["id"].isin(unique_ids)]

In [None]:
print(features.shape)
print(features["id"].nunique())
print(features["md5"].nunique())

In [None]:
features = features.drop(columns=["md5"])

In [21]:
# save features
features.to_parquet("../dataset/protein_features_unique.pa")

unique proteins

In [3]:
protein_features_unique = pd.read_parquet("../dataset/protein_features_unique.pa")

In [None]:
# Get column names and save to CSV
column_names = pd.DataFrame(protein_features_unique.columns, columns=["feature_name"])
column_names.to_csv("../dataset/feature_names.csv", index=False)

In [5]:
# remove the feature columns where it contains "DIPEP" or "TRIPEP" in the name
protein_features_unique = protein_features_unique.drop(
    columns=[
        col
        for col in protein_features_unique.columns
        if "DIPEP" in col or "TRIPEP" in col
    ]
)
protein_features_unique.to_parquet(
    "../dataset/protein_features_unique_no_dipep_tripep.pa"
)

In [6]:
protein_features_unique.shape

(360413, 212)