In [4]:
import requests
import pandas as pd
import json
import os
import numpy as np

# ✅ Define file paths
partial_file = "/Users/carmenshero/Desktop/Datasets/Partial_Training.csv"

# ✅ Load Partial_Training dataset
df_partial = pd.read_csv(partial_file, dtype={"ec_numbers": str, "sequence": str}, low_memory=False)

# ✅ Ensure missing values are recognized properly
df_partial["ec_numbers"] = df_partial["ec_numbers"].replace(["nan", np.nan], "")
df_partial["sequence"] = df_partial["sequence"].replace(["nan", np.nan], "")

# ✅ Ensure "Processed_Motifs" column exists
if "Processed_Motifs" not in df_partial.columns:
    df_partial["Processed_Motifs"] = False  # Initialize as unprocessed

print(f"✔ Loaded Partial_Training.csv with {len(df_partial)} entries.")


✔ Loaded Partial_Training.csv with 5453 entries.


In [2]:
# ✅ Function to fetch motifs for a given protein sequence
def get_protein_motifs(sequence):
    """Retrieve motifs (PROSITE signatures) based on the protein sequence."""
    
    if sequence == "N/A" or sequence.strip() == "":
        print("⚠️ No sequence available, skipping motif search.")
        return []

    # ✅ Define PROSITE Scan API URL
    url = "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"

    # ✅ Set API request parameters
    params = {
        "seq": sequence,  # Submit the actual protein sequence
        "output": "json"  # Request JSON output
    }

    # ✅ Send request to PROSITE Scan API
    response = requests.get(url, params=params)

    if response.status_code != 200:
        print(f"⚠️ Failed to retrieve motifs for sequence (Error {response.status_code})")
        return []

    data = response.json()
    motifs = []

    # ✅ Extract motifs from response
    for match in data.get("matchset", []):
        motifs.append(match.get("signature_ac"))  # Extract motif signature (e.g., PS00010)

    return motifs


In [None]:
# ✅ Process each row individually
for index, row in df_partial.iterrows():
    sequence = row["sequence"]  # ✅ Use sequence instead of UniProt ID
    
    # ✅ Skip already processed rows
    if row["Processed_Motifs"]:
        continue  

    # ✅ Skip missing sequences
    if not sequence or pd.isna(sequence) or sequence.strip() == "":
        continue

    try:
        # ✅ Fetch motifs using sequence
        motifs = get_protein_motifs(sequence)

        if not motifs:
            print(f"⚠️ No motifs found for sequence at index {index}. Skipping.")
            continue  # Skip if no motifs were detected

        # ✅ Print motifs found for debugging
        print(f"✔ Found motifs for index {index}: {motifs}")

        # ✅ Ensure motifs are properly encoded
        for motif in motifs:
            if motif in df_partial.columns:
                df_partial.at[index, motif] = 1  # Mark presence
            else:
                print(f"⚠️ Motif '{motif}' not found in columns! Skipping.")

        # ✅ Mark row as processed
        df_partial.at[index, "Processed_Motifs"] = True  

        # ✅ Save after processing each row (ensures progress is retained)
        df_partial.to_csv(partial_file, index=False)

        print(f"✔ Processed sequence {index} and saved progress.")

    except Exception as e:
        print(f"⚠️ Error processing sequence {index}: {e}")
        continue  # Skip to the next row if an error occurs


⚠️ No motifs found for sequence at index 0. Skipping.
⚠️ No motifs found for sequence at index 1. Skipping.
⚠️ No motifs found for sequence at index 2. Skipping.
⚠️ No motifs found for sequence at index 3. Skipping.
⚠️ No motifs found for sequence at index 4. Skipping.
⚠️ No motifs found for sequence at index 5. Skipping.
✔ Found motifs for index 6: ['PS50404', 'PS50405']
✔ Processed sequence 6 and saved progress.
✔ Found motifs for index 7: ['PS51679', 'PS00094', 'PS00095']
✔ Processed sequence 7 and saved progress.
⚠️ No motifs found for sequence at index 8. Skipping.
⚠️ No motifs found for sequence at index 9. Skipping.
⚠️ No motifs found for sequence at index 10. Skipping.
⚠️ No motifs found for sequence at index 11. Skipping.
⚠️ No motifs found for sequence at index 12. Skipping.
⚠️ No motifs found for sequence at index 13. Skipping.
✔ Found motifs for index 14: ['PS00387']
✔ Processed sequence 14 and saved progress.
⚠️ No motifs found for sequence at index 15. Skipping.
⚠️ No mot

In [3]:
import pandas as pd

# ✅ Define file path
partial_file = "/Users/carmenshero/Desktop/Datasets/Partial_Training.csv"

# ✅ Load dataset
df_partial = pd.read_csv(partial_file, dtype={"ec_numbers": str, "sequence": str}, low_memory=False)

# ✅ Ensure "Processed_Motifs" column exists
if "Processed_Motifs" not in df_partial.columns:
    df_partial["Processed_Motifs"] = False  # Create column if missing
else:
    df_partial["Processed_Motifs"] = False  # Reset all values to False

# ✅ Save updated dataset
df_partial.to_csv(partial_file, index=False)

print("✔ Reset all values in 'Processed_Motifs' to False.")


✔ Reset all values in 'Processed_Motifs' to False.


In [None]:
# ✅ Step 1: Collect all unique motifs across Partial_Training.csv
all_unique_motifs = set()

for index, row in df_partial.iterrows():
    sequence = row["sequence"]  # ✅ Use sequence instead of UniProt ID
    
    # ✅ Skip rows that are already processed
    if row["Processed_Motifs"]:
        continue
    
    motifs = get_protein_motifs(sequence)  # ✅ Query motifs based on sequence
    all_unique_motifs.update(motifs)

# ✅ Convert motifs into a sorted list for consistency
all_unique_motifs = sorted(list(all_unique_motifs))

# ✅ Step 2: Add new motif columns to the DataFrame (initialize to 0)
for motif in all_unique_motifs:
    if motif not in df_partial.columns:
        df_partial[motif] = 0  # Default to zero (absence of motif)

# ✅ Save updated Partial_Training.csv
df_partial.to_csv(partial_file, index=False)
print(f"✔ Extracted {len(all_unique_motifs)} unique motifs. Added missing columns and saved dataset.")
