In [1]:
# üì¶ Install Biopython
!pip install biopython

Collecting biopython
  Downloading biopython-1.86-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.2/3.2 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [2]:
# üìö Import necessary modules
import xml.etree.ElementTree as ET
from Bio import ExPASy, SeqIO
from Bio.ExPASy import ScanProsite

In [3]:
# 1. Load the target protein sequence (TP53)
# We use SeqIO to parse the FASTA file from the Kaggle dataset path
prot_record = SeqIO.read("/kaggle/input/tp53-protein/TP53_Protein.txt", format="fasta")
print(f"Sequence ID: {prot_record.id}")
print(f"Sequence Length: {len(prot_record.seq)}")

Sequence ID: sp|P04637|P53_HUMAN
Sequence Length: 393


In [4]:
# 2. Perform the ScanProsite search
# We convert the sequence to a string and point to the official ExPASy mirror
handle = ScanProsite.scan(seq=str(prot_record.seq), mirror="https://prosite.expasy.org/")

In [5]:
# 3. Parse the XML Response
# The result from ScanProsite is in XML format. We read the raw data into an ElementTree.
result_data = handle.read()
root = ET.fromstring(result_data)

In [6]:
# 4. Define Namespace
# ScanProsite uses a specific XML namespace (urn:expasy:scanprosite). 
# We define 'sp' so we can search the tags correctly.
ns = {"sp": "urn:expasy:scanprosite"}

In [7]:
# 5. Extract and Display Match Data
# We find all occurrences of the 'match' tag within the XML tree
matches = root.findall(".//sp:match", ns)

print(f"\nTotal motifs/signatures found: {len(matches)}")
print("-" * 30)

if matches:
    first = matches[0]
    # Extract the Signature Accession, Start position, and Stop position
    sig_ac = first.find("sp:signature_ac", ns).text
    start = first.find("sp:start", ns).text
    stop = first.find("sp:stop", ns).text
    
    print(f"Primary Signature ID: {sig_ac}")
    print(f"Sequence Region:      [{start} to {stop}]")
else:
    print("No matches found in the Prosite database.")


Total motifs/signatures found: 1
------------------------------
Primary Signature ID: PS00348
Sequence Region:      [237 to 249]
