In [2]:
!pip install biopython
import Bio
from Bio import Entrez
from Bio import SeqIO

# Always tell NCBI who you are (e.g. your email address)
Entrez.email = "your.email@example.com" # Replace with your email address

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


First, we'll define a keyword to search for in the GenBank protein database. Then, we'll use `Entrez.esearch` to find relevant entries and `Entrez.efetch` to retrieve their full records in FASTA format.

In [3]:
# Define your search keyword
search_keyword = "hemoglobin human"

# Search the protein database for the keyword
handle = Entrez.esearch(db="protein", term=search_keyword, retmax="10") # retrieve up to 10 records
record = Entrez.read(handle)
handle.close()

# Get the IDs of the found entries
protein_ids = record["IdList"]
print(f"Found {len(protein_ids)} protein entries for '{search_keyword}'.")

if protein_ids:
    # Fetch the protein sequences in FASTA format
    handle = Entrez.efetch(db="protein", id=protein_ids, rettype="fasta", retmode="text")
    fasta_records = handle.read()
    handle.close()

    # Display the fetched FASTA records
    print("\n--- Fetched FASTA Records ---")
    print(fasta_records)
else:
    print("No protein entries found for the given keyword.")

Found 10 protein entries for 'hemoglobin human'.

--- Fetched FASTA Records ---
>WP_458507907.1 haptoglobin-binding heme uptake protein HarA [Staphylococcus aureus]
MNKHHPKLRSFYSIRKSTLGVASVIVSTLFLITSQHQAQAAENTNTSDKISENQNNNATTTQPPKDTNQT
QPATQPTNTAKTYPAADESLKDAIKNPAVENKEHDIGPREQVNFQLLDKNNETQYYHFFSIKDPADVYYT
KKKAEVELDINTASTWKKFEVYENNQKLPVRLVSYSPVPEDHAYIRFPVSDGTQELKIVSSTQIDDGAET
NYDYTKLVFAKTIYNDPSLVKSDTNDAVASNVQSSSDASNQTNTNTSNQNTSTTNNASDQPQATTNMSQP
AKPVSSANADQASSQPAHETNSNGNTNDKTNASSNQSNGNQQYPPADESLQDAIKNPAIIDKEHTADNWR
PIDFQMKNDKGERQFYHYASTVEPATVIFTKTGPIIELGLKTASTWKKFEVYEGDKKLPVELVSYDSDKD
YAYIRFPVSNGAREVKIVSSIEYGENFHEDYDYTLMVFAQPITNNPDDYVDEETYNLQKLLAPYHKAKTL
ERQVYELEKLQDKLPEKYKAEYKKKLDQTRVELADQVKSAVTEFENVTPTNDQLTDLQEAHFVVFESEEN
SESVMDGFVEHPFYTATLNGQKYVVMKTKDDSYWKDLIVEGKRVTTVSKDPKNNSRTLIFPYVSDKAIYN
AIVKVVVANIGYEGQYHVRIVNQDIKTKDDDTSQNNTSEPLNVQTGQENKVSATDTAENSSTATNPKDAS
DKADVIEPESDVVKVTDSNIDKDAHHDVDHLSDMSDNTHLDKYDLKEMDTQIAKDTDKGVDKDADNSVGM
SSNVDTEKDINKNEGKVIQLAHNTDKNNHTGKAAKLDGVRQNYNNIDKVTDKKT

Now, let's save these retrieved sequences to a local FASTA file and then read them back to display the sequence content using `SeqIO.parse`.

In [4]:
if protein_ids:
    # Save the FASTA records to a local file
    fasta_filename = "genbank_proteins.fasta"
    with open(fasta_filename, "w") as out_handle:
        out_handle.write(fasta_records)
    print(f"\nSaved {len(protein_ids)} sequences to '{fasta_filename}'.")

    # Read the saved FASTA file and display each sequence's ID and sequence
    print("\n--- Content of the Saved FASTA File ---")
    for record in SeqIO.parse(fasta_filename, "fasta"):
        print(f"ID: {record.id}")
        print(f"Description: {record.description}")
        print(f"Sequence: {record.seq[:50]}...") # Display first 50 chars of sequence
        print("-" * 20)
else:
    print("No sequences to save or display.")


Saved 10 sequences to 'genbank_proteins.fasta'.

--- Content of the Saved FASTA File ---
ID: WP_458507907.1
Description: WP_458507907.1 haptoglobin-binding heme uptake protein HarA [Staphylococcus aureus]
Sequence: MNKHHPKLRSFYSIRKSTLGVASVIVSTLFLITSQHQAQAAENTNTSDKI...
--------------------
ID: WP_458507524.1
Description: WP_458507524.1 iron-regulated surface determinant protein IsdD [Staphylococcus aureus]
Sequence: MRNVKQIATKSIIAIISLGILTYTTMIGSVLADEIKYPSAKFNQPEAKDK...
--------------------
ID: WP_458507358.1
Description: WP_458507358.1 haptoglobin-binding heme uptake protein HarA [Staphylococcus aureus]
Sequence: MNKHHPKLRSFYSIRKSTLGVASVIVSTLFLITSQHQAQAAENTNTSDKI...
--------------------
ID: WP_458507299.1
Description: WP_458507299.1 haptoglobin-binding heme uptake protein HarA [Staphylococcus aureus]
Sequence: MNNHHPKLRSFYSIRKSTLGVASVIVSTLFLITSQHQAQTAENTNTSDKI...
--------------------
ID: WP_458507197.1
Description: WP_458507197.1 haptoglobin-binding heme uptake protein HarA [Staphyloco