In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [2]:
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import SeqIO, Entrez

In [6]:
# simple way to retrive protein sequence

#Setting up your email to be able to use Entrez
Entrez.email = 'riccardo.tedoldi@studenti.unitn.it'

#Here, we set up a temporary handle with our downloaded sequence in fasta format
temp = Entrez.efetch(db="protein",rettype="fasta",id="2KIU_A")

#Reading the sequence information as a string in fasta format
aaseq = SeqIO.read(temp, format="fasta")

#Creating a fasta file to write our downloaded sequence
aaseq_out = open("2KIU_A.fasta",'w')

#Writing the sequence record in fasta format
SeqIO.write(aaseq,aaseq_out,"fasta")

#Closing both the temp handle and the FASTA file
temp.close()
aaseq_out.close()

# from Bio.Seq import Seq
# my_seq = Seq("AEVRPPFTYASLIRQAILESPEKQLTLNEIYNWFTRMFPYFRRNAATWKNAVRHNLSLHKYFVRVENVKGAVWTVDEVEFQKRRPQK")

from Bio.Blast import NCBIWWW

result_handle = NCBIWWW.qblast("blastp", "pdb", aaseq.seq, entrez_query='human[organism]')

In [7]:
from Bio.Blast import NCBIXML
blast_record = NCBIXML.read(result_handle)

result_handle.close()

In [8]:
for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
        print('****Alignment****')
        print('sequence:', alignment.title)
        print('length:', alignment.length)
        print('e value:', hsp.expect)
        print(hsp.query[0:75] + '...')
        print(hsp.match[0:75] + '...')
        print(hsp.sbjct[0:75] + '...')

****Alignment****
sequence: pdb|2KIU|A Chain A, Forkhead box protein P1 [Homo sapiens]
length: 87
e value: 2.07572e-61
AEVRPPFTYASLIRQAILESPEKQLTLNEIYNWFTRMFPYFRRNAATWKNAVRHNLSLHKYFVRVENVKGAVWTV...
AEVRPPFTYASLIRQAILESPEKQLTLNEIYNWFTRMFPYFRRNAATWKNAVRHNLSLHKYFVRVENVKGAVWTV...
AEVRPPFTYASLIRQAILESPEKQLTLNEIYNWFTRMFPYFRRNAATWKNAVRHNLSLHKYFVRVENVKGAVWTV...
****Alignment****
sequence: pdb|6XAT|A Chain A, FOXP4 protein [Homo sapiens]
length: 110
e value: 1.10651e-54
AEVRPPFTYASLIRQAILESPEKQLTLNEIYNWFTRMFPYFRRNAATWKNAVRHNLSLHKYFVRVENVKGAVWTV...
A+VRPPFTYASLIRQAILE+P++QLTLNEIYNWFTRMF YFRRN ATWKNAVRHNLSLHK FVRVENVKGAVWTV...
ADVRPPFTYASLIRQAILETPDRQLTLNEIYNWFTRMFAYFRRNTATWKNAVRHNLSLHKCFVRVENVKGAVWTV...
****Alignment****
sequence: pdb|2A07|F Chain F, Forkhead box protein P2 [Homo sapiens] >pdb|2A07|G Chain G, Forkhead box protein P2 [Homo sapiens] >pdb|2A07|H Chain H, Forkhead box protein P2 [Homo sapiens] >pdb|2A07|I Chain I, Forkhead box protein P2 [Homo sapiens] >pdb|2A07|J Chain J, Forkhead 

In [14]:
"""
    Here I report those prodein with different functions:
        
        Hemoglobin - transports oxygen in the blood
        Insulin - regulates glucose metabolism
        Myosin - facilitates muscle contraction
        Collagen - provides structural support to tissues
        Immunoglobulin - recognizes and neutralizes foreign substances (antigens)
        Amylase - breaks down carbohydrates in the digestive system
        Trypsin - breaks down proteins in the digestive system
        Keratin - forms the structural components of skin, hair, and nails
        Fibrinogen - plays a key role in blood clotting
        Erythropoietin - stimulates the production of red blood cells
        Albumin - maintains the osmotic pressure of blood and transports various molecules
        Elastin - provides elasticity to tissues such as skin, arteries, and lungs
        Catalase - breaks down hydrogen peroxide in cells
        Protein kinase - regulates various cellular processes by modifying other proteins through phosphorylation
        Tubulin - forms the structural components of microtubules, which are important for cell division and intracellular transport.


        Hemoglobin: NP_000509.1 (beta subunit)
        Insulin: NP_000198.1 (preproinsulin)
        Actin: NP_001092.2 (actin, alpha cardiac muscle 1)
        Collagen: NP_000079.3 (collagen alpha-1(I) chain)
        Immunoglobulin: NP_001138834.1 (immunoglobulin heavy constant gamma 1)
        Amylase: NP_000681.2 (amylase, alpha 1A)
        Trypsin: NP_001239255.1 (trypsinogen 1)
        Keratin: NP_006116.2 (keratin, type II cytoskeletal 1)
        Fibrinogen: NP_000500.3 (fibrinogen alpha chain)
        Erythropoietin: NP_000790.1 (erythropoietin precursor)
        Albumin: NP_000468.2 (serum albumin)
        Elastin: NP_000088.1 (elastin)
        Catalase: NP_001752.1 (catalase)
        Protein kinase: NP_001158288.1 (protein kinase C alpha type)
        Tubulin: NP_001191.1 (tubulin beta chain)


    I will try to retrive sequences which the function is supposed
    to be similar, since the similarity in the sequence.
        
"""


def get_protein_id(protein_name, mail: str = 'riccardo.tedoldi@studenti.unitn.it'):
    """ Get the NCBI Protein accession ID for a given protein name.

    Args:
        protein_name (str): The name of the protein.
        mail (str): The email address to use for the Entrez API.

    Returns:
        str: The NCBI Protein accession ID for the given protein name.

    """
    # Set up the Entrez API
    Entrez.email = mail
    db = "protein"

    # Send a search request to the NCBI Protein database
    handle = Entrez.esearch(db=db, term=protein_name)

    # Parse the search results to extract the NCBI Protein accession ID
    record = Entrez.read(handle)
    if int(record["Count"]) > 0:
        protein_id = record["IdList"][0]
        print(f"The NCBI Protein accession ID for {protein_name} is {protein_id}")
        handle.close()
        return protein_id
    else:
        handle.close()
        raise ValueError(f"No results found for {protein_name}")

# find the accession id of the protein

# Define the protein name
proteins_name = [

    "hemoglobin", 
    "insulin",
    "myosin",
    "collagen",
    "immunoglobulin",
    "amylase",
    "trypsin",
    "keratin",
    "fibrinogen",
    "erythropoietin",
    "albumin",
    "elastin",
    "catalase",
    "protein kinase",
    "tubulin"  

]

# Get the protein accession ID
proteins_accession = [

    "NP_000509.1", # Hemoglobin
    "NP_000198.1", # Insulin
    "CAA86293.1", # Myosin
    "NP_000079.2", # Collagen
    "CAA41852.1", # Immunoglobulin
    "NP_000681.2", # Amylase
    "KGI35851.1", # Trypsin
    "CAA73943.1", # Keratin
    "NP_000500.2", # Fibrinogen
    "NP_000790.1", # Erythropoietin
    "NP_000468.1", # Albumin
    "NP_000088.1", # Elastin
    "NP_001743.1", # Catalase
    "AAA58363.1", # Protein kinase
    "NP_001280141.1" # Tubulin
    
]

# Get the protein ID
proteins_id = [get_protein_id(name) for name in proteins_name]


The NCBI Protein accession ID for hemoglobin is 2487155142
The NCBI Protein accession ID for insulin is 2487199811
The NCBI Protein accession ID for myosin is 2487201619
The NCBI Protein accession ID for collagen is 2487200458
The NCBI Protein accession ID for immunoglobulin is 2487201646
The NCBI Protein accession ID for amylase is 2487158369
The NCBI Protein accession ID for trypsin is 2487199163
The NCBI Protein accession ID for keratin is 2487199691
The NCBI Protein accession ID for fibrinogen is 2487193546
The NCBI Protein accession ID for erythropoietin is 2487161332
The NCBI Protein accession ID for albumin is 2487193311
The NCBI Protein accession ID for elastin is 2487202389
The NCBI Protein accession ID for catalase is 2487169621
The NCBI Protein accession ID for protein kinase is 2487202646
The NCBI Protein accession ID for tubulin is 2487202509


In [12]:

def fetch_protein(proteins_name, protein_id, mail: str = 'riccardo.tedoldi@studenti.unitn.it'):
    """ Fetch a protein sequence from NCBI Protein database and save it in FASTA format

    Args:
        proteins_name (str): name of the protein
        protein_id (str): NCBI Protein accession ID
        mail (str): email address to use for the Entrez API

    Returns:
        None

    Save the protein sequence in FASTA format in the current working directory
    
    """

    # Set up the Entrez API
    Entrez.email = mail
    db = "protein"
    path = './proteins/'

    #Here, we set up a temporary handle with our downloaded sequence in fasta format
    temp = Entrez.efetch(db=db,rettype="fasta",id=protein_id)

    #Reading the sequence information as a string in fasta format
    seq = SeqIO.read(temp, format="fasta")

    #Creating a fasta file to write our downloaded sequence
    seq_out = open(f"{path}{proteins_name}.{protein_id}.fasta",'w')

    #Writing the sequence record in fasta format
    SeqIO.write(seq,seq_out,"fasta")

    #Closing both the temp handle and the FASTA file
    temp.close()
    seq_out.close()



# Fetch the proteins
for i in range(len(proteins_name)):
    fetch_protein(proteins_name[i], proteins_accession[i])

In [None]:
"""Some useful parameters for the BLAST search are:

        program blastn, blastp, blastx, tblastn, or tblastx (lower case)

        database Which database to search against (e.g. “nr”).

        sequence The sequence to search.

        ncbi_gi TRUE/FALSE whether to give ‘gi’ identifier.

        descriptions Number of descriptions to show. Def 500.

        alignments Number of alignments to show. Def 500.

        expect An expect value cutoff. Def 10.0.

        matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).

        filter “none” turns off filtering. Default no filtering

        format_type “HTML”, “Text”, “ASN.1”, or “XML”. Def. “XML”.

        entrez_query Entrez query to limit Blast search

        hitlist_size Number of hits to return. Default 50

        megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)

        short_query TRUE/FALSE whether to adjust the search parameters for a
            short query sequence. Note that this will override manually set 
            parameters like word size and e value. Turns off when sequence 
            length is > 30 residues. Default: None.

        service plain, psi, phi, rpsblast, megablast (lower case)

"""

# here we perform blastp to retrive similar sequences
# that might have the same funtion of the human protein
# found. specifically, with blastp we can retrive similar
# sequences in the same organism, or in other organisms
# this allwos us to discover how the evolution affected
# different species

import glob
import os
from Bio.Blast import NCBIWWW




def blastp(protein_name, db = 'nr'):
    """ Perform a blastp search for a given protein.

    Args:
        protein_name (str): The name of the protein.
        db (str): The database to use for the blastp search.

    Returns:
        None

    Save the blastp results in XML format in the folder 
    proteins/protein_name/blast_results_protein_name.db.xml

    """
    database = db

    path_dir = glob.glob('./proteins/*.fasta')

    for path in path_dir:

        # get the protein name
        protein_name = path.split('.')[1].split('/')[2]

        # create a folder for each protein if not already exists
        if not os.path.isdir(f'./proteins/{protein_name}'):
            os.mkdir(f'./proteins/{protein_name}')

        out_blast = f'./proteins/{protein_name}/'

        # read the fasta file
        seq = SeqIO.read(path, format='fasta')

        print(f"Performing blastp search for {protein_name} in {database} database")

        result_handle = NCBIWWW.qblast("blastp", database, seq.seq)

        print(f"Saving the results for {protein_name} in {database} database")

        # Save the results to a file
        with open(f"{out_blast}blast_results{protein_name}.{database}.xml", "w") as out_handle:
            out_handle.write(result_handle.read())

        print(f"Results saved for {protein_name} in {database} database")
        print()
            
        # Close the handle
        result_handle.close()


# database = ['pdb', 'nt']

blastp(proteins_name)



In [33]:
# Parse the results from the files
with open("./proteins/keratin/keratin-blastp.xml") as in_handle:
    blast_records = NCBIXML.parse(in_handle)
    for blast_record in blast_records:
        for alignment in blast_record.alignments:
            print(f"Alignment: {alignment.title}")