In [18]:
# parse the blastp output file

from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import SeqIO, Entrez
from io import StringIO

sequences = []

# Parse the results from the files
with open("./proteins/keratin/keratin-blastp.xml") as in_handle:
    blast_records = NCBIXML.parse(in_handle)
    for blast_record in blast_records:
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < 0.0001:
                    print("****Alignment****")
                    print("sequence:", alignment.title)
                    print("length:", alignment.length)
                    print("e value:", hsp.expect)
                    print(hsp.query[0:75] + "...")
                    print(hsp.match[0:75] + "...")
                    print(hsp.sbjct[0:75] + "...")
        

****Alignment****
sequence: gb|EAW58243.1| hCG1997648 [Homo sapiens] >gb|KAI4066085.1| keratin 81 [Homo sapiens] >dbj|BAI45776.1| keratin 81, partial [synthetic construct] >emb|CAA73943.1| keratin [Homo sapiens]
length: 505
e value: 0.0
MTCGSGFGGRAFSCISACGPRPGRCCITAAPYRGISCYRGLTGGFGSHSVCGGFRAGSCGRSFGYRSGGVCGPSP...
MTCGSGFGGRAFSCISACGPRPGRCCITAAPYRGISCYRGLTGGFGSHSVCGGFRAGSCGRSFGYRSGGVCGPSP...
MTCGSGFGGRAFSCISACGPRPGRCCITAAPYRGISCYRGLTGGFGSHSVCGGFRAGSCGRSFGYRSGGVCGPSP...
****Alignment****
sequence: ref|XP_018893534.2| keratin, type II cuticular Hb1 [Gorilla gorilla gorilla]
length: 505
e value: 0.0
MTCGSGFGGRAFSCISACGPRPGRCCITAAPYRGISCYRGLTGGFGSHSVCGGFRAGSCGRSFGYRSGGVCGPSP...
MTCGSGFGGRAFSCISACGPRPGRCCITAAPYRGISCYRGLTGGFGSHSVCGGFRAGSCGRSFGYRSGGVCGPSP...
MTCGSGFGGRAFSCISACGPRPGRCCITAAPYRGISCYRGLTGGFGSHSVCGGFRAGSCGRSFGYRSGGVCGPSP...
****Alignment****
sequence: ref|XP_024112725.1| keratin, type II cuticular Hb1 [Pongo abelii] >gb|PNJ25503.1| KRT86 isoform 1 [Pongo abelii]
length: 505
e valu

In [43]:
import glob

def from_xml_to_protein_sequences():
    """Extract the protein sequences from the BLAST XML output file and write to FASTA file.

    Args:
        None

    Returns:    
        None
    
    """

    # Open the BLAST XML output file
    blast_xml_files = glob.glob("./proteins/*/*-blastp.xml")

    for blast_xml_file in blast_xml_files:

        # Extract the protein name from the file path
        prot = blast_xml_file.split("/")[2]

        # Parse the BLAST output file
        with open(blast_xml_file) as blast_xml:
            blast_record = NCBIXML.read(blast_xml)

        # Extract the sequences from the BLAST output file and write to FASTA file
        sequences = []

        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:

                # Format the sbjct sequence as a FASTA record
                accession = alignment.hit_id.split("|")[1]
                sbjct_record = ">{} {}_{}\n{}\n".format(accession,alignment.hit_def, hsp.sbjct_start, hsp.sbjct)

                # Parse the FASTA record using SeqIO.read()
                seq_record = SeqIO.read(StringIO(sbjct_record), "fasta")
                sequences.append(seq_record)

                # Write the sequences to a FASTA file
                SeqIO.write(seq_record, f"./proteins/{prot}/{accession}.fasta", "fasta")

        # Write the sequences to a FASTA file
        SeqIO.write(sequences, f"./proteins/{prot}/sequences.fasta", "fasta")


from_xml_to_protein_sequences()


# Amino acid tokens

I produce a vocabulary (size == number of aminoacids). I map each aminoacid to a unique integer. In this way, I can represent each protein sequence as a sequence of integers.

In [79]:
import torch
import torch.nn as nn


seq = SeqIO.read("./proteins/keratin/AAA39273.1.fasta", "fasta")

# start counting from 0
# 20 == 21
number_of_amino_acids = 20
dimensions = 1

token_list = [hash(char) % number_of_amino_acids for char in str(seq.seq)]


In [81]:
print(token_list)

[0, 19, 14, 19, 0, 0, 12, 7, 8, 12, 8, 7, 19, 10, 2, 10, 19, 2, 7, 7, 6, 12, 8, 8, 10, 10, 2, 19, 6, 12, 7, 10, 2, 19, 8, 12, 19, 19, 0, 19, 12, 15, 12, 16, 7, 19, 8, 0, 2, 12, 19, 12, 7, 19, 2, 12, 0, 19, 10, 2, 12, 19, 19, 6, 7, 19, 10, 12, 10, 10, 7, 6, 14, 14, 16, 12, 16, 0, 6, 12, 8, 8, 14, 10, 8, 0, 8, 6, 6, 5, 10, 0, 8, 15, 7, 16, 16, 0, 6, 6, 16, 6, 15, 6, 16, 7, 8, 0, 12, 2, 0, 8, 8, 0, 6, 5, 16, 16, 2, 0, 8, 6, 15, 15, 0, 16, 8, 8, 6, 14, 16, 1, 15, 0, 10, 15, 0, 2, 16, 7, 7, 6, 12, 0, 19, 6, 10, 8, 0, 6, 19, 10, 6, 6, 14, 8, 2, 2, 6, 8, 6, 7, 16, 6, 8, 5, 12, 19, 2, 8, 8, 8, 6, 8, 0, 0, 8, 15, 6, 12, 19, 6, 19, 10, 16, 16, 2, 10, 6, 6, 6, 16, 8, 8, 2, 8, 14, 8, 6, 0, 6, 0, 16, 8, 8, 16, 16, 5, 16, 5, 7, 8, 10, 8, 2, 16, 12, 5, 8, 6, 8, 0, 8, 6, 8, 8, 14, 15, 6, 14, 5, 0, 8, 2, 2, 19, 10, 5, 6, 6, 14, 2, 6, 8, 0, 12, 0, 6, 12, 5, 14, 12, 16, 6, 16, 16, 19, 5, 0, 12, 2, 5, 8, 0, 19, 5, 7, 16, 16, 8, 6, 6, 16, 8, 15, 10, 5, 5, 6, 8, 12, 2, 12, 2, 8, 6, 8, 6, 12, 1, 10, 10, 14, 

In [85]:
number_of_amino_acids = 20
dimensions = 10

emb = nn.Embedding(number_of_amino_acids, dimensions)

embedding = emb(torch.tensor(token_list))
# (seq_len, dimensions)
embedding.shape

torch.Size([482, 10])