In [2]:
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import SeqIO, Entrez
from io import StringIO

In [18]:
# parse the blastp output file

sequences = []

# Parse the results from the files
with open("./proteins/keratin/keratin-blastp.xml") as in_handle:
    blast_records = NCBIXML.parse(in_handle)
    for blast_record in blast_records:
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < 0.0001:
                    print("****Alignment****")
                    print("sequence:", alignment.title)
                    print("length:", alignment.length)
                    print("e value:", hsp.expect)
                    print(hsp.query[0:75] + "...")
                    print(hsp.match[0:75] + "...")
                    print(hsp.sbjct[0:75] + "...")
        

****Alignment****
sequence: gb|EAW58243.1| hCG1997648 [Homo sapiens] >gb|KAI4066085.1| keratin 81 [Homo sapiens] >dbj|BAI45776.1| keratin 81, partial [synthetic construct] >emb|CAA73943.1| keratin [Homo sapiens]
length: 505
e value: 0.0
MTCGSGFGGRAFSCISACGPRPGRCCITAAPYRGISCYRGLTGGFGSHSVCGGFRAGSCGRSFGYRSGGVCGPSP...
MTCGSGFGGRAFSCISACGPRPGRCCITAAPYRGISCYRGLTGGFGSHSVCGGFRAGSCGRSFGYRSGGVCGPSP...
MTCGSGFGGRAFSCISACGPRPGRCCITAAPYRGISCYRGLTGGFGSHSVCGGFRAGSCGRSFGYRSGGVCGPSP...
****Alignment****
sequence: ref|XP_018893534.2| keratin, type II cuticular Hb1 [Gorilla gorilla gorilla]
length: 505
e value: 0.0
MTCGSGFGGRAFSCISACGPRPGRCCITAAPYRGISCYRGLTGGFGSHSVCGGFRAGSCGRSFGYRSGGVCGPSP...
MTCGSGFGGRAFSCISACGPRPGRCCITAAPYRGISCYRGLTGGFGSHSVCGGFRAGSCGRSFGYRSGGVCGPSP...
MTCGSGFGGRAFSCISACGPRPGRCCITAAPYRGISCYRGLTGGFGSHSVCGGFRAGSCGRSFGYRSGGVCGPSP...
****Alignment****
sequence: ref|XP_024112725.1| keratin, type II cuticular Hb1 [Pongo abelii] >gb|PNJ25503.1| KRT86 isoform 1 [Pongo abelii]
length: 505
e valu

In [89]:
import glob
import pickle


def from_xml_to_protein_sequences():
    """Extract the protein sequences from the BLAST XML output file and write to FASTA file.

    Args:
        None

    Returns:    
        None
    
    """

    # Open the BLAST XML output file
    blast_xml_files = glob.glob("./proteins/*/*-blastp.xml")

    for blast_xml_file in blast_xml_files:

        # Extract the protein name from the file path
        prot = blast_xml_file.split("/")[2]

        # Parse the BLAST output file
        with open(blast_xml_file) as blast_xml:
            blast_record = NCBIXML.read(blast_xml)

        # Extract the sequences from the BLAST output file and write to FASTA file
        sequences = []
        sequences_pck = []

        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:

                # Format the sbjct sequence as a FASTA record
                accession = alignment.hit_id.split("|")[1]
                sbjct_record = ">{} {}_{}\n{}\n".format(accession,alignment.hit_def, hsp.sbjct_start, hsp.sbjct)

                # Parse the FASTA record using SeqIO.read()
                seq_record = SeqIO.read(StringIO(sbjct_record), "fasta")
                sequences.append(seq_record)
                sequences_pck.append((accession, seq_record.seq))

                # Write the sequences to a FASTA file
                SeqIO.write(seq_record, f"./proteins/{prot}/{accession}.fasta", "fasta")

        with open(f"./proteins/{prot}/sequences/sequences_pck.pkl", "wb") as f:
            pickle.dump(dict(sequences_pck), f)
        # Write the sequences to a FASTA file
        SeqIO.write(sequences, f"./proteins/{prot}/sequences.fasta", "fasta")


from_xml_to_protein_sequences()


In [92]:
import glob

def from_xml_to_txt_protein_sequences():
    """Extract the protein sequences from the BLAST XML output them 
    i a txt file.

    Args:
        None

    Returns:    
        None
    
    """

    # Open the BLAST XML output file
    blast_xml_files = glob.glob("./proteins/*/*-blastp.xml")

    for blast_xml_file in blast_xml_files:

        # Extract the protein name from the file path
        prot = blast_xml_file.split("/")[2]

        # Parse the BLAST output file
        with open(blast_xml_file) as blast_xml:
            blast_record = NCBIXML.read(blast_xml)

        # Extract the sequences from the BLAST output file and write to FASTA file
        sequences = []

        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:

                # Format the sbjct sequence as a FASTA record
                accession = alignment.hit_id.split("|")[1]
                sbjct_record = ">{} {}_{}\n{}\n".format(accession,alignment.hit_def, hsp.sbjct_start, hsp.sbjct)

                # Parse the FASTA record using SeqIO.read()
                seq_record = SeqIO.read(StringIO(sbjct_record), "fasta")

                with open(f"./proteins/{prot}/sequences.txt", "a") as f:
                    f.write(str(seq_record.seq)+"\n")


from_xml_to_txt_protein_sequences()


In [16]:
import os
import glob

# get the name of the proteins
proteins = [i.split('/')[2] for i in glob.glob("./proteins/*/")]

# create directory for sequence for each protein
for protein in proteins:
    os.makedirs(f"./proteins/{protein}/sequences", exist_ok=True)
    # move the file sequences.fasta to the directory sequences
    os.rename(f"./proteins/{protein}/sequences.fasta", f"./proteins/{protein}/sequences/sequences.fasta")
    os.rename(f"./proteins/{protein}/sequences.txt", f"./proteins/{protein}/sequences/sequences.txt")
    


In [29]:
import os
import glob

# get the name of the proteins
proteins = [i.split('/')[2] for i in glob.glob("./proteins/*/")]

# create directory for sequence for each protein
protein = 'albumin'
# os.makedirs(f"./proteins/{protein}/sequences", exist_ok=True)
# move the file sequences.fasta to the directory sequences


# Byte Pair Encoding (BPE)


Introduced by the paper [Neural Machine Translation of Rare Words with Subword Units](https://arxiv.org/abs/1508.07909). Consists in bring small pieces of the sequence and tokenize them reling on the sequence frequency. The intuition behind this approach encode the contex of each sub-component, avoiding the issue of out-of-word vocabulary.

In [8]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.98


In [75]:
import sentencepiece as spm
import glob
import os
import pickle

def from_xml_to_BPE_protein_tokenization():
    """Encode the sequences using BPE.

    Args:
        None

    Returns:    
        None
    
    """

    # Open the BLAST XML output file
    blast_xml_files = glob.glob("./proteins/*/*-blastp.xml")

    # vocab size
    # vocab_s = dict([(file.split('/')[2],max([len(seq) for seq in open(file, 'r').readlines()])*15) for file in glob.glob("./proteins/*/sequences/sequences.txt")])
    vocab_s = dict([(file.split('/')[2],max([len(seq) for seq in open(file, 'r').readlines()])*15) for file in glob.glob("./proteins/*/sequences/sequences.txt")])

    vocab_s['insulin'] = 6000
    vocab_s['hemoglobin'] = 6722
    vocab_s['erythropoietin'] = 8000
    vocab_s['collagen'] = int(vocab_s['collagen']/3)
    vocab_s['myosin'] = int(vocab_s['myosin']/3)
    vocab_s['trypsin'] = int(vocab_s['trypsin']/2)
    vocab_s['elastin'] = 8000
    vocab_s['tubulin'] = 3421

    for blast_xml_file in blast_xml_files:

        # Extract the protein name from the file path
        prot = blast_xml_file.split("/")[2]

        # https://github.com/google/sentencepiece/blob/master/doc/options.md
        spm.SentencePieceTrainer.train(
            input=f'./proteins/{prot}/sequences/sequences.txt', 
            model_type='bpe', 
            shuffle_input_sentence=False,
            split_by_whitespace=False,
            max_sentencepiece_length=16,
            allow_whitespace_only_pieces=True,
            model_prefix=f'BPE_model_{prot}', 
            vocab_size=vocab_s[prot]
        )

        sm = spm.SentencePieceProcessor()
        # load the model
        sm.load(f'BPE_model_{prot}.model')

        # Parse the BLAST output file
        with open(blast_xml_file) as blast_xml:
            blast_record = NCBIXML.read(blast_xml)

        # Extract the sequences from the BLAST output file and write to FASTA file
        sequences = []
        sequences_str = []

        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:

                # Format the sbjct sequence as a FASTA record
                accession = alignment.hit_id.split("|")[1]
                sbjct_record = ">{} {}_{}\n{}\n".format(accession,alignment.hit_def, hsp.sbjct_start, hsp.sbjct)

                # Parse the FASTA record using SeqIO.read()
                seq_record = SeqIO.read(StringIO(sbjct_record), "fasta")
                
                # encode the sequence
                sequences_str.append((accession,sm.encode(str(seq_record.seq), out_type=str)))
                sequences.append((accession,sm.encode(str(seq_record.seq))))

        # Write the encoded sequences into a pickle file
        with open(f"./proteins/{prot}/sequences/sequences_BPE.pkl", "wb") as f:
            pickle.dump(sequences, f)
        with open(f"./proteins/{prot}/sequences/sequences_BPE_str.pkl", "wb") as f:
            pickle.dump(sequences_str, f)

        # move the model+vocab to the right folder
        os.rename(f"./BPE_model_{prot}.model", f"./proteins/{prot}/sequences/BPE_model_{prot}.model")
        os.rename(f"./BPE_model_{prot}.vocab", f"./proteins/{prot}/sequences/BPE_model_{prot}.vocab")



from_xml_to_BPE_protein_tokenization()


In [64]:
# dictionary with the dimension of the vocabulary for each protein
d = dict([(file.split('/')[2],max([len(seq) for seq in open(file, 'r').readlines()])*15) for file in glob.glob("./proteins/*/sequences/sequences.txt")])

d['insulin'] = 6000
d['hemoglobin'] = 6792
d['erythropoietin'] = 8000
d['collagen'] = int(d['collagen']/3)
d['myosin'] = int(d['myosin']/3)
d['trypsin'] = int(d['trypsin']/2)
d['elastin'] = 8000
d['tubulin'] = 3421

d

{'aldehyde': 8400,
 'protein_kinase': 8040,
 'collagen': 7780,
 'fibrinogen': 7035,
 'albumin': 9300,
 'insulin': 6000,
 'hemoglobin': 6792,
 'erythropoietin': 8000,
 'myosin': 9835,
 'immunoglobulin': 2145,
 'tubulin': 3421,
 'keratin': 8025,
 'catalase': 8565,
 'trypsin': 9142,
 'elastin': 8000}

In [68]:
with open('./proteins/insulin/sequences/sequences_BPE.pkl', 'rb') as f:
    insulin = pickle.load(f)

insulin = dict(insulin)

insulin

{'QMS45324.1': [106, 164, 264, 49, 195, 171, 157, 50, 82],
 'AAP36446.1': [106, 164, 264, 49, 195, 171, 157, 50, 82],
 'NP_000198.1': [106, 164, 264, 49, 195, 171, 157, 50, 82],
 'XP_024110665.1': [106, 164, 153, 49, 195, 171, 157, 50, 82],
 'AKI70564.1': [106, 164, 264, 2294, 195, 171, 157, 50, 82],
 'QMS45321.1': [106, 164, 264, 49, 195, 171, 157, 50, 82],
 'NP_001008996.1': [211, 1189, 698, 49, 195, 171, 157, 50, 82],
 'AKI70566.1': [106, 164, 264, 49, 195, 171, 1847, 167, 82],
 'XP_034787832.1': [106, 164, 698, 49, 195, 171, 157, 50, 82],
 'AKI70567.1': [106, 164, 264, 49, 195, 171, 157, 1887, 82],
 'XP_050613945.1': [106, 164, 249, 49, 213, 171, 157, 50, 82],
 'AKI70565.1': [106, 164, 264, 49, 2113, 171, 157, 50, 82],
 'XP_016775240.1': [211, 1189, 698, 49, 195, 171, 157, 50, 82],
 'XP_003281399.1': [106, 164, 249, 49, 213, 171, 157, 50, 82],
 'XP_032009711.1': [656, 164, 249, 49, 213, 171, 157, 50, 82],
 'XP_003909425.2': [106, 164, 380, 49, 213, 171, 157, 50, 82],
 'XP_008002825

In [71]:
with open('./proteins/hemoglobin/sequences/sequences_BPE.pkl', 'rb') as f:
    hemoglobin = pickle.load(f)

hemoglobin = dict(hemoglobin)

for idx in hemoglobin.keys():
    print(len(hemoglobin[idx]))

13
13
13
13
12
13
13
13
13
12
13
13
13
12
13
13
12
13
13
13
13
13
13
13
13
13
13
13
13
13
14
12
13
12
13
13
13
13
12
13
12
13
13
13
12
13
12
13
14
13
13
13
13
13
13
13
13
13
12
14
13
13
13
13
13
12
13
13
12
13
12
12
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
12
13
13
13
13
13
13
13
13
12
12
12
12
13
13
13
13
13
13
14
13
12
13
12
12
13
13
13
12
12
13
12
15
13
12
13
13
13
13
13
13
13
13
13
13
13
13
13
13
24
13
12
12
13
13
13
13
13
13
13
13
11
13
13
13
13
13
13
13
13
12
13
12
13
12
12
13
13
13
13
13
13
13
12
12
13
13
13
12
13
13
12
10
13
13
12
12
12
11
11
12
12
11
13
13
13
12
12
11
12
12
12
12
12
12
12
12
12
12
13
12
12
12
12
12
12
13
12
11
12
12
13
12
13
12
12
11
12
10
12
12
12
12
12
12
12
11
13
13
12
12
12
12
12
12
13
13
12
12
12
13
12
13
13
13
12
12
12
12
12
12
12
12
13
13
13
12
12
12
13
12
12
13
12
13
12
12
13
13
12
12
12
13
13
12
12
12
12
13
13
12
13
13
12
12
13
12
12
12
12
12
13
13
13
12
13
12
12
13
12
13
13
13
13
12
13
12
13
13
12
12
13
13
13
12
1

In [77]:
files = glob.glob("./proteins/*/sequences/sequences.txt")

# load the sequences in a single file
for file in files:
    with open(file, 'rb') as f:
        with open('./sequences.txt', 'ab') as f2:
            f2.write(f.read())
            f2.write(b'\n')

In [165]:
spm.SentencePieceTrainer.train(
            input=f'./sequences.txt', 
            model_type='bpe', 
            shuffle_input_sentence=False,
            split_by_whitespace=False,
            max_sentencepiece_length=16,
            allow_whitespace_only_pieces=True,
            model_prefix=f'BPE_model', 
            vocab_size=3000
        )

In [171]:
files = glob.glob("./proteins/*/sequences/sequences_pck.pkl")

import numpy as np

# load the sequence
with open(files[0], 'rb') as f:
    seq = pickle.load(f)


sm = spm.SentencePieceProcessor()
# load the model
sm.load(f'BPE_model.model')

# list with all the encoded sequences
encoded_seq = [np.array(sm.encode(str(seq[key]))) for key in seq.keys()]

# extract maximum dimension
max_dim = np.array([len(seq) for seq in encoded_seq]).max()

# how much padding is needed
padding = [max_dim-a.shape[0] for a in encoded_seq]

# pad the sequences + 1 to avoid to use -1 as padding
encoded_seq = [np.pad(a,(0,p), constant_values=(-1, -1)) for a, p in zip(encoded_seq,padding)]

In [185]:
zero_like = sum((encoded_seq[0]-encoded_seq[1]) == 0) 

zero_like

182

# Amino acid tokens

I produce a vocabulary (size == number of aminoacids). I map each aminoacid to a unique integer. In this way, I can represent each protein sequence as a sequence of integers.

In [79]:
import torch
import torch.nn as nn


seq = SeqIO.read("./proteins/keratin/AAA39273.1.fasta", "fasta")

# start counting from 0
# 20 == 21
number_of_amino_acids = 20
dimensions = 1

token_list = [hash(char) % number_of_amino_acids for char in str(seq.seq)]


In [81]:
print(token_list)

[0, 19, 14, 19, 0, 0, 12, 7, 8, 12, 8, 7, 19, 10, 2, 10, 19, 2, 7, 7, 6, 12, 8, 8, 10, 10, 2, 19, 6, 12, 7, 10, 2, 19, 8, 12, 19, 19, 0, 19, 12, 15, 12, 16, 7, 19, 8, 0, 2, 12, 19, 12, 7, 19, 2, 12, 0, 19, 10, 2, 12, 19, 19, 6, 7, 19, 10, 12, 10, 10, 7, 6, 14, 14, 16, 12, 16, 0, 6, 12, 8, 8, 14, 10, 8, 0, 8, 6, 6, 5, 10, 0, 8, 15, 7, 16, 16, 0, 6, 6, 16, 6, 15, 6, 16, 7, 8, 0, 12, 2, 0, 8, 8, 0, 6, 5, 16, 16, 2, 0, 8, 6, 15, 15, 0, 16, 8, 8, 6, 14, 16, 1, 15, 0, 10, 15, 0, 2, 16, 7, 7, 6, 12, 0, 19, 6, 10, 8, 0, 6, 19, 10, 6, 6, 14, 8, 2, 2, 6, 8, 6, 7, 16, 6, 8, 5, 12, 19, 2, 8, 8, 8, 6, 8, 0, 0, 8, 15, 6, 12, 19, 6, 19, 10, 16, 16, 2, 10, 6, 6, 6, 16, 8, 8, 2, 8, 14, 8, 6, 0, 6, 0, 16, 8, 8, 16, 16, 5, 16, 5, 7, 8, 10, 8, 2, 16, 12, 5, 8, 6, 8, 0, 8, 6, 8, 8, 14, 15, 6, 14, 5, 0, 8, 2, 2, 19, 10, 5, 6, 6, 14, 2, 6, 8, 0, 12, 0, 6, 12, 5, 14, 12, 16, 6, 16, 16, 19, 5, 0, 12, 2, 5, 8, 0, 19, 5, 7, 16, 16, 8, 6, 6, 16, 8, 15, 10, 5, 5, 6, 8, 12, 2, 12, 2, 8, 6, 8, 6, 12, 1, 10, 10, 14, 

In [85]:
number_of_amino_acids = 20
dimensions = 10

emb = nn.Embedding(number_of_amino_acids, dimensions)

embedding = emb(torch.tensor(token_list))
# (seq_len, dimensions)
embedding.shape

torch.Size([482, 10])

# ESM to extract the embeddings of the sequences

The Meta Fundamental AI Research Protein Team (FAIR) already made available a series of pre-trained models for working with biological data. The ESM-2 language model is able to extract representations directly relying solely on the single sequence and produce accurate structure prediction. Recently, facebook research made available the [ESM metagenomic atlas](https://esmatlas.com/) an open database made of 617 million predicted metagenomic protein structures.

Actually, from the sequences we aim to extract the embeddings using ESM-2.

In [186]:
# !pip install git+https://github.com/facebookresearch/esm.git

Collecting git+https://github.com/facebookresearch/esm.git
  Cloning https://github.com/facebookresearch/esm.git to /tmp/pip-req-build-uftfbug8
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/esm.git /tmp/pip-req-build-uftfbug8
  Resolved https://github.com/facebookresearch/esm.git to commit c9c7d4f0fec964ce10c3e11dccec6c16edaa5144
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: fair-esm
  Building wheel for fair-esm (pyproject.toml) ... [?25ldone
[?25h  Created wheel for fair-esm: filename=fair_esm-2.0.1-py3-none-any.whl size=105311 sha256=1c64850705d1296536ad3369af9a41e2829834f8693ba90ed481eb94841cf435
  Stored in directory: /tmp/pip-ephem-wheel-cache-o4de2_du/wheels/f3/b2/ec/4db0b108f6367c7563f99b2445e1137d486003fb2f9bfd2f53
Successfully built fair-esm
Installing collected packa

In [1]:
import esm
import torch
import numpy as np
import glob
import pickle

In [2]:
# Load ESM-2 model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D() #esm2_t36_3B_UR50D()#esm2_t48_15B_UR50D()
batch_converter = alphabet.get_batch_converter()
# half precision float16
model = model.half()
model.eval()  # disables dropout for deterministic results

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0-32): 33 x TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
  )
  (contact_head): ContactPredictionHead(
    (regression): Linear(in_features=660, out_features=1, bias=True)
    (activation): Sigmoid()
  )
  (emb_layer_norm_after): LayerNorm((1280,), eps=1

In [3]:
# allocate the model on the GPU
model = model.cuda()

In [4]:
# load the sequences and cast them
files = glob.glob("./proteins/*/sequences/sequences_pck.pkl")

# list with all the sequences
sequences = []

for file in files:
    # load the sequence
    with open(file, 'rb') as f:
        seq = pickle.load(f)

    # cast all the sequences as a string 
    for key in seq.keys():
        seq[key] = str(seq[key])

    sequences.append(seq)

In [17]:
print('number of protein: ',len(sequences))
print()
[print(f'number of sequences protein {idx+1}: ',len(s)) for idx, s in enumerate(sequences)]
print()
print('total number of sequences: ',sum([len(s) for s in sequences]))

number of protein:  15

number of sequences protein 1:  500
number of sequences protein 2:  500
number of sequences protein 3:  500
number of sequences protein 4:  500
number of sequences protein 5:  500
number of sequences protein 6:  498
number of sequences protein 7:  489
number of sequences protein 8:  500
number of sequences protein 9:  500
number of sequences protein 10:  500
number of sequences protein 11:  500
number of sequences protein 12:  500
number of sequences protein 13:  500
number of sequences protein 14:  500
number of sequences protein 15:  500

total number of sequences:  7487


In [4]:
import gc

def get_batch(sequences_token, idx):
    """ Function to get the batch of sequences

    Returns:
        list: list with all the sequences
    """
    if idx == 5:
        yield [sequences_token[m:M]
                    for m, M in zip(np.linspace(0, 488, 60)[:59].astype(int),np.linspace(0, 488, 60)[1:].astype(int))]
        
    elif idx == 6:
        yield [sequences_token[m:M]
                    for m, M in zip(np.linspace(0, 498, 60)[:59].astype(int),np.linspace(0, 498, 60)[1:].astype(int))]
    elif idx == 7:
        yield [sequences_token[m:M]
                    for m, M in zip(np.linspace(0, 489, 60)[:59].astype(int),np.linspace(0, 489, 60)[1:].astype(int))]
        
    else:
        yield [sequences_token[m:M]
                    for m, M in zip(np.linspace(0, 500, 80)[:79].astype(int),np.linspace(0, 500, 80)[1:].astype(int))]


def get_esm_encoding(batch_tokens, model, device):
    # allocate the token on the GPU
    batch_tokens = batch_tokens.to(device)

    # Extract per-residue representations
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=False)

    yield results["representations"][33].type(torch.float16).cpu()

def extract_embedding(sequences_token, model, idx):
    """ Function to extract the embedding from the sequences

    Args:
        sequences_token (list): list with all the sequences
        model (torch.nn.Module): model to use to extract the embedding
        alphabet (esm.Alphabet): alphabet of the model
        batch_converter (esm.pretrained.esm1_t6_43M_UR50S): converter of the model

    Returns:
        list: list with all the embedding
    """

    # list with all the extracted representations
    token_representations = []

    for batch_tokens in list(get_batch(sequences_token, idx))[0]:

        # extract the embedding
        token_representations.append(list(get_esm_encoding(batch_tokens, model, device))[0])

        # free the memory
        gc.collect()
        del batch_tokens

    return token_representations


from tqdm import tqdm

# load the sequences and cast them
files = glob.glob("./proteins/*/sequences/sequences_pck.pkl")

# list with all the sequences
sequences = []

for file in files:
    # load the sequence
    with open(file, 'rb') as f:
        seq = pickle.load(f)

    s_seq = dict()
    # cast all the sequences as a string 
    for key in seq.keys():
        # I have to check and filter out the 
        # sequences with J otherwise the model
        # get an error
        if 'J' in str(seq[key]):
            break
        else:
            # store the sequence
            s_seq[key] = str(seq[key])

    sequences.append(s_seq)

del seq

print('number of protein: ',len(sequences))
print()
[print(f'protein {idx+1}: ',len(s)) for idx,s in enumerate(sequences)]
print()

sequences = sequences[8:11]

# check is cuda available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# list with all the sequences tokenized
sequences_token = []

for idx in tqdm(range(len(sequences))):
    # extract data
    data = list(sequences[idx].items())

    # convert the data in a batch
    batch_labels, batch_strs, batch_tokens = batch_converter(data)

    # check the length of the batch
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    sequences_token.append(extract_embedding(batch_tokens, model,idx+8))



number of protein:  15

protein 1:  500
protein 2:  500
protein 3:  500
protein 4:  500
protein 5:  488
protein 6:  498
protein 7:  489
protein 8:  500
protein 9:  500
protein 10:  500
protein 11:  500
protein 12:  500
protein 13:  500
protein 14:  500
protein 15:  500



  0%|          | 0/3 [00:00<?, ?it/s]

In [58]:
# print('number of protein: ',len(sequences_token))
# print()
# open('esm_embedding_0_3.pkl', 'wb').write(pickle.dumps(sequences_token))


number of protein:  4



4000073055

In [81]:
# print('number of protein: ',len(sequences_token))
# print()
# open('esm_embedding_4_7.pkl', 'wb').write(pickle.dumps(sequences_token))

number of protein:  4



1413264558

In [None]:
print('number of protein: ',len(sequences_token))
print()
open('esm_embedding_8_10.pkl', 'wb').write(pickle.dumps(sequences_token))

number of protein:  7



: 

: 

In [None]:
import gc

def get_batch(sequences_token, idx):
    """ Function to get the batch of sequences

    Returns:
        list: list with all the sequences
    """
    if idx == 5:
        yield [sequences_token[m:M]
                    for m, M in zip(np.linspace(0, 488, 60)[:59].astype(int),np.linspace(0, 488, 60)[1:].astype(int))]
        
    elif idx == 6:
        yield [sequences_token[m:M]
                    for m, M in zip(np.linspace(0, 498, 60)[:59].astype(int),np.linspace(0, 498, 60)[1:].astype(int))]
    elif idx == 7:
        yield [sequences_token[m:M]
                    for m, M in zip(np.linspace(0, 489, 60)[:59].astype(int),np.linspace(0, 489, 60)[1:].astype(int))]
        
    else:
        yield [sequences_token[m:M]
                    for m, M in zip(np.linspace(0, 500, 80)[:79].astype(int),np.linspace(0, 500, 80)[1:].astype(int))]


def get_esm_encoding(batch_tokens, model, device):
    # allocate the token on the GPU
    batch_tokens = batch_tokens.to(device)

    # Extract per-residue representations
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=False)

    yield results["representations"][33].type(torch.float16).cpu()

def extract_embedding(sequences_token, model, idx):
    """ Function to extract the embedding from the sequences

    Args:
        sequences_token (list): list with all the sequences
        model (torch.nn.Module): model to use to extract the embedding
        alphabet (esm.Alphabet): alphabet of the model
        batch_converter (esm.pretrained.esm1_t6_43M_UR50S): converter of the model

    Returns:
        list: list with all the embedding
    """

    # list with all the extracted representations
    token_representations = []

    for batch_tokens in list(get_batch(sequences_token, idx))[0]:

        # extract the embedding
        token_representations.append(list(get_esm_encoding(batch_tokens, model, device))[0])

        # free the memory
        gc.collect()
        del batch_tokens

    return token_representations


from tqdm import tqdm

# load the sequences and cast them
files = glob.glob("./proteins/*/sequences/sequences_pck.pkl")

# list with all the sequences
sequences = []

for file in files:
    # load the sequence
    with open(file, 'rb') as f:
        seq = pickle.load(f)

    s_seq = dict()
    # cast all the sequences as a string 
    for key in seq.keys():
        # I have to check and filter out the 
        # sequences with J otherwise the model
        # get an error
        if 'J' in str(seq[key]):
            break
        else:
            # store the sequence
            s_seq[key] = str(seq[key])

    sequences.append(s_seq)

del seq

print('number of protein: ',len(sequences))
print()
[print(f'protein {idx+1}: ',len(s)) for idx,s in enumerate(sequences)]
print()

sequences = sequences[11:]

# check is cuda available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# list with all the sequences tokenized
sequences_token = []

for idx in tqdm(range(len(sequences))):
    # extract data
    data = list(sequences[idx].items())

    # convert the data in a batch
    batch_labels, batch_strs, batch_tokens = batch_converter(data)

    # check the length of the batch
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    sequences_token.append(extract_embedding(batch_tokens, model,idx+8))



In [None]:
print('number of protein: ',len(sequences_token))
print()
open('esm_embedding_11_14.pkl', 'wb').write(pickle.dumps(sequences_token))

In [49]:
# l,s = zip(*data)
[print(f'number of sequences encoded for protein protein {idx+1}: ',sum([len(ss[0])  for ss in s[0]])) for idx, s in enumerate(sequences_token)]


# for label, seq_str in zip(l,s):
#     print(label)
#     print(alphabet.encode(seq_str))


# alphabet.encode(seq['XP_025964472.1'])


# for l,i in sequences[3].items():
#     for j in i:
#         if j == 'J':
#             print(j)

# def get_batch(sequences_token):
#     """ Function to get the batch of sequences

#     Returns:
#         list: list with all the sequences
#     """
    
#     yield [sequences_token[m:M]
#                 for m, M in zip(np.linspace(0, 500, 20)[:19].astype(int),np.linspace(0, 500, 20)[1:].astype(int))]
    
# len(list(get_batch(batch_tokens)))


# for l,i in data:
#     for j in i:
#         if j == 'J':
#             print(j)

def get_batch(sequences_token):
    """ Function to get the batch of sequences

    Returns:
        list: list with all the sequences
    """
    
    yield [sequences_token[m:M]
                for m, M in zip(np.linspace(0, 500, 60)[:59].astype(int),np.linspace(0, 500, 60)[1:].astype(int))]
    


for i in list(get_batch(batch_tokens))[0]:
    print(list(get_esm_encoding(i, model, device))[0].shape)


torch.Size([8, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])
torch.Size([9, 381, 1280])
torch.Size([8, 381, 1280])


KeyboardInterrupt: 

195281636

In [9]:
# [len(s) for idx, s in enumerate(sequences_token)]

len(sequences_token[0][0])

1

number of sequences encoded for protein protein 1:  561
number of sequences encoded for protein protein 2:  537
number of sequences encoded for protein protein 3:  1557
number of sequences encoded for protein protein 4:  470
number of sequences encoded for protein protein 5:  621
number of sequences encoded for protein protein 6:  119
number of sequences encoded for protein protein 7:  162
number of sequences encoded for protein protein 8:  221
number of sequences encoded for protein protein 9:  1968
number of sequences encoded for protein protein 10:  144
number of sequences encoded for protein protein 11:  466
number of sequences encoded for protein protein 12:  536
number of sequences encoded for protein protein 13:  572
number of sequences encoded for protein protein 14:  1220
number of sequences encoded for protein protein 15:  381


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [5]:
# list with encoded sequences prot 1, 2
encoded_seq = []
encoded_seq.append(sequences_token)

open('encoded_seq_esm_3_4.pkl', 'wb').write(pickle.dumps(encoded_seq[0]))

2594584225

In [22]:
encoded_seq[0][0][0][0]

tensor([[ 0.0936, -0.0355,  0.0840,  ..., -0.2505,  0.1166, -0.0011],
        [ 0.0453, -0.0010, -0.0210,  ...,  0.1932, -0.1685,  0.1936],
        [-0.0881,  0.1898, -0.0927,  ...,  0.1093, -0.0750, -0.0036],
        ...,
        [-0.0122, -0.0419,  0.0245,  ..., -0.1731,  0.2047, -0.0862],
        [ 0.0010, -0.0205,  0.0424,  ..., -0.1696,  0.1996, -0.0846],
        [ 0.0150, -0.0280,  0.0341,  ..., -0.1652,  0.2080, -0.0820]],
       dtype=torch.float16)

In [27]:
len(encoded_seq[0])

2

In [8]:
print('len embedding: ',token_representations.shape ,'len seq: ', len(list(seq.items())[0][1]))

len embedding:  torch.Size([60, 559, 1280]) len seq:  517


In [None]:
from esm.data import ESMStructuralSplitDataset
from esm.pretrained import load_model_and_alphabet


# load the model
model, alphabet = load_model_and_alphabet('esm1_t34_670M_UR50S')

# load the dataset
data = ESMStructuralSplitDataset

The dataframe has 773,846,840 records, and the file size is around 16GB. This dataframe has 10 columns:

+ id is the MGnify ID

+ ptm is the predicted TM score

+ plddt is the predicted average lddt

+ num_conf is the number of residues with plddt > 0.7

+ len is the total residues in the protein

+ is_fragment indicates whether the protein sequence is identified as a fragment in the MGnify90 sequence database.

+ sequenceChecksum is the CRC64 hash of the sequence. Can be used for cheaper lookups.

+ esmfold_version is the version of ESMFold, matching the model accessible as esm.pretrained.esmfold_v{0,1}

+ atlas_version is the Atlas version where this structure first appeared. Note: some of the predictions appearing for the first time in v0 are also part of Atlas v2023_02.

+ sequence_dbs is the metagenomic source databases where this structure is part of, as MGnify90_2022_05, comma-separated if it exists in more than one release, as MGnify90_2022_05,MGnify90_2023_02

source:https://github.com/facebookresearch/esm/tree/main/scripts/atlas

In [2]:
import pandas as pd

# df = pd.read_parquet('/home/rickbook/document/applied-nlp/metadata-rc2.parquet')

In [2]:
import glob
import Bio
import pickle

files = glob.glob("./proteins/*/sequences/sequences_pck.pkl")

# load the sequence
with open(files[0], 'rb') as f:
    seq = pickle.load(f)

{'AAP36614.1': Seq('MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTF...KNS'),
 'NP_000681.2': Seq('MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTF...KNS'),
 'BAD97093.1': Seq('MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTF...KNS'),
 'XP_054298134.1': Seq('MLRAAAHFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTF...KNS'),
 'AAA51693.1': Seq('MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTF...KNS'),
 'AAT41621.1': Seq('MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTF...KNS'),
 'NP_001124747.1': Seq('MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTF...KNS'),
 'XP_030677272.1': Seq('MLRAAALFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTF...KNS'),
 'XP_003832508.1': Seq('MLRAAACFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTF...KNS'),
 'CAG33272.1': Seq('MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTF...KNS'),
 'XP_016779748.1': Seq('MFRAAACLGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTF...KNS'),
 'XP_010377208.1': Seq('MLRAAARFGPRLGLRLLSAAATQAVPAPNQQ