# Utilizing ProteinEmbedding and Dataset formation

The _ProteinEmbedding_ object can store protein sequences in combination with embedding and vector representations,
which can then be manipulated by implementing standard operations from the __skbio__ library. 

To demonstrate this process, we will read in a list of protein sequences, then attempt to create a dataset from our 
embeddings to be streamed in with the standard skbio.read.

In [1]:
# Necessary imports
from transformers import T5Tokenizer, T5EncoderModel
from skbio.embedding import ProteinEmbedding
from skbio.sequence import Protein
from tqdm import tqdm
import torch
import skbio
import re

  from .autonotebook import tqdm as notebook_tqdm


## Loading Embeddings

These helper functions will take the inputted protein sequences and feed it through an embedding model (prot-t5), 
outputting the generated embeddings.

In [4]:
# Load Embeddings Functions
def load_protein_t5_embedding(sequence, model_name, tokenizer_name):

    tokenizer = T5Tokenizer.from_pretrained(tokenizer_name)
    model = T5EncoderModel.from_pretrained(model_name)
    
    # convert sequence to formatted list of strings
    seq_list = []
    seq_list.append(sequence)
    seqs = [" ".join(list(re.sub(r"[UZOB]", "X", str(seq)))) for seq in seq_list]
    
    # tokenize sequences and pad up to the longest sequence in the batch
    ids = tokenizer.batch_encode_plus(seqs, add_special_tokens=True, padding="longest")
    input_ids = torch.tensor(ids['input_ids'])
    attention_mask = torch.tensor(ids['attention_mask'])
    
    # generate embeddings
    with torch.no_grad():
        embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask)
    emb = embedding_repr.last_hidden_state[0, :-1, :].squeeze()
    
    print(emb.shape)
    print(emb)
    
    return ProteinEmbedding(emb, sequence)


def to_embeddings(sequences : list, model_name, tokenizer_name):
    # Embed the random/inputted protein sequence(s)
    for sequence in tqdm(sequences):
        test_embed = load_protein_t5_embedding(str(sequence), model_name, tokenizer_name)
        #reshape embeddings to fit the skbio format
        yield test_embed

## Passing to file

Finally, we can output the embeddings into a file, which can be utilized further as will be demonstrated in other scikit-bio tutorials.


In [5]:
model_name = "Rostlab/prot_t5_xl_uniref50"
tokenizer_name = "Rostlab/prot_t5_xl_uniref50"

# Parse bagel.fa
sequence_list = skbio.io.read("pdb_hits.fa", format='fasta')
embed_list = to_embeddings(sequence_list, model_name, tokenizer_name)
skbio.write(embed_list, format='embed', into="pdb_hits.h5")

#test if the file was written correctly and output
read_embed = iter(skbio.read("pdb_hits.h5", format='embed' ,constructor=ProteinEmbedding))
item = next(read_embed)
item



torch.Size([64, 1024])
tensor([[ 0.3527, -0.2012,  0.4537,  ...,  0.2823,  0.0100, -0.0525],
        [ 0.1412, -0.0212, -0.0139,  ...,  0.2419, -0.0761, -0.2239],
        [ 0.0060, -0.0903, -0.0719,  ...,  0.0536, -0.2231,  0.0865],
        ...,
        [ 0.3742, -0.0142,  0.1068,  ..., -0.2525,  0.1522,  0.1697],
        [-0.0415, -0.0368,  0.2598,  ...,  0.3595,  0.0954, -0.3362],
        [-0.2692,  0.1179,  0.3329,  ...,  0.0708, -0.0560, -0.3600]])




torch.Size([85, 1024])
tensor([[ 0.0448, -0.3192,  0.0338,  ...,  0.3150,  0.1809,  0.1616],
        [-0.1406, -0.2170, -0.0971,  ...,  0.3162,  0.0901,  0.0595],
        [ 0.1272, -0.3383,  0.1514,  ...,  0.3010,  0.1081, -0.0971],
        ...,
        [ 0.0277,  0.0309,  0.3150,  ..., -0.1001,  0.0265, -0.2073],
        [-0.0046, -0.0616,  0.3652,  ...,  0.0057,  0.4840,  0.1657],
        [-0.0733,  0.0379,  0.5814,  ..., -0.0665,  0.1087, -0.2235]])




torch.Size([57, 1024])
tensor([[ 0.1845, -0.3951, -0.0854,  ...,  0.1404,  0.1813,  0.2917],
        [-0.0796, -0.2827, -0.0998,  ...,  0.2809,  0.4256,  0.1024],
        [ 0.1345, -0.2616, -0.1580,  ...,  0.0142,  0.0996, -0.1368],
        ...,
        [ 0.1981,  0.0453,  0.1394,  ..., -0.0543, -0.1885,  0.0168],
        [-0.0489, -0.0764,  0.2490,  ..., -0.1048, -0.1520, -0.2704],
        [ 0.0244, -0.1924,  0.3299,  ..., -0.0902, -0.1990,  0.0381]])
torch.Size([21, 1024])
tensor([[ 0.0725,  0.0187, -0.1541,  ...,  0.0581, -0.0055,  0.0638],
        [ 0.1707,  0.1087, -0.2175,  ...,  0.2334,  0.0045,  0.0639],
        [ 0.1556,  0.0777, -0.2653,  ...,  0.0485,  0.2728, -0.0513],
        ...,
        [ 0.0922,  0.1656,  0.1224,  ...,  0.2453, -0.1710, -0.0967],
        [ 0.0379,  0.0477, -0.1381,  ...,  0.0363, -0.1200, -0.1437],
        [ 0.0754,  0.2271,  0.1624,  ...,  0.1898, -0.0374, -0.0882]])


IndexError: Index (3) out of range for (0-2)

## Write to file

Here, we can write our loaded embeddings to a .h5 file using skbio.write. To verify that the embeddings were stored correctly, 

In [None]:
#embed_list = map(loaded_proteins, sequence_list)
# Issue: Generator should include iterables
skbio.write(embed_list, format='embed', into="bagel.h5")

#test if the file was written correctly and output
read_embed = iter(skbio.read("bagel.h5", format='embed' ,constructor=ProteinEmbedding))
for item in read_embed:
    print(item.embedding)