# Embedding of protein sequence with ESM

In [None]:
from Bio import SeqIO
import torch

from typing import Sequence
from concurrent.futures import ThreadPoolExecutor
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:

records = SeqIO.parse("../../tmp/pep/287.5706.PATRIC.faa", "fasta")


In [6]:
first10_records = [next(records) for i in range(10)]
first10_sequence = [str(record.seq) for record in first10_records]

In [7]:
from esm.models.esmc import ESMC
from esm.sdk.api import ESMProtein, LogitsConfig, LogitsOutput, ESMCInferenceClient, ESMProteinError, ProteinType

client = ESMC.from_pretrained("esmc_300m").to("cuda") # or "cpu"


  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 25420.02it/s]


In [None]:
EMBEDDING_CONFIG = LogitsConfig(
    sequence=True, return_embeddings=True, return_hidden_states=True
)

def embed_sequence(model: ESMCInferenceClient, sequence: str) -> LogitsOutput:
    protein = ESMProtein(sequence=sequence)
    protein_tensor = model.encode(protein)
    output = model.logits(protein_tensor, EMBEDDING_CONFIG)
    if isinstance(output, ESMProteinError):
        print(len(sequence))
        raise output
    return output


def batch_embed(
    model: ESMCInferenceClient, inputs: Sequence[ProteinType]
) -> Sequence[LogitsOutput]:
    """Forge supports auto-batching. So batch_embed() is as simple as running a collection
    of embed calls in parallel using asyncio.
    """
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(embed_sequence, model, protein) for protein in inputs
        ]
        results = []
        for future in futures:
            try:
                results.append(future.result())
            except Exception as e:
                print(e)
                # results.append(ESMProteinError(500, str(e)))
    return results

In [10]:
outputs = batch_embed(client, first10_sequence)

The size of tensor a (516) must match the size of tensor b (369) at non-singleton dimension 1
The size of tensor a (369) must match the size of tensor b (317) at non-singleton dimension 1
The size of tensor a (371) must match the size of tensor b (40) at non-singleton dimension 1
The size of tensor a (808) must match the size of tensor b (317) at non-singleton dimension 1
The size of tensor a (259) must match the size of tensor b (180) at non-singleton dimension 1
The size of tensor a (687) must match the size of tensor b (317) at non-singleton dimension 1
The size of tensor a (317) must match the size of tensor b (180) at non-singleton dimension 1
