In [16]:
!pip3 install torch transformers fair-esm biopython

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [17]:
import torch
import esm
from Bio import SeqIO
import numpy as np

In [18]:
# Load the model and its alphabet using fair-esm
# Note: I am not using the tokenizer since the batch_converter from alphabet handles tokenization
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()  
batch_converter = alphabet.get_batch_converter()

model.eval()  # Set to evaluation mode

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0-32): 33 x TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
  )
  (contact_head): ContactPredictionHead(
    (regression): Linear(in_features=660, out_features=1, bias=True)
    (activation): Sigmoid()
  )
  (emb_layer_norm_after): LayerNorm((1280,), eps=1

In [19]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0-32): 33 x TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
  )
  (contact_head): ContactPredictionHead(
    (regression): Linear(in_features=660, out_features=1, bias=True)
    (activation): Sigmoid()
  )
  (emb_layer_norm_after): LayerNorm((1280,), eps=1

In [20]:
# Load sequences from FASTA file
fasta_file = "data/uniprotkb_data.fasta"
sequences = {record.id: str(record.seq) for record in SeqIO.parse(fasta_file, "fasta")}
print(f"Loaded {len(sequences)} sequences")

# dictionary to store segmented embeddings
segmented_embeddings = {}  # key: sequence ID, value: list of embeddings for each segment

Loaded 16161 sequences


In [21]:
# splits a protein sequence into overlapping segments and returns the list
def segment_sequence(sequence, window_size=100, stride=10):
    segments = []
    for start in range(0, len(sequence) - window_size + 1, stride):
        segments.append(sequence[start:start + window_size])
    return segments

In [None]:
# process each sequence: segment and generate embeddings
for seq_id, sequence in sequences.items():
    sequence = sequence.upper()  # convert to uppercase
    segments = segment_sequence(sequence, window_size=100, stride=10)
    seg_embeds = []

    for i, seg in enumerate(segments):
        # prepare data for batch conversion by assigning a unique name per segment
        data = [(f"{seq_id}_seg{i}", seg)]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)
        batch_tokens = batch_tokens.to(device)  # move tokens to GPU if available

        with torch.no_grad():
            outputs = model(batch_tokens, repr_layers=[model.num_layers])

        # get the final layer representation and mean pool
        embedding = outputs["representations"][model.num_layers]
        seg_embedding = embedding[0, 1:-1].mean(dim=0)
        seg_embeds.append(seg_embedding.cpu().numpy())

    segmented_embeddings[seq_id] = seg_embeds
    print(f"Processed {seq_id}: {len(segments)} segments")

Processed sp|A0A087X1C5|CP2D7_HUMAN: 42 segments
Processed sp|A0A0C5B5G6|MOTSC_HUMAN: 0 segments
Processed sp|A0A1B0GTW7|CIROP_HUMAN: 69 segments
Processed sp|A0AV02|S12A8_HUMAN: 62 segments
Processed sp|A0AV96|RBM47_HUMAN: 50 segments
Processed sp|A0AVF1|IFT56_HUMAN: 46 segments
Processed sp|A0AVI4|TM129_HUMAN: 27 segments
Processed sp|A0AVK6|E2F8_HUMAN: 77 segments
Processed sp|A0AVT1|UBA6_HUMAN: 96 segments
Processed sp|A0FGR8|ESYT2_HUMAN: 83 segments
Processed sp|A0FGR9|ESYT3_HUMAN: 79 segments
Processed sp|A0JLT2|MED19_HUMAN: 15 segments


In [None]:
# Save the segmented embeddings to a .npy file for later
np.save("esm2_segmented_embeddings.npy", segmented_embeddings)
print("Segmented embeddings saved to esm2_segmented_embeddings.npy")