# Ume Multi-Modal Embeddings Tutorial

This notebook show how to use the Universal Molecular Encoder (Ume) to generate embeddings for different molecular modalities: amino acids, SMILES, and nucleotides. Stay tuned for 3D coordinate embeddings and more!

In [2]:
from lobster.model import Ume

ume = Ume()

print(f"Supported modalities: {ume.modalities}")
print(f"Vocab size: {len(ume.get_vocab())}")

  @custom_fwd
  @custom_bwd


Supported modalities: ['SMILES', 'amino_acid', 'nucleotide', '3d_coordinates']
Vocab size: 1536


### Load from checkpoint

In [3]:
checkpoint = "ume-checkpoints/best.ckpt" # Replace with the correct checkpoint path

ume = Ume.load_from_checkpoint(checkpoint)

### 1. Protein sequences

Embed sample protein sequence to get full sequence embedding or per-residue embeddings.

In [4]:
# Example protein sequences
protein_sequences = [
    "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",  # Sample protein fragment
    "MLSRAVCGTSRQLAPVLAYLGSRQKHSLPDLPYDYGALEPHINAQIMQLHHSKHHAAYVNNLNVTEEKYQEALAKGDVTAQIALQPALKFNGGGHINHSIFWTNLSPNGGGEPKGELLEAIKRDFGSFDKFKEKLTAASVGVQGSGWGWLGFNKERGHLQIAACPNQDPLQGTTGLIPLLGIDVWEHAYYLQYKNVRPDYLKAIWNVINWENVTERYMACKK",  # Human SOD1
    "MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH"  # Hemoglobin beta chain
]

# Get embeddings for protein sequences
protein_embeddings = ume.get_embeddings(protein_sequences, modality="amino_acid")
print(f"Protein embeddings shape: {protein_embeddings.shape}")

# Get token-level embeddings (without aggregation)
protein_residue_embeddings = ume.get_embeddings(protein_sequences, modality="amino_acid", aggregate=False)
print(f"Protein token-level embeddings shape: {protein_residue_embeddings.shape}")

Protein embeddings shape: torch.Size([3, 768])
Protein token-level embeddings shape: torch.Size([3, 512, 768])


### 2. SMILES
SMILES strings are a text-based representation of molecular structures. Here we embed common drug molecules.


In [5]:
# Example SMILES strings for common molecules
smiles_examples = [
    "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
    "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # Caffeine
    "COC1=CC=C(CCN)C=C1",  # Dopamine
    "C1=CC=C(C(=C1)C(=O)O)O",  # Salicylic acid
    "CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@@H]([C@H]([C@@H]1O)O)N1C=NC2=C1N=CN=C2N)OC1=CC=CC=C1"  # Remdesivir
]

# Get embeddings for SMILES
smiles_embeddings = ume.get_embeddings(smiles_examples, modality="SMILES")
print(f"SMILES embeddings shape: {smiles_embeddings.shape}")

SMILES embeddings shape: torch.Size([5, 768])


### 3. Nucleotides

Embed example DNA/RNA sequences.

In [6]:
# Example DNA/RNA sequences
nucleotide_sequences = [
    "ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC",  
    "GATTACACAGTGCTTGACCCGATCGATCGATCGATCGATCGATCGATCGA",  
    "AUGCUAUGCUAGCUAGCUAGCUAGCUAUGCUAGCUAUGCUAGCUAUC"  # RNA sequence 
]

# Get embeddings for nucleotide sequences
nucleotide_embeddings = ume.get_embeddings(nucleotide_sequences, modality="nucleotide")
print(f"Nucleotide embeddings shape: {nucleotide_embeddings.shape}")

Nucleotide embeddings shape: torch.Size([3, 768])


## Using Embeddings for Downstream Tasks
Quick example of using molecular embeddings for a classification task.

In [7]:
# Dummy classification setup 
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# SMILES with some property labels
inputs = ["CC(=O)OC1=CC=CC=C1C(=O)O", "CN1C=NC2=C1C(=O)N(C(=O)N2C)C", "COC1=CC=C(CCN)C=C1", 
              "C1=CC=C(C(=C1)C(=O)O)O", "CC12CCC(CC1)CC(C2)C(C)CN"]
labels = [0, 1, 0, 1, 0]  # Binary classification example

# Get embeddings
X = ume.get_embeddings(inputs, modality="SMILES").cpu().numpy()
y = np.array(labels)

# Train a simple classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Evaluate on Molecular Property Prediction Tasks

Here is how to evaluate Ume on tasks defined as callbacks. Note that training on and evaluating these tasks will take a few minutes.

In [12]:
import pandas as pd

from lobster.callbacks import CalmLinearProbeCallback, MoleculeACELinearProbeCallback

molecule_ace_probe = MoleculeACELinearProbeCallback(
    max_length=ume.embedding_dim
)
molecule_ace_scores = molecule_ace_probe.evaluate(ume)

MoleculeACELinearProbeCallback: 100%|██████████| 30/30 [07:58<00:00, 15.94s/it]


In [18]:
pd.DataFrame(molecule_ace_scores).head()

Unnamed: 0,CHEMBL238_Ki,CHEMBL1862_Ki,CHEMBL237_EC50,CHEMBL231_Ki,CHEMBL214_Ki,CHEMBL4616_EC50,CHEMBL234_Ki,CHEMBL228_Ki,CHEMBL2034_Ki,CHEMBL233_Ki,...,CHEMBL2147_Ki,CHEMBL218_EC50,CHEMBL236_Ki,CHEMBL244_Ki,CHEMBL2047_EC50,CHEMBL4203_Ki,CHEMBL4005_Ki,CHEMBL204_Ki,CHEMBL235_EC50,mean
mse,2.095025,5.312742,3.668455,2.874007,0.977828,1.559722,0.929982,1.373721,2.35863,1.170707,...,2.118004,3.143562,1.167833,1.422116,1.564671,3.172571,3.062797,1.224741,0.990469,2.021026
r2,-0.544668,-1.43017,-0.756787,-0.672591,0.243814,-0.861025,0.27865,0.044004,-1.337632,0.312313,...,0.460752,-1.877037,0.349264,0.454258,-0.626796,-1.758412,-1.626563,0.47672,0.145517,-0.457946
spearman,0.473802,0.539632,0.380362,0.486178,0.546139,0.273161,0.608297,0.501315,0.387129,0.602608,...,0.760325,0.273797,0.642653,0.727715,0.476775,0.262011,0.314889,0.709204,0.543911,0.489678


In [15]:
calm_probe = CalmLinearProbeCallback(
    max_length=ume.embedding_dim
)
calm_scores = calm_probe.evaluate(ume)

Generating train split: 5222 examples [00:00, 74980.33 examples/s]
Generating train split: 562 examples [00:00, 37857.53 examples/s]s/it]
Generating train split: 5696 examples [00:00, 151651.71 examples/s]it]
Generating train split: 452 examples [00:00, 21029.68 examples/s]s/it]
Generating train split: 14772 examples [00:00, 125365.49 examples/s]t]
Generating train split: 369 examples [00:00, 23669.81 examples/s]s/it]
CalmLinearProbeCallback: 100%|██████████| 8/8 [02:38<00:00, 19.83s/it]


In [17]:
pd.DataFrame(calm_scores).head()

Unnamed: 0,solubility,localization,meltome,mean
mse,3.649214,,56.7019,30.175557
r2,-0.273509,,0.464096,0.095294
spearman,0.038007,,0.463611,0.250809
accuracy,,0.890833,,0.890833
f1,,0.398531,,0.398531
