<a href="https://colab.research.google.com/github/raz0208/ModernBERT/blob/main/ModernBERT_TokenEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
# import required libraries
import os
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

In [19]:
# Load ModernBERT tokenizer and model from Hugging Face
MODEL_NAME = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

In [20]:
# Function to gest text and return the embedding
def get_text_embedding(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Forward pass to get hidden states
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings (use CLS token for sentence-level embedding)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: [batch_size, hidden_size]

    return cls_embedding.squeeze().numpy()

In [22]:
# Example usage (Sentence: This is an application about Breast Cancer.)
if __name__ == "__main__":
    user_text = input("Enter your text: ")
    embedding = get_text_embedding(user_text)
    print("Embedding vector shape:", embedding.shape, "\n")
    print("Embedding (first 10 values):","\n", embedding[:])

Enter your text: This is an application about Breast Cancer.
Embedding vector shape: (768,) 

Embedding (first 10 values): 
 [ 4.22363549e-01 -8.86207283e-01 -6.53669417e-01 -2.98141301e-01
 -5.87442219e-01 -7.20902979e-01 -8.58848393e-01 -8.96957040e-01
  5.85657120e-01 -9.21418071e-01 -6.09960854e-01  7.95451462e-01
 -1.29750752e+00 -3.49179089e-01  3.21411109e-03 -3.27389777e-01
 -3.02219123e-01  5.94979942e-01  7.03177571e-01  1.14349341e+00
 -4.53124821e-01  4.46286201e-02  2.23747998e-01  4.11409289e-02
 -4.51851189e-02 -1.95063338e-01 -9.12220180e-01  4.94864322e-02
 -1.72562644e-01  5.70872903e-01 -1.35543716e+00  2.98780948e-01
 -9.94743556e-02 -4.71591711e-01  4.16971356e-01  4.55408603e-01
 -4.25172061e-01 -7.34001338e-01 -2.71154284e-01 -8.13149810e-02
 -8.82083774e-02  1.62187979e-01  5.66225529e-01 -4.20166999e-01
  1.71248078e-01  6.72928870e-01  1.13312531e+00 -1.25215352e+00
 -4.34570521e-01  1.02366216e-01  1.06520522e+00 -3.46358269e-01
 -6.31577551e-01  7.10827559e-