<a href="https://colab.research.google.com/github/raz0208/ModernBERT/blob/main/ModernBERT_TokenEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
# import required libraries
import os
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

In [19]:
# Load ModernBERT tokenizer and model from Hugging Face
MODEL_NAME = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

In [20]:
# Function to gest text and return the embedding
def get_text_embedding(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Forward pass to get hidden states
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings (use CLS token for sentence-level embedding)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: [batch_size, hidden_size]

    return cls_embedding.squeeze().numpy()

In [21]:
# Example usage (Sentence: This is an application about Breast Cancer.)
if __name__ == "__main__":
    user_text = input("Enter your text: ")
    embedding = get_text_embedding(user_text)
    print("Embedding vector shape:", embedding.shape, "\n")
    print("Embedding (first 10 values):","\n", embedding[:10])

Enter your text: This is an application about Breast Cancer.
Embedding vector shape: (768,) 

Embedding (first 10 values): 
 [ 0.42236355 -0.8862073  -0.6536694  -0.2981413  -0.5874422  -0.720903
 -0.8588484  -0.89695704  0.5856571  -0.9214181 ]
