# Embedding Models

A short introduction that loads a sentence-transformer model, extracts token and sentence embeddings, and visualizes embedding similarities.
- Load a sentence-transformer model and compute sentence & token embeddings.
- Inspect tokenization, input/output embedding shapes and cos-similarities.
- Visualize embeddings (TSNE) and token similarity heatmaps.



In [11]:
import warnings
warnings.filterwarnings('ignore')


In [None]:
#import all libraries, you can also run pip install -r requirements.txt to install all libraries
import sys
import subprocess

print("=" * 60)
print("üêç PYTHON ORTAMI Bƒ∞LGƒ∞LERƒ∞")
print("=" * 60)
print(f"Python yolu: {sys.executable}")
print(f"Python s√ºr√ºm√º: {sys.version.split()[0]}")
print(f"Platform: {sys.platform}")
print()

# Kurulu paketleri kontrol et
print("üì¶ Kurulu PyTorch ve ƒ∞lgili Paketler:")
print("-" * 60)

packages_to_check = ["torch", "torchvision", "sentence-transformers", "numpy", "scikit-learn"]
for pkg in packages_to_check:
    try:
        result = subprocess.run(
            [sys.executable, "-m", "pip", "show", pkg],
            capture_output=True,
            text=True
        )
        if result.returncode == 0:
            for line in result.stdout.split('\n'):
                if line.startswith('Version:'):
                    print(f"‚úÖ {pkg}: {line.split()[1]}")
                    break
        else:
            print(f"‚ùå {pkg}: KURULU DEƒûƒ∞L")
    except:
        print(f"‚ùå {pkg}: KONTROL EDƒ∞LEMEDƒ∞")

print()
print("üí° Eƒüer yukarƒ±da '‚ùå KURULU DEƒûƒ∞L' g√∂r√ºyorsanƒ±z,")
print("   bir sonraki h√ºcreyi (Setup) √ßalƒ±≈ütƒ±rƒ±n.")

üêç PYTHON ORTAMI Bƒ∞LGƒ∞LERƒ∞
Python yolu: /Users/selcuk/miniconda3/bin/python
Python s√ºr√ºm√º: 3.10.13
Platform: darwin

üì¶ Kurulu PyTorch ve ƒ∞lgili Paketler:
------------------------------------------------------------
‚úÖ torch: 2.0.1
‚úÖ torch: 2.0.1
‚úÖ torchvision: 0.15.2
‚úÖ torchvision: 0.15.2
‚úÖ sentence-transformers: 2.7.0
‚úÖ sentence-transformers: 2.7.0
‚úÖ numpy: 1.24.4
‚úÖ numpy: 1.24.4
‚úÖ scikit-learn: 1.3.2

üí° Eƒüer yukarƒ±da '‚ùå KURULU DEƒûƒ∞L' g√∂r√ºyorsanƒ±z,
   bir sonraki h√ºcreyi (Setup) √ßalƒ±≈ütƒ±rƒ±n.
‚úÖ scikit-learn: 1.3.2

üí° Eƒüer yukarƒ±da '‚ùå KURULU DEƒûƒ∞L' g√∂r√ºyorsanƒ±z,
   bir sonraki h√ºcreyi (Setup) √ßalƒ±≈ütƒ±rƒ±n.


In [None]:
# Setup: Install and verify PyTorch and dependencies
import subprocess
import sys
import importlib

print(f"üêç Python: {sys.executable}")
print(f"üìç Version: {sys.version.split()[0]}\n")

# Install PyTorch and required packages
print("üì¶ Installing required packages...")
print("   - torch==2.0.1")
print("   - torchvision==0.15.2") 
print("   - transformers==4.37.0 (downgraded for compatibility)")
print("   - sentence-transformers==2.7.0")
print("   - scikit-learn, plotly\n")

try:
    # Uninstall old transformers first
    subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", "-q", "transformers"])
    
    # Install correct versions
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "--no-cache-dir", 
                          "torch==2.0.1", "torchvision==0.15.2"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                          "transformers==4.37.0"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", 
                          "sentence-transformers==2.7.0", "scikit-learn==1.3.2", "plotly==5.22.0"])
    print("‚úÖ All packages installed successfully!\n")
except Exception as e:
    print(f"‚ö†Ô∏è  Installation warning: {e}\n")

# Force reload modules to clear old cached imports
print("üîÑ Clearing old module imports...")
modules_to_clear = ['transformers', 'torch', 'sentence_transformers']
for mod in modules_to_clear:
    if mod in sys.modules:
        del sys.modules[mod]
print("‚úÖ Module cache cleared\n")

# Verify imports work with fresh modules
print("üîç Verifying imports...")
try:
    import numpy as np
    import torch
    from sentence_transformers import SentenceTransformer
    
    print(f"‚úÖ NumPy {np.__version__}")
    print(f"‚úÖ PyTorch {torch.__version__}")
    print(f"‚úÖ Sentence Transformers loaded\n")
except ImportError as e:
    print(f"‚ùå Import error: {e}\n")
    raise

# Load the model directly in this cell
print("ü§ñ Loading sentence-transformer model...")
print("   Model: all-MiniLM-L6-v2 (~40MB, will download on first run)\n")

model = SentenceTransformer("all-MiniLM-L6-v2")
print(f"‚úÖ Model loaded successfully!")
print(f"   Embedding dimension: {model.get_sentence_embedding_dimension()}")
print(f"   Max sequence length: {model.max_seq_length}")
print(f"\nüéØ Ready to use! Model available as 'model' variable.")

In [None]:
# Model is already loaded in the previous cell, just display it
# model is the SentenceTransformer instance 
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [None]:
# Tokenize the input text, you can use the model's tokenizer and you can change sentence as well
tokenized_data = model.tokenize(["walker walked a long walk"])
tokenized_data

{'input_ids': tensor([[ 101, 5232, 2939, 1037, 2146, 3328,  102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [None]:
# Convert tokenized IDs back to readable tokens
#
# This cell maps the token IDs produced by the model tokenizer back into
# human-readable token strings. Use this to inspect how the input text is
# split into subword tokens, to identify special tokens (e.g. [CLS], [SEP])
# and WordPiece/BPE fragments (e.g. '##ing'). This helps debug tokenization
# boundary issues and align token-level embeddings with original text.
#
# Example output (approx.): ['[CLS]', 'walker', 'walk', '##ed', 'a', 'long', 'walk', '[SEP]']
#
# Run this cell after the tokenization cell to see the token strings.

tokens = model.tokenizer.convert_ids_to_tokens(tokenized_data["input_ids"][0])
tokens

['[CLS]', 'walker', 'walked', 'a', 'long', 'walk', '[SEP]']

In [None]:
# Access the underlying BERT model from SentenceTransformer
# SentenceTransformer wraps a pre-trained BERT model with pooling layers
# The first module contains the actual transformer architecture (BERT)
# We access this to examine the raw embeddings before pooling
first_module = model._first_module()
first_module.auto_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
    

## Input token embeddings

In [22]:
# Retrieve the model's input embedding module (token/position embeddings).
# Use this to inspect embedding layers or extract raw embedding weights.
embeddings = first_module.auto_model.embeddings
embeddings

BertEmbeddings(
  (word_embeddings): Embedding(30522, 384, padding_idx=0)
  (position_embeddings): Embedding(512, 384)
  (token_type_embeddings): Embedding(2, 384)
  (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [24]:
import torch
import plotly.express as px

# Select device: use Apple MPS when available, otherwise fallback to CPU
# (For CUDA-enabled machines replace this check with torch.cuda.is_available())
device = torch.device("mps" if torch.has_mps else "cpu")  # Use MPS for Apple, CUDA for others, or fallback to CPU

# Example sentences to compare
first_sentence = "vector search optimization"
second_sentence = "we learn to apply vector search optimization"

with torch.no_grad():
    # Tokenize both texts: returns input IDs and attention masks
    first_tokens = model.tokenize([first_sentence])
    second_tokens = model.tokenize([second_sentence])
    
    # Obtain token-level embeddings by indexing the embedding layer with input IDs
    # Move the input IDs to the selected device before indexing
    first_embeddings = embeddings.word_embeddings(
        first_tokens["input_ids"].to(device)
    )
    second_embeddings = embeddings.word_embeddings(
        second_tokens["input_ids"].to(device)
    )

# The resulting shapes are (batch_size, seq_len, embedding_dim)
first_embeddings.shape, second_embeddings.shape

(torch.Size([1, 5, 384]), torch.Size([1, 9, 384]))

In [25]:
from sentence_transformers import util

distances = util.cos_sim(
    first_embeddings.squeeze(), 
    second_embeddings.squeeze()
).cpu().numpy() # Move the tensor to the CPU and convert to a NumPy array

px.imshow(
    distances, 
    x=model.tokenizer.convert_ids_to_tokens(
        second_tokens["input_ids"][0]
    ),
    y=model.tokenizer.convert_ids_to_tokens(
        first_tokens["input_ids"][0]
    ),
    text_auto=True,
)

### Visualizing the input embeddings

In [27]:
token_embeddings = first_module.auto_model \
    .embeddings \
    .word_embeddings \
    .weight \
    .detach() \
    .cpu() \
    .numpy()
token_embeddings.shape

(30522, 384)

In [None]:
import random

# Get tokenizer vocabulary (token -> id mapping)
vocabulary = first_module.tokenizer.get_vocab()

# Sort by token id so tokens are in model index order
sorted_vocabulary = sorted(
    vocabulary.items(), 
    key=lambda x: x[1],  # uses the integer id for ordering
)

# Extract just the token strings in index order
sorted_tokens = [token for token, _ in sorted_vocabulary]

# Sample 100 tokens at random (with replacement). Use random.sample for unique tokens.
random.choices(sorted_tokens, k=100)

['filly',
 'rt',
 'tiberius',
 'elects',
 '‡∏•',
 'older',
 'southampton',
 '##ttal',
 'polar',
 'roberto',
 '2010s',
 'affiliated',
 'withdrew',
 'Áâà',
 '$',
 'supervisor',
 'delivery',
 'nails',
 'elimination',
 '##key',
 'prevention',
 '##ues',
 '##ÁöÑ',
 'atomic',
 'drains',
 '##urn',
 'fundamental',
 '##‡¶§',
 'castile',
 '##bard',
 'fact',
 'unavailable',
 'flute',
 'ing',
 'wiener',
 '##marks',
 'flashlight',
 '327',
 '1837',
 '##gration',
 'contract',
 '##aghan',
 'intersecting',
 '271',
 'verify',
 'wide',
 '256',
 '[unused377]',
 'apartheid',
 'chewing',
 'headline',
 '[unused239]',
 'ethnicity',
 '##rar',
 '##ingen',
 '##a1',
 'vault',
 'surgeons',
 'packers',
 'regional',
 'exceptional',
 'sample',
 'script',
 '##sons',
 'trier',
 'vegetation',
 'fraudulent',
 '[unused130]',
 '##·Éî',
 '82',
 'paces',
 'weakness',
 '1700',
 'Âíå',
 '[unused119]',
 'degrees',
 '##bation',
 '##hila',
 '##ive',
 'reader',
 'promise',
 '##lase',
 '##iers',
 'mistress',
 'timmy',
 'lever',
 'bo

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, metric="cosine", random_state=42)
tsne_embeddings_2d = tsne.fit_transform(token_embeddings)
tsne_embeddings_2d.shape

In [13]:
token_colors = []
for token in sorted_tokens:
    if token[0] == "[" and token[-1] == "]":
        token_colors.append("red")
    elif token.startswith("##"):
        token_colors.append("blue")
    else:
        token_colors.append("green")

In [14]:
import plotly.graph_objs as go

scatter = go.Scattergl(
    x=tsne_embeddings_2d[:, 0], 
    y=tsne_embeddings_2d[:, 1],
    text=sorted_tokens,
    marker=dict(color=token_colors, size=3),
    mode="markers",
    name="Token embeddings",
)

fig = go.FigureWidget(
    data=[scatter],
    layout=dict(
        width=600,
        height=900,
        margin=dict(l=0, r=0),
    )
)

fig.show()

## Output token embeddings

In [15]:
output_embedding = model.encode(["walker walked a long walk"])
output_embedding.shape

(1, 384)

In [16]:
output_token_embeddings = model.encode(
    ["walker walked a long walk"], 
    output_value="token_embeddings"
)
output_token_embeddings[0].shape

torch.Size([7, 384])

In [17]:
first_sentence = "vector search optimization"
second_sentence = "we learn to apply vector search optimization"

with torch.no_grad():
    first_tokens = model.tokenize([first_sentence])
    second_tokens = model.tokenize([second_sentence])
    
    first_embeddings = model.encode(
        [first_sentence], 
        output_value="token_embeddings"
    )
    second_embeddings = model.encode(
        [second_sentence], 
        output_value="token_embeddings"
    )

distances = util.cos_sim(
    first_embeddings[0], 
    second_embeddings[0]
)

In [18]:
px.imshow(
    distances.cpu().numpy(),  # Move the tensor to CPU and convert to a NumPy array
    x=model.tokenizer.convert_ids_to_tokens(
        second_tokens["input_ids"][0]
    ),
    y=model.tokenizer.convert_ids_to_tokens(
        first_tokens["input_ids"][0]
    ),
    text_auto=True,
)