In [None]:
!pip install transformers torch

# 1. Tokenization

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Tokenize input sentence
sentence = "Tokenization is essential for any AI organization."
tokens = tokenizer.tokenize(sentence)


In [4]:
print(tokens)

['token', '##ization', 'is', 'essential', 'for', 'any', 'ai', 'organization', '.']


# 2. Embeddings

Once tokenized, each word (or subword) is mapped to a high-dimensional vector using embeddings. This is where words stop being words and become numbers!

Why is this important? Because embeddings capture meaning and relationships between words. Unlike one-hot encoding (where every word is just a unique ID), embeddings represent words in a way that reflects their meaning.

For example:
📍 King - Man + Woman ≈ Queen
📍 Paris - France + Italy ≈ Rome

This happens because similar words have similar embeddings, and their relationships emerge naturally in the learned space.

In [5]:
input_ids = tokenizer.encode(sentence, return_tensors="pt")

# Get embeddings from BERT
with torch.no_grad():
    outputs = model(input_ids)
    embeddings = outputs.last_hidden_state  # Shape: (1, sequence_length, hidden_size)

# Print token-wise embeddings
for token, embedding in zip(tokens, embeddings[0]):
    print(f"Token: {token} -> Embedding: {embedding[:5]}...")  # Showing only first 5 values

Token: token -> Embedding: tensor([-0.4691, -0.1613, -0.4374,  0.0752, -0.5703])...
Token: ##ization -> Embedding: tensor([-0.5442, -0.0559, -0.8279, -0.3151, -0.1229])...
Token: is -> Embedding: tensor([-0.5498, -0.5570, -0.5254,  0.3701,  0.4370])...
Token: essential -> Embedding: tensor([-0.4953, -0.4014,  0.2123,  0.1217,  0.1697])...
Token: for -> Embedding: tensor([ 0.0313,  0.0174, -0.0993,  0.3831,  0.6161])...
Token: any -> Embedding: tensor([-0.1266,  0.1941,  0.5250,  0.3075, -0.3373])...
Token: ai -> Embedding: tensor([-0.5191,  0.2360,  0.1873, -0.0810, -0.4449])...
Token: organization -> Embedding: tensor([-0.3126,  0.3171,  0.2675, -0.0562,  1.0097])...
Token: . -> Embedding: tensor([-0.2083, -0.1946, -0.1072, -0.0192,  0.0847])...
