In [1]:
# Install required libraries (run once)
# !pip install sentencepiece transformers torch
import torch
import sentencepiece as spm
from transformers import BertTokenizer, BertModel
text = "Tokenization is important"
print("Original Text:", text)
print("-" * 50)
# WORD TOKENIZATION
word_tokens = text.split()
word_embeddings = torch.randn(len(word_tokens), 8)
print("WORD TOKENS:", word_tokens)
print("WORD EMBEDDING SHAPE:", word_embeddings.shape)
print("-" * 50)
char_tokens = list(text.replace(" ", ""))
char_embeddings = torch.randn(len(char_tokens), 8)
print("CHAR TOKENS:", char_tokens)
print("CHAR EMBEDDING SHAPE:", char_embeddings.shape)
print("-" * 50)
# SUBWORD TOKENIZATION
with open("text.txt", "w") as f:
    f.write(text)

spm.SentencePieceTrainer.train(
    input="text.txt",
    model_prefix="spm",
    vocab_size=30,
    model_type="bpe"
)

sp = spm.SentencePieceProcessor()
sp.load("spm.model")

subword_tokens = sp.encode(text, out_type=str)
subword_embeddings = torch.randn(len(subword_tokens), 8)

print("SUBWORD TOKENS:", subword_tokens)
print("SUBWORD EMBEDDING SHAPE:", subword_embeddings.shape)
print("-" * 50)
# WORDPIECE TOKENIZATION (BERT)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

wordpiece_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

print("WORDPIECE TOKENS:", wordpiece_tokens)
print("WORDPIECE EMBEDDING SHAPE:", outputs.last_hidden_state.shape)
print("-" * 50)


Original Text: Tokenization is important
--------------------------------------------------
WORD TOKENS: ['Tokenization', 'is', 'important']
WORD EMBEDDING SHAPE: torch.Size([3, 8])
--------------------------------------------------
CHAR TOKENS: ['T', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', 'i', 's', 'i', 'm', 'p', 'o', 'r', 't', 'a', 'n', 't']
CHAR EMBEDDING SHAPE: torch.Size([23, 8])
--------------------------------------------------
SUBWORD TOKENS: ['▁', 'Tok', 'en', 'iz', 'at', 'ion', '▁i', 's', '▁i', 'mp', 'ort', 'ant']
SUBWORD EMBEDDING SHAPE: torch.Size([12, 8])
--------------------------------------------------


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


WORDPIECE TOKENS: ['[CLS]', 'token', '##ization', 'is', 'important', '[SEP]']
WORDPIECE EMBEDDING SHAPE: torch.Size([1, 6, 768])
--------------------------------------------------
