<a href="https://colab.research.google.com/github/priyal6/NLP-Prac/blob/main/Tokeinzation_types.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers sentencepiece




# Character, Word, and Sentence Tokenization

In [None]:
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk

nltk.download('punkt_tab')
nltk.download("punkt")


text = "Mistral AI uses BPE tokenization. It's efficient for large-scale models!"

#Character tokenization
char_tokens = list(text)
print("Character Tokenization:", char_tokens)

#Word Tokenization
word_tokens = word_tokenize(text)
print("Word Tokenization", word_tokens)

# Sentence Tokenization
sent_tokens = sent_tokenize(text)
print("Sentence Tokenization:", sent_tokens)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Character Tokenization: ['M', 'i', 's', 't', 'r', 'a', 'l', ' ', 'A', 'I', ' ', 'u', 's', 'e', 's', ' ', 'B', 'P', 'E', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', '.', ' ', 'I', 't', "'", 's', ' ', 'e', 'f', 'f', 'i', 'c', 'i', 'e', 'n', 't', ' ', 'f', 'o', 'r', ' ', 'l', 'a', 'r', 'g', 'e', '-', 's', 'c', 'a', 'l', 'e', ' ', 'm', 'o', 'd', 'e', 'l', 's', '!']
Word Tokenization ['Mistral', 'AI', 'uses', 'BPE', 'tokenization', '.', 'It', "'s", 'efficient', 'for', 'large-scale', 'models', '!']
Sentence Tokenization: ['Mistral AI uses BPE tokenization.', "It's efficient for large-scale models!"]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Subword Tokenization using Hugging Face (BPE, WordPiece, and Unigram)

In [None]:
from transformers import AutoTokenizer
from huggingface_hub import login

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

text = "Tokenization helps Mistral AI process text efficiently."

tokens = tokenizer.tokenize(text)
token_ids = tokenizer.encode(text)


print("BPE Tokens:", tokens)
print("Token IDs:", token_ids)

BPE Tokens: ['Token', 'ization', 'Ġhelps', 'ĠMist', 'ral', 'ĠAI', 'Ġprocess', 'Ġtext', 'Ġefficiently', '.']
Token IDs: [1, 8693, 3229, 16348, 42301, 2784, 26554, 2832, 3403, 34737, 1046]


# Using BERT's WordPiece Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokens = tokenizer.tokenize(text)
token_ids = tokenizer.encode(text)

print("WordPiece Tokens:", tokens)
print("Token IDs:", token_ids)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

WordPiece Tokens: ['token', '##ization', 'helps', 'mist', '##ral', 'ai', 'process', 'text', 'efficiently', '.']
Token IDs: [101, 19204, 3989, 7126, 11094, 7941, 9932, 2832, 3793, 18228, 1012, 102]


# Using SentencePiece (Unigram Tokenizer)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

tokens = tokenizer.tokenize(text)
token_ids = tokenizer.encode(text)

print("Unigram Tokens:", tokens)
print("Token IDs:", token_ids)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Unigram Tokens: ['▁To', 'ken', 'ization', '▁helps', '▁Mis', 'tral', '▁AI', '▁process', '▁text', '▁efficiently', '.']
Token IDs: [304, 2217, 1707, 1691, 8306, 8792, 7833, 433, 1499, 8877, 5, 1]


BPE (Mistral): Efficiently merges common subwords, like "Tokenization" → ['Token', 'ization'].

WordPiece (BERT): Splits rare words into smaller parts using a ## prefix, like "Tokenization" → ['token', '##ization'].

Unigram (T5): Chooses subwords based on probability, sometimes keeping entire words or breaking them down.

In [None]:
from transformers import AutoTokenizer

def load_tokenizer(model_name):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        return tokenizer
    except Exception as e:
        print(f"Failed to load tokenizer: {e}")
        return None

def main():
    print("Tokenization Visualizer")
    print("Explore how different tokenizers break down your text!")

    # User Input
    text_input = input("Enter your text: ")

    # Tokenizer Selection
    print("Choose a Tokenizer:")
    print("1. Mistral AI (BPE)")
    print("2. BERT (WordPiece)")
    print("3. T5 (Unigram)")
    choice = input("Enter choice (1/2/3): ")

    model_map = {
        "1": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
        "2": "bert-base-uncased",
        "3": "t5-small"
    }

    tokenizer = load_tokenizer(model_map.get(choice, "bert-base-uncased"))

    if tokenizer and text_input:
        # Perform tokenization
        tokens = tokenizer.tokenize(text_input)
        token_ids = tokenizer.encode(text_input)

        # Display Results
        print("\nTokens:")
        print(tokens)

        print("\nToken IDs:")
        print(token_ids)

if __name__ == "__main__":
    main()


Tokenization Visualizer
Explore how different tokenizers break down your text!
Enter your text: priyal
Choose a Tokenizer:
1. Mistral AI (BPE)
2. BERT (WordPiece)
3. T5 (Unigram)
Enter choice (1/2/3): 1

Tokens:
['pri', 'yal']

Token IDs:
[1, 24588, 12495]


In [None]:
#Let’s tokenize the word "banana" using BPE.

#+
Initial Tokens:
['b', 'a', 'n', 'a', 'n', 'a']


['b', 'a', 'n', 'a', 'n', 'a']

In [None]:
#Merge the Most Frequent Pair
#The most frequent pair is ('a', 'n'). Merge it to form an.
 ['b', 'an', 'an', 'a']


['b', 'an', 'an', 'a']

In [None]:
#Merge the Most Frequent Pair
#The pair ('an', 'a') is merged to form ana.
['b', 'ana', 'na']

['b', 'ana', 'na']