### Implementing Tokenization

In [None]:
!pip install nltk
!pip install transformers==4.42.1
!pip install sentencepiece
!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
!pip install scikit-learn
!pip install torch==2.2.2
!pip install torchtext==0.17.2
!pip install numpy==1.26.0

In [5]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
import spacy
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer
from transformers import XLNetTokenizer
from torchtext.vocab import build_vocab_from_iterator

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /home/prodesk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/prodesk/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## What is a tokenizer and why do we use it?

Tokenizers play a pivotal role in natural language processing, segmenting text into smaller units known as tokens. These tokens are subsequently transformed into numerical representations called token indices, which are directly employed by deep learning algorithms.
<center>
<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-AI0201EN-Coursera/images/Tokenization%20lab%20Diagram%201.png" width="50%" alt="Image Description">
</center>

## Types of tokenizer

The meaningful representation can vary depending on the model in use. Various models employ distinct tokenization algorithms, and you will broadly cover the following approaches. Transforming text into numerical values might appear straightforward initially, but it encompasses several considerations that must be kept in mind.
<center>
<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-AI0201EN-Coursera/images/Tokenization%20lab%20Diagram%202.png" width="50%" alt="Image Description">
</center>


In [6]:
text = "This is a sample sentence for tokenization."
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['This', 'is', 'a', 'sample', 'sentence', 'for', 'tokenization', '.']


In [7]:
# This showcases word_tokenize from NLTK library
text = "I couldn't help the dog. Can't you do it? Don't be afraid if you are."
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['I', 'could', "n't", 'help', 'the', 'dog', '.', 'Ca', "n't", 'you', 'do', 'it', '?', 'Do', "n't", 'be', 'afraid', 'if', 'you', 'are', '.']


In [8]:
# This showcases the use of the 'spaCy' tokenizer with torchtext's get_tokenizer function

text = "I couldn't help the dog. Can't you do it? Don't be afraid if you are."
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

# Making a list of the tokens and printing the list
token_list = [token.text for token in doc]
print("Tokens:", token_list)

# Showing token details
for token in doc:
    print(f"Token: {token.text}, POS: {token.pos_}, Lemma: {token.lemma_}, Dependency: {token.dep_}")

Tokens: ['I', 'could', "n't", 'help', 'the', 'dog', '.', 'Ca', "n't", 'you', 'do', 'it', '?', 'Do', "n't", 'be', 'afraid', 'if', 'you', 'are', '.']
Token: I, POS: PRON, Lemma: I, Dependency: nsubj
Token: could, POS: AUX, Lemma: could, Dependency: aux
Token: n't, POS: PART, Lemma: not, Dependency: neg
Token: help, POS: VERB, Lemma: help, Dependency: ROOT
Token: the, POS: DET, Lemma: the, Dependency: det
Token: dog, POS: NOUN, Lemma: dog, Dependency: dobj
Token: ., POS: PUNCT, Lemma: ., Dependency: punct
Token: Ca, POS: AUX, Lemma: can, Dependency: aux
Token: n't, POS: PART, Lemma: not, Dependency: neg
Token: you, POS: PRON, Lemma: you, Dependency: nsubj
Token: do, POS: VERB, Lemma: do, Dependency: ROOT
Token: it, POS: PRON, Lemma: it, Dependency: dobj
Token: ?, POS: PUNCT, Lemma: ?, Dependency: punct
Token: Do, POS: AUX, Lemma: do, Dependency: aux
Token: n't, POS: PART, Lemma: not, Dependency: neg
Token: be, POS: AUX, Lemma: be, Dependency: ROOT
Token: afraid, POS: ADJ, Lemma: afraid, D

In [9]:
text = "Unicorns are real. I sae a unicorn yesterday."
token = word_tokenize(text)
print("Tokens:", token)

Tokens: ['Unicorns', 'are', 'real', '.', 'I', 'sae', 'a', 'unicorn', 'yesterday', '.']


### Subword-based tokenizer

In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.tokenize("IBM taught me tokenization.")

['ibm', 'taught', 'me', 'token', '##ization', '.']

### Unigram and SentencePiece Tokenizer

In [11]:
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
tokenizer.tokenize("IBM taught me tokenization.")

['▁IBM', '▁taught', '▁me', '▁token', 'ization', '.']

### Tokenization with PyTorch

In [12]:
dataset = [
    (1,"Introduction to NLP"),
    (2,"Basics of PyTorch"),
    (1,"NLP Techniques for Text Classification"),
    (3,"Named Entity Recognition with PyTorch"),
    (3,"Sentiment Analysis using PyTorch"),
    (3,"Machine Translation with PyTorch"),
    (1," NLP Named Entity,Sentiment Analysis,Machine Translation "),
    (1," Machine Translation with NLP "),
    (1," Named Entity vs Sentiment Analysis  NLP ")]

In [13]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer("basic_english")

In [14]:
tokenizer(dataset[0][1])

['introduction', 'to', 'nlp']

### Token indices

In [33]:
def yield_tokens(data_iter):
    for _, t in data_iter:
        yield tokenizer(t)

In [42]:
my_iter = yield_tokens(dataset)

In [35]:
results = [next(my_iter), next(my_iter)]
print("First two tokenized sentences:", results)

First two tokenized sentences: [['introduction', 'to', 'nlp'], ['basics', 'of', 'pytorch']]


### Out-of-vocabulary (OOV) tokens

In [36]:
vocab = build_vocab_from_iterator(my_iter, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [39]:
def get_tokenized_sentence_and_indices(iterator):
    tokenized_sentence_ = next(iterator) # Get the first tokenized sentence
    token_indices_ = [vocab[token] for token in tokenized_sentence_] # Get token indices
    return tokenized_sentence_, token_indices_

In [43]:
tokenized_sentence, token_indices = get_tokenized_sentence_and_indices(my_iter)

In [44]:
print("Tokenized Sentence:", tokenized_sentence)
print("Token Indices:", token_indices)

Tokenized Sentence: ['introduction', 'to', 'nlp']
Token Indices: [0, 0, 1]


In [45]:
lines = ["IBM taught me tokenization",
         "Special tokenizers are ready and they will blow your mind",
         "just saying hi!"]
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

tokenizer_en = get_tokenizer("spacy", language="en_core_web_sm")

tokens = []
max_length = 0

for line in lines:
    tokenized_line = tokenizer_en(line)
    tokenized_line = ['<bos>'] + tokenized_line + ['<eos>']
    tokens.append(tokenized_line)
    max_length = max(max_length, len(tokenized_line))

for i in range(len(tokens)):
    tokens[i] = tokens[i] + ['<pad>'] * (max_length - len(tokens[i]))

print("Lines after adding special tokens:\n", tokens)

# Build vocabulary without unk_init
vocab = build_vocab_from_iterator(tokens, specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

# Convert tokens to indices
print("Vocabulary:", vocab.get_itos())
print("Token IDs for 'tokenization':", vocab.get_itos())

Lines after adding special tokens:
 [['<bos>', 'IBM', 'taught', 'me', 'tokenization', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<bos>', 'Special', 'tokenizers', 'are', 'ready', 'and', 'they', 'will', 'blow', 'your', 'mind', '<eos>'], ['<bos>', 'just', 'saying', 'hi', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']]
Vocabulary: ['<unk>', '<pad>', '<bos>', '<eos>', '!', 'IBM', 'Special', 'and', 'are', 'blow', 'hi', 'just', 'me', 'mind', 'ready', 'saying', 'taught', 'they', 'tokenization', 'tokenizers', 'will', 'your']
Token IDs for 'tokenization': ['<unk>', '<pad>', '<bos>', '<eos>', '!', 'IBM', 'Special', 'and', 'are', 'blow', 'hi', 'just', 'me', 'mind', 'ready', 'saying', 'taught', 'they', 'tokenization', 'tokenizers', 'will', 'your']


In [46]:
nex_line = "I learned about embeddings and attention mechanisms."

# Tokenize the new line
tokenize_new_line = tokenizer_en(nex_line)
tokenize_new_line = ['<bos>'] + tokenize_new_line + ['<eos>']

# Pad the new line to match the max length
new_line_padded = tokenize_new_line + ['<pad>'] * (max_length - len(tokenize_new_line))

# Convert the new line to indices
new_line_indices = [vocab[token] if token in vocab else vocab['<unk>'] for token in new_line_padded]

print("New line after adding special tokens and padding:", new_line_indices)

New line after adding special tokens and padding: [2, 0, 0, 0, 0, 7, 0, 0, 0, 3, 1, 1]
