In [None]:
!pip install numpy scikit-learn
!pip install torch==2.0.1
!pip install torchtext==0.15.2

In [None]:
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')
import spacy
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.util import ngrams
from transformers import BertTokenizer
from transformers import XLNetTokenizer

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

There are three types of tokenizer:
- word based,
- character based,
- sub word based.

There are different rules for word-based tokenizers, such as splitting on spaces or splitting on punctuation. Each  assigns a specific ID to the split word. Here we will use nltk's  word_tokenize


### Word Based using NLTK and spacy

In [None]:
text = "This is a sample sentence for word tokenization."
tokens = word_tokenize(text)
print(tokens)

In [None]:
text = "I couldn't help the dog. Can't you do it? Don't be afraid if you are."
tokens = word_tokenize(text)
print(tokens)

In [None]:
# This uses 'spaCy' tokenizer

text = "I couldn't help the dog. Can't you do it? Don't be afraid if you are. dogs are loyal"
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

# Making a list of the tokens and priting the list
token_list = [token.text for token in doc]
print("Tokens:", token_list)

# Showing token details
for token in doc:
    print(token.text, token.pos_, token.dep_)

- I PRON nsubj: "I" is a pronoun (PRON) and is the nominal subject (nsubj) of the sentence.
- help VERB ROOT: "help" is a verb (VERB) and is the root action (ROOT) of the sentence.
- afraid ADJ acomp: "afraid" is an adjective (ADJ) and is an adjectival complement (acomp) which gives more information about a state or quality related to the verb.


The problem with this algorithm is that words with similar meanings will be assigned different IDs, resulting in them being treated as entirely separate words with distinct meanings. For example, $Unicorns$ is the plural form of $Unicorn$, but a word-based tokenizer would tokenize them as two separate words, potentially causing the model to miss their semantic relationship.



In [None]:
text = "Unicorns are real. I saw a unicorn yesterday."
token = word_tokenize(text)
print(token)

Languages generally have a large number of words, the vocabularies based on them will always be extensive. However, the number of characters in a language is always fewer compared to the number of words. Next, we will explore character-based tokenizers.


### Character Based Tokenization

it is important to note that character-based tokenization has its limitations. Single characters may not convey the same information as entire words



## Subword-based tokenizer
Techniques such as SentencePiece, or WordPiece are commonly used for subword tokenization. These methods learn subword units from a given text corpus, identifying common prefixes, suffixes, and root words as subword tokens based on their frequency of occurrence.  For instance, 'Unhappiness' is split into 'un' and 'happiness,' both of which can appear as stand-alone subwords. When we combine these individual subwords, they form 'unhappiness,' which retains its meaningful context.

### Wordpiece

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.tokenize("Unhappiness is not good for health and let us see how tokenization works in this case")

'token', '##ization': "Tokenization" is broken into two tokens. "Token" is a whole word, and "##ization" is a part of the original word.

 The "##" indicates that "ization" should be connected back to "token" when detokenizing (transforming tokens back to words).

### Sentence Piece


SentencePiece is a tool that takes text, divides it into smaller, more manageable parts, assigns IDs to these segments, and ensures that it does so consistently

In [None]:
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
tokenizer.tokenize("IBM taught me tokenization. I am practising IBM labs")

_ indicates space

### Tokenization with Pytorch

In [None]:
dataset = [
    (1,"Introduction to NLP"),
    (2,"Basics of PyTorch"),
    (1,"NLP Techniques for Text Classification"),
    (3,"Named Entity Recognition with PyTorch"),
    (3,"Sentiment Analysis using PyTorch"),
    (3,"Machine Translation with PyTorch"),
    (1," NLP Named Entity,Sentiment Analysis,Machine Translation "),
    (1," Machine Translation with NLP "),
    (1," Named Entity vs Sentiment Analysis  NLP ")]

In [None]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer("basic_english")
tokenizer(dataset[0][1])

build_vocab_from_iterator function, when applied to a list of tokens, assigns a unique index to each token based on its position in the vocabulary. These indices serve as a way to represent the tokens in a numerical format that can be easily processed by machine learning models.


dataset is an iterable. Therefore, you use a generator function yield_tokens to apply the tokenizer. The purpose of the generator function yield_tokens is to yield tokenized texts one at a time.


In [None]:
def yield_tokens(data_iter):
    for  _,text in data_iter:
        yield tokenizer(text)

In [None]:
my_iterator = yield_tokens(dataset)  ### used for iterating dataset

In [None]:
next(my_iterator)

In [None]:
vocab = build_vocab_from_iterator(yield_tokens(dataset), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
def get_tokenized_sentence_and_indices(iterator):
    tokenized_sentence = next(iterator)  # Get the next tokenized sentence
    token_indices = [vocab[token] for token in tokenized_sentence]  # Get token indices
    return tokenized_sentence, token_indices

tokenized_sentence, token_indices = get_tokenized_sentence_and_indices(my_iterator)
next(my_iterator)

print("Tokenized Sentence:", tokenized_sentence)
print("Token Indices:", token_indices)

In [None]:
lines = ["IBM taught me tokenization",
         "Special tokenizers are ready and they will blow your mind",
         "just saying hi!"]

special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

tokenizer_en = get_tokenizer('spacy', language='en_core_web_sm')

tokens = []
max_length = 0

for line in lines:
    tokenized_line = tokenizer_en(line)
    tokenized_line = ['<bos>'] + tokenized_line + ['<eos>']
    tokens.append(tokenized_line)
    max_length = max(max_length, len(tokenized_line))

for i in range(len(tokens)):
    tokens[i] = tokens[i] + ['<pad>'] * (max_length - len(tokens[i]))

print("Lines after adding special tokens:\n", tokens)

# Build vocabulary without unk_init
vocab = build_vocab_from_iterator(tokens, specials=['<unk>'])
vocab.set_default_index(vocab["<unk>"])

# Vocabulary and Token Ids
print("Vocabulary:", vocab.get_itos())
print("Token IDs for 'tokenization':", vocab.get_stoi())

Let's break down the output:
1. **Special Tokens**:
- Token: "`<unk>`", Index: 0: `<unk>` stands for "unknown" and represents words that were not seen during vocabulary building, usually during inference on new text.
- Token: "`<pad>`", Index: 1: `<pad>` is a "padding" token used to make sequences of words the same length when batching them together.
- Token: "`<bos>`", Index: 2: `<bos>` is an acronym for "beginning of sequence" and is used to denote the start of a text sequence.
- Token: "`<eos>`", Index: 3: `<eos>` is an acronym for "end of sequence" and is used to denote the end of a text sequence.

2. **Word Tokens**:
The rest of the tokens are words or punctuation extracted from the provided sentences, each assigned a unique index:
- Token: "IBM", Index: 5
- Token: "taught", Index: 16
- Token: "me", Index: 12
    ... and so on.
    
3. **Vocabulary**:
It denotes the total number of tokens in the sentences upon which vocabulary is built.
    
4. **Token IDs for 'tokenization'**:
It represents the token IDs assigned in the vocab where a number represents its presence in the sentence.


In [None]:
text = """
Going through the world of tokenization has been like walking through a huge maze made of words, symbols, and meanings. Each turn shows a bit more about the cool ways computers learn to understand our language. And while I'm still finding my way through it, the journey’s been enlightening and, honestly, a bunch of fun.
Eager to see where this learning path takes me next!"
"""

# Counting and displaying tokens and their frequency
from collections import Counter
def show_frequencies(tokens, method_name):
    print(f"{method_name} Token Frequencies: {dict(Counter(tokens))}\n")

In [None]:
import nltk
import spacy
from transformers import BertTokenizer, XLNetTokenizer
from datetime import datetime

# NLTK Tokenization
start_time = datetime.now()
nltk_tokens = nltk.word_tokenize(text)
nltk_time = datetime.now() - start_time

# SpaCy Tokenization
nlp = spacy.load("en_core_web_sm")
start_time = datetime.now()
spacy_tokens = [token.text for token in nlp(text)]
spacy_time = datetime.now() - start_time

# BertTokenizer Tokenization
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
start_time = datetime.now()
bert_tokens = bert_tokenizer.tokenize(text)
bert_time = datetime.now() - start_time

# XLNetTokenizer Tokenization
xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
start_time = datetime.now()
xlnet_tokens = xlnet_tokenizer.tokenize(text)
xlnet_time = datetime.now() - start_time

# Display tokens, time taken for each tokenizer, and token frequencies
print(f"NLTK Tokens: {nltk_tokens}\nTime Taken: {nltk_time} seconds\n")
show_frequencies(nltk_tokens, "NLTK")

print(f"SpaCy Tokens: {spacy_tokens}\nTime Taken: {spacy_time} seconds\n")
show_frequencies(spacy_tokens, "SpaCy")

print(f"Bert Tokens: {bert_tokens}\nTime Taken: {bert_time} seconds\n")
show_frequencies(bert_tokens, "Bert")

print(f"XLNet Tokens: {xlnet_tokens}\nTime Taken: {xlnet_time} seconds\n")
show_frequencies(xlnet_tokens, "XLNet")