<a href="https://colab.research.google.com/github/siddhi-svg/NLP/blob/main/tokenization_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
text = """One day, a wealthy man came to Akbar's court in hope to get help
from Birbal. The man suspected that one of his servants had stolen
from him. The clever Birbal thought of a plan and gave all the merchant’s
servants sticks of the same length."""

# Word tokenization using split()
word_tokens = text.split()
print(word_tokens)


['One', 'day,', 'a', 'wealthy', 'man', 'came', 'to', "Akbar's", 'court', 'in', 'hope', 'to', 'get', 'help', 'from', 'Birbal.', 'The', 'man', 'suspected', 'that', 'one', 'of', 'his', 'servants', 'had', 'stolen', 'from', 'him.', 'The', 'clever', 'Birbal', 'thought', 'of', 'a', 'plan', 'and', 'gave', 'all', 'the', 'merchant’s', 'servants', 'sticks', 'of', 'the', 'same', 'length.']


In [2]:
sent_tokens = text.split('. ')
print(sent_tokens)


["One day, a wealthy man came to Akbar's court in hope to get help \nfrom Birbal", 'The man suspected that one of his servants had stolen \nfrom him', 'The clever Birbal thought of a plan and gave all the merchant’s \nservants sticks of the same length.']


In [3]:
import re

tokens = re.findall(r"[\w']+", text)
print(tokens)


['One', 'day', 'a', 'wealthy', 'man', 'came', 'to', "Akbar's", 'court', 'in', 'hope', 'to', 'get', 'help', 'from', 'Birbal', 'The', 'man', 'suspected', 'that', 'one', 'of', 'his', 'servants', 'had', 'stolen', 'from', 'him', 'The', 'clever', 'Birbal', 'thought', 'of', 'a', 'plan', 'and', 'gave', 'all', 'the', 'merchant', 's', 'servants', 'sticks', 'of', 'the', 'same', 'length']


In [4]:
sentences = re.split(r'[.!?] ', text)
print(sentences)


["One day, a wealthy man came to Akbar's court in hope to get help \nfrom Birbal", 'The man suspected that one of his servants had stolen \nfrom him', 'The clever Birbal thought of a plan and gave all the merchant’s \nservants sticks of the same length.']


In [5]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Added to download the missing resource

from nltk.tokenize import word_tokenize

print(word_tokenize(text))

['One', 'day', ',', 'a', 'wealthy', 'man', 'came', 'to', 'Akbar', "'s", 'court', 'in', 'hope', 'to', 'get', 'help', 'from', 'Birbal', '.', 'The', 'man', 'suspected', 'that', 'one', 'of', 'his', 'servants', 'had', 'stolen', 'from', 'him', '.', 'The', 'clever', 'Birbal', 'thought', 'of', 'a', 'plan', 'and', 'gave', 'all', 'the', 'merchant', '’', 's', 'servants', 'sticks', 'of', 'the', 'same', 'length', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [6]:
from nltk.tokenize import sent_tokenize

print(sent_tokenize(text))


["One day, a wealthy man came to Akbar's court in hope to get help \nfrom Birbal.", 'The man suspected that one of his servants had stolen \nfrom him.', 'The clever Birbal thought of a plan and gave all the merchant’s \nservants sticks of the same length.']


In [7]:
from nltk.tokenize import wordpunct_tokenize

sample = "Hi, I am visiting Florida this weekend. #Florida #USA.Go"

print("word_tokenize:", word_tokenize(sample))
print("wordpunct_tokenize:", wordpunct_tokenize(sample))


word_tokenize: ['Hi', ',', 'I', 'am', 'visiting', 'Florida', 'this', 'weekend', '.', '#', 'Florida', '#', 'USA.Go']
wordpunct_tokenize: ['Hi', ',', 'I', 'am', 'visiting', 'Florida', 'this', 'weekend', '.', '#', 'Florida', '#', 'USA', '.', 'Go']


In [8]:
from nltk.tokenize import TreebankWordTokenizer

tree = TreebankWordTokenizer()
sample2 = "They'll not be happy. Don't go."

print(tree.tokenize(sample2))


['They', "'ll", 'not', 'be', 'happy.', 'Do', "n't", 'go', '.']


In [9]:
from nltk.tokenize import MWETokenizer

text_mwe = "Hi, I can't go there, Go Delhi #Delhi"
tokenizer = MWETokenizer()

# Before adding MWE
print(tokenizer.tokenize(word_tokenize(text_mwe)))

# Adding MWE: "Go Delhi"
tokenizer.add_mwe(("Go", "Delhi"))
print(tokenizer.tokenize(word_tokenize(text_mwe)))


['Hi', ',', 'I', 'ca', "n't", 'go', 'there', ',', 'Go', 'Delhi', '#', 'Delhi']
['Hi', ',', 'I', 'ca', "n't", 'go', 'there', ',', 'Go_Delhi', '#', 'Delhi']


In [10]:
!pip install spacy
!python -m spacy download en_core_web_sm

import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)
word_tokens_spacy = [token.text for token in doc]
print(word_tokens_spacy)


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
['One', 'day', ',', 'a', 'wealthy', 'man', 'came', 'to', 'Akbar', "'s", 'court', 'in', 'hope', 'to', 'get', 'help', '\n', 'from', 'Birbal', '.', 'The', 'man', 'suspected', 'that', 'one', 'of', 'his', 'servants', 'had', 'stolen', '\n', 'from', 'him', '.', 'The', 'clever', 'Birbal', 'thought', 'of', 'a', 'plan', 'and', 'gave', 'all', '

In [11]:
for sent in doc.sents:
    print(sent.text)


One day, a wealthy man came to Akbar's court in hope to get help 
from Birbal.
The man suspected that one of his servants had stolen 
from him.
The clever Birbal thought of a plan and gave all the merchant’s 
servants sticks of the same length.


In [12]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

result = text_to_word_sequence(text)
print(result)

['one', 'day', 'a', 'wealthy', 'man', 'came', 'to', "akbar's", 'court', 'in', 'hope', 'to', 'get', 'help', 'from', 'birbal', 'the', 'man', 'suspected', 'that', 'one', 'of', 'his', 'servants', 'had', 'stolen', 'from', 'him', 'the', 'clever', 'birbal', 'thought', 'of', 'a', 'plan', 'and', 'gave', 'all', 'the', 'merchant’s', 'servants', 'sticks', 'of', 'the', 'same', 'length']


In [13]:
!pip install gensim
from gensim.utils import tokenize

print(list(tokenize(text)))


['One', 'day', 'a', 'wealthy', 'man', 'came', 'to', 'Akbar', 's', 'court', 'in', 'hope', 'to', 'get', 'help', 'from', 'Birbal', 'The', 'man', 'suspected', 'that', 'one', 'of', 'his', 'servants', 'had', 'stolen', 'from', 'him', 'The', 'clever', 'Birbal', 'thought', 'of', 'a', 'plan', 'and', 'gave', 'all', 'the', 'merchant', 's', 'servants', 'sticks', 'of', 'the', 'same', 'length']
