In [2]:
import nltk
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import spacy
from gensim.utils import tokenize
from keras.preprocessing.text import text_to_word_sequence

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#Word Tokenization
Word tokenization breaks text into individual words or tokens, disregarding punctuation and special characters.

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import spacy
from gensim.utils import tokenize
from keras.preprocessing.text import text_to_word_sequence

In [23]:
paragraph = "Hey there! I'm Simran Basu, a passionate student currently pursuing my Master's in Artificial Intelligence and Machine Learning at Christ University in Bangalore 🎓. This semester, I'm diving deep into fascinating subjects like Natural Language Processing (NLP), Computer Vision (CV), Deep Learning, Augmented Reality/Virtual Reality (AR/VR), and the application of AI in Cognitive Sciences. I don't think it's just about studying; it's about understanding the intricate dynamics of these cutting-edge technologies 🧠. Living in Bangalore, known as the Silicon Valley of India, offers an exciting environment for exploring the latest advancements in AI and ML. From innovative startups to leading tech companies, Bangalore provides ample opportunities to apply theoretical knowledge gained in the classroom to real-world projects. As I navigate through this enriching academic journey, I'm eager to delve into the intricacies of AI and ML, sharpen my skills, and contribute meaningfully to the field. Let's embark on this exciting adventure together! 🚀✨"

# NLTK Word Tokenization
nltk_tokens = word_tokenize(paragraph)

# TextBlob Word Tokenization
blob = TextBlob(paragraph)
textblob_tokens = blob.words

# spaCy Word Tokenization
nlp = spacy.load("en_core_web_sm")
doc = nlp(paragraph)
spacy_tokens = [token.text for token in doc]

# Gensim Word Tokenization
gensim_tokens = list(tokenize(paragraph, lowercase=True))

# Keras Word Tokenization
keras_tokens = text_to_word_sequence(paragraph)

print("NLTK Word Tokenization:", nltk_tokens)
print("TextBlob Word Tokenization:", textblob_tokens) # TextBlob's tokenization provides a simple and effective way to break down text into its constituent words and punctuation marks.
print("spaCy Word Tokenization:", spacy_tokens) #spaCy tokenizer provides tokenization along with part-of-speech tagging, dependency parsing, and named entity recognition.
print("Gensim Word Tokenization:", gensim_tokens) #Gensim tokenizer focuses on creating tokens suitable for topic modeling and document similarity tasks.
print("Keras Word Tokenization:", keras_tokens) #Keras provides a simple tokenization method suitable for neural network-based text processing tasks.

NLTK Word Tokenization: ['Hey', 'there', '!', 'I', "'m", 'Simran', 'Basu', ',', 'a', 'passionate', 'student', 'currently', 'pursuing', 'my', 'Master', "'s", 'in', 'Artificial', 'Intelligence', 'and', 'Machine', 'Learning', 'at', 'Christ', 'University', 'in', 'Bangalore', '🎓', '.', 'This', 'semester', ',', 'I', "'m", 'diving', 'deep', 'into', 'fascinating', 'subjects', 'like', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', ',', 'Computer', 'Vision', '(', 'CV', ')', ',', 'Deep', 'Learning', ',', 'Augmented', 'Reality/Virtual', 'Reality', '(', 'AR/VR', ')', ',', 'and', 'the', 'application', 'of', 'AI', 'in', 'Cognitive', 'Sciences', '.', 'I', 'do', "n't", 'think', 'it', "'s", 'just', 'about', 'studying', ';', 'it', "'s", 'about', 'understanding', 'the', 'intricate', 'dynamics', 'of', 'these', 'cutting-edge', 'technologies', '🧠', '.', 'Living', 'in', 'Bangalore', ',', 'known', 'as', 'the', 'Silicon', 'Valley', 'of', 'India', ',', 'offers', 'an', 'exciting', 'environment', 'for', 'ex

#Sentence Tokenization
Sentence tokenization divides text into individual sentences based on punctuation and language-specific rules.

In [17]:
from nltk.tokenize import sent_tokenize

# NLTK Sentence Tokenization
nltk_sentences = sent_tokenize(paragraph)

# spaCy Sentence Tokenization
spacy_sentences = [sent.text for sent in doc.sents]

print("NLTK Sentence Tokenization:", nltk_sentences)
print("spaCy Sentence Tokenization:", spacy_sentences)


NLTK Sentence Tokenization: ['Hey there!', "I'm Simran Basu, a passionate student currently pursuing my Master's in Artificial Intelligence and Machine Learning at Christ University in Bangalore 🎓.", "This semester, I'm diving deep into fascinating subjects like Natural Language Processing (NLP), Computer Vision (CV), Deep Learning, Augmented Reality/Virtual Reality (AR/VR), and the application of AI in Cognitive Sciences.", "It's not just about studying; it's about understanding the intricate dynamics of these cutting-edge technologies 🧠.", 'Living in Bangalore, known as the Silicon Valley of India, offers an exciting environment for exploring the latest advancements in AI and ML.', 'From innovative startups to leading tech companies, Bangalore provides ample opportunities to apply theoretical knowledge gained in the classroom to real-world projects.', "As I navigate through this enriching academic journey, I'm eager to delve into the intricacies of AI and ML, sharpen my skills, and c

#Punctuation tokenizer
Punctuation-based tokenization splits text based on punctuation marks, treating them as separate tokens.



In [25]:
import nltk
from nltk.tokenize import WordPunctTokenizer
import spacy


text = "Hey there! I'm Simran Basu, a passionate student currently pursuing my Master's in Artificial Intelligence and Machine Learning at Christ University in Bangalore 🎓. This semester, I'm diving deep into fascinating subjects like Natural Language Processing (NLP), Computer Vision (CV), Deep Learning, Augmented Reality/Virtual Reality (AR/VR), and the application of AI in Cognitive Sciences. I don't think it's just about studying; it's about understanding the intricate dynamics of these cutting-edge technologies 🧠. Living in Bangalore, known as the Silicon Valley of India, offers an exciting environment for exploring the latest advancements in AI and ML. From innovative startups to leading tech companies, Bangalore provides ample opportunities to apply theoretical knowledge gained in the classroom to real-world projects. As I navigate through this enriching academic journey, I'm eager to delve into the intricacies of AI and ML, sharpen my skills, and contribute meaningfully to the field. Let's embark on this exciting adventure together! 🚀✨"

# NLTK Punctuation-based Tokenizer
punkt_tokenizer = WordPunctTokenizer()
nltk_punkt_tokens = punkt_tokenizer.tokenize(text)
print("NLTK Punctuation-based Tokenization:", nltk_punkt_tokens)

# spaCy Punctuation-based Tokenizer
nlp = spacy.load("en_core_web_sm")
custom_tokenizer = nlp.tokenizer
doc = custom_tokenizer(text)
spacy_punct_tokens = [token.text for token in doc if token.is_punct or token.text in ['🎓', '🧠', '🚀', '✨']]
print("spaCy Punctuation-based Tokenization:", spacy_punct_tokens)



NLTK Punctuation-based Tokenization: ['Hey', 'there', '!', 'I', "'", 'm', 'Simran', 'Basu', ',', 'a', 'passionate', 'student', 'currently', 'pursuing', 'my', 'Master', "'", 's', 'in', 'Artificial', 'Intelligence', 'and', 'Machine', 'Learning', 'at', 'Christ', 'University', 'in', 'Bangalore', '🎓.', 'This', 'semester', ',', 'I', "'", 'm', 'diving', 'deep', 'into', 'fascinating', 'subjects', 'like', 'Natural', 'Language', 'Processing', '(', 'NLP', '),', 'Computer', 'Vision', '(', 'CV', '),', 'Deep', 'Learning', ',', 'Augmented', 'Reality', '/', 'Virtual', 'Reality', '(', 'AR', '/', 'VR', '),', 'and', 'the', 'application', 'of', 'AI', 'in', 'Cognitive', 'Sciences', '.', 'I', 'don', "'", 't', 'think', 'it', "'", 's', 'just', 'about', 'studying', ';', 'it', "'", 's', 'about', 'understanding', 'the', 'intricate', 'dynamics', 'of', 'these', 'cutting', '-', 'edge', 'technologies', '🧠.', 'Living', 'in', 'Bangalore', ',', 'known', 'as', 'the', 'Silicon', 'Valley', 'of', 'India', ',', 'offers', 'a

#Treebank Word Tokenizer

The Treebank tokenizer follows the conventions of the Penn Treebank corpus for tokenization, which includes handling of contractions and punctuation.

In [19]:
from nltk.tokenize import TreebankWordTokenizer

# NLTK Treebank Word tokenizer
treebank_tokenizer = TreebankWordTokenizer()
nltk_treebank_tokens = treebank_tokenizer.tokenize(paragraph)

print("NLTK Treebank Word tokenizer:", nltk_treebank_tokens)


NLTK Treebank Word tokenizer: ['Hey', 'there', '!', 'I', "'m", 'Simran', 'Basu', ',', 'a', 'passionate', 'student', 'currently', 'pursuing', 'my', 'Master', "'s", 'in', 'Artificial', 'Intelligence', 'and', 'Machine', 'Learning', 'at', 'Christ', 'University', 'in', 'Bangalore', '🎓.', 'This', 'semester', ',', 'I', "'m", 'diving', 'deep', 'into', 'fascinating', 'subjects', 'like', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', ',', 'Computer', 'Vision', '(', 'CV', ')', ',', 'Deep', 'Learning', ',', 'Augmented', 'Reality/Virtual', 'Reality', '(', 'AR/VR', ')', ',', 'and', 'the', 'application', 'of', 'AI', 'in', 'Cognitive', 'Sciences.', 'It', "'s", 'not', 'just', 'about', 'studying', ';', 'it', "'s", 'about', 'understanding', 'the', 'intricate', 'dynamics', 'of', 'these', 'cutting-edge', 'technologies', '🧠.', 'Living', 'in', 'Bangalore', ',', 'known', 'as', 'the', 'Silicon', 'Valley', 'of', 'India', ',', 'offers', 'an', 'exciting', 'environment', 'for', 'exploring', 'the', 'latest',

#Tweet Tokenizer

Tweet tokenizer is specifically designed to handle the unique characteristics of tweets, including hashtags, mentions, and emoji.

In [20]:
from nltk.tokenize import TweetTokenizer

# NLTK Tweet Tokenizer
tweet_tokenizer = TweetTokenizer()
nltk_tweet_tokens = tweet_tokenizer.tokenize(paragraph)

print("NLTK Tweet Tokenizer:", nltk_tweet_tokens)


NLTK Tweet Tokenizer: ['Hey', 'there', '!', "I'm", 'Simran', 'Basu', ',', 'a', 'passionate', 'student', 'currently', 'pursuing', 'my', "Master's", 'in', 'Artificial', 'Intelligence', 'and', 'Machine', 'Learning', 'at', 'Christ', 'University', 'in', 'Bangalore', '🎓', '.', 'This', 'semester', ',', "I'm", 'diving', 'deep', 'into', 'fascinating', 'subjects', 'like', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', ',', 'Computer', 'Vision', '(', 'CV', ')', ',', 'Deep', 'Learning', ',', 'Augmented', 'Reality', '/', 'Virtual', 'Reality', '(', 'AR', '/', 'VR', ')', ',', 'and', 'the', 'application', 'of', 'AI', 'in', 'Cognitive', 'Sciences', '.', "It's", 'not', 'just', 'about', 'studying', ';', "it's", 'about', 'understanding', 'the', 'intricate', 'dynamics', 'of', 'these', 'cutting-edge', 'technologies', '🧠', '.', 'Living', 'in', 'Bangalore', ',', 'known', 'as', 'the', 'Silicon', 'Valley', 'of', 'India', ',', 'offers', 'an', 'exciting', 'environment', 'for', 'exploring', 'the', 'latest',

#Multi-Word Expression Tokenizer

Identifies and tokenizes multi-word expressions or phrases that are treated as single tokens.

In [22]:
from nltk.tokenize import MWETokenizer

# NLTK Multi-Word Expression Tokenizer
mwe_tokenizer = MWETokenizer([('cosmic', 'forces'), ('celestial', 'language'), ('cosmic', 'dance')])
mwe_tokens = mwe_tokenizer.tokenize(paragraph.split())

print("NLTK Multi-Word Expression Tokenizer:", mwe_tokens)


NLTK Multi-Word Expression Tokenizer: ['Hey', 'there!', "I'm", 'Simran', 'Basu,', 'a', 'passionate', 'student', 'currently', 'pursuing', 'my', "Master's", 'in', 'Artificial', 'Intelligence', 'and', 'Machine', 'Learning', 'at', 'Christ', 'University', 'in', 'Bangalore', '🎓.', 'This', 'semester,', "I'm", 'diving', 'deep', 'into', 'fascinating', 'subjects', 'like', 'Natural', 'Language', 'Processing', '(NLP),', 'Computer', 'Vision', '(CV),', 'Deep', 'Learning,', 'Augmented', 'Reality/Virtual', 'Reality', '(AR/VR),', 'and', 'the', 'application', 'of', 'AI', 'in', 'Cognitive', 'Sciences.', "It's", 'not', 'just', 'about', 'studying;', "it's", 'about', 'understanding', 'the', 'intricate', 'dynamics', 'of', 'these', 'cutting-edge', 'technologies', '🧠.', 'Living', 'in', 'Bangalore,', 'known', 'as', 'the', 'Silicon', 'Valley', 'of', 'India,', 'offers', 'an', 'exciting', 'environment', 'for', 'exploring', 'the', 'latest', 'advancements', 'in', 'AI', 'and', 'ML.', 'From', 'innovative', 'startups',