In [2]:
import nltk
nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download("all")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root

True

In [7]:
from nltk import word_tokenize, sent_tokenize
sent = "I'd like some tea, if you have any.\
 I'm ready to help in any way I can"
print(word_tokenize(sent))
print(sent_tokenize(sent))

['I', "'d", 'like', 'some', 'tea', ',', 'if', 'you', 'have', 'any', '.', 'I', "'m", 'ready', 'to', 'help', 'in', 'any', 'way', 'I', 'can']
["I'd like some tea, if you have any.", "I'm ready to help in any way I can"]


In [8]:
from nltk.tokenize import WordPunctTokenizer
sentences = "an ambitious and aspiring young person. \
“a lofty aspirant” synonyms: aspirer, hopeful, wannabe, wannabee, applicant, applier.\
 a person who requests or seeks something such as assistance or employment or admission."
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(sentences)
print(tokens)

['an', 'ambitious', 'and', 'aspiring', 'young', 'person', '.', '“', 'a', 'lofty', 'aspirant', '”', 'synonyms', ':', 'aspirer', ',', 'hopeful', ',', 'wannabe', ',', 'wannabee', ',', 'applicant', ',', 'applier', '.', 'a', 'person', 'who', 'requests', 'or', 'seeks', 'something', 'such', 'as', 'assistance', 'or', 'employment', 'or', 'admission', '.']


In [9]:
with_word_tokenizer = word_tokenize(sentences)
print(with_word_tokenizer)

['an', 'ambitious', 'and', 'aspiring', 'young', 'person', '.', '“', 'a', 'lofty', 'aspirant', '”', 'synonyms', ':', 'aspirer', ',', 'hopeful', ',', 'wannabe', ',', 'wannabee', ',', 'applicant', ',', 'applier', '.', 'a', 'person', 'who', 'requests', 'or', 'seeks', 'something', 'such', 'as', 'assistance', 'or', 'employment', 'or', 'admission', '.']


In [10]:
# Tree Bank word tokenizer
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(sentences)
print(tokens)

['an', 'ambitious', 'and', 'aspiring', 'young', 'person.', '“a', 'lofty', 'aspirant”', 'synonyms', ':', 'aspirer', ',', 'hopeful', ',', 'wannabe', ',', 'wannabee', ',', 'applicant', ',', 'applier.', 'a', 'person', 'who', 'requests', 'or', 'seeks', 'something', 'such', 'as', 'assistance', 'or', 'employment', 'or', 'admission', '.']


In [None]:
"""
 stemming means to reduce the words to their root form or base form.
 This is achived by using Stemming and lemmatization, so that we can understand
 meaning behind it.

 for example: words like "play", "plays", "Played", "playing" all refer to the same action and can therefore be mapped to the common base from play
 """

In [13]:
from nltk.stem import PorterStemmer

# Create an object of class PorterStemmer
porter = PorterStemmer()
print(porter.stem('play'))
print(porter.stem("playing"))
print(porter.stem("plays"))
print(porter.stem("played"))

play
play
play
play


In [14]:
"""
like the play and all verb are converted into base form
but that will not be always the case
"""
print(porter.stem("communication"))

commun


In [17]:
"""
lemmatization means grouping together the same attribute word or grammatical word.
So, we can reach out the base form of any word which will be meaningful in nature.
This is also called lemma.
"""
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("plays", "v"))
print(lemmatizer.lemmatize("played", "v"))
print(lemmatizer.lemmatize("playing", "v"))

play
play
play


In [18]:
# In lemmatizers, we need to pass the Part of speech of the word along in the words as function argument
# if we compare with stemmer, lemmatize always gives meaning ful words but we have to pass the part of speech
print(lemmatizer.lemmatize("communication", "v"))

communication


In [22]:
# Part of Speech Tagging
"""it means that assigning each word of sentence to their respective part of speech.
It helps to give a better syntactic meaning of the sentence. syntactic means how ordering of the words in a sentence
matters as well, like subject + verb + object. who is saying to whom,
"""

'it means that assigning each word of sentence to their respective part of speech. \nIt helps to give a better syntactic meaning of the sentence. syntactic means how ordering of the words in a sentence\nmatters as well, like subject + verb + object. who is saying to whom,\n'

In [23]:
from nltk import pos_tag
text = "it means that assigning each word of sentence to their respective part of speech. \
It helps to give a better syntactic meaning of the sentence. \
syntactic means how ordering of the words in a sentence matters as well, like subject + verb + object. who is saying to whom."

tokenized_text = word_tokenize(text)
tags = tokens_tag = pos_tag(tokenized_text)
print(tags)

[('it', 'PRP'), ('means', 'VBZ'), ('that', 'IN'), ('assigning', 'VBG'), ('each', 'DT'), ('word', 'NN'), ('of', 'IN'), ('sentence', 'NN'), ('to', 'TO'), ('their', 'PRP$'), ('respective', 'JJ'), ('part', 'NN'), ('of', 'IN'), ('speech', 'NN'), ('.', '.'), ('It', 'PRP'), ('helps', 'VBZ'), ('to', 'TO'), ('give', 'VB'), ('a', 'DT'), ('better', 'JJR'), ('syntactic', 'JJ'), ('meaning', 'NN'), ('of', 'IN'), ('the', 'DT'), ('sentence', 'NN'), ('.', '.'), ('syntactic', 'JJ'), ('means', 'VBZ'), ('how', 'WRB'), ('ordering', 'NN'), ('of', 'IN'), ('the', 'DT'), ('words', 'NNS'), ('in', 'IN'), ('a', 'DT'), ('sentence', 'NN'), ('matters', 'NNS'), ('as', 'RB'), ('well', 'RB'), (',', ','), ('like', 'IN'), ('subject', 'JJ'), ('+', 'NNP'), ('verb', 'NN'), ('+', 'NN'), ('object', 'NN'), ('.', '.'), ('who', 'WP'), ('is', 'VBZ'), ('saying', 'VBG'), ('to', 'TO'), ('whom', 'WP'), ('.', '.')]
