In [None]:
# installation
# !pip install nltk
import nltk
# nltk.download()

# Tokenization

In [1]:
from  nltk.tokenize import sent_tokenize, word_tokenize

In [5]:
sample_text = "Hello Mr. Brown, how are you? The weather is nice today. How is your mother-in-law?"

In [6]:
sent_tokenize(sample_text)

['Hello Mr. Brown, how are you?',
 'The weather is nice today.',
 'How is your mother-in-law?']

In [7]:
word_tokenize(sample_text)

['Hello',
 'Mr.',
 'Brown',
 ',',
 'how',
 'are',
 'you',
 '?',
 'The',
 'weather',
 'is',
 'nice',
 'today',
 '.',
 'How',
 'is',
 'your',
 'mother-in-law',
 '?']

# Normalization

### Stemming

In [8]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [12]:
list(map(lambda x : ps.stem(x), ["legal", "illegal", "legalize", "legally"]))

['illeag', 'legal']

### Lemmatization

In [13]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [18]:
list(map(lambda x: lemmatizer.lemmatize(x), ["puppies", "wolves", "abaci", "churches", "peoples"]))

['puppy', 'wolf', 'abacus', 'church', 'people']

In [20]:
lemmatizer.lemmatize("better", pos="a") # pos = parts of speech , a = adjective

'good'

### Part of Speech Tagging

In [21]:
from nltk import pos_tag
from nltk import word_tokenize, sent_tokenize
sample_text = "Hello Mr. Brown, how are you? The weather is nice today. How is your mother-in-law?"

In [22]:
pos_tag(word_tokenize(sample_text))

[('Hello', 'NNP'),
 ('Mr.', 'NNP'),
 ('Brown', 'NNP'),
 (',', ','),
 ('how', 'WRB'),
 ('are', 'VBP'),
 ('you', 'PRP'),
 ('?', '.'),
 ('The', 'DT'),
 ('weather', 'NN'),
 ('is', 'VBZ'),
 ('nice', 'JJ'),
 ('today', 'NN'),
 ('.', '.'),
 ('How', 'WRB'),
 ('is', 'VBZ'),
 ('your', 'PRP$'),
 ('mother-in-law', 'NN'),
 ('?', '.')]

''' POS Tag list:

    CC coordinating conjunction
    CD cardinal digit
    DT determiner
    EX existential there (like: "there is" ... think of it like "there exists")
    FW foreign word
    IN preposition/subordinating conjunction
    JJ adjective 'big'
    JJR adjective, comparative 'bigger'
    JJS adjective, superlative 'biggest'
    LS list marker 1)
    MD modal could, will
    NN noun, singular 'desk'
    NNS noun plural 'desks'
    NNP proper noun, singular 'Harrison'
    NNPS proper noun, plural 'Americans'
    PDT predeterminer 'all the kids'
    POS possessive ending parent's
    PRP personal pronoun I, he, she
    PRP$ possessive pronoun my, his, hers
    RB adverb very, silently,
    RBR adverb, comparative better
    RBS adverb, superlative best
    RP particle give up
    TO to go 'to' the store.
    UH interjection errrrrrrrm
    VB verb, base form take
    VBD verb, past tense took
    VBG verb, gerund/present participle taking
    VBN verb, past participle taken
    VBP verb, sing. present, non-3d take
    VBZ verb, 3rd person sing. present takes
    WDT wh-determiner which
    WP wh-pronoun who, what
    WP$ possessive wh-pronoun whose
    WRB wh-abverb where, when
'''

### Stopwords

In [9]:
from nltk.corpus import stopwords
sample_text = "Rohit Nandi works at google AI/ML Headquarters as a Director"

In [10]:
stop_words = stopwords.words("english")
[word for word in word_tokenize(sample_text) if word not in stop_words]

['Rohit', 'Nandi', 'works', 'google', 'AI/ML', 'Headquarters', 'Director']

### Named Entity Recognition

In [11]:
# !pip install numpy
import nltk
import numpy as np
from nltk import pos_tag
from nltk import word_tokenize

In [12]:
sample_text = "Trump economic advisor Larry Kudlow told Fox Business Wednesday that the administration supports " \
              "measures to increase oversight of Chinese firms, though the White House hasn't publicly stated an " \
              "opinion on this particular legislation."

tokenized_words = word_tokenize(sample_text)
tag = pos_tag(tokenized_words)

In [13]:
namedEnt = nltk.ne_chunk(tag, binary=True)
namedEnt.draw()

### Chunking

In [14]:
rule = r'Chunk: {<NN[SP]*.?>+<.>}'
parser = nltk.RegexpParser(rule)
chunk = parser.parse(tag)

chunk.draw()