In [42]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


# Tasks of NLP

1) **Tokenization** : Breaking down text into words/sentances

2) **Stopword Removal** : Filtering common words that may not add much meaning to the sentence

3) **Identifying N Grams** : Commonly occuring group of words("New York" which generally occurs together is Bigram)

4) **Word Sense Disambiguation** : Identifying the context in which the word occurs                                            
                                    -> The movie had really **cool** efects   
                                    -> I'd like a tall glass of **cool** water
                                    
5) **Identifying Parts-of-Speech** : Identifying which word is Noun, Adverb, Verb, etc.

6) **Stemming** : Removing ends of words(Close, Closed, Closely, Closer)

In [14]:
text = "Mary had a little lamb. Her fleece was white as snow"

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize

### Tokenization

In [15]:
sents = sent_tokenize(text)

In [16]:
sents

['Mary had a little lamb.', 'Her fleece was white as snow']

In [17]:
words = [word_tokenize(sent) for sent in sents]

In [18]:
words

[['Mary', 'had', 'a', 'little', 'lamb', '.'],
 ['Her', 'fleece', 'was', 'white', 'as', 'snow']]

### Stopword Removal

In [19]:
from nltk.corpus import stopwords
from string import punctuation

In [20]:
customStopwords = set(stopwords.words('english') + list(punctuation))

In [21]:
customStopwords

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [22]:
wordsWOStopwords = [word for word in word_tokenize(text) if word not in customStopwords]

In [23]:
wordsWOStopwords

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']

### Identifying N Grams

In this case, Bigrams

In [24]:
from nltk.collocations import *

In [25]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [26]:
bigram_measures

<nltk.metrics.association.BigramAssocMeasures at 0x1dab66606a0>

In [27]:
finder = BigramCollocationFinder.from_words(wordsWOStopwords)

In [28]:
finder

<nltk.collocations.BigramCollocationFinder at 0x1dab6756dd8>

In [30]:
sorted(finder.ngram_fd.items())

[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

### Stemming and Parts-of-Speech Tagging

In [31]:
text2 = "Mary closed on closing night when she was in the mood to close."

In [32]:
from nltk.stem.lancaster import LancasterStemmer

In [33]:
st = LancasterStemmer()

In [34]:
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]

In [35]:
stemmedWords

['mary',
 'clos',
 'on',
 'clos',
 'night',
 'when',
 'she',
 'was',
 'in',
 'the',
 'mood',
 'to',
 'clos',
 '.']

In [38]:
nltk.pos_tag(word_tokenize(text2))

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB'),
 ('.', '.')]

### Word Sense Disambiguation

In [40]:
from nltk.corpus import wordnet as wn                                #Wordnet is a lexicon(a little thesaurus)

In [44]:
for ss in wn.synsets('bass'):
    print(ss, ss.definition())      #Synset instances are the groupings of synonymous words that express the same concept

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [46]:
from nltk.wsd import lesk                #An algorithm for Word Sense Disambiguation

In [52]:
sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"), 'bass')

In [53]:
print(sense1, sense1.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


In [56]:
sense2 = lesk(word_tokenize("This sea bass was really hard to catch"), 'bass')

In [57]:
print(sense2, sense2.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
