In [57]:
import nltk

In [58]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sbommireddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [59]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sbommireddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [60]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sbommireddy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [84]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sbommireddy/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [61]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [62]:
text = "Mary had a little lamb. Her fleece were white as snow. Mary closed on closing the night when she was in a mood to close."

- Stemming
- Lemming
- Tokenization
- tagging Parts of Speech
- Word sense disambiguation
- Remove stop words
- Bigrams
- Frequency of occurence and weights

In [63]:
sents = sent_tokenize(text)

In [64]:
sents

['Mary had a little lamb.',
 'Her fleece were white as snow.',
 'Mary closed on closing the night when she was in a mood to close.']

In [65]:
words = [word_tokenize(sent) for sent in sents]

In [66]:
words

[['Mary', 'had', 'a', 'little', 'lamb', '.'],
 ['Her', 'fleece', 'were', 'white', 'as', 'snow', '.'],
 ['Mary',
  'closed',
  'on',
  'closing',
  'the',
  'night',
  'when',
  'she',
  'was',
  'in',
  'a',
  'mood',
  'to',
  'close',
  '.']]

## Stop Word Removal

In [67]:
from nltk.corpus import stopwords

In [68]:
from string import punctuation

In [69]:
customstopwords = set(stopwords.words('english')+list(punctuation))

In [70]:
customstopwords

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [71]:
stopwordsremoved =  [ word for word in word_tokenize(text) if word not in customstopwords ]

In [72]:
stopwordsremoved

['Mary',
 'little',
 'lamb',
 'Her',
 'fleece',
 'white',
 'snow',
 'Mary',
 'closed',
 'closing',
 'night',
 'mood',
 'close']

## Construct Bigrams

In [73]:
from nltk.collocations import *

In [74]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [75]:
finder = BigramCollocationFinder.from_words(stopwordsremoved)

In [76]:
finder

<nltk.collocations.BigramCollocationFinder at 0x13fbabc50>

In [77]:
finder.ngram_fd.items()

dict_items([(('Mary', 'little'), 1), (('little', 'lamb'), 1), (('lamb', 'Her'), 1), (('Her', 'fleece'), 1), (('fleece', 'white'), 1), (('white', 'snow'), 1), (('snow', 'Mary'), 1), (('Mary', 'closed'), 1), (('closed', 'closing'), 1), (('closing', 'night'), 1), (('night', 'mood'), 1), (('mood', 'close'), 1)])

In [78]:
sorted(finder.ngram_fd.items())

[(('Her', 'fleece'), 1),
 (('Mary', 'closed'), 1),
 (('Mary', 'little'), 1),
 (('closed', 'closing'), 1),
 (('closing', 'night'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('mood', 'close'), 1),
 (('night', 'mood'), 1),
 (('snow', 'Mary'), 1),
 (('white', 'snow'), 1)]

## Stemming

In [79]:
# We got different stemmer's. we are going to use the lancaster stemmer
from nltk.stem.lancaster import LancasterStemmer

In [80]:
st = LancasterStemmer()
stemmedwords = [st.stem(word) for word in word_tokenize(text)]
stemmedwords

['mary',
 'had',
 'a',
 'littl',
 'lamb',
 '.',
 'her',
 'fleec',
 'wer',
 'whit',
 'as',
 'snow',
 '.',
 'mary',
 'clos',
 'on',
 'clos',
 'the',
 'night',
 'when',
 'she',
 'was',
 'in',
 'a',
 'mood',
 'to',
 'clos',
 '.']

- close is reduced to root form clos

In [81]:
nltk.pos_tag(word_tokenize(text))

[('Mary', 'NNP'),
 ('had', 'VBD'),
 ('a', 'DT'),
 ('little', 'JJ'),
 ('lamb', 'NN'),
 ('.', '.'),
 ('Her', 'PRP$'),
 ('fleece', 'NN'),
 ('were', 'VBD'),
 ('white', 'JJ'),
 ('as', 'IN'),
 ('snow', 'NN'),
 ('.', '.'),
 ('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'VBG'),
 ('the', 'DT'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB'),
 ('.', '.')]

- NNP : Noun
- PPR : Pro Noun 
- VBD : Verb

## Word Sense Disambiguation

In [82]:
from nltk.corpus import wordnet as wn

In [86]:
#synset
for ss in wn.synsets('bass'):
    print(ss,ss.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [96]:
from nltk.wsd import lesk

In [97]:
wsense = lesk(word_tokenize("Sing in a lower tone, along with the bass"),"bass")
print(wsense,wsense.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


In [104]:
wsense1 = lesk(word_tokenize("Kevin is the bass singer in the group"),"bass")
print(wsense1,wsense1.definition())

Synset('bass.n.03') an adult male singer with the lowest voice


In [99]:
wsense2 = lesk(word_tokenize("I love eating bass"),"bass")
print(wsense2,wsense2.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae


In [101]:
wsense3 = lesk(word_tokenize("This sea bass is hard to catch"),"bass")
print(wsense3,wsense3.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae


- Rulebased vs Machine Learning Approaches in NLP
- Classification Vs Clustering
- Naive Baes Classification
- SVM Support Vector Machine
- KMeans Clustering
- Hierarchial Clustering

## Feature extraction for NLP
- Term Frequency Representation 
- Term Frequency IDF(Inverse document Frequency)

# Project 1 
- Auto Summarizing test using Rule based Approach
- Abstract Extraction
- Score each word based on its importance, sum the words in each sentence and pick top scoring sentences

### Retrieve text

In [105]:
from bs4 import BeautifulSoup
import requests

### Preprocess text

### Extract sentences

# Project 2
- Classifying Text using ML Approach