# Python Natural Language Processing

In [1]:
import nltk
# nltk.download()  # required if a package is missing

In [2]:
text = "Mary had a little lamb. Her fleece was white as snow"

from nltk.tokenize import word_tokenize, sent_tokenize



## Getting started

### Tokenize in sentences

In [3]:
sents = sent_tokenize(text)
print(sents)

['Mary had a little lamb.', 'Her fleece was white as snow']


### Tokenize in words

In [4]:
words = [ word_tokenize(sent) for sent in sents]
print(words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'white', 'as', 'snow']]


 ### Remove stop words

In [5]:
from nltk.corpus import stopwords
from string import punctuation

customStopWords=set(stopwords.words("english")+list(punctuation))

In [6]:
wordWOStopwords = [word for word in word_tokenize(text) if word not in customStopWords]
print(wordWOStopwords)

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']


### Identifying bigrams

In [8]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordWOStopwords)

sorted(finder.ngram_fd.items())

[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

### Stemming and POS tagging

Ex : close / different morphological forms of the same word

In [12]:
text2 = "Mary closed on closing night when she was in the mood to close."

from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmedWords = [ st.stem(word) for word in word_tokenize(text2)]
print(stemmedWords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos', '.']


### tag for words 