[View in Colaboratory](https://colab.research.google.com/github/prikulkarni/nlp/blob/master/nltk_colab.ipynb)

### NLTK import and Download

In [0]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True

### Tokenization

Normal word tokenize

In [3]:
from nltk.tokenize import word_tokenize

example_text = "Hello, how are you?"
for i in word_tokenize(example_text):
    print(i)

Hello
,
how
are
you
?


Regexp tokenizer - remove punctuation

In [0]:
from nltk.tokenize import RegexpTokenizer

example_text = "I'm interested, thanks!"
tokenizer = RegexpTokenizer(r'\w+')
for i in tokenizer.tokenize(example_text):
    print(i)

I
m
interested


tokenizer - remove punctuation but keep contractions

In [0]:
import string
import re
from nltk.tokenize import word_tokenize

example_text = "I'm interested, thanks! But I can't."
tokenized = word_tokenize(example_text)
tokenized_contractions = [w for w in tokenized if not re.fullmatch('[' + string.punctuation + ']+', w)]
print(tokenized_contractions)


['I', "'m", 'interested', 'thanks', 'But', 'I', 'ca', "n't"]


### Stopword Removal

In [0]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_text = "Hello, how are you? This is an example."
stop_words = set(stopwords.words("english"))
words = word_tokenize(example_text)

# filtered_sentence = []
# for w in words:
#     if w not in stop_words:
#         filtered_sentence.append(w)

filtered_sentence = [w for w in words if not w in stop_words]
print(filtered_sentence)


### Stemming

In [0]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()
example_text = "Hiding in the important queen's gardens"
words = word_tokenize(example_text)
for w in words:
    print(ps.stem(w))

hide
in
the
import
queen
's
garden


### Parts of Speech Tagging

In [1]:
from nltk.corpus import gutenberg
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
import nltk

sample_text = gutenberg.raw("austen-emma.txt")
pst = PunktSentenceTokenizer()
tokenized = pst.tokenize(sample_text)

def pos_tagger():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
            
    except Exception as e:
        print(str(e))
        
pos_tagger()

[('[', 'NNS'), ('Emma', 'NNP'), ('by', 'IN'), ('Jane', 'NNP'), ('Austen', 'NNP'), ('1816', 'CD'), (']', 'NNP'), ('VOLUME', 'NNP'), ('I', 'PRP'), ('CHAPTER', 'VBP'), ('I', 'PRP'), ('Emma', 'NNP'), ('Woodhouse', 'NNP'), (',', ','), ('handsome', 'NN'), (',', ','), ('clever', 'NN'), (',', ','), ('and', 'CC'), ('rich', 'JJ'), (',', ','), ('with', 'IN'), ('a', 'DT'), ('comfortable', 'JJ'), ('home', 'NN'), ('and', 'CC'), ('happy', 'JJ'), ('disposition', 'NN'), (',', ','), ('seemed', 'VBD'), ('to', 'TO'), ('unite', 'VB'), ('some', 'DT'), ('of', 'IN'), ('the', 'DT'), ('best', 'JJS'), ('blessings', 'NNS'), ('of', 'IN'), ('existence', 'NN'), (';', ':'), ('and', 'CC'), ('had', 'VBD'), ('lived', 'VBN'), ('nearly', 'RB'), ('twenty-one', 'CD'), ('years', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('with', 'IN'), ('very', 'RB'), ('little', 'JJ'), ('to', 'TO'), ('distress', 'VB'), ('or', 'CC'), ('vex', 'VB'), ('her', 'PRP'), ('.', '.')]
[('She', 'PRP'), ('was', 'VBD'), ('the', 'DT'), ('young

### Summary

tokenize, remove stop words and punctuation, stem and pos tagging

In [2]:
from nltk.corpus import gutenberg
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re
import string

sample_text = gutenberg.raw("austen-emma.txt")
pst = PunktSentenceTokenizer()
ps = PorterStemmer()
stop_words = set(stopwords.words("english"))
tokenized = pst.tokenize(sample_text)

def tagger():
    try:
        for i in tokenized:
            words_tokenized = word_tokenize(i)
            #print(words_tokenized)
            words = [w for w in words_tokenized if not re.fullmatch('[' + string.punctuation + ']+', w)]
            #print(words)
            stopped_words = [w for w in words if not w in stop_words]
            #print(stopped_words)
            stemmed_words = [ps.stem(w)for w in stopped_words]
            #print(stemmed_words)
            tagged = nltk.pos_tag(stemmed_words)
            print(tagged)
            
    except Exception as e:
        print(str(e))
        
tagger()

[('emma', 'NN'), ('jane', 'NN'), ('austen', 'IN'), ('1816', 'CD'), ('volum', 'NN'), ('I', 'PRP'), ('chapter', 'VBP'), ('I', 'PRP'), ('emma', 'VBP'), ('woodhous', 'JJ'), ('handsom', 'NN'), ('clever', 'NN'), ('rich', 'JJ'), ('comfort', 'NN'), ('home', 'NN'), ('happi', 'NN'), ('disposit', 'NN'), ('seem', 'VBP'), ('unit', 'NN'), ('best', 'JJS'), ('bless', 'JJ'), ('exist', 'VBP'), ('live', 'JJ'), ('nearli', 'NN'), ('twenty-on', 'JJ'), ('year', 'NN'), ('world', 'NN'), ('littl', 'NN'), ('distress', 'NN'), ('vex', 'NN')]
[('she', 'PRP'), ('youngest', 'JJS'), ('two', 'CD'), ('daughter', 'NN'), ('affection', 'NN'), ('indulg', 'NN'), ('father', 'NN'), ('consequ', 'NN'), ('sister', 'NN'), ("'s", 'POS'), ('marriag', 'NN'), ('mistress', 'NN'), ('hous', 'JJ'), ('earli', 'JJ'), ('period', 'NN')]
[('her', 'PRP$'), ('mother', 'NN'), ('die', 'NN'), ('long', 'RB'), ('ago', 'RB'), ('indistinct', 'JJ'), ('remembr', 'NN'), ('caress', 'NN'), ('place', 'NN'), ('suppli', 'JJ'), ('excel', 'NN'), ('woman', 'NN'),