In [1]:
text="""Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome.
The sky is pinkish-blue. You shouldn't eat cardboard"""

### Sentence Tokenizer

In [2]:
import nltk
from nltk.tokenize import sent_tokenize
tokenized_text=sent_tokenize(text)
print(tokenized_text)
print(len(tokenized_text))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and city is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard"]
4


### Word Tokenizer

In [3]:
from nltk.tokenize import word_tokenize
tokenized_word=word_tokenize(text)
print(tokenized_word)

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']


### Frequency

In [4]:
from nltk.probability import FreqDist
fdist = FreqDist(tokenized_word)
print(fdist)
fdist.most_common(5)

<FreqDist with 25 samples and 30 outcomes>


[('is', 3), ('.', 2), ('The', 2), (',', 2), ('Smith', 1)]

In [5]:
# Frequency Distribution Plot
import matplotlib.pyplot as plt
fdist.plot(30,cumulative=False)
plt.show()

<Figure size 640x480 with 1 Axes>

### Stop Words

In [6]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(stop_words)
print('--------------------------------------------------------------')
print("Total stop words : {}".format(len(stop_words)))

{'other', 'more', 'into', 'whom', 'i', 'aren', 'any', 'by', "couldn't", 'was', 'itself', 'over', 'further', 'didn', "won't", 'ma', 'with', "aren't", 'our', 'out', 'the', "isn't", 'as', 'shan', 'did', 'just', "shan't", 't', 'under', "didn't", 'having', 'no', 'does', 'in', 'those', 're', 'again', 'such', 'ourselves', 'he', 'myself', 'd', 'nor', 'their', 'same', 'are', 'were', 'will', 'm', 'them', 'ain', 'before', 'wasn', "you've", 'been', 'me', 'most', 'because', 'be', 'had', 's', "wouldn't", 'her', 'above', 'himself', 'wouldn', 'hadn', 'against', 'o', 'how', "haven't", 'him', 'and', 'too', 'his', "doesn't", "it's", 'below', 'to', 'we', 'through', 'an', "don't", "mustn't", "wasn't", 'then', 'when', 'here', 'being', "you're", "she's", 'than', "hadn't", 'hers', 'off', 'mustn', 'during', 'between', 'it', 'couldn', 'for', 'which', 'where', "you'll", 'a', 'am', 'at', 'all', 'y', 'she', 'on', 'why', 'mightn', 'each', 've', 'ours', 'of', 'do', 'if', "that'll", 'very', 'now', 'is', 'that', 'your

### Removing Stopwords

In [7]:
filtered_sent=[]
for w in tokenized_word:
    if w not in stop_words:
        filtered_sent.append(w)
print("Tokenized Sentence:",tokenized_word)
print("Filterd Sentence:",filtered_sent)

Tokenized Sentence: ['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']
Filterd Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']


### Stemming

In [8]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

stemmed_words=[]
for w in filtered_sent:
    stemmed_words.append(ps.stem(w))

print("Filtered Sentence:",filtered_sent)
print("Stemmed Sentence:",stemmed_words)

Filtered Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']
Stemmed Sentence: ['hello', 'mr.', 'smith', ',', 'today', '?', 'the', 'weather', 'great', ',', 'citi', 'awesom', '.', 'the', 'sky', 'pinkish-blu', '.', 'you', "n't", 'eat', 'cardboard']


### Lemmatization

In [9]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()

word = "flying"
print("Lemmatized Word:",lem.lemmatize(word,"v"))
print("Stemmed Word:",stem.stem(word))

Lemmatized Word: fly
Stemmed Word: fli


### POS Tagging

In [10]:
pos = nltk.pos_tag(tokenized_word)
print(pos)

[('Hello', 'NNP'), ('Mr.', 'NNP'), ('Smith', 'NNP'), (',', ','), ('how', 'WRB'), ('are', 'VBP'), ('you', 'PRP'), ('doing', 'VBG'), ('today', 'NN'), ('?', '.'), ('The', 'DT'), ('weather', 'NN'), ('is', 'VBZ'), ('great', 'JJ'), (',', ','), ('and', 'CC'), ('city', 'NN'), ('is', 'VBZ'), ('awesome', 'JJ'), ('.', '.'), ('The', 'DT'), ('sky', 'NN'), ('is', 'VBZ'), ('pinkish-blue', 'JJ'), ('.', '.'), ('You', 'PRP'), ('should', 'MD'), ("n't", 'RB'), ('eat', 'VB'), ('cardboard', 'NN')]


In [11]:
for chunk in nltk.ne_chunk(pos):
    if hasattr(chunk, 'label'):
        label = chunk.label()
        name = ' '.join(c[0] for c in chunk.leaves())
        print(name, label)

Hello PERSON
Mr. Smith PERSON


In [12]:
from nltk.chunk import conlltags2tree, tree2conlltags
ne_tree = nltk.ne_chunk(pos)
iob_tagged = tree2conlltags(ne_tree)
print(iob_tagged)

[('Hello', 'NNP', 'B-PERSON'), ('Mr.', 'NNP', 'B-PERSON'), ('Smith', 'NNP', 'I-PERSON'), (',', ',', 'O'), ('how', 'WRB', 'O'), ('are', 'VBP', 'O'), ('you', 'PRP', 'O'), ('doing', 'VBG', 'O'), ('today', 'NN', 'O'), ('?', '.', 'O'), ('The', 'DT', 'O'), ('weather', 'NN', 'O'), ('is', 'VBZ', 'O'), ('great', 'JJ', 'O'), (',', ',', 'O'), ('and', 'CC', 'O'), ('city', 'NN', 'O'), ('is', 'VBZ', 'O'), ('awesome', 'JJ', 'O'), ('.', '.', 'O'), ('The', 'DT', 'O'), ('sky', 'NN', 'O'), ('is', 'VBZ', 'O'), ('pinkish-blue', 'JJ', 'O'), ('.', '.', 'O'), ('You', 'PRP', 'O'), ('should', 'MD', 'O'), ("n't", 'RB', 'O'), ('eat', 'VB', 'O'), ('cardboard', 'NN', 'O')]


In [13]:
ne_tree = conlltags2tree(iob_tagged)
print(ne_tree)

(S
  (PERSON Hello/NNP)
  (PERSON Mr./NNP Smith/NNP)
  ,/,
  how/WRB
  are/VBP
  you/PRP
  doing/VBG
  today/NN
  ?/.
  The/DT
  weather/NN
  is/VBZ
  great/JJ
  ,/,
  and/CC
  city/NN
  is/VBZ
  awesome/JJ
  ./.
  The/DT
  sky/NN
  is/VBZ
  pinkish-blue/JJ
  ./.
  You/PRP
  should/MD
  n't/RB
  eat/VB
  cardboard/NN)
