In [1]:
from nltk import word_tokenize, sent_tokenize
sent = "Not all that Mrs. Bennet, however, with the assistance of her"
print(word_tokenize(sent))
print(sent_tokenize(sent))

['Not', 'all', 'that', 'Mrs.', 'Bennet', ',', 'however', ',', 'with', 'the', 'assistance', 'of', 'her']
['Not all that Mrs. Bennet, however, with the assistance of her']


In [2]:
from nltk.corpus import stopwords        # the corpus module is an 
                                         # extremely useful one. 
                                         # More on that later.
stop_words = stopwords.words('english')  # this is the full list of
                                         # all stop-words stored in
                                         # nltk
token = word_tokenize(sent)
cleaned_token = []
for word in token:
    if word not in stop_words:
        cleaned_token.append(word)
print("This is the unclean version:", token)
print("This is the cleaned version:", cleaned_token)


This is the unclean version: ['Not', 'all', 'that', 'Mrs.', 'Bennet', ',', 'however', ',', 'with', 'the', 'assistance', 'of', 'her']
This is the cleaned version: ['Not', 'Mrs.', 'Bennet', ',', 'however', ',', 'assistance']


In [3]:
#Stemming
'''
This is when ‘fluff’ letters (not words) are removed from a word and grouped together with its “stem form”. 
For instance, the words ‘play’, ‘playing’, or ‘plays’ convey the same meaning (although, again, not exactly, 
but for analysis with a computer, that sort of detail is still not a viable option). 
So instead of having them as different words, we can put them together under the same umbrella term ‘play’.
'''
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ['play', 'playing', 'plays', 'played',
         'playfullness', 'playful']
stemmed = [stemmer.stem(word) for word in words]
print(stemmed)

['play', 'play', 'play', 'play', 'playful', 'play']


In [4]:
sent2 = "five daughters, could ask on the subject, was sufficient to draw"
token = word_tokenize(sent2)
stemmed = ""
for word in token:
    stemmed += stemmer.stem(word) + " "
print(stemmed)

five daughter , could ask on the subject , wa suffici to draw 


In [5]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SONAL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [6]:
#Tagging Parts of Speech (pos)The next essential thing we want to do is tagging each word in the corpus 
#(a corpus is just a ‘bag’ of words) we created after converting sentences by tokenizing.

from nltk import pos_tag 
token = word_tokenize(sent) + word_tokenize(sent2)
tagged = pos_tag(cleaned_token)                 
print(tagged)

[('Not', 'RB'), ('Mrs.', 'NNP'), ('Bennet', 'NNP'), (',', ','), ('however', 'RB'), (',', ','), ('assistance', 'NN')]
