In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize

data = "All work and no play makes jack a dull boy, all work and no play"
print(word_tokenize(data))

['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


In [2]:
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
print(sent_tokenize(data))

['All work and no play makes jack dull boy.', 'All work and no play makes jack a dull boy.']


In [3]:
phrases = sent_tokenize(data)
words = word_tokenize(data)

print(phrases)
print(words)

['All work and no play makes jack dull boy.', 'All work and no play makes jack a dull boy.']
['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', '.']


In [4]:
# remove stop words

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
stopWords = set(stopwords.words('english'))   #  English stop words 
words = word_tokenize(data)
wordsFiltered = []

for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)

print(wordsFiltered)

['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']


In [5]:
print(len(stopWords))
print(stopWords)

179
{'at', 'couldn', 'is', 'a', 'no', 'very', 'his', 'some', 'here', 'them', 'during', 'should', 'shouldn', 'did', 'mustn', "you're", 'how', 'not', 'have', 'will', 'above', 'was', "wasn't", 'ma', 'those', 'under', 'because', "don't", 'it', "should've", 'then', 'i', 'few', 'aren', 'theirs', 'doing', 'y', "shouldn't", 'has', 'again', 'nor', 'needn', 'herself', 'now', 'which', "doesn't", "she's", 'won', 'themselves', 'ourselves', "haven't", 'more', "isn't", 'were', 've', 'before', 'you', 'own', 'further', 'hasn', 'he', 'out', "aren't", 'as', 'there', 'that', "needn't", 'and', 'haven', 'but', "it's", 'had', 'do', 'she', 'being', 'ain', 'of', 'don', 'mightn', 'an', 't', 'its', 'wasn', 'this', 'yourselves', 'after', "wouldn't", 'by', "you've", 'doesn', 'or', 'wouldn', "weren't", "that'll", 'while', 'where', 'below', 'her', "didn't", 're', 'both', 'their', "mustn't", 'we', 'hers', 'didn', 'who', 'on', 'shan', 'can', 'your', 'what', 'him', 'whom', 'why', 'o', 'these', 'up', 'down', 'over', 'th

In [6]:
print(len(wordsFiltered))
print(wordsFiltered)

16
['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']


In [7]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

words = ["game","gaming","gamed","games"]
ps = PorterStemmer()

for word in words:
    print(word + ":" + ps.stem(word))

game:game
gaming:game
gamed:game
games:game


In [8]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

sentence = "gaming, the gamers play games"
words = word_tokenize(sentence)

for word in words:
    print(word + ":" + ps.stem(word))

gaming:game
,:,
the:the
gamers:gamer
play:play
games:game


In [9]:
#  Split all punctuation into separate tokens

from nltk.tokenize import WordPunctTokenizer
text = "Reset your password if you just can't remember your old one."
print("\nOriginal string:")
print(text)
result = WordPunctTokenizer().tokenize(text)
print("\nSplit all punctuation into separate tokens:")
print(result)


Original string:
Reset your password if you just can't remember your old one.

Split all punctuation into separate tokens:
['Reset', 'your', 'password', 'if', 'you', 'just', 'can', "'", 't', 'remember', 'your', 'old', 'one', '.']


In [10]:
#  Tokenize a twitter text

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
tweet_text = "NoSQL introduction - w3resource http://bit.ly/1ngHC5F  #nosql #database #webdev"
print("\nOriginal Tweet:")
print(tweet_text)
result = tknzr.tokenize(tweet_text)
print("\nTokenize a twitter text:")
print(result) 


Original Tweet:
NoSQL introduction - w3resource http://bit.ly/1ngHC5F  #nosql #database #webdev

Tokenize a twitter text:
['NoSQL', 'introduction', '-', 'w3resource', 'http://bit.ly/1ngHC5F', '#nosql', '#database', '#webdev']


In [11]:
#  Remove username handles from a twitter text

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True)
tweet_text = "@abcd @pqrs NoSQL introduction - w3resource http://bit.ly/1ngHC5F  #nosql #database #webdev"
print("\nOriginal Tweet:")
print(tweet_text)
result = tknzr.tokenize(tweet_text)
print("\nTokenize a twitter text:")
print(result)


Original Tweet:
@abcd @pqrs NoSQL introduction - w3resource http://bit.ly/1ngHC5F  #nosql #database #webdev

Tokenize a twitter text:
['NoSQL', 'introduction', '-', 'w3resource', 'http://bit.ly/1ngHC5F', '#nosql', '#database', '#webdev']


In [12]:
#  Read a given text through each line and look for sentences

import nltk.data
text = '''
Mr. Smith waited for the train. The train was late.
Mary and Samantha took the bus. I looked for Mary and
Samantha at the bus station.
'''
print("\nOriginal Tweet:")
print(text)
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
print('\n==============\n'.join(sent_detector.tokenize(text.strip())))



Original Tweet:

Mr. Smith waited for the train. The train was late.
Mary and Samantha took the bus. I looked for Mary and
Samantha at the bus station.

Mr. Smith waited for the train.
The train was late.
Mary and Samantha took the bus.
I looked for Mary and
Samantha at the bus station.
