## Tokenize text into words and sentences

In [None]:
!python pip intstall -U nltk

In [2]:
import numpy

print(numpy.__version__)

1.25.0


In [3]:
import nltk

print(nltk.__version__)

3.8.1


#### Uses NLTK's recommended sentence tokenizer, the PunktSentenceTokenizer
#### Uses NLTK's recommended word tokenizer, the TreebankWordTokenizer and the PunktSentencetokenizer

In [4]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/shoney/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
sent_tokens = sent_tokenize('Does this tokenizer work? These are two different sentences')

print(sent_tokens)

['Does this tokenizer work?', 'These are two different sentences']


In [8]:
word_tokens = word_tokenize('Does this tokenizer work?')

print(word_tokens)

['Does', 'this', 'tokenizer', 'work', '?']


#### Punkt tokenizer

https://www.nltk.org/_modules/nltk/tokenize/punkt.html
http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt

A sentence tokenizer which uses an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences; and then uses that model to find sentence boundaries. This approach has been shown to work well for many European languages.

It must be trained on a large collection of plaintext in the target language before it can be used.

The NLTK data package includes a pre-trained Punkt tokenizer for English.

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/shoney/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
sent_tokens = sent_tokenize('Does this tokenizer work? These are two different sentences')

print(sent_tokens)

['Does this tokenizer work?', 'These are two different sentences']


In [11]:
word_tokens = word_tokenize('Does this tokenizer work?')

print(word_tokens)

['Does', 'this', 'tokenizer', 'work', '?']


In [12]:
text = "A bird in hand is worth two in the bush. " +\
       "Good things come to those who wait. " +\
       "These watches cost $1500! " +\
       "The ball is in your court. " +\
       "Mr. Smith Goes to Washington " +\
       "Doogie Howser M.D."

word_tokens = word_tokenize(text, language='english')
print(word_tokens)

['A', 'bird', 'in', 'hand', 'is', 'worth', 'two', 'in', 'the', 'bush', '.', 'Good', 'things', 'come', 'to', 'those', 'who', 'wait', '.', 'These', 'watches', 'cost', '$', '1500', '!', 'The', 'ball', 'is', 'in', 'your', 'court', '.', 'Mr.', 'Smith', 'Goes', 'to', 'Washington', 'Doogie', 'Howser', 'M.D', '.']


In [13]:
len(word_tokens)

41

In [14]:
word_tokens[3:8]

['hand', 'is', 'worth', 'two', 'in']

In [15]:
from nltk.tokenize.punkt import PunktSentenceTokenizer

In [16]:
pst = PunktSentenceTokenizer()

In [17]:
sent_tokens = pst.tokenize(text)

print(sent_tokens)

['A bird in hand is worth two in the bush.', 'Good things come to those who wait.', 'These watches cost $1500!', 'The ball is in your court.', 'Mr.', 'Smith Goes to Washington Doogie Howser M.D.']


In [18]:
span_tokens = pst.span_tokenize(text)

print(list(span_tokens))

[(0, 40), (41, 76), (77, 102), (103, 129), (130, 133), (134, 177)]


In [19]:
sentences = pst.sentences_from_tokens(word_tokens)

list(sentences)

[['A', 'bird', 'in', 'hand', 'is', 'worth', 'two', 'in', 'the', 'bush', '.'],
 ['Good', 'things', 'come', 'to', 'those', 'who', 'wait', '.'],
 ['These', 'watches', 'cost', '$', '1500', '!'],
 ['The', 'ball', 'is', 'in', 'your', 'court', '.'],
 ['Mr.'],
 ['Smith', 'Goes', 'to', 'Washington', 'Doogie', 'Howser', 'M.D', '.']]

In [20]:
from nltk.tokenize import WhitespaceTokenizer

In [21]:
wt = WhitespaceTokenizer()

In [22]:
word_tokens = wt.tokenize(text)

print(word_tokens)

['A', 'bird', 'in', 'hand', 'is', 'worth', 'two', 'in', 'the', 'bush.', 'Good', 'things', 'come', 'to', 'those', 'who', 'wait.', 'These', 'watches', 'cost', '$1500!', 'The', 'ball', 'is', 'in', 'your', 'court.', 'Mr.', 'Smith', 'Goes', 'to', 'Washington', 'Doogie', 'Howser', 'M.D.']


### Reading Local Files

In [24]:
with open('/datasets/biography.txt', 'r') as f:
    file_contents = f.read()

print(file_contents)

FileNotFoundError: [Errno 2] No such file or directory: '/datasets/biography.txt'

In [None]:
word_tokens = word_tokenize(file_contents)

print(word_tokens)

### Frequency distributions

In [None]:
from nltk.probability import FreqDist

In [None]:
freq_dist = FreqDist(word_tokens)

print(freq_dist)

In [None]:
freq_dist.most_common(20)

Return the frequency of a given sample. The frequency of a sample is defined as the count of that sample divided by the total number of sample outcomes that have been recorded by this FreqDist. The count of a sample is defined as the number of times that sample outcome was recorded by this FreqDist. Frequencies are always real numbers in the range [0, 1].

In [None]:
freq_dist.freq('the')

In [None]:
freq_dist.freq('exposure')

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12, 8))

freq_dist.plot(20, cumulative=False)

plt.show()

In [None]:
file_contents = file_contents.lower()

word_tokens = wt.tokenize(file_contents)

print(word_tokens)

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', 'The'])

print(stop_words)

In [None]:
filtered_words = []

for w in word_tokens:
    if w not in stop_words:
        filtered_words.append(w)
        
print(filtered_words)

In [None]:
freq_dist = FreqDist(filtered_words)

In [None]:
freq_dist.most_common(20)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

freq_dist.plot(20, cumulative=False)

plt.show()