# Tokenization

In [27]:
import os
import nltk

In [79]:
# Get a sample test dataset
from urllib import request
from bs4 import BeautifulSoup

# Retrieve some current news
url = "http://nytimes.com"
response = request.urlopen(url)
soup = BeautifulSoup(response.read(), 'lxml')

# Extract the data
articles = [article.text.strip() for article in soup.select('p.summary')]
# Clean up p tags
articles = [a for a in articles if a != ""]

In [83]:
articles[0]

'Irma continued its march of devastation on Monday morning, dumping rain across the width of Florida.'

##  Tokenize string

In [87]:
# Tokenize into sentences
nltk.sent_tokenize(articles[2])

['Photographs from the storm and its aftermath.']

In [88]:
# Tokenize into words
nltk.word_tokenize(articles[2])

['Photographs', 'from', 'the', 'storm', 'and', 'its', 'aftermath', '.']

###  When tokenizing large data, its better to directly load the pre-trained tokenizer from pickle

The nltk.tokenize() internally loads the tokenizer on each call, which is quite inefficient if done on large amounts of data

In [98]:
import nltk.data

# Load the Tokenizer Model
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [112]:
list(map(tokenizer.tokenize, articles))[:5]

[['Irma continued its march of devastation on Monday morning, dumping rain across the width of Florida.'],
 ['Floridians are used to bracing for hurricanes, but Irma was not a run-of-the-mill event.'],
 ['Photographs from the storm and its aftermath.'],
 ['Food and water was in short supply after Hurricane Irma, and witnesses spoke of a disintegration of law and order.'],
 ['Times journalists and others in the storm’s path described what they were hearing and seeing on Sunday.']]

#### Comparing the saving in time

In [102]:
%%timeit
# Apply it to the list
list(map(tokenizer.tokenize, articles))

2.04 ms ± 28.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [105]:
%%timeit
list(map(nltk.word_tokenize, articles))

14.9 ms ± 296 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Different Ways to handle tokenization

In [118]:
# The standard Tokenizer
print(nltk.word_tokenize("Can't"))

['Ca', "n't"]


In [117]:
# Using the "WordPunctTokenizer"
from nltk.tokenize import WordPunctTokenizer
WordPunctTokenizer().tokenize("Can't")

['Can', "'", 't']

####  Regex based Tokenization

The RegexpTokenizer class works by compiling your pattern, then calling re.findall() on your text. You could do all this yourself using the re module, but RegexpTokenizer implements the TokenizerI interface, just like all the word tokenizers from the previous recipe. This means it can be used by other parts of the NLTK package, such as corpus readers, which we'll cover in detail in Chapter 3, Creating Custom Corpora. Many corpus readers need a way to tokenize the text they're reading, and can take optional keyword arguments specifying an instance of a TokenizerI subclass. This way, you have the ability  to provide your own tokenizer instance if the default tokenizer is unsuitable.

In [120]:
from nltk.tokenize import regexp_tokenize

regexp_tokenize("Can't is a contraction", "[\w']+")

["Can't", 'is', 'a', 'contraction']

In [123]:
# Matching on the gaps
print(regexp_tokenize("Can't is a contraction", "\s+", gaps=False))
print(regexp_tokenize("Can't is a contraction", "\s+", gaps=True))

[' ', ' ', ' ']
["Can't", 'is', 'a', 'contraction']


## Training a sentence tokenizer for custom text

The Sentence tokenizer might be to general purpose for specific textsets, especially when they are concerning web context or chat. We will use the chat corpus as an example here.

In [128]:
from nltk.corpus import webtext
from nltk.tokenize import PunktSentenceTokenizer

In [129]:
# Training the Tokenizer on the Sentences based on punctuation
text = webtext.raw('overheard.txt')
sent_tokenizer = PunktSentenceTokenizer(text)

#### Comparing to the baseline Tokenizer

In [141]:
print('The baseline:: ', nltk.sent_tokenize(text)[678])
print('The new:: ', sent_tokenizer.tokenize(text)[678])

The baseline::  Girl: But you already have a Big Mac...
Hobo: Oh, this is all theatrical.
The new::  Girl: But you already have a Big Mac...


The new tokenizer identified the linebreak and moved it into the next token, whereas the baseline one did not

## Filtering Stopwords


In [143]:
from nltk.corpus import stopwords

The stopwords corpus is an instance of nltk.corpus.reader. WordListCorpusReader. As such, it has a words() method that can take a single argument for the file ID, which in this case is 'english', referring to a file containing  a list of English stopwords. You could also call stopwords.words() with no argument  to get a list of all stopwords in every language available.

In [147]:
# First we import a list of identified Stopwords for the specific language
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [148]:
# They are available for a list of languages
stopwords.fileids()

['danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'kazakh',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish',
 'turkish']

In [154]:
# Removing the stopwords from the text
def remove_stopwords(sentence):
    return [word for word in nltk.word_tokenize(sentence) if word not in stopwords.words('english')]

In [156]:
remove_stopwords(sent_tokenizer.tokenize(text)[1])

['Asian', 'girl', ':', 'Yeah', ',', 'angry', '!']

## Looking up Synsets fro a word in WordNet

In [210]:
from nltk.corpus import wordnet

syn = wordnet.synsets('CookBook')[0]

In [211]:
# Querying the Synset name
syn.name()

'cookbook.n.01'

In [212]:
# Querying the Synset definition
syn.definition()

'a book of recipes and cooking directions'

Synsets are organized in a structure similar to that of an inheritance tree. More abstract terms are known as hypernyms and more specific terms are hyponyms. This tree can be traced all the way up to a root hypernym. Hypernyms provide a way to categorize and group words based on their similarity to each other. The Calculating WordNet Synset similarity recipe details the functions used to calculate the similarity based on the distance between two words in the hypernym tree:

In [213]:
# Getting a list of Hypernyms
syn.hypernyms()

[Synset('reference_book.n.01')]

In [214]:
syn.hypernyms()[0].hyponyms()

[Synset('annual.n.02'),
 Synset('atlas.n.02'),
 Synset('cookbook.n.01'),
 Synset('directory.n.01'),
 Synset('encyclopedia.n.01'),
 Synset('handbook.n.01'),
 Synset('instruction_book.n.01'),
 Synset('source_book.n.01'),
 Synset('wordbook.n.01')]

In [215]:
syn.hypernym_paths()

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('artifact.n.01'),
  Synset('creation.n.02'),
  Synset('product.n.02'),
  Synset('work.n.02'),
  Synset('publication.n.01'),
  Synset('book.n.01'),
  Synset('reference_book.n.01'),
  Synset('cookbook.n.01')]]

In [216]:
syn.pos()

'n'

#### Extracting a list of synonyms

As all lemmas in a Synset have the seame meaning they can be treated as synonyms.

In [217]:
[lemma.name() for lemma in syn.lemmas()]

['cookbook', 'cookery_book']

In [220]:
# Get all synonyms based on all possible meanings of a word
set([lemma.name() for syn in wordnet.synsets('book') for lemma in syn.lemmas()])

{'Bible',
 'Book',
 'Christian_Bible',
 'Good_Book',
 'Holy_Scripture',
 'Holy_Writ',
 'Koran',
 'Quran',
 'Scripture',
 'Word',
 'Word_of_God',
 'account_book',
 "al-Qur'an",
 'book',
 'book_of_account',
 'hold',
 'ledger',
 'leger',
 'playscript',
 'record',
 'record_book',
 'reserve',
 'rule_book',
 'script',
 'volume'}

## Calculating Synset Similiarities using WordNet

In [223]:
from nltk.corpus import wordnet

cb = wordnet.synset('cookbook.n.01')
ib = wordnet.synset('instruction_book.n.01')

In [224]:
# Calculating the similiarity of both words
cb.wup_similarity(ib)

0.9166666666666666

The wup_similarity method is short for Wu-Palmer Similarity, which is a scoring method based on how similar the word senses are and where the Synsets occur relative to each other in the hypernym tree. One of the core metrics used to calculate similarity is the shortest path distance between the two Synsets and their common hypernym:

In [225]:
# Calculating the difference to common hypernym
ref = cb.hypernyms()[0]
cb.shortest_path_distance(ref)

1

In [226]:
ib.shortest_path_distance(ref)

1

In [227]:
cb.shortest_path_distance(ib)

2

#### Measuring the cause of common similiarity

In [229]:
# Measuring the similiarity between dog and cookbook
dog = wordnet.synsets('dog')[0]
dog.wup_similarity(cb)

0.38095238095238093

To see where this similiarity is coming from we can investigate the common hypernyms

In [230]:
sorted(dog.common_hypernyms(cb))

[Synset('entity.n.01'),
 Synset('object.n.01'),
 Synset('physical_entity.n.01'),
 Synset('whole.n.02')]

## Discovering Word Collocations
Collocations are two or more words that tend to appear frequently together, such as United States. Of course, there are many other words that can come after United, such as United Kingdom and United Airlines. As with many aspects of natural language processing, context  is very important. And for collocations, context is everything! 

In [232]:
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

words = [word.lower() for word in webtext.words('grail.txt')]

### Using a Bi-gram Filter

In [236]:
# Applying the Bigram Finder
bcf = BigramCollocationFinder.from_words(words)
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 5)

[("'", 's'), ('arthur', ':'), ('#', '1'), ("'", 't'), ('villager', '#')]

This pics up mostly on the structure of the Text, and not on important content bigrams. Let's remove punctuation from the words

In [240]:
from nltk.corpus import stopwords
# Create a stopword filter
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset

# Apply the filter
bcf.apply_word_filter(filter_stops)
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 10)


[('black', 'knight'),
 ('clop', 'clop'),
 ('head', 'knight'),
 ('mumble', 'mumble'),
 ('squeak', 'squeak'),
 ('saw', 'saw'),
 ('holy', 'grail'),
 ('run', 'away'),
 ('french', 'guard'),
 ('cartoon', 'character')]

### Using a Tri-gram Filter

In [247]:
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures

# Create wordset
words = [w.lower() for w in webtext.words('singles.txt')]

# Create the trigrams
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_word_filter(filter_stops)
tcf.apply_freq_filter(3)

# Return results
tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)

[('long', 'term', 'relationship')]