In [1]:
import nltk

In [2]:
s = 'Python is an awesome language.'

In [3]:
tokens = nltk.word_tokenize(s)
tokens

['Python', 'is', 'an', 'awesome', 'language', '.']

In [6]:
list(nltk.bigrams(tokens))

[('Python', 'is'),
 ('is', 'an'),
 ('an', 'awesome'),
 ('awesome', 'language'),
 ('language', '.')]

# Computing Frequent Bigrams

- Now let's find out three frequently occurring bigrams, present in english-kjv collection of genesis corpus.

- Let's consider only those bigrams, whose words are having a length greater than 5.

In [7]:
from nltk.corpus import genesis

In [8]:
eng_tokens = genesis.words('english-kjv.txt')

In [9]:
eng_bigrams = nltk.bigrams(eng_tokens)

In [10]:
filtered_bigrams = [ (w1, w2) for w1, w2 in eng_bigrams if len(w1) >=5 and len(w2) >= 5 ]

In [11]:
filtered_bigrams

[('called', 'Night'),
 ('waters', 'which'),
 ('waters', 'which'),
 ('firmament', 'Heaven'),
 ('waters', 'under'),
 ('gathered', 'together'),
 ('gathering', 'together'),
 ('waters', 'called'),
 ('earth', 'bring'),
 ('bring', 'forth'),
 ('forth', 'grass'),
 ('yielding', 'fruit'),
 ('fruit', 'after'),
 ('earth', 'brought'),
 ('brought', 'forth'),
 ('forth', 'grass'),
 ('yielding', 'fruit'),
 ('great', 'lights'),
 ('greater', 'light'),
 ('lesser', 'light'),
 ('waters', 'bring'),
 ('bring', 'forth'),
 ('forth', 'abundantly'),
 ('moving', 'creature'),
 ('created', 'great'),
 ('great', 'whales'),
 ('every', 'living'),
 ('living', 'creature'),
 ('waters', 'brought'),
 ('brought', 'forth'),
 ('forth', 'abundantly'),
 ('after', 'their'),
 ('every', 'winged'),
 ('earth', 'bring'),
 ('bring', 'forth'),
 ('living', 'creature'),
 ('creature', 'after'),
 ('creeping', 'thing'),
 ('earth', 'after'),
 ('earth', 'after'),
 ('cattle', 'after'),
 ('after', 'their'),
 ('every', 'thing'),
 ('earth', 'after')

In [12]:
eng_bifreq = nltk.FreqDist(filtered_bigrams)

In [13]:
eng_bifreq.most_common(3)

[(('their', 'father'), 19), (('lived', 'after'), 16), (('seven', 'years'), 15)]

# Determining Frequent After Words

Now let's see an example which determines the two most frequent words occurring after "living" are determined.

In [62]:
eng_bigrams = nltk.bigrams(eng_tokens)
eng_cfd = nltk.ConditionalFreqDist(eng_bigrams)
eng_cfd['living'].most_common(2)

[('creature', 7), ('thing', 4)]

# Generating Frequent Next Word

Now let's define a function named generate, which returns words occurring frequently after a given word.

In [31]:
def generate(cfd, word, n=5):
    n_words = []
    for i in range(n):
        n_words.append(word)
        word = cfd[word].max()
        print(word)
    return n_words

In [32]:
generate(eng_cfd, 'living')

creature
that
he
said
,


['living', 'creature', 'that', 'he', 'said']

# Trigrams

Similar to Bigrams, Trigrams refers to set of all three consecutive words appearing in text.

In [33]:
s = 'Python is an awesome language.'
tokens = nltk.word_tokenize(s)
list(nltk.trigrams(tokens))

[('Python', 'is', 'an'),
 ('is', 'an', 'awesome'),
 ('an', 'awesome', 'language'),
 ('awesome', 'language', '.')]

In [37]:
eng_trigrams = nltk.trigrams(eng_tokens)

In [41]:
eng_trigrams_cdf = nltk.ConditionalFreqDist(eng_trigrams)

In [47]:
eng_trigrams_cdf['living']

FreqDist({})

- ngrams
- nltk also provides the function ngrams. It can be used to determine a set of all possible n consecutive words appearing in a text.

- The following example displays a list of four consecutive words appearing in the text s.

In [48]:
list(nltk.ngrams(tokens, 4))

[('Python', 'is', 'an', 'awesome'),
 ('is', 'an', 'awesome', 'language'),
 ('an', 'awesome', 'language', '.')]

# Collocations 

- A collocation is a pair of words that occur together, very often.

- For example, red wine is a collocation.

- One characteristic of a collocation is that the words in it cannot be substituted with words having similar senses.

- For example, the combination maroon wine sounds odd.

In [49]:
from nltk.corpus import genesis

In [50]:
tokens = genesis.words('english-kjv.txt')
tokens

In [51]:
gen_text = nltk.Text(tokens)

In [52]:
gen_text.collocations()

said unto; pray thee; thou shalt; thou hast; thy seed; years old;
spake unto; thou art; LORD God; every living; God hath; begat sons;
seven years; shalt thou; little ones; living creature; creeping thing;
savoury meat; thirty years; every beast


In [54]:
gen_text.generate()

TypeError: generate() missing 1 required positional argument: 'words'

In [55]:
from nltk.corpus import brown

In [58]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [59]:
brown.words(categories='news')

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [64]:
from nltk.book import text6

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [101]:

word_token = nltk.bigrams(text6)
word_token = list(word_token)

In [102]:
word_token = [(word1.upper(),word2.upper()) for word1,word2 in word_token]

In [103]:
cfd_knight_head = nltk.ConditionalFreqDist(word_token)

In [111]:
cfd_knight_head['KING']['ARTHUR']

17

In [109]:
text6.collocations()

BLACK KNIGHT; clop clop; HEAD KNIGHT; mumble mumble; Holy Grail;
squeak squeak; FRENCH GUARD; saw saw; Sir Robin; Run away; CARTOON
CHARACTER; King Arthur; Iesu domine; Pie Iesu; DEAD PERSON; Round
Table; clap clap; OLD MAN; dramatic chord; dona eis


In [106]:
gen_text.collocations()


