In [1]:
import nltk

In [2]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [3]:
from nltk.corpus import reuters

In [4]:
len(text6)

16967

In [5]:
len(set(text6))

2166

In [6]:
len(text6) / len(set(text6))

7.833333333333333

In [7]:
text6_freq = nltk.FreqDist(text6)

In [8]:
text6_freq['ARTHUR']

225

In [9]:
text6_freq

FreqDist({':': 1197, '.': 816, '!': 801, ',': 731, "'": 421, '[': 319, ']': 312, 'the': 299, 'I': 255, 'ARTHUR': 225, ...})

In [10]:
text6_word_len = [len(word) for word in text6]
print(min(text6_word_len), max(text6_word_len))

1 13


### How many times do the words lead and smelter occur in text collections, grouped into genre zinc? Consider reuters corpus.


In [11]:
cfd_reuters = nltk.ConditionalFreqDist([(genre, word) for genre in reuters.categories() for word in reuters.words(categories=genre)])

In [12]:
cfd_reuters.tabulate(conditions=['zinc'], samples=['lead', 'smelter'])

        lead smelter 
zinc      40      33 


### How many times do the words tonnes and year occur in text collections, grouped into genre sugar? Consider reuters corpus.



In [13]:
cfd_reuters.tabulate(conditions=['sugar'], samples=['tonnes', 'year'])

      tonnes   year 
sugar    355    196 


### How many times do the words gasoline and barrels occur in text collections, grouped into genre gas? Consider reuters corpus.

In [14]:
cfd_reuters.tabulate(conditions=['gas'], samples=['gasoline', 'barrels'])

    gasoline  barrels 
gas       77       64 


In [15]:
cfd_reuters.tabulate(conditions=['sugar'], samples=['sugar'])

      sugar 
sugar   521 


### How many times does the word gas occur in text collections, grouped into genre 'gas'? Consider reuters corpus.

In [16]:
cfd_reuters.tabulate(conditions=['gas'], samples=['gas'])

    gas 
gas  10 


In [17]:
cfd_reuters.tabulate(conditions=['zinc'], samples=['zinc'])

     zinc 
zinc   70 


In [18]:
nltk.word_tokenize('Python is cool!!!')

['Python', 'is', 'cool', '!', '!', '!']

### What is the frequency of bigram ('HEAD', 'KNIGHT') in text collection text6?

In [19]:
text6_bigrams = list(nltk.bigrams(text6))

In [20]:
nltk.FreqDist(text6_bigrams)['HEAD', 'KNIGHT']

29

### What is the frequency of bigram ('BLACK', 'KNIGHT') in text collection text6?

In [21]:
nltk.FreqDist(text6_bigrams)['BLACK', 'KNIGHT']

32

### Collocations associated with text6?

In [22]:
gen_text6 = nltk.Text(text6)
gen_text6.collocations()

BLACK KNIGHT; clop clop; HEAD KNIGHT; mumble mumble; Holy Grail;
squeak squeak; FRENCH GUARD; saw saw; Sir Robin; Run away; CARTOON
CHARACTER; King Arthur; Iesu domine; Pie Iesu; DEAD PERSON; Round
Table; clap clap; OLD MAN; dramatic chord; dona eis


In [23]:
porter = nltk.PorterStemmer()
print(porter.stem('lying'))

lie


In [24]:
lancaster = nltk.LancasterStemmer()
print(lancaster.stem('basics'))

bas


In [25]:
print(lancaster.stem('power'))

pow


In [26]:
print(lancaster.stem('women'))

wom


In [27]:
from nltk.corpus import brown

In [28]:
brown_tagged = brown.tagged_words()

In [29]:
type(brown_tagged)

nltk.corpus.reader.util.ConcatenatedCorpusView

In [30]:
from collections import defaultdict, Counter
# Keeps words and pos into a dictionary 
# where the key is a word and
# the value is a counter of POS and counts
word_tags = defaultdict(Counter)
for word, pos in brown.tagged_words():
    word_tags[word][pos] +=1

In [31]:
word_tags['The']

Counter({'AT': 6725, 'AT-TL': 452, 'AT-HL': 81})

In [32]:
brown_tagged_news = brown.tagged_words(categories='news')

In [33]:
word_tags_news = defaultdict(Counter)
for word, pos in brown_tagged_news:
    word_tags_news[word][pos] +=1

In [34]:
word_tags_news

defaultdict(collections.Counter,
            {'The': Counter({'AT': 775, 'AT-TL': 28, 'AT-HL': 3}),
             'Fulton': Counter({'NP-TL': 10, 'NP': 4}),
             'County': Counter({'NN-TL': 35}),
             'Grand': Counter({'JJ-TL': 5, 'FW-JJ-TL': 1}),
             'Jury': Counter({'NN-TL': 2}),
             'said': Counter({'VBD': 382, 'VBN': 20}),
             'Friday': Counter({'NR': 41}),
             'an': Counter({'AT': 300}),
             'investigation': Counter({'NN': 9}),
             'of': Counter({'IN': 2716, 'IN-TL': 128, 'IN-HL': 5}),
             "Atlanta's": Counter({'NP$': 4}),
             'recent': Counter({'JJ': 20}),
             'primary': Counter({'NN': 13, 'JJ': 4}),
             'election': Counter({'NN': 38}),
             'produced': Counter({'VBD': 5, 'VBN': 1}),
             '``': Counter({'``': 732}),
             'no': Counter({'AT': 104, 'RB': 5}),
             'evidence': Counter({'NN': 17}),
             "''": Counter({"''": 702}),
          

In [35]:
nltk.FreqDist(brown_tagged_news)

FreqDist({('the', 'AT'): 5558, (',', ','): 5133, ('.', '.'): 4012, ('of', 'IN'): 2716, ('and', 'CC'): 2115, ('a', 'AT'): 1988, ('in', 'IN'): 1828, ('to', 'TO'): 1222, ('for', 'IN'): 905, ('to', 'IN'): 880, ...})

In [36]:
import re
s = 'Python is cool!!!'
print(re.findall(r'\s\w+\b', s))

[' is', ' cool']


### END