# Importing our wordlists

Here we import all of our wordlists and add them to an array which me can merge at the end. 

This wordlists should not be filtered at this point. However they should all contain the same columns to make merging easier for later.

In [103]:
wordlists = []

## Dictcc

#### Download the dictionary from http://www.dict.cc/?s=about%3Awordlist

#### Print out the first 20 lines of the dictionary

In [99]:
!head -n 20 de-en.txt

# DE-EN vocabulary database	compiled by dict.cc
# Date and time	2016-08-29 23:46
# License	THIS WORK IS PROTECTED BY INTERNATIONAL COPYRIGHT LAWS!
# License	Private use is allowed as long as the data, or parts of it, are not published or given away.
# License	By using this file, you agree to be bound to the Terms of Use published at the following URL:  
# License	http://www.dict.cc/translation_file_request.php
# Contains data from	http://dict.tu-chemnitz.de/ with friendly permission by Frank Richter, TU Chemnitz 
# Brought to you by	Paul Hemetsberger and the users of http://www.dict.cc/, 2002 - 2016

&#945;-Keratin {n}	&#945;-keratin	noun
&#945;-Lactalbumin {n} <&#945;-La>	&#945;-lactalbumin <&#945;-La>	noun
&#946;-Mercaptoethanol {n}	&#946;-mercaptoethanol	noun
&#963;-Algebra {f}	&#963;-field	noun
&#963;-Algebra {f}	sigma algebra	noun
& Co.	and company <& Co.>	
'Die' heißt mein Unterrock, und 'der' hängt im Schrank. [regional] [Satz, mit dem Kinder gerügt werden, die vo

#### Use pandas library to import csv file

In [104]:
import pandas as pd


dictcc_df = pd.read_csv("de-en.txt", 
                        sep='\t',
                        skiprows=8,
                        header=None, 
                        names=["GermanWord","Word","WordType"])

#### Preview a few entries of the wordlist

In [101]:
dictcc_df[90:100]

Unnamed: 0,GermanWord,Word,WordType
90,(aktiv) Werbung machen für,to tout,verb
91,(aktive) Langzeitverbindung {f} [Standverbindu...,nailed-up connection <NUC>,noun
92,(aktuelles) Zeitgeschehen {n},current events {pl},noun
93,(akustisch) verstehen,to hear,verb
94,(akustische) Haarzelle {f},auditory cell,noun
95,(akustischer) Dissipationsgrad {m},(acoustic) dissipation factor,noun
96,(akute) Rückenmuskelnekrose {f},(acute) back muscle necrosis,noun
97,(akuter) Hörsturz {m},acute hearing loss,noun
98,(akuter) Myokardinfarkt {m} <AMI / MI>,(acute) myocardial infarction <AMI / MI>,noun
99,(akutes) Lungenversagen {n},acute respiratory distress syndrome <ARDS>,noun


#### We only need "Word" and "WordType" column

In [105]:
dictcc_df = dictcc_df[["Word", "WordType"]][:].copy()

#### Convert WordType Column to a pandas.Categorical

In [106]:
word_types = dictcc_df["WordType"].astype('category')
dictcc_df["WordType"] = word_types
# show data types of each column in the dataframe
dictcc_df.dtypes

Word          object
WordType    category
dtype: object

#### List the current distribution of word types in dictcc dataframe

In [107]:
# nltk TaggedCorpusParses requires uppercase WordType
dictcc_df["WordType"] = dictcc_df["WordType"].str.upper()
dictcc_df["WordType"].value_counts().head()

NOUN          759619
VERB          126806
ADJ            94507
ADV            26277
ADJ PAST-P     12519
Name: WordType, dtype: int64

#### Add dictcc corpus to our wordlists array

In [108]:
wordlists.append(dictcc_df)

## Moby

#### Download the corpus from http://icon.shef.ac.uk/Moby/mpos.html

#### Perform some basic cleanup on the wordlist

In [109]:
# the readme file in `nltk/corpora/moby/mpos` gives some information on how to parse the file

result = []
# replace all DOS line endings '\r' with newlines then change encoding to UTF8
moby_words = !cat nltk/corpora/moby/mpos/mobyposi.i | iconv --from-code=ISO88591 --to-code=UTF8 | tr -s '\r' '\n' | tr -s '×' '/'
result.extend(moby_words)
moby_df = pd.DataFrame(data = result, columns = ['Word'])

In [9]:
moby_df

Unnamed: 0,Word
0,3-D/AN
1,4-F/N
2,4-H'er/N
3,4-H/A
4,A battery/h
5,a bon march/v
6,a cappella/Av
7,a capriccio/h
8,a datu/h
9,a fortiori/v


- sort out the nouns, verbs and adjectives

In [110]:
# Matches nouns
nouns = moby_df[moby_df["Word"].str.contains('/[Np]$')].copy()
nouns["WordType"] = "NOUN"
# Matches verbs
verbs = moby_df[moby_df["Word"].str.contains('/[Vti]$')].copy()
verbs["WordType"] = "VERB"
# Magtches adjectives
adjectives = moby_df[moby_df["Word"].str.contains('/A$')].copy()
adjectives["WordType"] = "ADJ"

- remove the trailing stuff and concatenate the nouns, verbs and adjectives

In [111]:
nouns["Word"] = nouns["Word"].str.replace(r'/N$','')
verbs["Word"] = verbs["Word"].str.replace(r'/[Vti]$','')
adjectives["Word"] = adjectives["Word"].str.replace(r'/A$','')
# Merge nouns, verbs and adjectives into one dataframe
moby_df = pd.concat([nouns,verbs,adjectives])

#### Add moby corpus to wordlists array

## Brown (from nltk)

TODO:

- We can probably work with `nltk.corpus.brown.tagged_words()` when creating our dataframe

## Use NLTK to combine wordlists

## Combine all wordlists

In [112]:
import numpy as np
wordlist = pd.concat(wordlists)

# Filter for results that we want

- We want to remove words that aren't associated with a type (null WordType)

In [113]:
wordlist_filtered = wordlist[wordlist["WordType"].notnull()]

- We want to remove words that contain non word characters (whitespace, hypens, etc.)

In [114]:
# we choose [a-z] here and not [A-Za-z] because we do _not_
# want to match words starting with uppercase characters.
# ^to matches verbs in the infinitive from `dictcc`
word_chars = r'^[a-z]+$|^to\s'
is_word_chars = wordlist_filtered["Word"].str.contains(word_chars, na=False)
wordlist_filtered = wordlist_filtered[is_word_chars]
wordlist_filtered.describe()
wordlist_filtered["WordType"].value_counts()

NOUN                  132318
VERB                  126665
ADJ                    50659
ADV                    12748
ADJ PAST-P              9327
ADJ PRES-P              4223
PAST-P                  1291
ADJ ADV                  620
PREP                     252
PRON                     222
PRES-P                   173
CONJ                     124
PAST-P ADJ                33
PRES-P ADJ                26
ADV PREP                  20
ADJ PRON                  16
ADJ ARCHAIC:ADV           10
PREFIX                    10
ADV CONJ                   9
PREP CONJ                  5
ADV ADJ                    4
ADJ.                       4
ADV PAST-P                 3
ADV PREP CONJ              2
ADJ ARCHAIC:PAST-P         2
ADV DATED:ADJ              2
[NONE]                     2
ADJ RARE:ADV               1
ADJ COLL:ADV               1
PREP ADV                   1
ADJ PREP                   1
ADV.                       1
ADJ OBS:PAST-P             1
ADV PRON                   1
ADV ARCHAIC:AD

-  We want results that are less than 'x' letters long (x+3 for verbs since they are in their infinitive form in the dictcc wordlist)

In [115]:
lt_x_letters = (wordlist_filtered["Word"].str.len() < 9) |\
               ((wordlist_filtered["Word"].str.contains('^to\s\w+\s')) &\
                (wordlist_filtered["Word"].str.len() < 11)\
               )
wordlist_filtered = wordlist_filtered[lt_x_letters]
wordlist_filtered.describe()

Unnamed: 0,Word,WordType
count,108112,108112
unique,39257,39
top,boom,NOUN
freq,35,64792


- We want to remove all duplicates

In [116]:
wordlist_filtered = wordlist_filtered.drop_duplicates("Word")
wordlist_filtered.describe()
wordlist_filtered["WordType"].value_counts()

NOUN                  24671
ADJ                    6901
VERB                   2663
ADJ PAST-P             2130
ADV                    1250
ADJ PRES-P              705
PAST-P                  622
ADJ ADV                 132
PRON                     45
PREP                     43
PRES-P                   34
CONJ                     23
PREFIX                    8
PAST-P ADJ                8
ADJ PRON                  5
PRES-P ADJ                4
ADJ ARCHAIC:ADV           2
ADV CONJ                  2
[NONE]                    1
ADJ ARCHAIC:PAST-P        1
ADV PRON                  1
ADJ OBS:PAST-P            1
ADV DATED:ADJ             1
PRES-P ARCHAIC:ADJ        1
ADV PREP                  1
ADV PREP CONJ             1
ADV ADJ                   1
Name: WordType, dtype: int64

- We want to remove words that are difficult to spell

TODO:

In [115]:
# Words with uncommon vowel duplicates (examples: ["piing", "reeject"])

- We want to remove all names and animals

TODO:


TODO:

- We want to remove stopwords from wordlist

```
from nltk.corpus import stopwords
dif = set(wordlist_filtered['Word']) - set(stopwords.words('english'))
names = nltk.corpus.names
names.fileids()
```

- We want to remove homonyms that are used in different parts of speech (example: saw (as verb) and saw (as noun))

- We want to remove arcane and unusual words

```
import nltk

def unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab - english_vocab
    return sorted(unusual)
```


### Load our wordlists into nltk

In [117]:
# The TaggedCorpusReader likes to use the forward slash character '/'
# as seperator between the word and part-of-speech tag (WordType).
wordlist_filtered.to_csv("dictcc_moby.csv",index=False,sep="/",header=None)

In [118]:
from nltk.corpus import TaggedCorpusReader
from nltk.tokenize import WhitespaceTokenizer
nltk_wordlist = TaggedCorpusReader("./", "dictcc_moby.csv")

In [119]:
nltk_wordlist.tagged_words()

[('up', 'ADV'), ('down', 'ADV'), ('cubicle', 'NOUN'), ...]

## Maximize distance between neighbouring words

- Nouns like "cobra" and "domra" should not be located at Geo-Coordinate "55°x11°" and "55°x12°"
- TODO: the spread_words() method doesn't actually solve this problem. We will need to update it by calculating the distance to **all 8** of its adjacent neighbours

In [116]:
# `pip install python-levenshtein`
# used to calculate the Levenshtein distance between words
import Levenshtein as lev

# Maximize the Levenshtein distance between neighbouring words
def spread_words(dataframe_values, min_distance = 25, min_lev = 5):
    words = []
    words.extend(dataframe_values)
    short_distances = 0
    for i in range(len(words)-1):
        next = i + 1
        if lev.distance(words[i],words[next]) < min_lev:
            short_distances = short_distances + 1
            words.append(words[next])
            words.remove(words[next])
    # The value for min_distance was derived
    # by simple trial and error
    if short_distances < min_distance:
        # The remaining words with short distance 
        # will have to be sorted out by hand.
        return words
    else:
        # Recurse until we minimize short distances
        # as much as possible.
        return spread_words(words)

# Insert distance of neighbour
def insert_neighbour_distance(words):
    result = []
    word_with_neighbour_distance = ()
    for i in range(len(words)-1):
        next = i + 1
        lev_distance = lev.distance(words[i],words[next])
        word_with_neighbour_distance = words[i], lev_distance
        result.append(word_with_neighbour_distance)
    return pd.DataFrame(data = result, columns=['Words', 'NeighbourDistance'])

### Spread nouns

In [117]:
nouns = wordlist_filtered[wordlist_filtered["WordType"] == "noun"]
# randomize for better performance
nouns = nouns.sample(len(nouns))
min_distance_nouns = spread_words(nouns["Word"].values,50,3)
nouns_ready_for_export = insert_neighbour_distance(min_distance_nouns)
nouns_ready_for_export[:10]

Unnamed: 0,Words,NeighbourDistance
0,dictions,8
1,trouper,7
2,sleigh,6
3,palpebra,8
4,missions,7
5,pageant,5
6,contact,7
7,reply,5
8,operas,6
9,slip,7


### Spread adjectives

In [118]:
adjectives = wordlist_filtered[wordlist_filtered["WordType"] == "adj"]
# randomize for better performance
adjectives = adjectives.sample(len(adjectives))
min_distance_adjectives = spread_words(nouns["Word"].values,50,3)
adjectives_ready_for_export = insert_neighbour_distance(min_distance_adjectives)
len(adjectives_ready_for_export.Words)

24589

### Spread verbs

In [119]:
# use "adj past-p" as verbs conjugated in the past tense until 
# we use nltk to properly conjugate all verbs in our wordlist
verbs_init = wordlist_filtered[((wordlist_filtered["WordType"] == "adj past-p") | (wordlist_filtered["WordType"] == "verb"))]
verbs = verbs_init.sample(n=len(verbs_init))
min_distance_verbs = spread_words(verbs["Word"].values,50,3)
verbs_ready_for_export = insert_neighbour_distance(min_distance_verbs)
sorted(verbs_ready_for_export.Words)

['abased',
 'abashed',
 'abducted',
 'abetted',
 'abhorred',
 'abjured',
 'ablated',
 'aborted',
 'abraded',
 'abridged',
 'absolved',
 'absorbed',
 'abused',
 'acceded',
 'accented',
 'accepted',
 'accessed',
 'accosted',
 'accreted',
 'accrued',
 'accursed',
 'accused',
 'achieved',
 'acquired',
 'actuated',
 'acylated',
 'adapted',
 'added',
 'adduced',
 'adhered',
 'adjoined',
 'adjudged',
 'adjusted',
 'admitted',
 'admixed',
 'adopted',
 'adored',
 'adorned',
 'adsorbed',
 'adverted',
 'advised',
 'aerated',
 'affected',
 'affirmed',
 'affixed',
 'aged',
 'agitated',
 'agonized',
 'agreed',
 'aimed',
 'aired',
 'alarmed',
 'aligned',
 'allayed',
 'allied',
 'allotted',
 'allowed',
 'alloyed',
 'alluded',
 'allured',
 'altered',
 'amazed',
 'amended',
 'amidated',
 'amused',
 'analyzed',
 'anchored',
 'angered',
 'animated',
 'annealed',
 'annexed',
 'annoyed',
 'annulled',
 'anodized',
 'anointed',
 'answered',
 'aped',
 'appeased',
 'appended',
 'applied',
 'apprised',
 'approve

#### Show the distribution of word types after filtering

In [120]:
wordlist_filtered["WordType"].value_counts().head()

noun          24590
adj            6885
adj past-p     2126
adv            1232
adj pres-p      694
Name: WordType, dtype: int64

### Export our filtered word lists to csv files

In [121]:
nouns_ready_for_export.to_csv("nouns.csv", index=False)
adjectives_ready_for_export.to_csv("adjectives.csv", index=False)
verbs_ready_for_export.to_csv("verbs.csv", index=False)

# Test pairings

In [122]:
print(nouns_ready_for_export.sample()['Words'].values[0] + '-' +\
      verbs_ready_for_export.sample()['Words'].values[0] + '-' +\
      adjectives_ready_for_export.sample()['Words'].values[0] + '-' +\
      nouns_ready_for_export.sample()['Words'].values[0])

birimbao-absolved-back-heats


## NLTK

- 

- 

In [163]:
# Our wordlist
import nltk
nltk_wordlist_fd = nltk.FreqDist(tag for (word, tag) in nltk_wordlist.tagged_words() if len(word) < 9 and word.isalpha())
cfd = nltk.ConditionalFreqDist(nltk_wordlist.tagged_words())
print(len(cfd.conditions()))
print(len([w for w in cfd.conditions() if len(cfd[w]) == 1]))
nltk_wordlist_fd.most_common()

37078
35419


[('NOUN', 24671),
 ('ADJ', 9877),
 (None, 3540),
 ('VERB', 2630),
 ('ADV', 1257),
 ('PAST-P', 630),
 ('PRON', 45),
 ('PREP', 43),
 ('PRES-P', 39),
 ('CONJ', 23),
 ('PREFIX', 8),
 ('[NONE]', 1)]

In [162]:
# Brown Corpus
import nltk
brown_fd = nltk.FreqDist(tag for (word, tag) in nltk.corpus.brown.tagged_words() if len(word) < 9 and word.isalpha())
brown_cfd = nltk.ConditionalFreqDist(nltk.corpus.brown.tagged_words())
print(len(brown_cfd.conditions()))
print(len([w for w in brown_cfd.conditions() if len(brown_cfd[w]) == 1]))
brown_fd.most_common(5)

56057
47328


[('NN', 120867), ('IN', 119772), ('AT', 97957), ('JJ', 44546), ('NNS', 40235)]

In [171]:
# TODO: brown still needs to filtered here
print(len(set(brown_cfd.conditions()) - set(cfd.conditions())))
set(brown_cfd.conditions()) - set(cfd.conditions())

41189


{'Philosophies',
 'microns',
 "Maude's",
 'Bishopsgate',
 'Karns',
 'demoralization',
 'Feeling',
 "'20's",
 'sanitation',
 'propagandist',
 'intolerable',
 'purtiest',
 '4-0',
 'tarantara',
 '$2,490',
 'Matilda',
 'prowled',
 'nineteenth-century',
 'Mitch',
 'Generale',
 'a-gracious',
 '64',
 'diethylaminoethyl',
 'jitterbug',
 "hero's",
 'clubrooms',
 'Bismarck',
 "chauffeur's",
 'cabdriver',
 'insomma',
 'prevented',
 'Birkhead',
 'Socola',
 'sonorities',
 'freebooters',
 'Kluckhohn',
 'McCrady',
 'hunches',
 'unnatural',
 'wellknown',
 'three-hundred-foot',
 'secularist',
 'Y-cells',
 'whimpering',
 'overexpose',
 'scientifically',
 'vocational-advancement',
 'big-business',
 'musicianship',
 'conjectures',
 'surface-declaring',
 '2.3',
 'Jemela',
 'Chabrier',
 'inaccuracies',
 'auto-loaders',
 'atavistic',
 '$1.5',
 'bested',
 'Loan',
 'stained-glass',
 "Euclid's",
 'Ameaux',
 'bugeyed',
 'Rockefeller',
 'ADC',
 'Eddies',
 'self-mastery',
 'Donnay',
 'Germany',
 'trans-Atlantic',


In [7]:
from nltk.corpus import brown
# NN = NOUNS
# VB = VERBS (base form)
# JJ = ADJECTIVES
brown_tagged = brown.tagged_words()

In [12]:
# Most common tags (Word types, POS) in brown corpus
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_tagged if len(word) < 9)
tag_fd.most_common()

[('NN', 121824),
 ('IN', 119920),
 ('AT', 97959),
 ('.', 60638),
 (',', 58156),
 ('JJ', 45025),
 ('NNS', 40947),
 ('CC', 37718),
 ('VB', 31914),
 ('NP', 30552),
 ('RB', 29906),
 ('VBD', 23508),
 ('VBN', 22135),
 ('CS', 22132),
 ('PPS', 18253),
 ('PP$', 16872),
 ('TO', 14918),
 ('PPSS', 13802),
 ('CD', 13174),
 ('MD', 12431),
 ('VBG', 12385),
 ('PPO', 11181),
 ('NN-TL', 10578),
 ('BEZ', 10066),
 ('BEDZ', 9806),
 ('AP', 9499),
 ('DT', 8957),
 ('``', 8837),
 ("''", 8789),
 ('QL', 8011),
 ('VBZ', 6562),
 ('BE', 6360),
 ('RP', 6009),
 ('WDT', 5532),
 ('HVD', 4895),
 ('*', 4603),
 ('WRB', 4505),
 ('BER', 4379),
 ('HV', 3928),
 ('WPS', 3918),
 ('JJ-TL', 3523),
 ('--', 3405),
 ('NP-TL', 3369),
 ('BED', 3282),
 ('ABN', 3010),
 ('DTI', 2921),
 ('BEN', 2470),
 ('DTS', 2435),
 ('HVZ', 2433),
 (')', 2273),
 ('(', 2264),
 ('EX', 2164),
 ('JJR', 1940),
 ('PN', 1865),
 ('OD', 1807),
 ('NNS-TL', 1751),
 (':', 1558),
 ('NP$', 1550),
 ('IN-TL', 1477),
 ('NR', 1423),
 ('DO', 1353),
 ('PPL', 1233),
 ('RBR'

In [40]:
import nltk
def most_common_word_types(corpus, word_type):
    word_tag_fd = nltk.FreqDist(corpus.tagged_words())
    return [(word, occurences) for (word, occurences) in word_tag_fd.most_common() if word[1] == word_type]

In [86]:
def words_matching_only_one_pos(corpus, pos, pos2, pos3):
    tagged = corpus.tagged_words()
    cfd = nltk.ConditionalFreqDist(tagged)
    original_count = [w for w in cfd.conditions() if pos in cfd[w]]
    print("Original count: ", len(original_count))
    print("Examples: " + str(original_count[:5]))
    without_pos2 = [w for w in cfd.conditions() if pos in cfd[w] and pos2 not in cfd[w]]
    print("Without " + pos2 + ": ", len(without_pos2))
    with_pos2 = [(w, cfd[w].most_common()) for w in cfd.conditions() if pos in cfd[w] and pos2 in cfd[w]]
    print("Examples with " + pos2 + ": " + str(with_pos2[:5]))
    without_pos3 = [w for w in cfd.conditions() if pos in cfd[w] and pos2 not in cfd[w] and pos3 not in cfd[w]]
    print("Without " + pos3 + ": ", len(without_pos3))
    with_pos3 = [w for w in cfd.conditions() if pos in cfd[w] and pos2 in cfd[w] and pos3 in cfd[w]]
    print("Examples with " + pos3 + ": " + str(with_pos3[:5]))
    return without_pos2

In [87]:
wi = words_matching_only_one_pos(nltk.corpus.brown, "NN", "VB", "JJ")

Original count:  14128
Examples: ['snoring', 'demoralization', 'sanitation', 'reversal', 'gasket']
Without VB:  12970
Examples with VB: [('purge', [('NN', 1), ('VB', 1)]), ('whip', [('NN', 14), ('VB', 5)]), ('pan', [('NN', 12), ('NN-HL', 1), ('VB', 1)]), ('counsel', [('NN', 13), ('VB', 1)]), ('compromise', [('NN', 16), ('VB', 3)])]
Without JJ:  12516
Examples with JJ: ['blind', 'chance', 'black', 'abstract', 'duplicate']


In [96]:
[w for w in wi if '-' not in w]

['snoring',
 'demoralization',
 'sanitation',
 'reversal',
 'gasket',
 'pothole',
 'diethylaminoethyl',
 'jitterbug',
 'p(Q)',
 'complacency',
 'deadness',
 'ugliness',
 'faithful',
 'outset',
 'musicianship',
 'horsehair',
 'reinvestigation',
 'beadle',
 'ADC',
 'echelon',
 'burl',
 'deadweight',
 'insurance',
 'propagandist',
 'jackass',
 'elegy',
 'schnapps',
 'chaise',
 'sponsorship',
 'humiliation',
 'characteristic',
 'grist',
 'personification',
 'fortress',
 'psyche',
 'trilogy',
 'development',
 'nostalgia',
 'benediction',
 'walnut',
 'clay',
 'cleavage',
 'veldt',
 'twilight',
 'airmail',
 'frosting',
 'salesman',
 'indoctrinating',
 'bovine',
 'ion',
 'murderer',
 'tripod',
 'minority',
 'anchoritism',
 'Mc/sec',
 'contraband',
 'disapprobation',
 'easel',
 'reasoning',
 'mine',
 'Professor',
 'second',
 'citizenry',
 'rein',
 'pickaxe',
 'p',
 'warden',
 'midway',
 'potency',
 'trickle',
 'congestion',
 'octopus',
 'random',
 'shrinkage',
 'examiantion',
 'crowing',
 'nora

In [60]:
cfd = nltk.ConditionalFreqDist(brown_tagged)
[w for w in cfd.conditions() if 'NN' in cfd[w] and 'VB' in cfd[w]]

['purge',
 'whip',
 'pan',
 'counsel',
 'compromise',
 'hand',
 'split',
 'convert',
 'trust',
 'time',
 'profit',
 'scrape',
 'shock',
 'watch',
 'shop',
 'patrol',
 'pick',
 'span',
 'grasp',
 'collapse',
 'dot',
 'shuffle',
 'step',
 'Show',
 'swarm',
 'blind',
 'stress',
 'exhaust',
 'sun',
 'chance',
 'kiss',
 'land',
 'shape',
 'evidence',
 'promise',
 'plunge',
 'seat',
 'overhaul',
 'barrel',
 'root',
 'fork',
 'answer',
 'wash',
 'saw',
 'feather',
 'outline',
 'cheer',
 'strain',
 'darn',
 'black',
 'fold',
 'thud',
 'shed',
 'decline',
 'abandon',
 'pinch',
 'wear',
 'shore',
 'bridge',
 'file',
 'blot',
 'twist',
 'raise',
 'pattern',
 'die',
 'abstract',
 'duplicate',
 'round',
 'alternate',
 'wall',
 'request',
 'game',
 'hug',
 'drive',
 'glance',
 'return',
 'bump',
 'conduct',
 'roll',
 'pillage',
 'page',
 'Hope',
 'gun',
 'spur',
 'glisten',
 'content',
 'shift',
 'cool',
 'open',
 'travel',
 'phrase',
 'rear',
 'bypass',
 'rebel',
 'case',
 'knock',
 'pay',
 'cautio

In [61]:
cfd['purge'].most_common()

[('NN', 1), ('VB', 1)]

In [27]:
# Most common Nouns (NN)?
len(most_common_word_types(nltk.corpus.brown, "NN"))

14128

In [28]:
# Most common Adjectives (JJ)?
len(most_common_word_types(nltk.corpus.brown, "JJ"))

8063

In [39]:
# Most common Verbs (VB + VBD)?
vb = len(most_common_word_types(nltk.corpus.brown, "VB"))
vbd = len(most_common_word_types(nltk.corpus.brown, "VBD"))
vb + vbd

5236

In [126]:
# given that we have word 'foobar' with which tag
# does it appear most often?

[word for word in cfd1.conditions() if 'VBD' in cfd1[word] and 'VBN' in cfd1[word]]
idx1 = wsj.index(('followed', 'VBN'))
wsj[idx1-4:idx1+4]

NameError: name 'cfd1' is not defined

In [98]:
past_participles = [w for w in cfd2 if 'VBN' in cfd2[w]]

#### Show common words for particular parts-of-speech tags

In [None]:
def find_common_words(tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text)
    return dict((tag, cfd[tag].most_common()) for tag in cfd.conditions())

In [None]:
import nltk
tagdict = find_common_words(nltk.corpus.brown.tagged_words())
# only nouns (that is, don't include proper nouns)
tagdict['NN']