# Playing around

In [1]:
import nltk


In [2]:
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch',
    'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist(
          (lang, len(word))
          for lang in languages
          for word in udhr.words(lang + '-Latin1'))
cfd.plot(cumulative=True)

In [3]:
raw_text = udhr.raw('Hungarian_Magyar-Latin1')
raw_fd = nltk.FreqDist(raw_text)
raw_fd.plot()

In [14]:
from nltk.corpus import PlaintextCorpusReader

corpus_root = 'texts'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
wordlists.words('aranyszoru_barany.txt')

cfd = nltk.ConditionalFreqDist(
    (fileid, len(word))
    for fileid in wordlists.fileids()
    for word in wordlists.words(fileid))
cfd.plot(cumulative=True)

Working with the news and romance genres from the Brown Corpus, find out which days of the week are most newsworthy, and which are most romantic. Define a variable called days containing a list of days of the week, i.e. ['Monday', ...]. Now tabulate the counts for these words using cfd.tabulate(samples=days). Now try the same thing using plot in place of tabulate. You may control the output order of days with the help of an extra parameter: samples=['Monday', ...].

In [25]:
from nltk.corpus import brown

days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
genre_word = [(genre, word) 
              for genre in ['news','romance'] 
              for word in brown.words(categories = genre)]

cfd = nltk.ConditionalFreqDist(genre_word)
#cpd = nltk.ConditionalProbDist(genre_word)

In [22]:
cfd.tabulate(samples = days)
cfd.plot(samples = days)

           Monday   Tuesday Wednesday  Thursday    Friday  Saturday    Sunday 
   news        54        43        22        20        41        33        51 
romance         2         3         3         1         3         4         5 


## Generating random text with bigrams

In [35]:
def generate_model(cfdist, word, num = 15):
    for i in range(num):
        print(word, end = ' ')
        word = cfdist[word].max()

text = nltk.corpus.genesis.words('german.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)

generate_model(cfd, 'Gott')

Gott der HERR , und sprach : Ich will ich will ich will ich will 

In [26]:
nltk.corpus.genesis.fileids()

['english-kjv.txt',
 'english-web.txt',
 'finnish.txt',
 'french.txt',
 'german.txt',
 'lolcat.txt',
 'portuguese.txt',
 'swedish.txt']

## WordNet

Write down all the senses of the word dish that you can think of. Now, explore this word with the help of WordNet, using the same operations we used above.

In [2]:
from nltk.corpus import wordnet as wn

In [9]:
for synset in wn.synsets('dish'):
    print(synset)
    print(synset.lemma_names())
    print(synset.definition())
    print()

Synset('dish.n.01')
['dish']
a piece of dishware normally used as a container for holding or serving food

Synset('dish.n.02')
['dish']
a particular item of prepared food

Synset('dish.n.03')
['dish', 'dishful']
the quantity that a dish will hold

Synset('smasher.n.02')
['smasher', 'stunner', 'knockout', 'beauty', 'ravisher', 'sweetheart', 'peach', 'lulu', 'looker', 'mantrap', 'dish']
a very attractive or seductive looking woman

Synset('dish.n.05')
['dish', 'dish_aerial', 'dish_antenna', 'saucer']
directional antenna consisting of a parabolic reflector for microwave or radio frequency radiation

Synset('cup_of_tea.n.01')
['cup_of_tea', 'bag', 'dish']
an activity that you like or at which you are superior

Synset('serve.v.06')
['serve', 'serve_up', 'dish_out', 'dish_up', 'dish']
provide (usually but not necessarily food)

Synset('dish.v.02')
['dish']
make concave; shape like a dish



In [16]:
types_of_dishes = wn.synset('dish.n.02').hyponyms()
types_of_dishes[0]
sorted(lemma.name() for synset in types_of_dishes for lemma in synset.lemmas())

Synset('adobo.n.01')

What is a boxer?

In [21]:
for synset in wn.synsets('boxer'):
    print(synset.definition())
    print(synset.lemma_names())
    print(synset.name())
    print()

someone who fights with his fists for sport
['boxer', 'pugilist']
boxer.n.01

a workman employed to pack things into containers
['packer', 'bagger', 'boxer']
packer.n.01

a member of a nationalistic Chinese secret society that led an unsuccessful rebellion in 1900 against foreign interests in China
['Boxer']
boxer.n.03

a breed of stocky medium-sized short-haired dog with a brindled coat and square-jawed muzzle developed in Germany
['boxer']
boxer.n.04



In [24]:
boxer = wn.synset('boxer.n.04')
fighter = wn.synset('boxer.n.01')

print(boxer.lowest_common_hypernyms(fighter))
print(boxer.path_similarity(fighter))

[Synset('organism.n.01')]
0.1111111111111111


# Exercises

2\. Use the corpus module to explore austen-persuasion.txt. How many word tokens does this book have? How many word types?

In [5]:
persuasion = nltk.corpus.gutenberg.words('austen-persuasion.txt')

print("No. of words: %d" % len(persuasion))
print("No. of word types: %d" % len(set(persuasion)))

print(nltk.Text(persuasion).concordance('surprise'))

No. of words: 98171
No. of word types: 6132
Displaying 23 of 23 matches:
re upon , that it will not greatly surprise me if , with all our caution , som
was the reply , and with a look of surprise . " Yes ; it is in two points offe
t all hours , that it was rather a surprise to her to find Mary alone ; but be
 , food , hours , & c ., and their surprise at his accounts , at learning the 
 , who was lying on the sofa . The surprise of finding himself almost alone wi
rt of the street ; but his evident surprise and vexation at the substitution o
h amusement at his little start of surprise , that he had not been at all awar
think differently ; and it did not surprise her , therefore , that Lady Russel
 and , to quicken the pleasure and surprise , with Admiral and Mrs Croft ' s c
you shall hear something that will surprise you . But first of all , you must 
ildering , first effects of strong surprise were over with her . Still , howev
 to feign that he was . It did not surprise , but it griev

4\. Read in the texts of the State of the Union addresses, using the state_union corpus reader. Count occurrences of men, women, and people in each document. What has happened to the usage of these words over time?

In [14]:
from nltk.corpus import state_union

words = ['men','women','people']

cfd = nltk.ConditionalFreqDist(
    (word.lower(), file[0:4])
    for file in state_union.fileids()
    for word in state_union.words(file)
    for target in words
    if word.lower() == target)

cfd.plot()

       1945 1946 1947 1948 1949 1950 1951 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 
   men    2   12    7    5    2    6    8    3    2    4    2    5    2    4    2    6    6    8    3   19   12   11    4    5    2    1    1    1    0    0    3    2    0    0    1    1    1    3    3    1    2    1    1    2    3    9    4    1    1    1    2    1    2    2    5    4    3    6    7    8    7 
people   10   49   12   22   15   15   10   17   15   26   30   11   19   11   10   10   10   15    3   30   35   25   17    6   23   32    7    9   20   14   18   19   26   15   12   11   17   19   27   12   14   24   17   13    9   27   27   45   66   73   43   31   22   22   41   27   14   33   21   18   22 
 women    2    7    2    1    1    2    2    0    0    0    2

5\. Investigate the holonym-meronym relations for some nouns. Remember that there are three kinds of holonym-meronym relation, so you need to use: member_meronyms(), part_meronyms(), substance_meronyms(), member_holonyms(),  part_holonyms(), and substance_holonyms().

In [56]:
my_lemma = 'toe.n.01'
print(wn.synset(my_lemma).definition())

print(wn.synset(my_lemma).part_meronyms())
print(wn.synset(my_lemma).substance_meronyms())
print(wn.synset(my_lemma).member_meronyms())

print(wn.synset(my_lemma).part_holonyms())
print(wn.synset(my_lemma).substance_holonyms())
print(wn.synset(my_lemma).member_holonyms())


one of the digits of the foot
[Synset('tiptoe.n.01'), Synset('toenail.n.01')]
[]
[]
[Synset('foot.n.01')]
[]
[]


6\. In the discussion of comparative wordlists, we created an object called translate which you could look up using words in both German and Spanish in order to get corresponding words in English. What problem might arise with this approach? Can you suggest a way to avoid this problem?

In [64]:
from nltk.corpus import swadesh

es2de = swadesh.entries(['es','de'])
translate = dict(es2de)

en2de = swadesh.entries(['en','de'])
translate.update(dict(en2de))

# duplicate entry!
print("Original: %s" % translate['animal'])

Original: Tier


In [80]:
# Could use dictionaries separately, pre- or suffix the words, or have entries
# point to a list of translations, having language info

# List
es2de = [(entry[0],[('es',entry[1])]) for entry in swadesh.entries(['es','de'])]
en2de = [(entry[0],[('en',entry[1])]) for entry in swadesh.entries(['en','de'])]

langspec_translate = dict(es2de)
for entry in en2de:
    if entry[0] in langspec_translate:
        langspec_translate[entry[0]].extend(entry[1])
    else:
        langspec_translate[entry[0]] = entry[1]

# duplicate entry!
print(langspec_translate['yo'])

[('es', 'ich')]


According to Strunk and White's Elements of Style, the word however, used at the start of a sentence, means "in whatever way" or "to whatever extent", and not "nevertheless". They give this example of correct usage: However you advise him, he will probably do as he thinks best. (http://www.bartleby.com/141/strunk3.html) Use the concordance tool to study actual usage of this word in the various texts we have been considering. See also the LanguageLog posting "Fossilized prejudices about 'however'" at  http://itre.cis.upenn.edu/~myl/languagelog/archives/001913.html

In [85]:
from nltk.corpus import gutenberg

emma = nltk.Text(gutenberg.words('austen-emma.txt'))
persuasion = nltk.Text(gutenberg.words('austen-persuasion.txt'))
hamlet = nltk.Text(gutenberg.words('shakespeare-hamlet.txt'))

print('Emma')
print()
emma.concordance("however")
print()

print('Persuasion')
print()
persuasion.concordance("however")
print()

print('Hamlet')
print()
hamlet.concordance("however")
print()


Emma

Displaying 25 of 131 matches:
 her many enjoyments . The danger , however , was at present so unperceived , t
ion would offend . Miss Churchill , however , being of age , and with the full 
n . From the expense of the child , however , he was soon relieved . The boy ha
 -- and been very well brought up . However , I do not mean to set up my opinio
f and predict . It was not likely , however , that any body should have equalle
to be borne . We will not despair , however . Weston may grow cross from the wa
is so very handsome and agreeable . However , I do really think Mr . Martin a v
 accepted after all . This letter , however , was written , and sealed , and se
e him ." " And if I did , ( which , however , I am far from allowing ) I should
 slightingly . Waiving that point , however , and supposing her to be , as you 
e was not so materially cast down , however , but that a little time and the re
ld inspire him ." The very next day however produced some proof of inspiration 
and 

8\. Define a conditional frequency distribution over the Names corpus that allows you to see which initial letters are more frequent for males vs. females (cf. 4.4).

In [91]:
names = nltk.corpus.names

print([len(names.words(fileid)) for fileid in names.fileids()])


cfd = nltk.ConditionalFreqDist(
    (fileid, name[0])
    for fileid in names.fileids()
    for name in names.words(fileid))

cfd.plot()
# Weirdly very much not uniform!


[5001, 2943]


9\. Pick a pair of texts and study the differences between them, in terms of vocabulary, vocabulary richness, genre, etc. Can you find pairs of words which have quite different meanings across the two texts, such as monstrous in Moby Dick and in Sense and Sensibility?

In [37]:
from nltk.corpus import gutenberg
import pandas as pd

def vocab_size(words):
    return(len(set(words)))

def vocab_richness(words):
    n_words = len(words)
    n_distinct = len(set(words))
    return(n_distinct/n_words)

def mean_sent_len(sentences):
    n_sents = len(sentences)
    
    total_n_words = 0
    for sent in sentences:
        total_n_words = total_n_words + len(sent)
    
    return(total_n_words/n_sents)

def count_in_text(text, pattern):
    n_occ = 0
    for word in text:
        if word == pattern:
            n_occ += 1
    return(n_occ)

In [22]:


my_files = ['austen-sense.txt', 'melville-moby_dick.txt']
my_names = ['sense','moby_dick']

df = pd.DataFrame({'name': my_names, 'file': my_files})

v_vocab_size = []
v_vocab_richness = []
v_mean_sent_len = []

for index, row in df.iterrows():
    words = nltk.Text(gutenberg.words(row['file']))
    sents = nltk.Text(gutenberg.sents(row['file']))
    
    v_vocab_size.append(vocab_size(words))
    v_vocab_richness.append(vocab_richness(words))
    v_mean_sent_len.append(mean_sent_len(sents))

df['vocab_size'] = v_vocab_size
df['vocab_richness'] = v_vocab_richness
df['mean_sent_len'] = v_mean_sent_len

df

Unnamed: 0,file,name,vocab_size,vocab_richness,mean_sent_len
0,austen-sense.txt,sense,6833,0.048264,28.329066
1,melville-moby_dick.txt,moby_dick,19317,0.074063,25.932896


In [27]:
sense = nltk.Text(gutenberg.words('austen-sense.txt'))
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))

overlapping_words = set(sense).intersection(set(moby))

In [42]:
word_table = pd.DataFrame({'word': list(overlapping_words)}).head(10)

n_sense = word_table.apply(lambda row: count_in_text(sense, row['word']), axis = 1)
n_moby = word_table.apply(lambda row: count_in_text(moby, row['word']), axis = 1)

word_table['sense'] = n_sense
word_table['moby'] = n_moby

word_table

Unnamed: 0,word,sense,moby
0,office,6,8
1,heath,1,1
2,connected,6,12
3,progress,3,4
4,temporary,3,11
5,wild,4,82
6,familiarly,2,5
7,shoulder,2,12
8,mourning,1,2
9,speculative,1,1


In [50]:
word = 'engaged'

print(sense.concordance(word))
print(moby.concordance(word))

Displaying 25 of 42 matches:
everal weeks in the house before he engaged much of Mrs . Dashwood ' s attentio
he truth . " And you really are not engaged to him !" said she . " Yet it certa
ven to Norland half its charms were engaged in again with far greater enjoyment
vided attention where his heart was engaged , and in slighting too easily the f
ment she doubted not of their being engaged to each other ; and the belief of i
ered her mind of their being really engaged , and this doubt was enough to prev
k , to call on Lady Middleton ." He engaged to be with them by four o ' clock .
ceal their engagement ( if they ARE engaged ) from Mrs . Smith -- and if that i
he , " whether she is or she is not engaged to Willoughby ? From you , her moth
osing it possible that they are not engaged , what distress would not such an e
silent , till a new object suddenly engaged her attention . She was sitting by 
Lady Middleton ' s good opinion was engaged in their favour before they had bee
 of long st

10\. Read the BBC News article: UK's Vicky Pollards 'left behind' http://news.bbc.co.uk/1/hi/education/6173441.stm. The article gives the following statistic about teen language: "the top 20 words used, including yeah, no, but and like, account for around a third of all words." How many word types account for a third of all word tokens, for a variety of text sources? What do you conclude about this statistic? Read more about this on LanguageLog, at  http://itre.cis.upenn.edu/~myl/languagelog/archives/003993.html.

In [82]:
from nltk.corpus import nps_chat

def count_words_in_top(words, percentage):
    words = [w for w in words if len(w) > 1]
    fd = nltk.FreqDist(w.lower() for w in words)
    word_list = sorted(fd, key = fd.get, reverse = True)
    
    n_all = len(words)
    n_sofar = 0
    i = 0
    while n_sofar < percentage*n_all:
        print(word_list[i])
        n_sofar = n_sofar + fd[word_list[i]]
        i = i + 1
    
    return(i)
    
def count_perc(words, percentage):
    n_top = count_words_in_top(words, percentage)
    n_all = len(set(words))
    return(n_top, n_top/n_all)

chat = nps_chat.words()
print(count_perc(chat, 0.3))
print(count_perc(sense, 0.3))
print(count_perc(moby, 0.3))

part
join
lol
you
to
the
hi
me
...
is
in
..
and
it
action
hey
that
my
of
what
's
for
on
here
no
do
are
not
have
....
all
(31, 0.00511045169798879)
to
the
of
and
her
in
was
it
she
that
be
for
not
as
you
he
his
had
(18, 0.0026342748426752523)
the
of
and
to
in
that
his
it
he
but
as
is
with
was
for
all
this
at
whale
by
not
(21, 0.0010871253300201895)


11\. Investigate the table of modal distributions and look for other patterns. Try to explain them in terms of your own impressionistic understanding of the different genres. Can you find other closed classes of words that exhibit significant differences across different genres?

In [83]:
from nltk.corpus import brown

cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

                  can could   may might  must  will 
           news    93    86    66    38    50   389 
       religion    82    59    78    12    54    71 
        hobbies   268    58   131    22    83   264 
science_fiction    16    49     4    12     8    16 
        romance    74   193    11    51    45    43 
          humor    16    30     8     8     9    13 
