#### Copyright 2018 Google LLC.

In [0]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Natural Language Understanding: WordNet

Please **make a copy** of this Colab notebook before starting this lab. To do so, choose **File**->**Save a copy in Drive**.

## Topics covered
  1. Synsets
  1. Lemmas and synonyms
  1. Word hierarchies
  1. Measuring similarities

One of the earliest attempts to create useful representations of meaning for language is [WordNet](https://en.wikipedia.org/wiki/WordNet) -- a lexical database of words and their relationships.

NLTK provides a [WordNet wrapper](http://www.nltk.org/howto/wordnet.html) that we'll use here.

In [10]:
import nltk
assert(nltk.download('wordnet'))  # Make sure we have the wordnet data.
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Synsets
The fundamental WordNet unit is a **synset**, specified by a word form, a part of speech, and an index. The synsets() function retrieves the synsets that match the given word. For example, there are 4 synsets for the word "surf", one of which is a noun (n) and three of which are verbs (v). WordNet provides a definition and sometimes glosses (examples) for each synset. **Polysemy**, by the way, means having multiple senses.

In [11]:
for s in wn.synsets('surf'):
    print (s);
    print ('\t', s.definition())
    print ('\t', s.examples())

Synset('surf.n.01')
	 waves breaking on the shore
	 []
Synset('surfboard.v.01')
	 ride the waves of the sea with a surfboard
	 ['Californians love to surf']
Synset('browse.v.03')
	 look around casually and randomly, without seeking anything in particular
	 ['browse a computer directory', 'surf the internet or the world wide web']
Synset('surf.v.03')
	 switch channels, on television
	 []


## Lemmas and synonyms
Each synset includes its corresponding **lemmas** (word forms).

We can construct a set of synonyms by looking up all the lemmas for all the synsets for a word.

In [12]:
synonyms = set()
 
for s in wn.synsets('triumphant'):
    for l in s.lemmas():
        synonyms.add(l.name())

print ('synonyms:', ', '.join(synonyms))

synonyms: victorious, jubilant, prideful, triumphal, rejoicing, triumphant, exulting, exultant


## Word hierarchies
WordNet organizes nouns and verbs into hierarchies according to **hypernym** or is-a relationships.

Let's examine the path from "rutabaga" to its root in the tree, "entity".

In [13]:
s = wn.synsets('rutabaga')

while s:
    print (s[0].hypernyms())
    s = s[0].hypernyms()

[Synset('turnip.n.02')]
[Synset('cruciferous_vegetable.n.01'), Synset('root_vegetable.n.01')]
[Synset('vegetable.n.01')]
[Synset('produce.n.01')]
[Synset('food.n.02')]
[Synset('solid.n.01')]
[Synset('matter.n.03')]
[Synset('physical_entity.n.01')]
[Synset('entity.n.01')]
[]


Actually, the proper way to do this is with a transitive closure, which repeatedly applies the specified function (in this case, hypernyms()).

In [14]:
hyper = lambda x: x.hypernyms()
s = wn.synset('rutabaga.n.01')
for i in list(s.closure(hyper)):
    print (i)
print    
ss = wn.synset('root_vegetable.n.01')
for i in list(ss.closure(hyper)):
    print (i)

Synset('turnip.n.02')
Synset('cruciferous_vegetable.n.01')
Synset('root_vegetable.n.01')
Synset('vegetable.n.01')
Synset('produce.n.01')
Synset('food.n.02')
Synset('solid.n.01')
Synset('matter.n.03')
Synset('physical_entity.n.01')
Synset('entity.n.01')
Synset('vegetable.n.01')
Synset('produce.n.01')
Synset('food.n.02')
Synset('solid.n.01')
Synset('matter.n.03')
Synset('physical_entity.n.01')
Synset('entity.n.01')


## Measuring similarity

WordNet's word hierarchies (for nouns and verbs) allow us to measure similarity in various ways.

Path similarity is defined as:

> $1 / (ShortestPathDistance(s_1, s_2) + 1)$

where $ShortestPathDistance(s_1, s_2)$ is computed from the hypernym/hyponym graph.



In [15]:
s1 = wn.synset('dog.n.01')
s2 = wn.synset('cat.n.01')
s3 = wn.synset('potato.n.01')

print (s1, '::', s1, s1.path_similarity(s1))
print (s1, '::', s2, s1.path_similarity(s2))
print (s1, '::', s3, s1.path_similarity(s3))
print (s2, '::', s3, s2.path_similarity(s3))

print ()

hyper = lambda x: x.hypernyms()

print(s1.hypernyms())

for i in list(s1.closure(hyper)):
    print (i)

Synset('dog.n.01') :: Synset('dog.n.01') 1.0
Synset('dog.n.01') :: Synset('cat.n.01') 0.2
Synset('dog.n.01') :: Synset('potato.n.01') 0.07142857142857142
Synset('cat.n.01') :: Synset('potato.n.01') 0.05263157894736842

[Synset('canine.n.02'), Synset('domestic_animal.n.01')]
Synset('canine.n.02')
Synset('domestic_animal.n.01')
Synset('carnivore.n.01')
Synset('animal.n.01')
Synset('placental.n.01')
Synset('organism.n.01')
Synset('mammal.n.01')
Synset('living_thing.n.01')
Synset('vertebrate.n.01')
Synset('whole.n.02')
Synset('chordate.n.01')
Synset('object.n.01')
Synset('physical_entity.n.01')
Synset('entity.n.01')


## Takeaways

WordNet gives us ways to compare words and understand their relationships in a much more meaningful way than relying on the raw strings (sequences of characters). We know that 'cat' and 'dog', for example, are somewhat similar even though they have no string similarity. As a result, WordNet has been used in lots of practical applications over the years. However, WordNet has a few important shortcomings:

1. WordNet was built by people. This makes it hard to maintain as new words are added (e.g. 'iphone' isn't in WordNet) and definitions evolve. It also has limited language coverage. NLTK wraps Open Multilingual WordNet which includes 22 additional languages, but these are less extensive than the English WordNet. A fundamental question addressed by subsequent sections is: can we build WordNet-like resources automatically from text, of which there is an abundance?

1. WordNet, like any dictionary or thesaurus, represents the meaning of a word with its relationships to other words. That is, it lacks *grounding* in the real world. This is fine for people who have plenty of working knowledge of the world, who have seen and interacted with dogs and cats and potatoes, but would be much less helpful for aliens arriving on Earth for the first time. This deficiency, where language is only defined with respect to itself, and not with respect to images for example, is at the frontier of research in Natural Language Understanding.

## Quiz Questions

(1) Use the closure function to enumerate the **hyponyms** (the inverse of a hypernym) of 'root_vegetable.n.01'.

(2) We used the path_similarity function to compute the similarity between 'dog' and 'cat'. Use the hypernyms() function (see above) to find the path between these two words. Does the path similarity 0.2 make sense?

In [16]:
import nltk

nltk(nltk.download('all'))

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Package cess_cat is already up-to-date!
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Package cess_esp is already up-to-date!
[nltk_data]    | Downloading packag

TypeError: ignored

In [18]:
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
import nltk
import gensim
from nltk.corpus import gutenberg

from nltk.corpus import wordnet

model= gensim.models.Word2Vec(gutenberg.sents())
X= list(model.wv.vocab)
data=model.most_similar('science')
print(data)

[('disinterested', 0.8455296158790588), ('powerful', 0.839820146560669), ('contemptible', 0.8355181813240051), ('ambition', 0.8223700523376465), ('rare', 0.8219707608222961), ('scientific', 0.8184798359870911), ('romantic', 0.816957950592041), ('species', 0.8166199326515198), ('abominable', 0.8121931552886963), ('honourable', 0.8116869926452637)]


  if __name__ == '__main__':
  if np.issubdtype(vec.dtype, np.int):


# load pre-trained word-vectors from gensim-data
https://radimrehurek.com/gensim/models/keyedvectors.html

In [22]:
import gensim.downloader as api

import json
info = api.info()
print(json.dumps(info, indent=4))

{
    "corpora": {
        "semeval-2016-2017-task3-subtaskBC": {
            "num_records": -1,
            "record_format": "dict",
            "file_size": 6344358,
            "reader_code": "https://github.com/RaRe-Technologies/gensim-data/releases/download/semeval-2016-2017-task3-subtaskB-eng/__init__.py",
            "license": "All files released for the task are free for general research use",
            "fields": {
                "2016-train": [
                    "..."
                ],
                "2016-dev": [
                    "..."
                ],
                "2017-test": [
                    "..."
                ],
                "2016-test": [
                    "..."
                ]
            },
            "description": "SemEval 2016 / 2017 Task 3 Subtask B and C datasets contain train+development (317 original questions, 3,169 related questions, and 31,690 comments), and test datasets in English. The description of the tasks and the collect

In [23]:
word_vectors = api.load("glove-wiki-gigaword-300") 



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [34]:
result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))
print(result)

queen: 0.6713
[('queen', 0.6713277101516724), ('princess', 0.5432624220848083), ('throne', 0.5386104583740234), ('monarch', 0.5347574949264526), ('daughter', 0.498025119304657), ('mother', 0.4956442713737488), ('elizabeth', 0.4832652509212494), ('kingdom', 0.47747087478637695), ('prince', 0.4668239951133728), ('wife', 0.4647327661514282)]


  if np.issubdtype(vec.dtype, np.int):


In [25]:
result = word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

print(result)

queen: 0.9199
[('queen', 0.9199351072311401), ('princess', 0.8403170108795166), ('throne', 0.8287888765335083), ('monarch', 0.8201609253883362), ('elizabeth', 0.8025429248809814), ('daughter', 0.7933654189109802), ('mother', 0.7825508117675781), ('kalākaua', 0.7787636518478394), ('kingdom', 0.777129590511322), ('wife', 0.7694059610366821)]


In [41]:
result = word_vectors.most_similar_cosmul(positive=[ 'paris', 'germany'], negative=['france'])
print("{}: {:.4f}".format(*result[0]))

print(result)

#France is to Paris as Germany is to:
sim = word_vectors.similar_by_vector(word_vectors['paris']-word_vectors['france']+word_vectors['germany'])
print("{}: {:.4f}".format(*sim[0]))

print(sim)

print(len(word_vectors['hof']))

berlin: 0.9907
[('berlin', 0.9906642436981201), ('frankfurt', 0.9340233206748962), ('munich', 0.8888449668884277), ('cologne', 0.8813262581825256), ('bonn', 0.8785525560379028), ('vienna', 0.8671213388442993), ('leipzig', 0.8619391322135925), ('hamburg', 0.8586714267730713), ('dresden', 0.8390956521034241), ('stuttgart', 0.8372994065284729)]
berlin: 0.8082
[('berlin', 0.8082348108291626), ('frankfurt', 0.718215823173523), ('germany', 0.6976348161697388), ('munich', 0.6616809368133545), ('cologne', 0.638824462890625), ('bonn', 0.6297187209129333), ('vienna', 0.6096600294113159), ('hamburg', 0.6015803813934326), ('leipzig', 0.5951980352401733), ('german', 0.5929442644119263)]
300


  if np.issubdtype(vec.dtype, np.int):


In [27]:
result = word_vectors.most_similar_cosmul(positive=[ 'human', 'dinosaur'], negative=['monkey'])
print("{}: {:.4f}".format(*result[0]))

print(result)

#monkey:human::dinosaur:[fossil, fossilized, Ice_Age_mammals, fossilization]
#Humans are fossilized monkeys? Humans are what's left
#over from mon![alt text](https://)keys? Humans are the species that beat monkeys
#just as Ice Age mammals beat dinosaurs? Plausible.

#building:architect::software:[programmer, SecurityCenter, WinPcap]

result = word_vectors.most_similar_cosmul(positive=[ 'architect', 'software'], negative=['building'])
print("{}: {:.4f}".format(*result[0]))

print(result)

fossils: 0.9267
[('fossils', 0.9267341494560242), ('commission', 0.902690052986145), ('fossil', 0.8995264768600464), ('fossilized', 0.8591017723083496), ('graves', 0.8443430066108704), ('rights', 0.8341718316078186), ('documented', 0.8333224058151245), ('grave', 0.8316300511360168), ('experts', 0.8258289694786072), ('skeletons', 0.8244320154190063)]
programmer: 1.0158
[('programmer', 1.0157649517059326), ('content-control', 0.8603453040122986), ('designer', 0.8441733717918396), ('animator', 0.8329057097434998), ('browser', 0.8276143670082092), ('netscape', 0.8257592916488647), ('illustrator', 0.8253690004348755), ('autodesk', 0.8252367377281189), ('inventor', 0.8230858445167542), ('novell', 0.8200629353523254)]


In [43]:
print(word_vectors.doesnt_match("breakfast cereal dinner lunch".split()))
print(word_vectors.doesnt_match("audi bmw vw".split()))
print(word_vectors.doesnt_match("fiat skoda vw audi".split()))
print(word_vectors.doesnt_match("schalke meisterschaft bayern".split()))

cereal
vw
fiat
meisterschaft


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


In [29]:
result = word_vectors.similar_by_word("cat")
print("{}: {:.4f}".format(*result[0]))
print(result)

dog: 0.6817
[('dog', 0.6816747188568115), ('cats', 0.6815836429595947), ('pet', 0.5870364904403687), ('dogs', 0.540766716003418), ('feline', 0.48979705572128296), ('monkey', 0.48794347047805786), ('horse', 0.4732130467891693), ('pets', 0.4634858965873718), ('rabbit', 0.4608757495880127), ('leopard', 0.4585462808609009)]


  if np.issubdtype(vec.dtype, np.int):


In [0]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
#path = get_tmpfile("word2vec.model")

model2 = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
#model.save("word2vec.model")

In [42]:
print(list(model.wv.vocab))
print(list(model2.wv.vocab))


['human', 'interface', 'computer', 'survey', 'user', 'system', 'response', 'time', 'eps', 'trees', 'graph', 'minors']


In [32]:
vector_c = model2.wv['computer']



#print(vector_c)

print(model) 




vector_t = model2.wv['time']
#print(vector_t)

print(model2.similar_by_vector(vector_c))
print(model2.similar_by_vector(vector_t))



Word2Vec(vocab=17011, size=100, alpha=0.025)
[('computer', 1.0), ('trees', 0.18067461252212524), ('graph', 0.09885556995868683), ('user', 0.05702870339155197), ('human', 0.015458092093467712), ('minors', -0.006871828809380531), ('response', -0.08514246344566345), ('time', -0.09487903118133545), ('interface', -0.09714832156896591), ('system', -0.11603781580924988)]
[('time', 0.9999999403953552), ('response', 0.1709960550069809), ('human', 0.11712338030338287), ('eps', 0.0952422246336937), ('trees', 0.0767492949962616), ('interface', -0.019803009927272797), ('graph', -0.03488349914550781), ('system', -0.0816764086484909), ('computer', -0.09487904608249664), ('minors', -0.13383223116397858)]


  from ipykernel import kernelapp as app
  if np.issubdtype(vec.dtype, np.int):
  app.launch_new_instance()


In [33]:
vector_c = model.wv['mother']
vector_t = model.wv['woman']

#print(model.similar_by_vector(vector_t-vector_c))


#print(model.wv.most_similar('woman'))
print(model.wv.most_similar_cosmul(['woman', 'king'],['man']))



[('daughter', 0.912455141544342), ('son', 0.9064741134643555), ('chamberlains', 0.8922019600868225), ('captain', 0.8856467008590698), ('queen', 0.8789352178573608), ('Solomon', 0.8704864978790283), ('Joseph', 0.8667926788330078), ('Rachel', 0.8635896444320679), ('Amoz', 0.8603639602661133), ('scribe', 0.8556996583938599)]
