In [1]:
from pprint import pprint, PrettyPrinter
pp = PrettyPrinter(indent=4, width=100).pprint # 3.8 sort_dicts=False

# WordNet

### Полезные ссылки:
Главный сайт проекта: https://wordnet.princeton.edu/

**ДОКУМЕНТАЦИЯ**: WordNet через nltk: http://www.nltk.org/howto/wordnet.html

In [2]:
import nltk

In [3]:
%%time
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Wall time: 317 ms


True

In [142]:
from nltk.corpus import wordnet as wn

Ищем все синсеты, в которых есть подстрока "dog":

In [5]:
%%time
dog_synsets = wn.synsets('dog')
print (dog_synsets)

[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]
Wall time: 2.97 s


In [6]:
pp(dog_synsets)

[   Synset('dog.n.01'),
    Synset('frump.n.01'),
    Synset('dog.n.03'),
    Synset('cad.n.01'),
    Synset('frank.n.02'),
    Synset('pawl.n.01'),
    Synset('andiron.n.01'),
    Synset('chase.v.01')]


Можно уточнить, какие именно части речи нас интересуют. Возможные варианты: NOUN, ADJ, ADV, VERB

In [8]:
dog_noun_synsets = wn.synsets('dog', pos=wn.NOUN)
pp (dog_noun_synsets)

[   Synset('dog.n.01'),
    Synset('frump.n.01'),
    Synset('dog.n.03'),
    Synset('cad.n.01'),
    Synset('frank.n.02'),
    Synset('pawl.n.01'),
    Synset('andiron.n.01')]


Доступ ко всем синсетам и ко всем словам:

In [9]:
%%time
print (len(list(wn.all_synsets())))
print (len(list(wn.all_synsets('v'))))
print (len(list(wn.all_lemma_names('a'))))

117659
13767
21479
Wall time: 13.7 s


Про синсет мы можем узнать: его имя (ID синсета), определение, ID относящихся к нему лемм, сами леммы; посмотреть примеры (если они есть) 

In [11]:
dog_exemplar = wn.synset('dog.n.01')
pp ([dog_exemplar.name(), dog_exemplar.definition(), dog_exemplar.lemmas(), dog_exemplar.lemma_names(),
       dog_exemplar.examples()])

[   'dog.n.01',
    'a member of the genus Canis (probably descended from the common wolf) that has been '
    'domesticated by man since prehistoric times; occurs in many breeds',
    [Lemma('dog.n.01.dog'), Lemma('dog.n.01.domestic_dog'), Lemma('dog.n.01.Canis_familiaris')],
    ['dog', 'domestic_dog', 'Canis_familiaris'],
    ['the dog barked all night']]


Отношения между синсетами

In [12]:
pprint (dog_exemplar.hyponyms(), compact=True, width=100)
print (dog_exemplar.hypernyms())
print (dog_exemplar.root_hypernyms())
print (dog_exemplar.member_holonyms())
print (dog_exemplar.member_meronyms())
print (dog_exemplar.similar_tos())

[Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'),
 Synset('great_pyrenees.n.01'), Synset('griffon.n.02'), Synset('hunting_dog.n.01'),
 Synset('lapdog.n.01'), Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'),
 Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), Synset('pug.n.01'),
 Synset('puppy.n.01'), Synset('spitz.n.01'), Synset('toy_dog.n.01'), Synset('working_dog.n.01')]
[Synset('canine.n.02'), Synset('domestic_animal.n.01')]
[Synset('entity.n.01')]
[Synset('canis.n.01'), Synset('pack.n.06')]
[]
[]


Ближайший общий гипероним

In [14]:
print(wn.synset('person.n.01').lowest_common_hypernyms(wn.synset('dog.n.01')))

[Synset('organism.n.01')]


Расстояние между синсетами: <br>
path_similarity - оценивает расстояние по кратчайшему пути между синсетами. <br>
Значение - от 0 до 1, где 1 - максимальная степень близости.

In [15]:
%%time
print(wn.synset('dog.n.01').path_similarity(wn.synset('cat.n.01')))
print(wn.synset('person.n.01').path_similarity(wn.synset('cat.n.01')))
print(wn.synset('dog.n.01').path_similarity(wn.synset('dog.n.01')))

0.2
0.1
1.0
Wall time: 4.54 ms


Деривационные отношения и отношение антонимии определены только для лемм:

In [16]:
%%time
for lemma in wn.lemmas('personal'):
    print (lemma.name())
    print ('Pertainyms:', lemma.pertainyms())
    print ('Antonyms:', lemma.antonyms())
    print ('Derivationally related forms:', lemma.derivationally_related_forms())
    print ()

personal
Pertainyms: []
Antonyms: []
Derivationally related forms: []

personal
Pertainyms: []
Antonyms: [Lemma('impersonal.a.01.impersonal')]
Derivationally related forms: []

personal
Pertainyms: []
Antonyms: []
Derivationally related forms: []

personal
Pertainyms: [Lemma('personality.n.01.personality')]
Antonyms: []
Derivationally related forms: [Lemma('personality.n.01.personality')]

personal
Pertainyms: []
Antonyms: []
Derivationally related forms: []

personal
Pertainyms: [Lemma('person.n.03.person')]
Antonyms: []
Derivationally related forms: []

Wall time: 4.68 ms


## MultiWordNet

http://compling.hss.ntu.edu.sg/omw/ <br>
Условные обозначения языков: коды ISO-639

In [17]:
sorted(wn.langs()), len(wn.langs())

(['als',
  'arb',
  'bul',
  'cat',
  'cmn',
  'dan',
  'ell',
  'eng',
  'eus',
  'fas',
  'fin',
  'fra',
  'glg',
  'heb',
  'hrv',
  'ind',
  'ita',
  'jpn',
  'nld',
  'nno',
  'nob',
  'pol',
  'por',
  'qcn',
  'slv',
  'spa',
  'swe',
  'tha',
  'zsm'],
 29)

In [18]:
%%time
print (dog_exemplar.lemma_names('fra'))
print (dog_exemplar.lemma_names('hrv'))
print (dog_exemplar.lemma_names('jpn'))

['canis_familiaris', 'chien']
['Canis_lupus_familiaris', 'domaći_pas', 'pas']
['イヌ', 'ドッグ', '洋犬', '犬', '飼犬', '飼い犬']
Wall time: 1.83 s


# FrameNet

Главный сайт проекта: https://framenet2.icsi.berkeley.edu

**ДОКУМЕНТАЦИЯ**: FrameNet через NLTK: http://www.nltk.org/howto/framenet.html

In [19]:
%%time
nltk.download('framenet_v17')

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!
Wall time: 3.94 s


True

In [20]:
from nltk.corpus import framenet as fn

Все фреймы:

In [21]:
%%time
print (fn.frames(), len(fn.frames()))

[<frame ID=2031 name=Abandonment>, <frame ID=262 name=Abounding_with>, ...] 1221
Wall time: 2.23 s


Все фреймы, в которых есть подстрока 'event':

In [23]:
for frame in fn.frames('event'):
    print (frame.name)

Change_event_duration
Change_event_time
Desirable_event
Historic_event
Locale_by_event
Prevent_or_allow_possession
Preventing_or_letting
Required_event
Social_event
Social_event_collective
Social_event_individuals


Все слова:

In [24]:
print (fn.lus(), len(fn.lus()))

[<lu ID=16601 name=(can't) help.v>, <lu ID=14632 name=(in/out of) line.n>, ...] 13572


Каждый фрейм - это словарь. Заглянем внутрь фрейма Historic_event:

In [28]:
frame_HistEvent = fn.frame('Historic_event')
print (frame_HistEvent)

frame (1908): Historic_event

[URL] https://framenet2.icsi.berkeley.edu/fnReports/data/frame/Historic_event.xml

[definition]
  In the course of history, an Event or Entity is taken to have
  importance or significance.  'Throughout the campaign activists
  have made financial history as one by one major corporations have
  yielded to protester power'  'The conference was historic for
  Atlanta's growth as a city.'  'Many of the historic sites offer
  additional outdoor recreation activities.'  'The James River is
  arguably the most historic river in the country and one of the
  most important rivers in the Southeast.'  'Take in the history,
  the sawdust-covered floors, and the legendary backroom where the
  ale flowed during Prohibition.'

[semTypes] 0 semantic types

[frameRelations] 3 frame relations
  <Parent=Eventive_affecting -- Inheritance -> Child=Historic_event>
  <Complex=Individual_history -- Subframe -> Component=Historic_event>
  <Parent=Importance -- Using -> Child=Hist

FE и lexUnit - тоже словари:

In [29]:
print (frame_HistEvent.FE)

[Event] frame element (11417): Event
    of Historic_event(1908)
[definition]
  This FE identifies the event which occurs to create history.
[abbrev] Evnt
[coreType] Core
[requiresFE] <None>
[excludesFE] <None>
[semType] 
  State_of_affairs(177)

[Place] frame element (11418): Place
    of Historic_event(1908)
[definition]
  This FE identifies where the event takes place.
[abbrev] Place
[coreType] Peripheral
[requiresFE] <None>
[excludesFE] <None>
[semType] 
  Locative_relation(182)

[Time] frame element (11419): Time
    of Historic_event(1908)
[definition]
  This FE identifies the time when the event occurs.
[abbrev] Time
[coreType] Peripheral
[requiresFE] <None>
[excludesFE] <None>
[semType] 
  Time(141)

[Explanation] frame element (11420): Explanation
    of Historic_event(1908)
[definition]
  This FE identifies the Explanation for which an event occurs.
[abbrev] Exp
[coreType] Extra-Thematic
[requiresFE] <None>
[excludesFE] <None>
[semType] 
  State_of_affairs(177)

[Entity] fram

Способы обратиться к элементам фрейма (FE) \[потому что это словарь\]

In [32]:
print (frame_HistEvent.FE.Event.definition)
print (frame_HistEvent['FE']['Event']['definition'])

This FE identifies the event which occurs to create history.
This FE identifies the event which occurs to create history.


В словарях лексических юнитов скрываются размеченные примеры:

In [33]:
historic = frame_HistEvent.lexUnit['historic.a']
# то же самое (по ID):
# historic = fn.lu(14182))
print (historic)

lexical unit (14182): historic.a

[definition]
  COD: famous or important in history, or potentially so.

[frame] Historic_event(1908)

[POS] A

[status] Finished_Initial

[lexemes] historic/A

[semTypes] 0 semantic types

[URL] https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu14182.xml

[subCorpus] 8 subcorpora
  01-T-Wmoment,victory,opportunity-(1), 03-NP-VP-T-(1),
  04-T-NP-(1), 05-AVP-T-(1), 06-T-AVP-(1), manually-added,
  other-matched-(1), other-unmatched-(1)

[exemplars] 17 sentences across all subcorpora



In [34]:
print (historic.exemplars[0])

exemplar sentence (1454496):
[corpID] 111
[docID] 421
[paragNo] 7518
[sentNo] 1
[aPos] 28944963

[LU] (14182) historic.a in Historic_event

[frame] (1908) Historic_event

[annotationSet] 2 annotation sets

[POS] 27 tags

[POS_tagset] PENN

[GF] 2 relations

[PT] 2 phrases

[text] + [Target] + [FE]

Researchers expected to find six out of ten people could recall 
                                                                
                                                                
 
Lady Thatcher 's historic resignation moment in vivid detail by 
---------------- ******** ------------------
Entity                    Event             
 
retaining a long-lasting ` flashbulb " memory .
 
 
 




## Практические задания

Найдите все фреймы, в число ядерных (Core) элементов которых входит участник с ролью начальной точки перемещения (Source).

Найдите все ядерные элементы фрейма Removing

Найдите 5 примеров употребления лексемы take.v из фрейма Removing (с разметкой)

# Что еще можно делать семантическими сетями?

### считать схожесть слов разными способами

подробнее: https://www.nltk.org/howto/wordnet.html#similarity

In [51]:
dog = wn.synset('dog.n.01')
cat = wn.synset('cat.n.01')

collar = wn.synset('collar.n.01')

In [59]:
print(dog.path_similarity(cat)) # [0,1]
print(dog.lch_similarity(cat)) 
print(dog.wup_similarity(cat)) # [0,1]

0.2
2.0281482472922856
0.8571428571428571


In [60]:
print(dog.path_similarity(collar))
print(dog.lch_similarity(collar)) 
print(dog.wup_similarity(collar))

0.1
1.3350010667323402
0.47058823529411764


### лемматизировать

[оригинальный текст](https://www.machinelearningplus.com/nlp/lemmatization-examples-python/)

В NLTK уже есть готовый лемматизатор на основе WordNet'a:

In [37]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [38]:
print(lemmatizer.lemmatize("bats"))
print(lemmatizer.lemmatize("are"))
print(lemmatizer.lemmatize("feet"))

bat
are
foot


In [143]:
sentence = "The striped bats are hanging on their feet for best"
word_list = nltk.word_tokenize(sentence)
print(word_list)

['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']


In [144]:
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(lemmatized_output)

The striped bat are hanging on their foot for best


Обратите внимание, не вре слова лемматизировались так, как можно было ожидать. Почему? Потому что, как мы уже обсуждали, Иногда одно и то же слово может иметь несколько лемм в зависимости от значения / контекста. Эту проблему можно решить, передав лемматизатору POS тег:

In [42]:
print(lemmatizer.lemmatize("stripes", 'v'))  
print(lemmatizer.lemmatize("stripes", 'n'))  

strip
stripe


Для получения частей речи можно использовать NLTK POS-tagger.

**NB**: набор тегов WordNet'a отличается от стандартых nltk POS тегов. Для преобразования одних в другие можно сделать так:

In [48]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wn.ADJ,
                "N": wn.NOUN,
                "V": wn.VERB,
                "R": wn.ADV}
    return tag_dict.get(tag, wn.NOUN)

In [49]:
word = 'feet'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

foot


In [50]:
sentence = "The striped bats are hanging on their feet for best"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']


### Word-sense disambiguation (WSD)

подробнее [WordNet: Word Relations,Senses, and Disambiguation (pdf)](https://web.stanford.edu/~jurafsky/slp3/C.pdf)

* The Lesk algorithm chooses the sense whose dictionary definition shares the most words with the target word’s neighborhood.
* Graph-based algorithms view the thesaurus as a graph and choose the sense that is most central in some way.


Lesk через NLKT: http://www.nltk.org/howto/wsd.html

In [61]:
from nltk.wsd import lesk

In [84]:
sent = ['I', 'keep', 'my', 'coin', 'in', 'a', 'piggy', 'bank', '.']
bank_meaning = lesk(sent, 'bank', 'n')
pp ([bank_meaning.name(), bank_meaning.definition(), bank_meaning.lemma_names(),
       bank_meaning.examples()])

[   'savings_bank.n.02',
    'a container (usually with a slot in the top) for keeping money at home',
    ['savings_bank', 'coin_bank', 'money_box', 'bank'],
    ['the coin bank was empty']]


In [85]:
sent = 'We went to the river and I jumped in the water from the bank .'.split()
bank_meaning = lesk(sent, 'bank')
pp ([bank_meaning.name(), bank_meaning.definition(), bank_meaning.lemma_names(),
       bank_meaning.examples()])

[   'bank.n.07',
    'a slope in the turn of a road or track; the outside is higher than the inside in order to '
    'reduce the effects of centrifugal force',
    ['bank', 'cant', 'camber'],
    []]


In [87]:
sent = 'Did you make a deposit to our bank account ?'.split()
bank_meaning = lesk(sent, 'bank')
pp ([bank_meaning.name(), bank_meaning.definition(), bank_meaning.lemma_names(),
       bank_meaning.examples()])

[   'deposit.v.02',
    'put into a bank account',
    ['deposit', 'bank'],
    ['She deposits her paycheck every month']]


In [146]:
sent = "How much is the dealer 's gambling bank right now ?".split()
bank_meaning = lesk(sent, 'bank')
pp ([bank_meaning.name(), bank_meaning.definition(), bank_meaning.lemma_names(),
       bank_meaning.examples()])

[   'bank.n.06',
    'the funds held by a gambling house or the dealer in some gambling games',
    ['bank'],
    ['he tried to break the bank at Monte Carlo']]


*Дарья Попова, Дарья Рыжова, ред. Анна Полянская, 2020, НИУ ВШЭ*