In [1]:
import pandas as pd
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
import os
from collections import Counter
from nltk.corpus import verbnet
from tqdm import tqdm
tqdm.pandas()

# WordNet demo

Choose some words:

In [2]:
word1 = "turn"
word2 = "twist"
word3 = "jump"

Synsets:

In [3]:
print("Synsets for", word1)
print(wn.synsets(word1))
print()
print("Synsets for",word1,"filtered just to verb POS")
print(wn.synsets(word1, pos = wn.VERB))

Synsets for turn
[Synset('bend.n.01'), Synset('turn.n.02'), Synset('turn.n.03'), Synset('turn.n.04'), Synset('turning.n.04'), Synset('turn.n.06'), Synset('twist.n.13'), Synset('go.n.01'), Synset('turn.n.09'), Synset('act.n.04'), Synset('turn.n.11'), Synset('turn.n.12'), Synset('turn.v.01'), Synset('change_state.v.01'), Synset('become.v.02'), Synset('turn.v.04'), Synset('change_by_reversal.v.01'), Synset('turn.v.06'), Synset('turn.v.07'), Synset('turn.v.08'), Synset('turn.v.09'), Synset('turn.v.10'), Synset('turn.v.11'), Synset('plow.v.01'), Synset('turn.v.13'), Synset('turn.v.14'), Synset('twist.v.10'), Synset('turn.v.16'), Synset('turn.v.17'), Synset('turn.v.18'), Synset('turn.v.19'), Synset('turn.v.20'), Synset('flex.v.05'), Synset('turn.v.22'), Synset('turn.v.23'), Synset('call_on.v.01'), Synset('sour.v.01'), Synset('turn.v.26')]

Synsets for turn filtered just to verb POS
[Synset('turn.v.01'), Synset('change_state.v.01'), Synset('become.v.02'), Synset('turn.v.04'), Synset('change_b

Just for now, selecting the first synset

In [4]:
w1_syn = wn.synsets(word1, pos = wn.VERB)[0]
w2_syn = wn.synsets(word2, pos = wn.VERB)[0]
w3_syn = wn.synsets(word3, pos = wn.VERB)[0]

# Verb frames:

For the synsets' lemma, it has a specific amount of verb frames. It generally seems to be the same amount per synset for each lemma.

In [5]:
for lemma in w1_syn.lemmas():
    print(lemma, lemma.frame_ids(), len(lemma.frame_ids()))
    print(" | ".join(lemma.frame_strings()))
print("------------------------------")

for lemma in w2_syn.lemmas():
    print(lemma, lemma.frame_ids(), len(lemma.frame_ids()))
    print(" | ".join(lemma.frame_strings()))
print("------------------------------")

for lemma in w3_syn.lemmas():
    print(lemma, lemma.frame_ids(), len(lemma.frame_ids()))
    print(" | ".join(lemma.frame_strings()))
print("------------------------------")

Lemma('turn.v.01.turn') [1, 2, 4] 3
Something turn | Somebody turn | Something is turning PP
------------------------------
Lemma('writhe.v.01.writhe') [1, 2] 2
Something writhe | Somebody writhe
Lemma('writhe.v.01.wrestle') [1, 2] 2
Something wrestle | Somebody wrestle
Lemma('writhe.v.01.wriggle') [1, 2] 2
Something wriggle | Somebody wriggle
Lemma('writhe.v.01.worm') [1, 2] 2
Something worm | Somebody worm
Lemma('writhe.v.01.squirm') [1, 2] 2
Something squirm | Somebody squirm
Lemma('writhe.v.01.twist') [1, 2] 2
Something twist | Somebody twist
------------------------------
Lemma('jump.v.01.jump') [1, 2, 22] 3
Something jump | Somebody jump | Somebody jump PP
Lemma('jump.v.01.leap') [1, 2, 22] 3
Something leap | Somebody leap | Somebody leap PP
Lemma('jump.v.01.bound') [1, 2, 22] 3
Something bound | Somebody bound | Somebody bound PP
Lemma('jump.v.01.spring') [1, 2, 22] 3
Something spring | Somebody spring | Somebody spring PP
------------------------------


# Multiple similarity metrics: 

Path similarity

Leacock-Chodorow Similarity

Wu-Palmer similarity

Resnik similarity

Jiang-Conrath similarity

Lin Similarity

In [6]:
print("Path similarity:", w1_syn.path_similarity(w2_syn))

print("Lch similarity:",w1_syn.lch_similarity(w2_syn))

print("Wup similarity:",w1_syn.wup_similarity(w2_syn))

print("Res similarity:",w1_syn.res_similarity(w2_syn, brown_ic))

print("JCN similarity:",w1_syn.jcn_similarity(w2_syn, brown_ic))

print("Lin similarity:",w1_syn.lin_similarity(w2_syn, brown_ic))

Path similarity: 0.3333333333333333
Lch similarity: 2.159484249353372
Wup similarity: 0.3333333333333333
Res similarity: 4.692755582239643
JCN similarity: 0.12831564565996478
Lin similarity: 0.546342873109817


# VerbNet:

http://verbs.colorado.edu/~kipper/Papers/dissertation.pdf

http://verbs.colorado.edu/verb-index/VerbNet_Guidelines.pdf

In [37]:
a = verbnet.classids(lemma = "give")
print(a)

print("lemmas",verbnet.lemmas(a[0]))

print("vnclass",verbnet.vnclass(a[0]))

print("Sub-classes", verbnet.subclasses(verbnet.vnclass(a[0])))

['give-13.1-1']
lemmas ['give', 'hock', 'rent', 'sell', 'lease', 'pawn']
vnclass <Element 'VNSUBCLASS' at 0x0000019979B2C4F0>
Sub-classes []


In [70]:
def get_classid(verb):
    # print(verb)
    classid = verbnet.classids(lemma = verb)
    # print(classid)
    return classid
    
def get_lemmas(verb):
    classid = get_classid(verb)
    if len(classid) > 0:
        lemmas = verbnet.lemmas(classid[0])
        return lemmas
    else:
        return "-1"

def get_frames(verb):
    classid = get_classid(verb)
    if len(classid) > 0:
        frames = []
        for item in verbnet.frames(classid[0]):
            # print(item)
            frames.append(item["description"]["primary"])
        return frames
    else:
        return ["-1"]

def get_unique_frames(classid):
    frames = []
    for item in verbnet.frames(classid[0]):
        # print(item)
        frames.append(item["description"]["primary"])
    return pd.unique(frames)

In [59]:
step1 = get_classid("catch")
print(step1)

step2a = get_lemmas(step1)
print(step2a)

step2b = get_frames(step1)
print(step2b)


['get-13.5.1']
['attain', 'book', 'buy', 'call', 'catch', 'charter', 'choose', 'conserve', 'find', 'gather', 'hire', 'lease', 'order', 'phone', 'pick', 'pluck', 'procure', 'pull', 'reach', 'rent', 'reserve', 'secure', 'shoot', 'slaughter', 'vote', 'win']
['Basic Transitive', 'NP-PP', 'NP-PP', 'Benefactive Alternation', 'NP-PP', 'NP', 'NP-PP-PP']


In [None]:
#word: "give"
frames = verbnet.frames(verbnet.vnclass(a[0]))
for item in frames:
    for key in item.keys():
        print(key,":", item[key])
        print()
    print()
    print("--------------------------")

In [9]:
print(word1,":")
verbnet.classids(lemma = word1)

turn :


['convert-26.6.2',
 'crane-40.3.2',
 'hurt-40.8.3-1-1',
 'meander-47.7',
 'roll-51.3.1',
 'turn-26.6.1-1']

*Verbs that participate in this alternation include scatter, pump, hang, drizzle, and cram, all of which are verbs that semantically involve a type of placement or covering. Because of their shared syntactic behaviors, these verbs are grouped together in the Spray-‐9.7 class.*

Share syntactic behaviors -> grouped together in classes -> check shared classes?

In [10]:
print(word1,":", verbnet.classids(lemma = word1))
print(word2,":", verbnet.classids(lemma = word2))
print(word3,":", verbnet.classids(lemma = word3))

turn : ['convert-26.6.2', 'crane-40.3.2', 'hurt-40.8.3-1-1', 'meander-47.7', 'roll-51.3.1', 'turn-26.6.1-1']
twist : ['coil-9.6-1', 'hurt-40.8.3-1-1', 'knead-26.5', 'meander-47.7', 'roll-51.3.1']
jump : ['calibratable_cos-45.6-1', 'run-51.3.2']


# SimVerb:

In [16]:
simverb = pd.read_csv("../data/SimVerb-3500.txt",sep='\t', header = None)

simverb.columns = ["word1","word2","pos","sv_score",'relation']

print(simverb)

            word1     word2 pos  sv_score        relation
0            take    remove   V      6.81        SYNONYMS
1            walk     trail   V      4.81      COHYPONYMS
2            feed    starve   V      1.49        ANTONYMS
3           shine    polish   V      7.80        SYNONYMS
4       calculate       add   V      5.98  HYPER/HYPONYMS
...           ...       ...  ..       ...             ...
3495       impose     cheat   V      1.16            NONE
3496        rebel   protest   V      7.64  HYPER/HYPONYMS
3497  collaborate  conspire   V      4.23            NONE
3498     conspire   protest   V      1.83            NONE
3499      protest   release   V      1.16            NONE

[3500 rows x 5 columns]


## Background: 
- Meaning as inherent in word relations, meaning as derived from statistical regularities -> distributional semantics theory of word meaning


- Verbs and nouns are conceptually different, which may be reflected in age of acquisition for English speaking babies 
- Noun bias (see Ch11 from https://langcog.github.io/wordbank-book/categories-syntactic.html)
- Nouns seem to be more "indexical" in nature, with a given label (usually) mapping directly to a object, or more concrete aspect, while verbs are more relational (Gentner, 1982)
- Other considerations: syntactic position, morphology


- This difference is also reflected in distributional semantics models
- Distributional semantics models as harnessing the co-occurrence statistics to capture word meaning
- Variety of models perform well on different tasks, however, recently developed gold standards (simlex, simverb) have shown that their performance greatly differs based on part of speech


- What differentiates these two POS in their representation? 

## Research question:
- What is the relationship between syntax and semantics for verb understanding? 
- How do syntax and semantics interact regarding verb representation?
- How do these models reflect our own linguistic processing of verbs?

General question: See above ^

Research question: Does syntactic and semantic info impact people's performance on human judgement of similarity?

-> show specifically these different types of information --> control condition (has syntax but not informative about their meaning representation)

-> inter annotater agreement

Corpus data -> frequencies of ()

Google ngrams -> syntactic version (syntgram), checking syntactic frame (counts)

## Implementation / approach:
- Use WordNet and VerbNet as *what* -> syntactic reference point?
- POS disambiguation: number of different potential parts of speech for a given word -> entropy of potential POS?
- Sense disambiguation: 1) number of different senses, 2) average similarity to other senses
- *VerbNet* something with subcat bias?

### Other thoughts:
- Lemmas and amount of verb frames?
- Entailment environment

#### Out of the scope of this study:
- Comparison between languages with and without noun bias

In [17]:
# Cases when word not in WN?

def potential_pos(word):
    pos = []
    for sense in wn.synsets(word):
        pos.append(sense.pos())
    return Counter(pos)
        
    
def num_v_senses(word):
#     print(wn.synsets(word, pos = wn.VERB))
    num_senses = len(wn.synsets(word, pos = wn.VERB))
#     print(num_senses)
    return num_senses

In [18]:
simverb["w1_pos"] = simverb['word1'].progress_apply(potential_pos)
simverb["w1_num_v_senses"] = simverb['word1'].progress_apply(num_v_senses)
simverb["w2_pos"] = simverb['word2'].progress_apply(potential_pos)
simverb["w2_num_v_senses"] = simverb['word2'].progress_apply(num_v_senses)

100%|██████████| 3500/3500 [00:01<00:00, 1848.36it/s]
100%|██████████| 3500/3500 [00:00<00:00, 16381.31it/s]
100%|██████████| 3500/3500 [00:00<00:00, 11480.94it/s]
100%|██████████| 3500/3500 [00:00<00:00, 16045.28it/s]


In [71]:
simverb["w1_lemmas"] = simverb["word1"].progress_apply(get_lemmas)
simverb["w2_lemmas"] = simverb["word2"].progress_apply(get_lemmas)
simverb["w1_frames"] = simverb["word1"].progress_apply(get_frames)
simverb["w2_frames"] = simverb["word2"].progress_apply(get_frames)

100%|██████████| 3500/3500 [00:04<00:00, 711.88it/s]
100%|██████████| 3500/3500 [00:05<00:00, 635.96it/s]
100%|██████████| 3500/3500 [00:07<00:00, 489.81it/s]
100%|██████████| 3500/3500 [00:07<00:00, 487.52it/s]


In [73]:
simverb.to_csv("../data_output/simverb_processed.csv", index = False)

In [30]:
unique_verbs = pd.unique(pd.concat([simverb["word1"],simverb["word2"]]))

unique_verbs


array(['take', 'walk', 'feed', 'shine', 'calculate', 'cheat', 'pardon',
       'smell', 'plow', 'believe', 'tap', 'condemn', 'crunch', 'erode',
       'try', 'tear', 'replace', 'respond', 'rationalize', 'laugh',
       'attack', 'descend', 'decompose', 'miss', 'yell', 'feel',
       'connect', 'decay', 'protect', 'bake', 'remove', 'describe',
       'drive', 'build', 'cover', 'dislike', 'forget', 'comprehend',
       'reflect', 'defeat', 'know', 'remind', 'go', 'produce',
       'recommend', 'shake', 'accomplish', 'ask', 'gather', 'bend',
       'jump', 'agree', 'give', 'kill', 'carry', 'sing', 'dismay', 'shun',
       'hurt', 'prefer', 'cook', 'reprimand', 'break', 'hit', 'wrap',
       'upset', 'show', 'accuse', 'hate', 'pounce', 'respect', 'sell',
       'put', 'charge', 'melt', 'flap', 'die', 'pull', 'hitch', 'live',
       'sow', 'destroy', 'obtain', 'buy', 'deposit', 'rise', 'pray',
       'rescue', 'succeed', 'look', 'delay', 'color', 'get', 'drink',
       'treat', 'dump', 'lea

# Todo:
Checking inter-annotater agreement, seeing how added context impacts the inter-annotater agreement.

Is there a relationship between that and the WordNet and verbnet measures?

Syntgram / triarcs -> check Rachel's grant (3.1.1)
- Syntactic frames of the different words
- Overlap measure -> shared frames, bias as majority frame -> more semantically related to other verbs of this type with same bias
- Goldberg and orwant 2013 (done)
- Lenci DSM paper (done)