# Retrofitting to the Structure of Wordnet

In [18]:
__author__ = 'Ben Lengerich'

In [19]:
import numpy as np
import scipy
from sklearn.decomposition import IncrementalPCA
from nltk.corpus import wordnet as wn
import time

import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '../../'))
from retrofit_identity import retrofit_identity
from retrofit_linear   import retrofit_linear
from retrofit_neural   import retrofit_neural
from utils import *

## Load Word2Vec

In [20]:
from gensim.models import KeyedVectors
word2vec_filename='../../GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(word2vec_filename, binary=True)

## Initialize WordNet Representations as Pre-Trained Word2Vec

In [21]:
t = time.time()
lemmas = []
embeddings = []
row_names = []
row_ids = {}
row_ids_by_name = {}
n_ignored = 0
for synset in wn.all_synsets():
    for lemma in synset.lemmas():
        if lemma.name() in row_ids:
            continue
        try:
            embeddings.append(model[lemma.name()])
            lemmas.append(lemma)
            row_ids[str(lemma)] = len(embeddings) - 1
            row_ids_by_name[lemma.name()] = len(embeddings) - 1
            row_names.append(str(lemma))
        except KeyError: # word present in WordNet, but not Word2Vec
            n_ignored += 1
            pass

X = np.array(embeddings)
row_names = np.array(row_names)
save_obj(row_names, 'row_names')
save_obj(row_ids,   'row_ids')
save_obj(row_ids_by_name, 'row_ids_by_name')
assert(len(lemmas) == len(X))
print("Found {} WordNet lemmas in Word2Vec, ignored {} WordNet lemmas.".format(
    len(X), n_ignored))
print("Took {:.2f} seconds.".format(time.time() - t))

Found 115635 WordNet lemmas in Word2Vec, ignored 91343 WordNet lemmas.
Took 2.27 seconds.


In [22]:
# Reduce Embedding Dimensionality
t = time.time()
n_components=25
pca = IncrementalPCA(n_components=n_components)
X = pca.fit_transform(X)
save_obj(X, "X_reduced_{:d}".format(n_components))
print("Reduced to {:d} components.\nTook {:.2f} seconds.".format(n_components, time.time() - t))

Reduced to 25 components.
Took 8.34 seconds.


## Extract Edges from WordNet

In [23]:
from utils import *

t = time.time()
def make_neighbors(get_neighbors, lemmas, row_ids):
    assert(len(row_ids.keys()) == len(lemmas))
    neighbors = {}
    for x in lemmas:
        x_row_id = row_ids[str(x)]
        neighbors[x_row_id] = set([])
        for y in get_neighbors(x):
            try:
                y_row_id = row_ids[str(y)]
            except KeyError:
                continue
            neighbors[x_row_id].add(y_row_id)
        neighbors[x_row_id] = list(neighbors[x_row_id])
    return neighbors

hypernyms = lambda x: [l for s in x.synset().hypernyms() for l in s.lemmas()]
hyponyms = lambda x: [l for s in x.synset().hyponyms() for l in s.lemmas()]
antonyms = lambda x: x.antonyms()
derivation = lambda x: x.derivationally_related_forms()
also_sees = lambda x: x.also_sees()
verb_groups = lambda x: x.verb_groups()
pertainyms = lambda x: x.pertainyms()
topic_domains = lambda x: x.topic_domains()
usage_domains = lambda x: x.usage_domains()
region_domains = lambda x: x.region_domains()

edges = {'Hypernyms': make_neighbors(hypernyms, lemmas, row_ids)
         ,'Hyponyms': make_neighbors(hyponyms, lemmas, row_ids)
         ,'Antonyms': make_neighbors(antonyms, lemmas, row_ids)
         ,'Derivationally Related Forms': make_neighbors(derivation, lemmas, row_ids)
         ,'Usage Domains': make_neighbors(usage_domains, lemmas, row_ids)
         ,'Also Sees': make_neighbors(also_sees, lemmas, row_ids)
         ,'Verb Groups': make_neighbors(verb_groups, lemmas, row_ids)
         ,'Pertainyms': make_neighbors(pertainyms, lemmas, row_ids)
         ,'Topic Domains': make_neighbors(topic_domains, lemmas, row_ids)
         ,'Region Domains': make_neighbors(region_domains, lemmas, row_ids)
        }

min_threshold = 15
edges_parsed = {}
for r in edges.keys():
    if sum([len(neighbors) for neighbors in edges[r].values()]) > min_threshold:
        edges_parsed[r] = edges[r]

bad = 0
def make_out_r(in_edges):
    out_edges = {}
    for i, neighbors in in_edges.items():
        for j in neighbors:
            try:
                out_edges[j].append(i)
            except KeyError:
                out_edges[j] = [i]
        if i not in out_edges:
            out_edges[i] = []
    return out_edges
out_edges = {r: make_out_r(e) for r, e in edges_parsed.items()}

save_obj(edges_parsed, "in_edges")
save_obj(out_edges, "out_edges")
print_edge_counts(edges_parsed)
print("Took {:.2f} seconds.".format(time.time() - t))

5573 Edges of Type: Pertainyms
136235 Edges of Type: Hyponyms
69 Edges of Type: Usage Domains
60250 Edges of Type: Derivationally Related Forms
136235 Edges of Type: Hypernyms
5922 Edges of Type: Antonyms
Took 6.28 seconds.


## Retrofit

In [24]:
n_components=25
X = load_obj("X_reduced_{:d}".format(n_components))
in_edges = load_obj("in_edges")
out_edges = load_obj("out_edges")

In [25]:
# Retrofit with Identity Relation (Faruqui et al)
t = time.time()
retrofitted_identity = retrofit_identity(
    X, in_edges, n_iter=20, alpha=lambda i: 1, verbose=True)
print("Baseline retrofitting took {:.2f} seconds.".format(time.time() - t))
save_obj(retrofitted_identity, 'retrofitted_identity')

Converged at iteration 6
Baseline retrofitting took 8.21 seconds.


In [26]:
# Retrofit with Linear Relation
t = time.time()
retrofitted_linear, A, B = retrofit_linear(X, in_edges, out_edges, n_iter=50,
                                           alpha=lambda i: 1, orthogonal=True,
                                           lam=1e-3, lr=0.1, lr_decay=0.99, verbose=True)
print("Linear retrofitting took {:.2f} seconds.".format(time.time() - t))
retrofitted_linear = np.squeeze(retrofitted_linear)
assert(retrofitted_identity.shape == retrofitted_linear.shape)
save_obj(retrofitted_linear, 'retrofitted_linear')

Iteration 1 of 50	Changes: 0.554	Loss: 134913.381
Iteration 2 of 50	Changes: 0.517	Loss: 132132.389
Iteration 3 of 50	Changes: 0.482	Loss: 130346.684
Iteration 4 of 50	Changes: 0.448	Loss: 129204.179
Iteration 5 of 50	Changes: 0.414	Loss: 128474.393
Iteration 6 of 50	Changes: 0.380	Loss: 128007.934
Iteration 7 of 50	Changes: 0.346	Loss: 127708.801
Iteration 8 of 50	Changes: 0.314	Loss: 127515.815
Iteration 9 of 50	Changes: 0.284	Loss: 127390.308
Iteration 10 of 50	Changes: 0.256	Loss: 127308.038
Iteration 11 of 50	Changes: 0.230	Loss: 127253.922
Iteration 12 of 50	Changes: 0.205	Loss: 127218.629
Iteration 13 of 50	Changes: 0.183	Loss: 127196.397
Iteration 14 of 50	Changes: 0.163	Loss: 127183.643
Iteration 15 of 50	Changes: 0.145	Loss: 127178.095
Loss reached local minimum at iteration 15
Linear retrofitting took 1706.63 seconds.


In [37]:
# Neural Retrofitting
t = time.time()
retrofitted_neural, A, u, b = retrofit_neural(
    X, in_edges, out_edges, k=10, alpha=lambda i: 1,
    n_iter=2000, lam=1e-5, lr=0.001, tol=1e-5, lr_decay=0.99, batch_size=256, patience=100, verbose=5)
print("Neural retrofitting took {:.2f} seconds.".format(time.time() - t))
save_obj(retrofitted_neural, 'retrofitted_neural')

Iteration 5 of 2000	Changes: 1.26373	Loss: 1581.857	Patience: 99
Iteration 10 of 2000	Changes: 0.23948	Loss: 1174.324	Patience: 98
Iteration 15 of 2000	Changes: 0.24093	Loss: 999.598	Patience: 96
Iteration 20 of 2000	Changes: 0.11356	Loss: 724.107	Patience: 93
Iteration 25 of 2000	Changes: 0.13395	Loss: 712.741	Patience: 91
Iteration 30 of 2000	Changes: 0.06782	Loss: 513.152	Patience: 89
Iteration 35 of 2000	Changes: 0.08978	Loss: 494.278	Patience: 88
Iteration 40 of 2000	Changes: 0.09615	Loss: 439.267	Patience: 85
Iteration 45 of 2000	Changes: 0.07010	Loss: 445.482	Patience: 82
Iteration 50 of 2000	Changes: 0.05834	Loss: 427.946	Patience: 79
Iteration 55 of 2000	Changes: 0.05734	Loss: 339.042	Patience: 78
Iteration 60 of 2000	Changes: 0.10240	Loss: 267.613	Patience: 75
Iteration 65 of 2000	Changes: 0.04350	Loss: 253.538	Patience: 73
Iteration 70 of 2000	Changes: 0.08645	Loss: 364.526	Patience: 70
Iteration 75 of 2000	Changes: 0.05637	Loss: 348.364	Patience: 67
Iteration 80 of 2000	Cha

## Evaluation
The code for evaluating word similarity and syntatic relations are adapted from Chris Potts's [CS244u course notes](http://nbviewer.jupyter.org/github/cgpotts/cs224u/blob/master/vsm.ipynb#In-class-bake-off:-Word-similarity).

In [38]:
from evaluate import *
n_components = 25
X = load_obj("X_reduced_{:d}".format(n_components))
row_ids_by_name = load_obj('row_ids_by_name')
retrofitted_identity = load_obj('retrofitted_identity')
retrofitted_linear = load_obj('retrofitted_linear')
retrofitted_neural = load_obj('retrofitted_neural')

print("Distributional Embeddings")
full_word_similarity_evaluation(X, row_ids_by_name)
print("="*80)
print("Distributional Embeddings, Identity Retrofitting")
full_word_similarity_evaluation(retrofitted_identity, row_ids_by_name)
print("="*80)
print("Distributional Embeddings, Linear Retrofitting")
full_word_similarity_evaluation(retrofitted_linear, row_ids_by_name)
print("="*80)
print("Distributional Embeddings, Neural Retrofitting")
full_word_similarity_evaluation(retrofitted_neural, row_ids_by_name)

Distributional Embeddings
----------------------------------------
wordsim353_reader
Evaluation vocabulary size: 429
Spearman r: 0.512
----------------------------------------
mturk771_reader
Evaluation vocabulary size: 1109
Spearman r: 0.538
----------------------------------------
mturk287_reader
Evaluation vocabulary size: 301
Spearman r: 0.671
Distributional Embeddings, Identity Retrofitting
----------------------------------------
wordsim353_reader
Evaluation vocabulary size: 429
Spearman r: 0.512
----------------------------------------
mturk771_reader
Evaluation vocabulary size: 1109
Spearman r: 0.532
----------------------------------------
mturk287_reader
Evaluation vocabulary size: 301
Spearman r: 0.664
Distributional Embeddings, Linear Retrofitting
----------------------------------------
wordsim353_reader
Evaluation vocabulary size: 429
Spearman r: 0.542
----------------------------------------
mturk771_reader
Evaluation vocabulary size: 1109
Spearman r: 0.562
-------------

In [39]:
def load_synrel(filename):
    data = []
    with open(filename, 'r') as f:
        for line in f:
            parts = [p.strip() for p in line.split(' ')]
            if len(parts) == 4:
                data.append(parts)
    return data

def cos(a, b):
    return a.T.dot(b)/(np.linalg.norm(a, ord=2)*np.linalg.norm(b, ord=2))

def syn_rel_evaluation(X, row_ids):
    data = load_synrel('Evaluation/synreldata/analogies.txt')
    sims = []
    for analogy in data:
        try:
            sims.append(
                cos(X[row_ids[analogy[1]]] - X[row_ids[analogy[0]]] + X[row_ids[analogy[2]]],
                    X[row_ids[analogy[3]]]))
        except KeyError:
            continue
    print("Tested {:d} Analogies, Mean Cosine Similarity: {:.5f}".format(len(sims), np.mean(np.array(sims))))

print("Testing Syntatic Relations for Word2Vec Distributional Embeddings")
syn_rel_evaluation(X, row_ids_by_name)
print("="*60)
print("Testing Syntatic Relations for Word2Vec, Identity Retrofitting")
syn_rel_evaluation(retrofitted_identity, row_ids_by_name)
print("="*60)
print("Testing Syntatic Relations for Word2Vec, Linear Retrofitting")
syn_rel_evaluation(retrofitted_linear, row_ids_by_name)
print("="*60)
print("Testing Syntatic Relations for Word2Vec, Neural Retrofitting")
syn_rel_evaluation(retrofitted_neural, row_ids_by_name)

Testing Syntatic Relations for Word2Vec Distributional Embeddings
Tested 12148 Analogies, Mean Cosine Similarity: 0.77165
Testing Syntatic Relations for Word2Vec, Identity Retrofitting
Tested 12148 Analogies, Mean Cosine Similarity: 0.77419
Testing Syntatic Relations for Word2Vec, Linear Retrofitting
Tested 12148 Analogies, Mean Cosine Similarity: 0.79329
Testing Syntatic Relations for Word2Vec, Neural Retrofitting
Tested 12148 Analogies, Mean Cosine Similarity: 0.77162
