# Retrofitting to the Structure of Wordnet

In [1]:
__author__ = 'Ben Lengerich'

In [4]:
import numpy as np
import scipy
from sklearn.decomposition import IncrementalPCA
from nltk.corpus import wordnet as wn
import time

import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '../../'))
from retrofit_identity import retrofit_identity
from retrofit_linear   import retrofit_linear
from utils import *

## Load Word2Vec

In [5]:
from gensim.models import KeyedVectors
word2vec_filename='../../GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(word2vec_filename, binary=True)

Using Theano backend.


## Initialize WordNet Representations as Pre-Trained Word2Vec

In [22]:
t = time.time()
lemmas = []
embeddings = []
row_names = []
row_ids = {}
row_ids_by_name = {}
n_ignored = 0
for synset in wn.all_synsets():
    for lemma in synset.lemmas():
        if lemma.name() in row_ids:
            continue
        try:
            embeddings.append(model[lemma.name()])
            lemmas.append(lemma)
            row_ids[str(lemma)] = len(embeddings) - 1
            row_ids_by_name[lemma.name()] = len(embeddings) - 1
            row_names.append(str(lemma))
        except KeyError: # word present in WordNet, but not Word2Vec
            n_ignored += 1
            pass

X = np.array(embeddings)
row_names = np.array(row_names)
save_obj(row_names, 'row_names')
save_obj(row_ids,   'row_ids')
assert(len(lemmas) == len(X))
print("Found {} WordNet lemmas in Word2Vec, ignored {} WordNet lemmas.".format(
    len(X), n_ignored))
print("Took {:.2f} seconds.".format(time.time() - t))

Found 115635 WordNet lemmas in Word2Vec, ignored 91343 WordNet lemmas.
Took 2.40 seconds.


In [23]:
# Reduce Embedding Dimensionality
t = time.time()
n_components=25
pca = IncrementalPCA(n_components=n_components)
X = pca.fit_transform(X)
save_obj(X, "X_reduced_{:d}".format(n_components))
print("Reduced to {:d} components.\nTook {:.2f} seconds.".format(n_components, time.time() - t))

Reduced to 25 components.
Took 8.46 seconds.


## Extract Edges from WordNet

In [24]:
from utils import *

t = time.time()
def make_neighbors(get_neighbors, lemmas, row_ids):
    assert(len(row_ids.keys()) == len(lemmas))
    neighbors = {}
    for x in lemmas:
        x_row_id = row_ids[str(x)]
        neighbors[x_row_id] = set([])
        for y in get_neighbors(x):
            try:
                y_row_id = row_ids[str(y)]
            except KeyError:
                continue
            neighbors[x_row_id].add(y_row_id)
        neighbors[x_row_id] = list(neighbors[x_row_id])
    return neighbors

hypernyms = lambda x: [l for s in x.synset().hypernyms() for l in s.lemmas()]
hyponyms = lambda x: [l for s in x.synset().hyponyms() for l in s.lemmas()]
antonyms = lambda x: x.antonyms()
derivation = lambda x: x.derivationally_related_forms()
also_sees = lambda x: x.also_sees()
verb_groups = lambda x: x.verb_groups()
pertainyms = lambda x: x.pertainyms()
topic_domains = lambda x: x.topic_domains()
usage_domains = lambda x: x.usage_domains()
region_domains = lambda x: x.region_domains()

edges = {'Hypernyms': make_neighbors(hypernyms, lemmas, row_ids)
         ,'Hyponyms': make_neighbors(hyponyms, lemmas, row_ids)
         ,'Antonyms': make_neighbors(antonyms, lemmas, row_ids)
         ,'Derivationally Related Forms': make_neighbors(derivation, lemmas, row_ids)
         ,'Usage Domains': make_neighbors(usage_domains, lemmas, row_ids)
         ,'Also Sees': make_neighbors(also_sees, lemmas, row_ids)
         ,'Verb Groups': make_neighbors(verb_groups, lemmas, row_ids)
         ,'Pertainyms': make_neighbors(pertainyms, lemmas, row_ids)
         ,'Topic Domains': make_neighbors(topic_domains, lemmas, row_ids)
         ,'Region Domains': make_neighbors(region_domains, lemmas, row_ids)
        }

min_threshold = 15
edges_parsed = {}
for r in edges.keys():
    if sum([len(neighbors) for neighbors in edges[r].values()]) > min_threshold:
        edges_parsed[r] = edges[r]

bad = 0
def make_out_r(in_edges):
    out_edges = {}
    for i, neighbors in in_edges.items():
        for j in neighbors:
            try:
                out_edges[j].append(i)
            except KeyError:
                out_edges[j] = [i]
        if i not in out_edges:
            out_edges[i] = []
    return out_edges
out_edges = {r: make_out_r(e) for r, e in edges_parsed.items()}

save_obj(edges_parsed, "in_edges")
save_obj(out_edges, "out_edges")
print_edge_counts(edges_parsed)
print("Took {:.2f} seconds.".format(time.time() - t))

69 Edges of Type: Usage Domains
60250 Edges of Type: Derivationally Related Forms
136235 Edges of Type: Hypernyms
5573 Edges of Type: Pertainyms
136235 Edges of Type: Hyponyms
5922 Edges of Type: Antonyms
Took 5.90 seconds.


## Retrofit

In [25]:
X = load_obj("X_reduced_{:d}".format(n_components))
in_edges = load_obj("in_edges")
out_edges = load_obj("out_edges")

In [26]:
# Retrofit Identity (Faruqui et al)
t = time.time()
retrofitted_baseline = retrofit_identity(
    X, in_edges, n_iter=20, alpha=lambda i: 1, verbose=True)
print("Baseline retrofitting took {:.2f} seconds.".format(time.time() - t))
save_obj(retrofitted_baseline, 'retrofitted_baseline')

Converged at iteration 6
Baseline retrofitting took 9.42 seconds.


In [32]:
# Linear
t = time.time()
retrofitted_linear, A, B = retrofit_linear(X, in_edges, out_edges, n_iter=20,
                                           alpha=lambda i: 1, orthogonal=True,
                                           lam=1e-3, lr=0.1, lr_decay=0.95, verbose=True)
print("Linear retrofitting took {:.2f} seconds.".format(time.time() - t))
retrofitted_linear = np.squeeze(retrofitted_linear)
assert(retrofitted_baseline.shape == retrofitted_linear.shape)
save_obj(retrofitted_linear, 'retrofitted_linear')

Iteration 1 of 20	Changes: 0.632	Loss: 133914.809
Iteration 2 of 20	Changes: 0.579	Loss: 131307.219
Iteration 3 of 20	Changes: 0.528	Loss: 129638.506
Iteration 4 of 20	Changes: 0.476	Loss: 128538.368
Iteration 5 of 20	Changes: 0.427	Loss: 127789.000
Iteration 6 of 20	Changes: 0.381	Loss: 127260.309
Iteration 7 of 20	Changes: 0.339	Loss: 126873.428
Iteration 8 of 20	Changes: 0.301	Loss: 126579.837
Iteration 9 of 20	Changes: 0.267	Loss: 126349.244
Iteration 10 of 20	Changes: 0.238	Loss: 126162.433
Iteration 11 of 20	Changes: 0.211	Loss: 126007.008
Iteration 12 of 20	Changes: 0.188	Loss: 125874.821
Iteration 13 of 20	Changes: 0.168	Loss: 125760.400
Iteration 14 of 20	Changes: 0.150	Loss: 125659.985
Iteration 15 of 20	Changes: 0.134	Loss: 125570.919
Iteration 16 of 20	Changes: 0.120	Loss: 125491.275
Iteration 17 of 20	Changes: 0.108	Loss: 125419.611
Iteration 18 of 20	Changes: 0.097	Loss: 125354.818
Iteration 19 of 20	Changes: 0.088	Loss: 125296.017
Iteration 20 of 20	Changes: 0.080	Loss: 

## Evaluation
These evaluation codes are adapted from Chris Potts's [CS244u course notes](http://nbviewer.jupyter.org/github/cgpotts/cs224u/blob/master/vsm.ipynb#In-class-bake-off:-Word-similarity).

In [33]:
import csv
vsmdata_home = "Evaluation/vsmdata"

def wordsim_dataset_reader(src_filename, header=False, delimiter=','):    
    """Basic reader that works for all four files, since they all have the 
    format word1,word2,score, differing only in whether or not they include 
    a header line and what delimiter they use.
    
    Parameters
    ----------
    src_filename : str
        Full path to the source file.
        
    header : bool (default: False)
        Whether `src_filename` has a header.
        
    delimiter : str (default: ',')
        Field delimiter in `src_filename`.
    
    Yields
    ------    
    (str, str, float)
       (w1, w2, score) where `score` is the negative of the similarity 
       score in the file so that we are intuitively aligned with our 
       distance-based code.
    
    """
    reader = csv.reader(open(src_filename), delimiter=delimiter)
    if header:
        next(reader)
    for row in reader:
        w1, w2, score = row
        # Negative of scores to align intuitively with distance functions:
        score = -float(score)
        yield (w1, w2, score)

def wordsim353_reader():
    """WordSim-353: http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/"""
    src_filename = os.path.join(vsmdata_home, 'wordsim', 'wordsim353.csv')
    return wordsim_dataset_reader(src_filename, header=True)
 
def mturk287_reader():
    """MTurk-287: http://tx.technion.ac.il/~kirar/Datasets.html"""
    src_filename = os.path.join(vsmdata_home, 'wordsim', 'MTurk-287.csv')
    return wordsim_dataset_reader(src_filename, header=False)
    
def mturk771_reader():
    """MTURK-771: http://www2.mta.ac.il/~gideon/mturk771.html"""
    src_filename = os.path.join(vsmdata_home, 'wordsim', 'MTURK-771.csv')
    return wordsim_dataset_reader(src_filename, header=False)

In [34]:
def cosine(u, v):        
    """Cosine distance between 1d np.arrays `u` and `v`, which must have 
    the same dimensionality. Returns a float."""
    # Use scipy's method:
    return scipy.spatial.distance.cosine(u, v)
    # Or define it yourself:
    # return 1.0 - (np.dot(u, v) / (vector_length(u) * vector_length(v)))
    
def word_similarity_evaluation(reader, mat, row_ids, distfunc=cosine):
    """Word-similarity evalution framework.
    
    Parameters
    ----------
    reader : iterator
        A reader for a word-similarity dataset. Just has to yield
        tuples (word1, word2, score).
    
    mat : 2d np.array
        The VSM being evaluated.
        
    rownames : dict
        The names of the rows in mat.
        
    distfunc : function mapping vector pairs to floats (default: `cosine`)
        The measure of distance between vectors. Can also be `euclidean`, 
        `matching`, `jaccard`, as well as any other distance measure 
        between 1d vectors.  
    
    Prints
    ------
    To standard output
        Size of the vocabulary overlap between the evaluation set and
        rownames. We limit the evalation to the overlap, paying no price
        for missing words (which is not fair, but it's reasonable given
        that we're working with very small VSMs in this notebook).
    
    Returns
    -------
    float
        The Spearman rank correlation coefficient between the dataset
        scores and the similarity values obtained from `mat` using 
        `distfunc`. This evaluation is sensitive only to rankings, not
        to absolute values.
    
    """    
    sims = defaultdict(list)
    vocab = set([])
    for w1, w2, score in reader():
        if w1 in row_ids and w2 in row_ids:
            sims[w1].append((w2, score))
            sims[w2].append((w1, score))
            vocab.add(w1)
            vocab.add(w2)
    print("Evaluation vocabulary size: %s" % len(vocab))
    # Evaluate the matrix by creating a vector of all_scores for data
    # and all_dists for mat's distances. 
    all_scores = []
    all_dists = []
    for word in vocab:
        vec = mat[row_ids[word]]
        vals = sims[word]
        cmps, scores = zip(*vals)
        all_scores += scores
        all_dists += [distfunc(vec, mat[row_ids[w]]) for w in cmps]
    # Return just the rank correlation coefficient (index [1] would be the p-value):
    return scipy.stats.spearmanr(all_scores, all_dists)[0]

def full_word_similarity_evaluation(mat, row_ids):
    """Evaluate the (mat, rownames) VSM against all four datasets."""
    for reader in (wordsim353_reader, mturk771_reader, mturk287_reader):
        print("-"*40)
        print(reader.__name__)
        print('Spearman r: %0.03f' % word_similarity_evaluation(reader, mat, row_ids))

In [35]:
from collections import defaultdict, Counter
X = load_obj("X_reduced_{:d}".format(n_components))
retrofitted_baseline = load_obj('retrofitted_baseline')
retrofitted_linear = load_obj('retrofitted_linear')

print("Distributional Embeddings")
full_word_similarity_evaluation(X, row_ids_by_name)
print("="*80)
print("Distributional Embeddings, Identity Retrofitting")
full_word_similarity_evaluation(retrofitted_baseline, row_ids_by_name)
print("="*80)
print("Distributional Embeddings, Linear Retrofitting")
full_word_similarity_evaluation(retrofitted_linear, row_ids_by_name)

Distributional Embeddings
----------------------------------------
wordsim353_reader
Evaluation vocabulary size: 429
Spearman r: 0.505
----------------------------------------
mturk771_reader
Evaluation vocabulary size: 1109
Spearman r: 0.535
----------------------------------------
mturk287_reader
Evaluation vocabulary size: 301
Spearman r: 0.652
Distributional Embeddings, Identity Retrofitting
----------------------------------------
wordsim353_reader
Evaluation vocabulary size: 429
Spearman r: 0.524
----------------------------------------
mturk771_reader
Evaluation vocabulary size: 1109
Spearman r: 0.511
----------------------------------------
mturk287_reader
Evaluation vocabulary size: 301
Spearman r: 0.627
Distributional Embeddings, Linear Retrofitting
----------------------------------------
wordsim353_reader
Evaluation vocabulary size: 429
Spearman r: 0.531
----------------------------------------
mturk771_reader
Evaluation vocabulary size: 1109
Spearman r: 0.539
-------------

In [36]:
def load_synrel(filename):
    data = []
    with open(filename, 'r') as f:
        for line in f:
            parts = [p.strip() for p in line.split(' ')]
            if len(parts) == 4:
                data.append(parts)
    return data

def cos(a, b):
    return a.T.dot(b)/(np.linalg.norm(a, ord=2)*np.linalg.norm(b, ord=2))

def syn_rel_evaluation(X, row_ids):
    data = load_synrel('Evaluation/synreldata/analogies.txt')
    sims = []
    for analogy in data:
        try:
            sims.append(
                cos(X[row_ids[analogy[1]]] - X[row_ids[analogy[0]]] + X[row_ids[analogy[2]]],
                    X[row_ids[analogy[3]]]))
        except KeyError:
            continue
    print("Tested {:d} Analogies, Mean Cosine Similarity: {:.5f}".format(len(sims), np.mean(np.array(sims))))

print("="*60)
print("Testing Syntatic Relations for Word2Vec Distributional Embeddings")
syn_rel_evaluation(X, row_ids_by_name)
print("="*60)
print("Testing Syntatic Relations for Word2Vec, Identity Retrofitting")
syn_rel_evaluation(retrofitted_baseline, row_ids_by_name)
print("="*60)
print("Testing Syntatic Relations for Word2Vec, Linear Retrofitting")
syn_rel_evaluation(retrofitted_linear, row_ids_by_name)

Testing Syntatic Relations for Word2Vec Distributional Embeddings
Tested 12148 Analogies, Mean Cosine Similarity: 0.76070
Testing Syntatic Relations for Word2Vec, Identity Retrofitting
Tested 12148 Analogies, Mean Cosine Similarity: 0.75057
Testing Syntatic Relations for Word2Vec, Linear Retrofitting
Tested 12148 Analogies, Mean Cosine Similarity: 0.77100
