In [1]:
from data_loading import load_embeddings, load_sim_dataset
import numpy as np
from scipy.stats import pearsonr, spearmanr

# Evaluating Word Representations

## Word embeddings

We will examine the use of word embeddings as representations for the meaning of words. In particular, we will use pretrained word embeddings as obtained in [1]: Dependency based embeddings and bag-of-words embeddings with k = 2 and 5.

In [2]:
# Load word embeddings
bow2embeddings = load_embeddings("data/bow2.words")

# Load similarity datasets
simlex_pairs, simlex_scores = load_sim_dataset("data/SimLex-999.txt", score_col=3, skip=1)
men_pairs, men_scores = load_sim_dataset("data/MEN_dataset_natural_form_full", score_col=2)

In [3]:
def score_pairs(pairs, scores, embeddings):
    gold_scores = []
    sim_scores = []
    for i, pair in enumerate(pairs):
        if pair[0] in embeddings and pair[1] in embeddings:
            # Get score and embedding for each word in pair
            gold_scores.append(scores[i])
            a = embeddings[pair[0]]
            b = embeddings[pair[1]]
            # Calculate cosine similarity
            sim_scores.append(np.dot(a, b)/(np.linalg.norm(a) * np.linalg.norm(b)))    
    
    print("Pearson correlation: {:.4f}".format(pearsonr(gold_scores, sim_scores)[0]))
    print("Spearman correlation: {:.4f}".format(spearmanr(gold_scores, sim_scores).correlation))
    
score_pairs(simlex_pairs, simlex_scores, bow2embeddings)
score_pairs(men_pairs, men_scores, bow2embeddings)

Pearson correlation: 0.4285
Spearman correlation: 0.4141
Pearson correlation: 0.6777
Spearman correlation: 0.6999


### References
[1] Levy, O., & Goldberg, Y. (2014). Dependency-based word embeddings. In Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers) (Vol. 2, pp. 302-308).