In [1]:
import os
import bz2
import codecs
import pickle as pkl

import numpy as np

In [2]:
def read_embedding_file(file_name, selected_words = None):
    word_index = []
    embeddings = []

    with bz2.BZ2File(file_name, "r") as reader:

        for idx, line in enumerate(reader):
            if idx % 10000 == 0:
                print("Read {} lines".format(idx))
            line = line.strip().split()
            word = line[0].decode('utf-8')
            if selected_words is not None and word not in selected_words:
                continue
            embedding = np.array(list(map(float, line[1:])))
            word_index.append(word)
            embeddings.append(embedding)

    return word_index, embeddings

In [3]:
def cosine_similarity(u, v):
    return round(np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)), 3)

In [4]:
class WordSimilarityModel:
    def __init__(self, word_index, embeddings, similarity="cosine"):
        self.word_index = dict(zip(word_index, np.arange(len(word_index))))
        self.embeddings = embeddings

        if similarity == "cosine":
            self.similarity = cosine_similarity
        else:
            self.similarity = similarity

    def __getitem__(self, word):
        if word not in self.word_index:
            raise ValueError("Word '{}' doesn't exist in the model".format(word))
        return self.embeddings[self.word_index[word]]

    def most_similar(self, word, n=10, score=False):
        u = self[word]
        word_distances = []
        for other_word, idx in self.word_index.items():
            if word == other_word:
                continue
            word_distances.append((other_word, self.similarity(u, self[other_word])))
        word_distances.sort(key=lambda _: -_[1])
        if score:
            return word_distances[:n]
        else:
            return list(map(lambda _: _[0], word_distances))[:n]

In [5]:
def load_model(model_name):
    pkl_file = "./models/{}.pkl".format(model_name)
    if not os.path.exists(pkl_file):
        print("Model doesn't exist, creating it")
        word_index, embeddings = read_embedding_file("./data/{}.bz2".format(model_name))
        sim = WordSimilarityModel(word_index, embeddings)
        pkl.dump(sim, open(pkl_file, "wb"))
        return sim
    return pkl.load(open(pkl_file, "rb"))

In [6]:
dep_based = load_model("deps.words")
bow2_based = load_model("bow2.words")
bow5_based = load_model("bow5.words")

In [7]:
all_models = [['Dependency ', dep_based], 
              ['BoW with k = 2 ', bow2_based], 
              ['BoW with k = 5 ', bow5_based]]

In [18]:
# Read Simlex999.txt 

def read_sim_file(filename):
    pairs_list = []
    pairs_dict = {}
    with open(filename, "r") as f:
        for _, line in enumerate(f):
            line = line.strip().split()
            pairs_list.append((line[0], line[1], line[3]))
            pairs = pairs_list[1:]
    for word1, word2, score in pairs:
        if word1 not in pairs_dict.keys():
            pairs_dict[word1] = []
        pairs_dict[word1].append((word2,float(score)))
# Sorting by scores
    for word1, list_value in pairs_dict.items():
        list_value.sort(key=lambda _: -_[1])
    return pairs_dict


In [24]:
#Read MEN-Dataset-Natural-Full-Form

def read_men_file(filename):
    pairs_list = []
    pairs_dict = {}
    with open(filename, "r") as f:
        for _, line in enumerate(f):
            line = line.strip().split()
            pairs_list.append((line[0], line[1], line[2]))
    for word1, word2, score in pairs_list:
        if word1 not in pairs_dict.keys():
            pairs_dict[word1] = []
        pairs_dict[word1].append((word2, float(score)))
    
    for key, value in pairs_dict.items():
        value.sort(key=lambda _: -_[1])
        
    return pairs_dict

In [28]:
simlex_999 = read_sim_file("data/SimLex-999/SimLex-999.txt")

In [29]:
men_natural = read_men_file("data/MEN/MEN_dataset_natural_form_full")

In [31]:
eval_models = [["SimLex-999", simlex_999], 
               ["MEN (Natural) Full Form", men_natural]]

In [58]:
from scipy import stats

In [170]:
# Evaluation
# For each common word from Simlex vs. models
#     Find length of Simlex data
#         Normalize scores of Simlex data in the range [0,1]
#     Read similar length data in models
#     Store scores in (a, b) - for spearman (two 1D arrays of the scores)
spearman_a = np.zeros(len(p))
spearman_b = np.zeros(len(p))
i = 0
for key, value in p.items():
    for model_word in bow5_based.word_index.keys():
        if (key == model_word):
            if i%50 == 0:
                print(i)
            simlex_length = len(value)
            model_all_values = bow5_based.most_similar(key, score = True, n = simlex_length)
            normalized_sum = 0.0

            for _, score in value:
                normalized_sum += score
            for _, score in value:
                score = round(score/normalized_sum, 3)
                spearman_a[i] = score
            for _, model_score in model_all_values:
                spearman_b[i] = (model_score)
        else:
            print(key, " doesn't exist in the model.\n")
    i=i+1

0
50
100
150
200
250
300
350
400
450
500
550
600


In [171]:
print(stats.spearmanr(spearman_a, spearman_b, axis = None))   

SpearmanrResult(correlation=0.3767945615424258, pvalue=3.2510867767480334e-22)


In [173]:
spearman_b

array([0.497, 0.512, 0.469, 0.579, 0.691, 0.605, 0.812, 0.645, 0.665,
       0.522, 0.589, 0.576, 0.736, 0.526, 0.559, 0.542, 0.588, 0.67 ,
       0.696, 0.702, 0.748, 0.64 , 0.722, 0.582, 0.584, 0.688, 0.769,
       0.642, 0.735, 0.492, 0.672, 0.828, 0.485, 0.729, 0.785, 0.664,
       0.633, 0.68 , 0.762, 0.727, 0.519, 0.796, 0.703, 0.765, 0.732,
       0.559, 0.607, 0.605, 0.542, 0.739, 0.715, 0.595, 0.702, 0.594,
       0.703, 0.801, 0.541, 0.561, 0.698, 0.673, 0.711, 0.584, 0.683,
       0.74 , 0.659, 0.733, 0.772, 0.549, 0.666, 0.55 , 0.576, 0.618,
       0.622, 0.758, 0.79 , 0.548, 0.7  , 0.661, 0.848, 0.599, 0.845,
       0.614, 0.568, 0.714, 0.551, 0.58 , 0.688, 0.593, 0.853, 0.668,
       0.825, 0.6  , 0.624, 0.604, 0.735, 0.552, 0.725, 0.707, 0.546,
       0.609, 0.657, 0.638, 0.75 , 0.772, 0.542, 0.726, 0.819, 0.576,
       0.805, 0.635, 0.675, 0.721, 0.649, 0.721, 0.752, 0.592, 0.573,
       0.868, 0.58 , 0.624, 0.713, 0.758, 0.502, 0.586, 0.612, 0.583,
       0.66 , 0.717,