In [24]:
import os
import random
import pickle
from collections import defaultdict

import numpy as np

from sklearn.manifold import TSNE, Isomap
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
%matplotlib inline

from models import *

In [2]:
plt.rcParams["figure.figsize"] = (15, 20)

In [3]:
def read_analogy_data(path):
    with open(path) as reader:
        analogy_data = defaultdict(list)
        for line in reader:
            if line.startswith(":"):
                task = line.strip().strip(":").strip()
                continue
            # convert to lower-case 
            analogy_data[task].append(line.strip().lower().split())
    return analogy_data
analogy_data = read_analogy_data("./data/questions-words.txt")

In [4]:
# list of sub-categories
analogy_data.keys()

dict_keys(['gram7-past-tense', 'gram8-plural', 'city-in-state', 'currency', 'gram2-opposite', 'gram6-nationality-adjective', 'gram9-plural-verbs', 'gram1-adjective-to-adverb', 'family', 'capital-world', 'capital-common-countries', 'gram4-superlative', 'gram5-present-participle', 'gram3-comparative'])

In [8]:
for key in analogy_data.keys():
    print(key, len(analogy_data[key]))

gram7-past-tense 1560
gram8-plural 1332
city-in-state 2467
currency 866
gram2-opposite 812
gram6-nationality-adjective 1599
gram9-plural-verbs 870
gram1-adjective-to-adverb 992
family 506
capital-world 4524
capital-common-countries 506
gram4-superlative 1122
gram5-present-participle 1056
gram3-comparative 1332


In [5]:
bow2_sim = load_model("bow2.words")
#bow5_sim = load_model("bow5.words")
#deps_sim = load_model("deps.words")

# models = {
#     "bow2": bow2_sim,
#     "bow5": bow5_sim,
#     "deps": deps_sim
# }

Model doesn't exist, creating it
Read 0 lines
Read 10000 lines
Read 20000 lines
Read 30000 lines
Read 40000 lines
Read 50000 lines
Read 60000 lines
Read 70000 lines
Read 80000 lines
Read 90000 lines
Read 100000 lines
Read 110000 lines
Read 120000 lines
Read 130000 lines
Read 140000 lines
Read 150000 lines
Read 160000 lines
Read 170000 lines
Read 180000 lines


In [11]:
def reciprocal_rank(correct_value, results):
    position = results.index(correct_value)
    if position == -1:
        return 0 
    return 1 / (position + 1)

# tests
# print(reciprocal_rank("cats", ["catten", "cati", "cats"]))
# print(reciprocal_rank("tori", ["catten", "tori", "cats"]))
# print(reciprocal_rank("virus", ["virus", "cati", "cats"]))

0.3333333333333333
0.5
1.0


In [None]:
def compute_analogy(model, a, a_star, b):
    if a not in model.word_index or a_star not in model.word_index or b not in model.word_index:
        return None
    a, a_star, b = model[a], model[a_star], model[b]
    v = a_star - a
    b_star = b + v
    return model.most_similar_to_vector(b_star, n=len(model.embeddings))

def compute_scores(task, model):
    print("Computing scores for task: ", task)
    data = analogy_data[task]
    
    correct = 0
    reciprocal_ranks = []
    skipped = 0
    for index, (a, a_star, b, b_star_actual) in enumerate(data):
        # if the final word doesn't exist, then there's no point
        if b_star_actual not in model.word_index:
            skipped += 1
            continue

        if index % 100 == 0:
            print("\t{} of {}".format(index, len(data)))
        b_star_results = compute_analogy(model, a, a_star, b)
        
        # ignore if some of the words are not in the vocabulary
        if b_star_results is None:
            skipped += 1
            continue
            
        if b_star_results[0] == b_star_actual:
            correct += 1
        
        reciprocal_ranks.append(reciprocal_rank(b_star_actual, b_star_results))
        
    print("\t ... done")
    accuracy, mrr = correct / len(data), np.mean(reciprocal_ranks)
    print("\t Skipped: {}, Accuracy: {}, MRR: {}".format(skipped, accuracy, mrr))
    return accuracy, mrr, correct, reciprocal_ranks


def run_evaluation(model):    
    scores = defaultdict(dict)
    overall_correct = 0
    overall_evaluated = 0
    overall_reciprocal_ranks = []
    
    for task, data in analogy_data.items():
        accuracy, mrr, correct, reciprocal_ranks = compute_scores(task, model)
        scores[task]["accuracy"] = accuracy
        scores[task]["MRR"] = mrr
        
        overall_correct += correct
        overall_evaluated += len(reciprocal_ranks)
        overall_reciprocal_ranks.extend(reciprocal_ranks)
    
    scores["overall"]["accuracy"] = overall_correct / overall_evaluated
    scores["overall"]["MRR"] = np.mean(overall_reciprocal_ranks)
    
    return scores

bow2_scores = run_evaluation(bow2_sim)
pickle.dump(bow2_scores, open("bow2_analogy_scores.pkl", "wb"))

Computing scores for task:  gram7-past-tense
	0 of 1560
	100 of 1560
	200 of 1560
	300 of 1560
	400 of 1560
	500 of 1560
	600 of 1560
	700 of 1560
	800 of 1560
	900 of 1560
	1000 of 1560
