In [1]:
import os
import random
import pickle
from collections import defaultdict, Counter

import numpy as np

from sklearn.manifold import TSNE, Isomap
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
%matplotlib inline

from models import *

In [2]:
plt.rcParams["figure.figsize"] = (15, 20)

In [3]:
def read_analogy_data(path):
    with open(path) as reader:
        analogy_data = []
        task_labels = []
        for line in reader:
            if line.startswith(":"):
                task = line.strip().strip(":").strip()
                continue
            # convert to lower-case 
            analogy_data.append(line.strip().lower().split())
            task_labels.append(task)
    return analogy_data, task_labels
analogy_data, task_labels = read_analogy_data("./data/questions-words.txt")

In [4]:
# list of sub-categories
analogy_data[:10], task_labels[:10]

([['athens', 'greece', 'baghdad', 'iraq'],
  ['athens', 'greece', 'bangkok', 'thailand'],
  ['athens', 'greece', 'beijing', 'china'],
  ['athens', 'greece', 'berlin', 'germany'],
  ['athens', 'greece', 'bern', 'switzerland'],
  ['athens', 'greece', 'cairo', 'egypt'],
  ['athens', 'greece', 'canberra', 'australia'],
  ['athens', 'greece', 'hanoi', 'vietnam'],
  ['athens', 'greece', 'havana', 'cuba'],
  ['athens', 'greece', 'helsinki', 'finland']],
 ['capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries'])

In [5]:
Counter(task_labels)

Counter({'capital-common-countries': 506,
         'capital-world': 4524,
         'city-in-state': 2467,
         'currency': 866,
         'family': 506,
         'gram1-adjective-to-adverb': 992,
         'gram2-opposite': 812,
         'gram3-comparative': 1332,
         'gram4-superlative': 1122,
         'gram5-present-participle': 1056,
         'gram6-nationality-adjective': 1599,
         'gram7-past-tense': 1560,
         'gram8-plural': 1332,
         'gram9-plural-verbs': 870})

In [6]:
bow2_sim = load_model("bow2.words")
#bow5_sim = load_model("bow5.words")
#deps_sim = load_model("deps.words")

# models = {
#     "bow2": bow2_sim,
#     "bow5": bow5_sim,
#     "deps": deps_sim
# }

In [7]:
def reciprocal_rank(correct_value, results):
    try:
        position = results.index(correct_value)
        return 1 / (position + 1)
    except:
        return 0 

# tests
# print(reciprocal_rank("cats", ["catten", "cati", "cats"]))
# print(reciprocal_rank("tori", ["catten", "tori", "cats"]))
# print(reciprocal_rank("virus", ["virus", "cati", "cats"]))

In [None]:
def compute_wv(model, a, a_star, b):
    if a not in model.word_index or a_star not in model.word_index or b not in model.word_index:
        return None
    a, a_star, b = model[a], model[a_star], model[b]
    v = a_star - a
    b_star = b + v
    return b_star

def compute_scores(data, model, task_labels):
    correct = 0
    reciprocal_ranks = []
    skipped = 0
    word_vectors = []
    parsed_data = []
    for index, (a, a_star, b, b_star_actual) in enumerate(data):
        # if the final word doesn't exist, then there's no point
        if b_star_actual not in model.word_index:
            skipped += 1
            continue
        
        if index % 250 == 0:
            print("\tPreprocessing: {} of {}".format(index, len(data)))
        
        b_star = compute_wv(model, a, a_star, b)
        if b_star is None:
            skipped += 1
            continue
            
        parsed_data.append((a, a_star, b, b_star_actual))
        word_vectors.append(b_star)

    word_vectors = np.array(word_vectors)
    print("\t ... done. Constructed word matrix of size: {}".format(word_vectors.shape))
    print("Now computing similarities!")
    
    similarity_dict = defaultdict(list)
    for index, (word, word_index) in enumerate(model.word_index.items()):
        if index % 100 == 0:
            print("Similarity : {} of {}".format(index, len(model.word_index)))
        word_vector = model.embeddings[word_index]
        
        similarities = np.apply_along_axis(lambda _: cosine_similarity(word_vector, _), 1, word_vectors)
        
        for idx, tup in enumerate(parsed_data):
            similarity_dict[tuple(tup)].append((word, similarities[idx]))
    
    correct = defaultdict(list)
    reciprocal_ranks = defaultdict(list)
    
    for tup, task in zip(data, task_labels):
        tup = tuple(tup)
        if tup not in similarity_dict:
            continue
        similarity_list = similarity_dict[tup]
        similarity_list.sort(key=lambda _: -_[1])
        if similarity_list[0] == b_star_actual:
            correct[task].append(1)
        else:
            correct[task].append(0)
        b_star_results = [_[0] for _ in similarity_list]
        reciprocal_ranks[task].append(reciprocal_rank(b_star_actual, b_star_results))
    
    overall_correct = []
    overall_reciprocal_ranks = []
    for task in correct.keys():
        overall_correct.extend(correct[task])
        overall_reciprocal_ranks.extend(reciprocal_ranks[task])
        
        accuracy = np.sum(correct[task]) / len(correct[task])
        mrr = np.mean(reciprocal_ranks[task])
        print("Task: {} :: Accuracy: {}, MRR: {}".format(task, accuracy, mrr))
    
    overall_acc = np.sum(overall_correct) / len(overall_correct) 
    print("Overall: Accuracy: {}, MRR: {}".format(overall_acc, np.mean(overall_reciprocal_ranks)))
    
    print("\t ... done")
    print("\t Skipped: {}".format(skipped))

compute_scores(analogy_data, bow2_sim, task_labels)

	Preprocessing: 0 of 19544
	Preprocessing: 250 of 19544
	Preprocessing: 500 of 19544
	Preprocessing: 750 of 19544
	Preprocessing: 1000 of 19544
	Preprocessing: 1250 of 19544
	Preprocessing: 1500 of 19544
	Preprocessing: 1750 of 19544
	Preprocessing: 2000 of 19544
	Preprocessing: 2250 of 19544
	Preprocessing: 2500 of 19544
	Preprocessing: 2750 of 19544
	Preprocessing: 3000 of 19544
	Preprocessing: 3250 of 19544
	Preprocessing: 3500 of 19544
	Preprocessing: 3750 of 19544
	Preprocessing: 4000 of 19544
	Preprocessing: 4250 of 19544
	Preprocessing: 4500 of 19544
	Preprocessing: 4750 of 19544
	Preprocessing: 5000 of 19544
	Preprocessing: 5250 of 19544
	Preprocessing: 5500 of 19544
	Preprocessing: 6000 of 19544
	Preprocessing: 6250 of 19544
	Preprocessing: 6500 of 19544
	Preprocessing: 6750 of 19544
	Preprocessing: 7000 of 19544
	Preprocessing: 7250 of 19544
	Preprocessing: 7500 of 19544
	Preprocessing: 7750 of 19544
	Preprocessing: 8000 of 19544
	Preprocessing: 8250 of 19544
	Preprocessing: 