In [1]:
import os
import random
import pickle
from collections import defaultdict, Counter

import numpy as np

from sklearn.manifold import TSNE, Isomap
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import scipy
%matplotlib inline

from models import *

In [2]:
plt.rcParams["figure.figsize"] = (15, 20)

In [3]:
def read_analogy_data(path):
    with open(path) as reader:
        analogy_data = []
        task_labels = []
        for line in reader:
            if line.startswith(":"):
                task = line.strip().strip(":").strip()
                continue
            # convert to lower-case 
            analogy_data.append(line.strip().lower().split())
            task_labels.append(task)
    return analogy_data, task_labels
analogy_data, task_labels = read_analogy_data("./data/questions-words.txt")

In [4]:
# list of sub-categories
analogy_data[:10], task_labels[:10]

([['athens', 'greece', 'baghdad', 'iraq'],
  ['athens', 'greece', 'bangkok', 'thailand'],
  ['athens', 'greece', 'beijing', 'china'],
  ['athens', 'greece', 'berlin', 'germany'],
  ['athens', 'greece', 'bern', 'switzerland'],
  ['athens', 'greece', 'cairo', 'egypt'],
  ['athens', 'greece', 'canberra', 'australia'],
  ['athens', 'greece', 'hanoi', 'vietnam'],
  ['athens', 'greece', 'havana', 'cuba'],
  ['athens', 'greece', 'helsinki', 'finland']],
 ['capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries'])

In [5]:
Counter(task_labels)

Counter({'capital-common-countries': 506,
         'capital-world': 4524,
         'city-in-state': 2467,
         'currency': 866,
         'family': 506,
         'gram1-adjective-to-adverb': 992,
         'gram2-opposite': 812,
         'gram3-comparative': 1332,
         'gram4-superlative': 1122,
         'gram5-present-participle': 1056,
         'gram6-nationality-adjective': 1599,
         'gram7-past-tense': 1560,
         'gram8-plural': 1332,
         'gram9-plural-verbs': 870})

In [6]:
bow2_sim = load_model("bow2.words")
#bow5_sim = load_model("bow5.words")
#deps_sim = load_model("deps.words")

# models = {
#     "bow2": bow2_sim,
#     "bow5": bow5_sim,
#     "deps": deps_sim
# }

In [7]:
def reciprocal_rank(correct_value, results):
    try:
        position = results.index(correct_value)
        return 1 / (position + 1)
    except:
        return 0 

# tests
print(reciprocal_rank("cats", ["catten", "cati", "cats"]))
print(reciprocal_rank("tori", ["catten", "tori", "cats"]))
print(reciprocal_rank("virus", ["virus", "cati", "cats"]))

0.3333333333333333
0.5
1.0


In [8]:
def compute_wv(model, a, a_star, b):
    if a not in model.word_index or a_star not in model.word_index or b not in model.word_index:
        return None
    a, a_star, b = model[a], model[a_star], model[b]
    v = a_star - a
    b_star = b + v
    return b_star

In [None]:
def cos_csim(matrix, vector):
    """
    Compute the cosine distances between each row of matrix and vector.
    """
    v = vector.reshape(1, -1)
    return 1 - scipy.spatial.distance.cdist(matrix, v, 'cosine').reshape(-1)

In [43]:
import time

In [None]:
def evaluate_model(model, data):
    start_time = time.time()
    overall_correct = []
    overall_rr = []
    task_correct = defaultdict(list)
    task_rr = defaultdict(list)
    skipped = 0
    rev_index = dict([(v,k) for k, v in model.word_index.items()])
    embeddings = np.array(model.embeddings)
    for index, (tlab, (a, a_star, b, b_star_actual)) in enumerate(zip(task_labels, data)):
        if b_star_actual not in model.word_index:
            skipped += 1
            continue
        b_star = compute_wv(model, a, a_star, b)

        if b_star is None:
            skipped += 1
            continue
        results_score = cos_cdist(embeddings, b_star)
        results = [(rev_index[idx], result) for idx, result in enumerate(results_score)]
        results.sort(key=lambda _ : -_[1])
        # exclude these
        results = [r[0] for r in results if r[0] not in {a, a_star, b}]
        if results[0] == b_star_actual:
            overall_correct.append(1)
            task_correct[tlab].append(1)
        else:
            overall_correct.append(0)
            task_correct[tlab].append(0)
        
        overall_rr.append(reciprocal_rank(b_star_actual, results))
        task_rr[tlab].append(reciprocal_rank(b_star_actual, results))

        if index % 100 == 0:
            print("{}: {} minutes".format(index, (time.time() - start_time)/60))
    
    accuracy = sum(overall_correct) / len(overall_correct)
    print("Accuracy: {}, MRR: {}".format(accuracy, np.mean(overall_rr)))
    
    
    for task_label in np.unique(task_labels):
        accuracy = sum(task_correct[task_label]) / len(task_correct[task_label])
        print("Task: {}:: Accuracy: {}, MRR: {}".format(task_label, accuracy, np.mean(task_rr[task_label])))
        
evaluate_model(bow2_sim, analogy_data)

0: 0.012606791655222575 minutes
100: 0.7654893676439921 minutes
200: 1.4475637594858806 minutes
300: 2.1128949761390685 minutes
400: 2.7743141969045 minutes
500: 3.4352715333302815 minutes
600: 4.0956223169962565 minutes
700: 4.756393575668335 minutes
800: 5.415835746129354 minutes
900: 6.081057441234589 minutes
1000: 6.745235359668731 minutes
1100: 7.406493008136749 minutes
1200: 8.068427471319835 minutes
1300: 8.728729943434397 minutes
1400: 9.38922268152237 minutes
1500: 10.048403346538544 minutes
1600: 10.710478190581004 minutes
1700: 11.370628583431245 minutes
1800: 12.031337201595306 minutes
1900: 12.691721157232921 minutes
2000: 13.352600522836049 minutes
2100: 14.0135138352712 minutes
2200: 14.673587187131245 minutes
2300: 15.333711850643159 minutes
2400: 15.992825456460317 minutes
2500: 16.651005025704702 minutes
2600: 17.309117535750072 minutes
2700: 17.97066947221756 minutes
2800: 18.630885406335196 minutes
2900: 19.29002690712611 minutes
3000: 19.950294280052184 minutes
310

In [34]:
evaluate_model(bow2_sim, analogy_data, "family")

8400
8500
8600
8700
8800
Accuracy: 0.7944664031620553, MRR: 0.8538134449938941


In [35]:
evaluate_model(bow2_sim, analogy_data, "currency")

5100
5200
5300
5400
5600
5700
Accuracy: 0.1130030959752322, MRR: 0.14801165297319077


In [36]:
evaluate_model(bow2_sim, analogy_data, "gram9-plural-verbs")

18700
18800
18900
19000
19100
19200
19300
19400
19500
Accuracy: 0.8068965517241379, MRR: 0.8647319328912093


In [37]:
evaluate_model(bow2_sim, analogy_data, "gram2-opposite")

9900
10000
10100
10200
10300
10400
10500
10600
Accuracy: 0.35591133004926107, MRR: 0.42342064938705753


In [38]:
evaluate_model(bow2_sim, analogy_data, "gram1-adjective-to-adverb")

8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
Accuracy: 0.1592741935483871, MRR: 0.2357158319077029
