In [1]:
import os
import time
import random
import pickle
from collections import defaultdict, Counter

import numpy as np

from sklearn.manifold import TSNE, Isomap
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import scipy
%matplotlib inline

from models import *

In [2]:
plt.rcParams["figure.figsize"] = (15, 20)

In [3]:
def read_analogy_data(path):
    with open(path) as reader:
        analogy_data = []
        task_labels = []
        for line in reader:
            if line.startswith(":"):
                task = line.strip().strip(":").strip()
                continue
            # convert to lower-case 
            analogy_data.append(line.strip().lower().split())
            task_labels.append(task)
    return analogy_data, task_labels
analogy_data, task_labels = read_analogy_data("./data/questions-words.txt")

In [4]:
# list of sub-categories
analogy_data[:10], task_labels[:10]

([['athens', 'greece', 'baghdad', 'iraq'],
  ['athens', 'greece', 'bangkok', 'thailand'],
  ['athens', 'greece', 'beijing', 'china'],
  ['athens', 'greece', 'berlin', 'germany'],
  ['athens', 'greece', 'bern', 'switzerland'],
  ['athens', 'greece', 'cairo', 'egypt'],
  ['athens', 'greece', 'canberra', 'australia'],
  ['athens', 'greece', 'hanoi', 'vietnam'],
  ['athens', 'greece', 'havana', 'cuba'],
  ['athens', 'greece', 'helsinki', 'finland']],
 ['capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries',
  'capital-common-countries'])

In [5]:
Counter(task_labels)

Counter({'capital-common-countries': 506,
         'capital-world': 4524,
         'city-in-state': 2467,
         'currency': 866,
         'family': 506,
         'gram1-adjective-to-adverb': 992,
         'gram2-opposite': 812,
         'gram3-comparative': 1332,
         'gram4-superlative': 1122,
         'gram5-present-participle': 1056,
         'gram6-nationality-adjective': 1599,
         'gram7-past-tense': 1560,
         'gram8-plural': 1332,
         'gram9-plural-verbs': 870})

In [28]:
bow2_sim = load_model("bow2.words")
bow5_sim = load_model("bow5.words")
deps_sim = load_model("deps.words")

In [7]:
def reciprocal_rank(correct_value, results):
    try:
        position = results.index(correct_value)
        return 1 / (position + 1)
    except:
        return 0 

# tests
print(reciprocal_rank("cats", ["catten", "cati", "cats"]))
print(reciprocal_rank("tori", ["catten", "tori", "cats"]))
print(reciprocal_rank("virus", ["virus", "cati", "cats"]))

0.3333333333333333
0.5
1.0


In [8]:
def compute_wv(model, a, a_star, b):
    if a not in model.word_index or a_star not in model.word_index or b not in model.word_index:
        return None
    a, a_star, b = model[a], model[a_star], model[b]
    v = a_star - a
    b_star = b + v
    return b_star

In [9]:
def cos_csim(matrix, vector):
    """
    Compute the cosine distances between each row of matrix and vector.
    """
    v = vector.reshape(1, -1)
    return 1 - scipy.spatial.distance.cdist(matrix, v, 'cosine').reshape(-1)

In [13]:
def evaluate_model(model, data):
    start_time = time.time()
    overall_correct = []
    overall_rr = []
    task_correct = defaultdict(list)
    task_rr = defaultdict(list)
    skipped = 0
    rev_index = dict([(v,k) for k, v in model.word_index.items()])
    embeddings = np.array(model.embeddings)
    for index, (tlab, (a, a_star, b, b_star_actual)) in enumerate(zip(task_labels, data)):
        if b_star_actual not in model.word_index:
            skipped += 1
            continue
        b_star = compute_wv(model, a, a_star, b)

        if b_star is None:
            skipped += 1
            continue
        results_score = cos_csim(embeddings, b_star)
        results = [(rev_index[idx], result) for idx, result in enumerate(results_score)]
        results.sort(key=lambda _ : -_[1])
        # exclude these
        results = [r[0] for r in results if r[0] not in {a, a_star, b}]
        if results[0] == b_star_actual:
            overall_correct.append(1)
            task_correct[tlab].append(1)
        else:
            overall_correct.append(0)
            task_correct[tlab].append(0)
        
        overall_rr.append(reciprocal_rank(b_star_actual, results))
        task_rr[tlab].append(reciprocal_rank(b_star_actual, results))

        if index % 100 == 0:
            print("{}: {} minutes".format(index, (time.time() - start_time)/60))
    
    accuracy = sum(overall_correct) / len(overall_correct)
    print("Accuracy: {}, MRR: {}".format(accuracy, np.mean(overall_rr)))
    
    
    for task_label in np.unique(task_labels):
        accuracy = sum(task_correct[task_label]) / len(task_correct[task_label])
        print("Task: {}:: Accuracy: {}, MRR: {}".format(task_label, accuracy, np.mean(task_rr[task_label])))
        

In [None]:
# The output is saved in the last cell (executed on different laptops)
evaluate_model(deps_sim, analogy_data)

In [None]:
# The output is saved in the last cell (executed on different laptops)
evaluate_model(bow5_sim, analogy_data)

In [14]:
# The output is saved in the last cell (executed on different laptops)
evaluate_model(bow2_sim, analogy_data)

0: 0.02026971975962321 minutes
100: 1.092841883500417 minutes
200: 2.168443016211192 minutes
300: 3.2496140877405804 minutes
400: 4.3660209973653155 minutes
500: 5.441014126936595 minutes
600: 6.513348948955536 minutes
700: 7.589607067902883 minutes
800: 8.66030620733897 minutes
900: 9.731489197413127 minutes
1000: 10.802808781464895 minutes
1100: 11.875128746032715 minutes
1200: 12.951686759789785 minutes
1300: 14.02580695549647 minutes
1400: 15.104666503270467 minutes
1500: 16.176827359199525 minutes
1600: 17.251885414123535 minutes
1700: 18.32343314488729 minutes
1800: 19.398509856065115 minutes
1900: 20.47016796271006 minutes
2000: 21.540792949994405 minutes
2100: 22.615817642211915 minutes
2200: 23.68942538102468 minutes
2300: 24.76369526386261 minutes
2400: 25.836255248387655 minutes
2500: 26.90896447499593 minutes
2600: 27.98992429971695 minutes
2700: 29.071549129486083 minutes
2800: 30.145808299382526 minutes
2900: 31.22102496623993 minutes
3000: 32.2964684844017 minutes
3100: 

#### Results of the analogy task

<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{font-family:Arial, sans-serif;font-size:14px;padding:10px 5px;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:black;}
.tg th{font-family:Arial, sans-serif;font-size:14px;font-weight:normal;padding:10px 5px;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:black;}
.tg .tg-baqh{text-align:center;vertical-align:top}
.tg .tg-c3ow{border-color:inherit;text-align:center;vertical-align:top}
.tg .tg-7btt{font-weight:bold;border-color:inherit;text-align:center;vertical-align:top}
.tg .tg-amwm{font-weight:bold;text-align:center;vertical-align:top}
</style>
<table class="tg">
  <tr>
    <th class="tg-7btt" rowspan="2">Task</th>
    <th class="tg-7btt" colspan="2">BOW2</th>
    <th class="tg-7btt" colspan="2">BOW5</th>
    <th class="tg-amwm" colspan="2">DEPS</th>
  </tr>
  <tr>
    <td class="tg-7btt">Accuracy</td>
    <td class="tg-7btt">MRR</td>
    <td class="tg-7btt">Accuracy</td>
    <td class="tg-7btt">MRR</td>
    <td class="tg-amwm">Accuracy</td>
    <td class="tg-amwm">MRR</td>
  </tr>
  <tr>
    <td class="tg-c3ow">capital-common-countries</td>
    <td class="tg-c3ow">0.8359</td>
    <td class="tg-c3ow">0.8817</td>
    <td class="tg-7btt">0.9407</td>
    <td class="tg-7btt">0.9639</td>
    <td class="tg-baqh">0.3517</td>
    <td class="tg-baqh">0.4938</td>
  </tr>
  <tr>
    <td class="tg-c3ow">capital-world</td>
    <td class="tg-c3ow">0.63019</td>
    <td class="tg-c3ow">0.7194</td>
    <td class="tg-7btt">0.7029</td>
    <td class="tg-7btt">0.7988</td>
    <td class="tg-baqh">0.1120</td>
    <td class="tg-baqh">0.2034</td>
  </tr>
  <tr>
    <td class="tg-c3ow">city-in-state</td>
    <td class="tg-c3ow">0.39237</td>
    <td class="tg-c3ow">0.4976</td>
    <td class="tg-7btt">0.5127</td>
    <td class="tg-7btt">0.6213</td>
    <td class="tg-baqh">0.1228</td>
    <td class="tg-baqh">0.2208</td>
  </tr>
  <tr>
    <td class="tg-c3ow">currency</td>
    <td class="tg-c3ow">0.1130</td>
    <td class="tg-c3ow">0.1480</td>
    <td class="tg-7btt">0.1222</td>
    <td class="tg-7btt">0.1686</td>
    <td class="tg-baqh">0.0637</td>
    <td class="tg-baqh">0.0958</td>
  </tr>
  <tr>
    <td class="tg-c3ow">family</td>
    <td class="tg-c3ow">0.7944</td>
    <td class="tg-c3ow">0.8538</td>
    <td class="tg-7btt">0.8181</td>
    <td class="tg-7btt">0.8698</td>
    <td class="tg-baqh">0.8162</td>
    <td class="tg-baqh">0.8541</td>
  </tr>
  <tr>
    <td class="tg-c3ow">gram1-adjective-to-adverb</td>
    <td class="tg-c3ow">0.1592</td>
    <td class="tg-c3ow">0.2357</td>
    <td class="tg-7btt">0.1693</td>
    <td class="tg-7btt">0.2721</td>
    <td class="tg-baqh">0.0342</td>
    <td class="tg-baqh">0.0670</td>
  </tr>
  <tr>
    <td class="tg-c3ow">gram2-opposite</td>
    <td class="tg-c3ow">0.3559</td>
    <td class="tg-c3ow">0.4234</td>
    <td class="tg-c3ow">0.3633</td>
    <td class="tg-c3ow">0.4321</td>
    <td class="tg-amwm">0.4002</td>
    <td class="tg-amwm">0.4763</td>
  </tr>
  <tr>
    <td class="tg-baqh">gram3-comparative</td>
    <td class="tg-amwm">0.8956</td>
    <td class="tg-amwm">0.9388</td>
    <td class="tg-baqh">0.8303</td>
    <td class="tg-baqh">0.8916</td>
    <td class="tg-baqh">0.8010</td>
    <td class="tg-baqh">0.8534</td>
  </tr>
  <tr>
    <td class="tg-baqh">gram4-superlative</td>
    <td class="tg-amwm">0.6306</td>
    <td class="tg-amwm">0.7300</td>
    <td class="tg-baqh">0.5710</td>
    <td class="tg-baqh">0.6987</td>
    <td class="tg-baqh">0.5606</td>
    <td class="tg-baqh">0.6372</td>
  </tr>
  <tr>
    <td class="tg-baqh">gram5-present-participle</td>
    <td class="tg-baqh">0.6268</td>
    <td class="tg-baqh">0.7470</td>
    <td class="tg-amwm">0.6704</td>
    <td class="tg-amwm">0.7818</td>
    <td class="tg-baqh">0.6467</td>
    <td class="tg-baqh">0.7402</td>
  </tr>
  <tr>
    <td class="tg-baqh">gram6-nationality-adjective</td>
    <td class="tg-baqh">0.7417</td>
    <td class="tg-baqh">0.8073</td>
    <td class="tg-amwm">0.8236</td>
    <td class="tg-amwm">0.8648</td>
    <td class="tg-baqh">0.1213</td>
    <td class="tg-baqh">0.2198</td>
  </tr>
  <tr>
    <td class="tg-baqh">gram7-past-tense</td>
    <td class="tg-baqh">0.5570</td>
    <td class="tg-amwm">0.6625</td>
    <td class="tg-baqh">0.5467</td>
    <td class="tg-baqh">0.6661</td>
    <td class="tg-baqh">0.6589</td>
    <td class="tg-amwm">0.7319</td>
  </tr>
  <tr>
    <td class="tg-baqh">gram8-plural</td>
    <td class="tg-amwm">0.7327</td>
    <td class="tg-amwm">0.7926</td>
    <td class="tg-baqh">0.6681</td>
    <td class="tg-baqh">0.7522</td>
    <td class="tg-baqh">0.6756</td>
    <td class="tg-baqh">0.7478</td>
  </tr>
  <tr>
    <td class="tg-baqh">gram9-plural-verbs</td>
    <td class="tg-baqh">0.8068</td>
    <td class="tg-baqh">0.8647</td>
    <td class="tg-baqh">0.7356</td>
    <td class="tg-baqh">0.8217</td>
    <td class="tg-amwm">0.9091</td>
    <td class="tg-amwm">0.9447</td>
  </tr>
  <tr>
    <td class="tg-baqh">Overall</td>
    <td class="tg-baqh">0.5928</td>
    <td class="tg-baqh">0.6738</td>
    <td class="tg-baqh">0.6228</td>
    <td class="tg-amwm">0.7111</td>
    <td class="tg-baqh">0.3671</td>
    <td class="tg-baqh">0.4457</td>
  </tr>
</table>

### Discussion of Quantitative Results

In this section we'll analyze the results obtained in the previous section. We make the following observations:

- The best performing model is the BOW5 model, and the worst performing model overall is the DEPS model.
- The BOW5 models is best in 8/14 tasks, and the DEPS and BOW2 model performs the best in 3/14 tasks each. 
- All models perform very badly on the 'gram1-adjective-to-adverb' and 'currency' sub-tasks, suggesting that these models do not capture the requisite information necessary to succeed in these tasks. We hypothesize that the models perform badly on the 'currency' task because of data sparsity problems (we explore this more in the qualitative results section). The DEPS models performs especially bad in this task, suggesting that Dependency based embeddings aren't suited for this type of question
- We also note that the BOW5 and BOW2 achieve comparable performance on most of the tasks. 
- The largest gaps in performance between the BOW and DEPS models is in the 'gram6-nationality-adjective' task (we explore why in the next section)

### Qualitative Analysis

In this section, we'll perform qualitative analysis to support some of the conclusions made in the previous sections. We'll start with sampling 5 questions randomly from the analogy data and then executing the 3 models to compare their outputs. The analysis follows

In [30]:
models = [
    ("BOW2", bow2_sim),
    ("BOW5", bow5_sim),
    ("DEPS", deps_sim)
]

In [31]:
def execute_samples(task, data, task_labels, models, n=5):
    random.seed(42)
    data = [sample for (sample, task_label) in zip(data, task_labels) if task_label == task]
    choices = [random.choice(data) for _ in range(n)]
    for (a, a_star, b, b_star_actual) in choices:
        print("\t{}:{}::{}:{}".format(a, a_star, b, b_star_actual))
        for model_name, model in models:
            if b_star_actual not in model.word_index:
                continue
                
            b_star = compute_wv(model, a, a_star, b)

            if b_star is None:
                continue
                
            results = model.most_similar_to_vector(b_star, n = len(model.embeddings))
            # exclude these
            results = [r for r in results if r not in {a, a_star, b}]
            print("\t\t{}:: Top 5 results: {}".format(model_name, results[:5]))
            print("\t\t{}:: Reciprocal Rank: {}".format(model_name, reciprocal_rank(b_star_actual, results)))
            print()
        print("\n")
    
for task in np.unique(task_labels):
    print("Task: {}".format(task))
    execute_samples(task, analogy_data, task_labels, models)
    print("\n\n\n")

Task: capital-common-countries
	madrid:spain::islamabad:pakistan
		BOW2:: Top 5 results: ['pakistan', 'afghanistan', 'srilanka', 'arabia', 'tajikistan']
		BOW2:: Reciprocal Rank: 1.0

		BOW5:: Top 5 results: ['pakistan', 'sindh', 'karachi', 'gilgit-baltistan', 'peshawar']
		BOW5:: Reciprocal Rank: 1.0

		DEPS:: Top 5 results: ['bhutan', 'turkmenistan', 'arabia', 'tajikistan', 'fennoscandia']
		DEPS:: Reciprocal Rank: 0.02702702702702703



	bangkok:thailand::oslo:norway
		BOW2:: Top 5 results: ['norway', 'finland', 'sweden', 'trondheim', 'iceland']
		BOW2:: Reciprocal Rank: 1.0

		BOW5:: Top 5 results: ['norway', 'sweden', 'stavanger', 'trondheim', 'denmark']
		BOW5:: Reciprocal Rank: 1.0

		DEPS:: Top 5 results: ['slovenia', 'finland', 'denmark', 'sweden', 'slovakia']
		DEPS:: Reciprocal Rank: 0.1111111111111111



	athens:greece::london:england
		BOW2:: Top 5 results: ['scandinavia', 'britain', 'germany', 'belgium', 'italy']
		BOW2:: Reciprocal Rank: 0.07692307692307693

		BOW5:: Top

		BOW5:: Top 5 results: ['princess', 'hohenlohe-langenburg', 'duchess', 'bourbon-parma', 'vadhana']
		BOW5:: Reciprocal Rank: 1.0

		DEPS:: Top 5 results: ['princess', 'duchess', 'margravine', 'landgravine', 'electress']
		DEPS:: Reciprocal Rank: 1.0



	boy:girl::man:woman
		BOW2:: Top 5 results: ['woman', 'person', 'divorcee', 'seductress', 'villager']
		BOW2:: Reciprocal Rank: 1.0

		BOW5:: Top 5 results: ['woman', 'stranger', 'person', 'divorcee', 'villager']
		BOW5:: Reciprocal Rank: 1.0

		DEPS:: Top 5 results: ['woman', 'seductress', 'loner', 'shepherdess', 'divorcee']
		DEPS:: Reciprocal Rank: 1.0



	son:daughter::boy:girl
		BOW2:: Top 5 results: ['girl', 'schoolgirl', 'tomboy', 'housemaid', 'waif']
		BOW2:: Reciprocal Rank: 1.0

		BOW5:: Top 5 results: ['girl', 'schoolgirl', 'twelve-year-old', 'spunky', '13-year-old']
		BOW5:: Reciprocal Rank: 1.0

		DEPS:: Top 5 results: ['girl', 'schoolgirl', 'woman', 'waitress', 'toddler']
		DEPS:: Reciprocal Rank: 1.0



	grandpa:grandma:

		BOW5:: Top 5 results: ['best-known', 'longest', 'greatest', 'shortest', 'finest']
		BOW5:: Reciprocal Rank: 0.25

		DEPS:: Top 5 results: ['shortest', 'longest', 'second-longest', 'busiest', 'slowest']
		DEPS:: Reciprocal Rank: 1.0



	quick:quickest::short:shortest
		BOW2:: Top 5 results: ['shortest', 'longest', 'second-fastest', 'second-longest', 'smoothest']
		BOW2:: Reciprocal Rank: 1.0

		BOW5:: Top 5 results: ['shortest', 'longest', 'fastest', 'shorter', 'third-longest']
		BOW5:: Reciprocal Rank: 1.0

		DEPS:: Top 5 results: ['shortest', 'slowest', 'second-longest', 'thinnest', 'quietest']
		DEPS:: Reciprocal Rank: 1.0



	lucky:luckiest::small:smallest
		BOW2:: Top 5 results: ['smallish', 'middle-sized', 'tiny', 'largish', 'moderate-sized']
		BOW2:: Reciprocal Rank: 0.005235602094240838

		BOW5:: Top 5 results: ['large', 'tiny', 'middle-sized', 'smallest', 'smaller']
		BOW5:: Reciprocal Rank: 0.25

		DEPS:: Top 5 results: ['smallish', 'moderate-sized', 'good-sized', 'middle-si

		BOW2:: Top 5 results: ['ear', 'eyes', 'warts', 'eyelid', 'non-humans']
		BOW2:: Reciprocal Rank: 0.5

		BOW5:: Top 5 results: ['eyelids', 'eyes', 'eyelid', 'lesions', 'ocular']
		BOW5:: Reciprocal Rank: 0.5

		DEPS:: Top 5 results: ['eyes', 'lungs', 'ear', 'testes', 'testicles']
		DEPS:: Reciprocal Rank: 1.0



	eagle:eagles::bottle:bottles
		BOW2:: Top 5 results: ['bottles', 'steelers', 'corks', 'panthers', 'redskins']
		BOW2:: Reciprocal Rank: 1.0

		BOW5:: Top 5 results: ['bottles', 'corks', 'drink', 'fridge', 'cans']
		BOW5:: Reciprocal Rank: 1.0

		DEPS:: Top 5 results: ['bottles', 'cans', 'jug', 'pot', 'buckets']
		DEPS:: Reciprocal Rank: 1.0



	donkey:donkeys::cow:cows
		BOW2:: Top 5 results: ['cows', 'chickens', 'mules', 'yaks', 'alpacas']
		BOW2:: Reciprocal Rank: 1.0

		BOW5:: Top 5 results: ['cows', 'sheep', 'goats', 'cattle', 'calves']
		BOW5:: Reciprocal Rank: 1.0

		DEPS:: Top 5 results: ['cows', 'yaks', 'silkworms', 'alpacas', 'ostriches']
		DEPS:: Reciprocal Rank: 1.

In this section, we go over each task and discuss the output of the previous experiment, and try to rationalize the quantitative results obtained in the previous section
#### Task: capital-common-countries
- BOW2 and BOW5 perform very well in this task. While DEPS does well on the last two examples, it's overall performance is lower than the BOW models. We can also argue that it performs well on the 'athens:greece::london:england' question, since the 3rd item is 'Britain'. 
- DEPS lower performance is probably due to the fact that context built using dependencies do not necessarily capture the information required to capture country-capital relations. 

#### Task: capital-world, city-in-state
- Both of these tasks have similar relative performance w.r.t the models, like the previous task. In general, we can expect slightly worse performance, due to the relative sparsity of the data under consideration - the first task has only 'common' countries. 
- DEPS again performs badly for the same reasons described in the previous task.
- Interesting sample: 'miami:florida::chandler:arizona' . All models perform poorly on this example. This maybe due to the fact that 'chandler' occurs in more contexts (ex. as a name) than the previous samples. This is reflected in the top 5 results for all 3 models, since most of them are names of people.

#### Task: currency
- This task has the worse performance among all other tasks. This maybe due to the nature of the corpus itself. If it is trained on data that doesn't contain as many references to other currencies e.g. if it is trained on data primarily from the Americas, it would have references to 'dollar' or 'peso', but not 'rupee' or 'yen'
- Interesting sample: armenia:dram::argentina:peso : With this example all models return computer component names ('sdram', 'ddr2' etc)

#### Task: family
- For this task, all models perform about the same. We can see in the samples that all 3 models perfectly answer 4/5 questions
- The question 'nephew:niece::husband:wife' performs poorly for all 3 models

#### Task: gram1-adjective-to-adverb
- All models perform very poorly on this task, with DEPS performing the worst.
- We can expect the 3 models to perform poorly. This type of information requires lexical knowledge, which isn't available to the models directly. 
- We note that DEPS and BOW2 return words ending with 'ly' in almost all top results, but it isn't the case with the BOW5 model, which is a sensisible since BOW5 is more 'topical' than BOW2 / DEPS

#### Task: gram2-opposite
- Interesting sample: efficient:inefficient::fortunate:unfortunate
   - All models return words somewhat related to 'unfortunate', but are unable to properly find the right word.  
- Interesting sample: acceptable:unacceptable::responsible:irresponsible
  - In this sample, we can see that most of the results words typically in a 'negative' setting, and somewhat sematically related to irresponsibility. 

#### Task: gram3-comparative
- Excellent results for all 3 models
- Interesting sample: 'high:higher::great:greater'
  - Even though the models have relatively lower RR for this example, all models seem to capture the general meaning. This even brings into question the evaluation metric used - MRR
  
#### Task: gram4-superlative
- All models perform the same, with all of them performing much better on the previous task. 
- Interesting sample: lucky:luckiest::small:smallest
   - DEPS gives a very interesting result. We notice that, as expected, all the words returned by the model are related in the sense that they refer to size. However, except the first result, none of them actually capture the proper meaning. This shows that this model excels at finding related words, but falls short when it comes to computing similar words. This property applies to all other words

#### Task: gram5-present-participle	
- Interesting sample: dance:dancing::say:saying
  - We note that the results of the DEPS model again, are words that are related, but not really exactly the answer. 
  
#### Task: gram6-nationality-adjective
- This model has the largest gaps in performance between the DEPS and BOW models. 
- All results from DEPS are related words, maybe sharing the same dependency structures. 
- BOW models have no trouble predicting the correct answers for the frequent nationalities, while DEPS seems to struggle with it.

#### Task: gram7-past-tense
- DEPS outperforms BOW models in this task. This is probably due to past tense words sharing similar dependency structures
- Interesting sample: feeding:fed::writing:wrote
  - BOW models capture the sense, but fail to capture the tense. DEPS models also performs relatively poor here, although it returns a mispelling of the word 'written' 
  
#### Task: gram8-plural, gram9-plural-verbs
- All 3 models perform about the same for the gram8-plural task, while DEPS performs the best for the gram9-plural-verbs task. 

### BOW2
Accuracy: 0.5928964586146017, MRR: 0.6738476055502349 

Task: capital-common-countries:: Accuracy: 0.8359683794466403, MRR: 0.8817258077741364

Task: capital-world:: Accuracy: 0.6301945181255526, MRR: 0.7194812770755393

Task: city-in-state:: Accuracy: 0.3923794081880827, MRR: 0.4976112176150989

Task: currency:: Accuracy: 0.1130030959752322, MRR: 0.14801165297319077

Task: family:: Accuracy: 0.7944664031620553, MRR: 0.8538134449938941

Task: gram1-adjective-to-adverb:: Accuracy: 0.1592741935483871, MRR: 0.2357158319077029

Task: gram2-opposite:: Accuracy: 0.35591133004926107, MRR: 0.42342064938705753

Task: gram3-comparative:: Accuracy: 0.8956456456456456, MRR: 0.9388533621508531

Task: gram4-superlative:: Accuracy: 0.6306818181818182, MRR: 0.7300379939681495

Task: gram5-present-participle:: Accuracy: 0.6268939393939394, MRR: 0.7470184960322382

Task: gram6-nationality-adjective:: Accuracy: 0.7417135709818636, MRR: 0.8073374035779143

Task: gram7-past-tense:: Accuracy: 0.5570512820512821, MRR: 0.6625581806410229

Task: gram8-plural:: Accuracy: 0.7327327327327328, MRR: 0.7926475754522675

Task: gram9-plural-verbs:: Accuracy: 0.8068965517241379, MRR: 0.8647319328912093

### BOW5 
Accuracy: 0.6228061065531207, MRR: 0.7111230374484191

Task: capital-common-countries:: Accuracy: 0.9407114624505929, MRR: 0.9639281008846227

Task: capital-world:: Accuracy: 0.7029177718832891, MRR: 0.7988549579564563

Task: city-in-state:: Accuracy: 0.5127685447912445, MRR: 0.6213078533111366

Task: currency:: Accuracy: 0.12229102167182662, MRR: 0.16867354008564173

Task: family:: Accuracy: 0.8181818181818182, MRR: 0.8698343729395179

Task: gram1-adjective-to-adverb:: Accuracy: 0.1693548387096774, MRR: 0.27214939923316167

Task: gram2-opposite:: Accuracy: 0.3633004926108374, MRR: 0.43216434825773664

Task: gram3-comparative:: Accuracy: 0.8303303303303303, MRR: 0.8916557740521935

Task: gram4-superlative:: Accuracy: 0.5710227272727273, MRR: 0.6987903418881215

Task: gram5-present-participle:: Accuracy: 0.6704545454545454, MRR: 0.7818590311557159

Task: gram6-nationality-adjective:: Accuracy: 0.8236397748592871, MRR: 0.8648149066187605

Task: gram7-past-tense:: Accuracy: 0.5467948717948717, MRR: 0.6661794735807756

Task: gram8-plural:: Accuracy: 0.6681681681681682, MRR: 0.7522690196093208

Task: gram9-plural-verbs:: Accuracy: 0.735632183908046, MRR: 0.821760919807262

### Deps

Accuracy: 0.36719075385256145, MRR: 0.4457027196317186

Task: capital-common-countries:: Accuracy: 0.35177865612648224, MRR: 0.49386809562252204

Task: capital-world:: Accuracy: 0.11206896551724138, MRR: 0.2034998414570093

Task: city-in-state:: Accuracy: 0.12282124037292258, MRR: 0.22087180510353727

Task: currency:: Accuracy: 0.06375838926174497, MRR: 0.09589387260282986

Task: family:: Accuracy: 0.8162055335968379, MRR: 0.8541289814671902

Task: gram1-adjective-to-adverb:: Accuracy: 0.034274193548387094, MRR: 0.06706805210731484

Task: gram2-opposite:: Accuracy: 0.4002463054187192, MRR: 0.47634700311122624

Task: gram3-comparative:: Accuracy: 0.801051051051051, MRR: 0.8534634128403255

Task: gram4-superlative:: Accuracy: 0.5606060606060606, MRR: 0.6372342387939611

Task: gram5-present-participle:: Accuracy: 0.646780303030303, MRR: 0.7402042631095018

Task: gram6-nationality-adjective:: Accuracy: 0.12132582864290181, MRR: 0.21989590316716345

Task: gram7-past-tense:: Accuracy: 0.658974358974359, MRR: 0.7319858707885283

Task: gram8-plural:: Accuracy: 0.6756756756756757, MRR: 0.7478410747404471

Task: gram9-plural-verbs:: Accuracy: 0.9091954022988505, MRR: 0.9447645130880642