In [None]:
!pip install annoy

Collecting annoy
[?25l  Downloading https://files.pythonhosted.org/packages/00/15/5a9db225ebda93a235aebd5e42bbf83ab7035e7e4783c6cb528c635c9afb/annoy-1.16.3.tar.gz (644kB)
[K     |▌                               | 10kB 6.5MB/s eta 0:00:01[K     |█                               | 20kB 1.4MB/s eta 0:00:01[K     |█▌                              | 30kB 2.1MB/s eta 0:00:01[K     |██                              | 40kB 1.5MB/s eta 0:00:01[K     |██▌                             | 51kB 1.9MB/s eta 0:00:01[K     |███                             | 61kB 2.2MB/s eta 0:00:01[K     |███▋                            | 71kB 2.6MB/s eta 0:00:01[K     |████                            | 81kB 3.0MB/s eta 0:00:01[K     |████▋                           | 92kB 2.3MB/s eta 0:00:01[K     |█████                           | 102kB 2.6MB/s eta 0:00:01[K     |█████▋                          | 112kB 2.6MB/s eta 0:00:01[K     |██████                          | 122kB 2.6MB/s eta 0:00:01[K     

In [None]:
import torch
import torch.nn as nn
from tqdm import tqdm
from annoy import AnnoyIndex
import numpy as np

In [None]:
class PreTrainedEmbeddings(object):
    """ A wrapper around pre-trained word vectors and their use """
    def __init__(self, word_to_index, word_vectors):
        """
        Args:
            word_to_index (dict): mapping from word to integers
            word_vectors (list of numpy arrays)
        """
        self.word_to_index = word_to_index
        self.word_vectors = word_vectors
        self.index_to_word = {v: k for k, v in self.word_to_index.items()}

        self.index = AnnoyIndex(len(word_vectors[0]), metric='cosine')
        print("Building Index!")
        for _, i in self.word_to_index.items():
            self.index.add_item(i, self.word_vectors[i])
        self.index.build(50)
        print("Finished!")
        
    @classmethod
    def from_embeddings_file(cls, embedding_file):
        """Instantiate from pre-trained vector file.
        
        Vector file should be of the format:
            word0 x0_0 x0_1 x0_2 x0_3 ... x0_N
            word1 x1_0 x1_1 x1_2 x1_3 ... x1_N
        
        Args:
            embedding_file (str): location of the file
        Returns: 
            instance of PretrainedEmbeddigns
        """
        word_to_index = {}
        word_vectors = []

        with open(embedding_file) as fp:
            for line in fp.readlines():
                line = line.split(" ")
                word = line[0]
                vec = np.array([float(x) for x in line[1:]])
                
                word_to_index[word] = len(word_to_index)
                word_vectors.append(vec)
                
        return cls(word_to_index, word_vectors)
    
    def get_embedding(self, word):
        """
        Args:
            word (str)
        Returns
            an embedding (numpy.ndarray)
        """
        return self.word_vectors[self.word_to_index[word]]

    def get_closest_to_vector(self, vector, n=1):
        """Given a vector, return its n nearest neighbors
        
        Args:
            vector (np.ndarray): should match the size of the vectors 
                in the Annoy index
            n (int): the number of neighbors to return
        Returns:
            [str, str, ...]: words that are nearest to the given vector. 
                The words are not ordered by distance 
        """
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    def compute_and_print_analogy(self, word1, word2, word3):
        """Prints the solutions to analogies using word embeddings

        Analogies are word1 is to word2 as word3 is to __
        This method will print: word1 : word2 :: word3 : word4
        
        Args:
            word1 (str)
            word2 (str)
            word3 (str)
        """
        vec1 = self.get_embedding(word1)
        vec2 = self.get_embedding(word2)
        vec3 = self.get_embedding(word3)

        # now compute the fourth word's embedding!
        spatial_relationship = vec2 - vec1
        vec4 = vec3 + spatial_relationship

        closest_words = self.get_closest_to_vector(vec4, n=4)
        existing_words = set([word1, word2, word3])
        closest_words = [word for word in closest_words 
                             if word not in existing_words] 

        if len(closest_words) == 0:
            print("Could not find nearest neighbors for the computed vector!")
            return
        
        for word4 in closest_words:
            print("{} : {} :: {} : {}".format(word1, word2, word3, word4))

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-02-01 19:16:24--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-02-01 19:16:24--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-02-01 19:16:25--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [None]:
import zipfile
zip_ref = zipfile.ZipFile('glove.6B.zip', 'r')
zip_ref.extractall('data/')
zip_ref.close()

In [None]:
!ls -l data

total 2197148
-rw-r--r-- 1 root root  347116733 Feb  1 19:25 glove.6B.100d.txt
-rw-r--r-- 1 root root  693432828 Feb  1 19:25 glove.6B.200d.txt
-rw-r--r-- 1 root root 1037962819 Feb  1 19:25 glove.6B.300d.txt
-rw-r--r-- 1 root root  171350079 Feb  1 19:25 glove.6B.50d.txt


In [None]:
embeddings = PreTrainedEmbeddings.from_embeddings_file('data/glove.6B.100d.txt')

Building Index!
Finished!


In [None]:
print(embeddings.get_embedding('chicken'))

[-0.31941    0.64352    0.061722  -0.23465   -0.46668    0.45944
  0.80966    0.26575    0.17443   -0.2897    -0.77198    0.29437
  1.1188     0.54886   -0.23227    0.62681   -0.1981    -0.39673
  0.075107   0.13992    0.30519    0.88384   -0.032432  -0.98251
  0.61574    1.6974     0.1439    -0.18219   -0.5754     0.51227
 -0.043756   0.90425    0.54989   -0.27782   -0.038291   0.86877
  0.027351  -0.062064  -0.11542   -1.1948     0.91219   -1.3764
 -0.60074   -1.239      0.71743    0.0060215 -1.2784    -0.60365
  0.087471  -0.93287   -0.38174    0.15325   -0.029534   0.5951
 -1.3351    -0.85247   -0.25387    0.15488    0.63603    0.46029
  0.11268    0.73124    0.79237    0.64031    0.87218   -0.14922
 -0.37287   -0.089947  -0.30832    0.14441   -0.21685    0.43613
  0.27235    1.1278     0.27427    0.55706   -0.9089     0.28802
  0.42004    0.9972     0.69903   -0.37304   -0.44695    0.70073
 -0.47789   -0.30684   -0.17773    0.70475    0.018582   0.20878
  0.16036    0.17889   -0.3

In [None]:
embeddings.compute_and_print_analogy('man', 'king', 'woman')

man : king :: woman : queen
man : king :: woman : monarch
man : king :: woman : throne


In [None]:
embeddings.compute_and_print_analogy('fly', 'plane', 'sail')

fly : plane :: sail : ship
fly : plane :: sail : vessel


In [None]:
embeddings.compute_and_print_analogy('cat', 'kitten', 'dog')

cat : kitten :: dog : puppy
cat : kitten :: dog : toddler
cat : kitten :: dog : sleds


In [None]:
embeddings.compute_and_print_analogy('sky', 'blue', 'tree')

sky : blue :: tree : leaf
sky : blue :: tree : green
sky : blue :: tree : trees


In [None]:
embeddings.compute_and_print_analogy('leg', 'legs', 'hand')

leg : legs :: hand : fingers
leg : legs :: hand : ears
leg : legs :: hand : stick


In [None]:
embeddings.compute_and_print_analogy('toe', 'foot', 'finger')

toe : foot :: finger : hand
toe : foot :: finger : attached
toe : foot :: finger : apart


In [None]:
embeddings.compute_and_print_analogy('talk', 'communicate', 'read')

talk : communicate :: read : correctly
talk : communicate :: read : instructions


In [None]:
embeddings.compute_and_print_analogy('blue', 'democrat', 'red')

blue : democrat :: red : republican
blue : democrat :: red : congressman
blue : democrat :: red : senator


In [None]:
embeddings.compute_and_print_analogy('man', 'king', 'woman')

man : king :: woman : queen
man : king :: woman : throne
man : king :: woman : elizabeth


In [None]:
embeddings.compute_and_print_analogy('man', 'doctor', 'woman')

man : doctor :: woman : nurse
man : doctor :: woman : physician
man : doctor :: woman : pregnant


In [None]:
embeddings.compute_and_print_analogy('fast', 'fastest', 'small')

fast : fastest :: small : ten
fast : fastest :: small : registered
fast : fastest :: small : eight
