In [1]:
import numpy as np
from annoy import AnnoyIndex

In [2]:
class PreTrainedEmbeddings(object):
    def __init__(self, word_to_index, word_vectors):
        """
        Args: word_to_index(dict): mapping from word to integers
        word_vectors (list of numpy arrays)
        """
        self.word_to_index = word_to_index
        self.word_vectors = word_vectors
        self.index_to_word = {v: k for k, v in self.word_to_index.items()}
        self.index = AnnoyIndex(len(word_vectors[0]), metric='euclidean')
        
        for _, i in self.word_to_index.items():
            self.index.add_item(i, self.word_vectors[i])
        self.index.build(50)
        
    @classmethod
    def from_embeddings_file(cls, embedding_file):
        """
        Instantiate from pretrained vector file
        
        vector file should be of the format:
        word0 x0_0 x0_1 x0_2 x0_3...x0_n
        word: x1_0 x_1_1 x1_2 x1_3 ... x1_N
        
        Args:
            embedding file (str): location of the file
        Returns:
            instance of PretainedEmbeddings
        """
        word_to_index = {}
        word_vectors = []
        with open(embedding_file) as fp:
            for line in fp.readlines():
                line = line.split(" ")
                word = line[0]
                vec = np.array([float(x) for x in line[1:]])
                
                word_to_index[word] = len(word_to_index)
                word_vectors.append(vec)
            return cls(word_to_index, word_vectors)
        
    def get_embedding(self, word):
        '''
        Args:
            word (str)
        Returns:
            an embedding (numpy.ndarray)
        '''
        return self.word_vectors[self.word_to_index[word]]
    
    def get_closest_to_vector(self, vector, n=1):
        '''Given a vector, return its n nearest neighbors
        Args:
            vector (np.ndarray): should match the size of the vectors in the
                in the annoy index.
        Returns:
            [str, str, ...]: words nearest to the given vector
            The words are not ordered by distance
        '''
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
         
        
    def compute_and_print_analogy(self, word1, word2, word3):
        '''Prints the solutions to anaolgies using word embeddings
        
        analogies are word1 is to word2 as word3 is to ____
        This method will print: word1 : word2 :: word3 : word4
        
        Args:
            word1 (str)
            word2 (str)
            word3 (str)
        '''
        vec1 = self.get_embedding(word1)
        vec2 = self.get_embedding(word2)
        vec3 = self.get_embedding(word3)
        
        # Simple hypothesis: Analogy is a spatial relationship
        spatial_relationship = vec2 - vec1
        vec4 = vec3 + spatial_relationship
        
        closest_words = self.get_closest_to_vector(vec4, n=4)
        existing_words = set([word1, word2, word3])
        closest_words = [word for word in closest_words
                            if word not in existing_words]
        
        if len(closest_words) == 0:
            print("Could not find nearest neighbors for the vector!")
            return
        
        for word4 in closest_words:
            print("{} : {} :: {} : {}".format(word1, word2, word3,
                                              word4))

In [3]:
embeddings = PreTrainedEmbeddings.from_embeddings_file('/Users/thomassullivan/projects/GitHub/PyTorchNLPBook/chapters/chapter_5/data/glove/glove.6B.100d.txt')

In [4]:
#relationship 1: the relationship between gendered nouns and pronouns
embeddings.compute_and_print_analogy('man', 'he', 'woman')

man : he :: woman : she
man : he :: woman : never
man : he :: woman : her


In [6]:
#relationship 2: verb-noun relationships
embeddings.compute_and_print_analogy('fly', 'plane', 'sail')

fly : plane :: sail : ship
fly : plane :: sail : vessel


In [7]:
#relationship 3: noun-noun relationships
embeddings.compute_and_print_analogy('cat', 'kitten', 'dog')

cat : kitten :: dog : puppy
cat : kitten :: dog : puppies
cat : kitten :: dog : toddler


In [8]:
#relationship 4: hypernymy (broader category)
embeddings.compute_and_print_analogy('blue', 'color', 'dog')

blue : color :: dog : pig
blue : color :: dog : typical
blue : color :: dog : adult
blue : color :: dog : and/or


In [9]:
#Meronymy (part to whole)
embeddings.compute_and_print_analogy('toe', 'foot', 'finger')

toe : foot :: finger : hand
toe : foot :: finger : kept
toe : foot :: finger : ground


In [11]:
#Troponymy (difference in manner)
embeddings.compute_and_print_analogy('talk', 'communicate', 'read')

talk : communicate :: read : correctly
talk : communicate :: read : instructions


In [12]:
#Metonymy (figures of speech)
embeddings.compute_and_print_analogy('blue', 'democrat', 'red')

blue : democrat :: red : republican
blue : democrat :: red : congressman
blue : democrat :: red : senator


In [22]:
#embeddings.compute_and_print_analogy('judge', 'opinion', 'legislature')

judge : opinion :: legislature : legislative
judge : opinion :: legislature : legislators
judge : opinion :: legislature : conservatives


In [23]:
#Relationship 8: adjectival scales
embeddings.compute_and_print_analogy('fast', 'fastest', 'young')

fast : fastest :: young : youngest
fast : fastest :: young : eighth
fast : fastest :: young : qualified
fast : fastest :: young : ranked


In [24]:
embeddings.compute_and_print_analogy('fast', 'fastest', 'small')

fast : fastest :: small : ten
fast : fastest :: small : registered
fast : fastest :: small : eight


In [25]:
#protected characteristics
embeddings.compute_and_print_analogy('man', 'doctor', 'woman')

man : doctor :: woman : nurse
man : doctor :: woman : physician
man : doctor :: woman : pregnant
