In [1]:
import csv
import math
import pickle
import random
import sys
from collections import Counter
import numpy as np

# Hhelpful for computing cosine similarity
from scipy.spatial.distance import cosine

# This will make things go fast when we finally use it
from numba import jit

# Handy command-line argument parsing
import argparse

# Progress bar tracker
from tqdm import tqdm

# Sort of smart tokenization
from nltk.tokenize import RegexpTokenizer

# We'll use this to save our models
from gensim.models import KeyedVectors

random.seed(1234)
np.random.seed(1234)

In [2]:
class word2vec:
    def __init__(self, hidden_layer_size=50):

        self.hidden_layer_size = hidden_layer_size
        self.tokenizer = RegexpTokenizer(r'\w+')
        
        # These state variables become populated as the main() function calls
        #
        # 1. load_data()
        # 2. generate_negative_sampling_table()
        # 3. init_weights()
        #
        # See those functions for how the various values get filled in

        self.word_to_index = {} # word to unique-id
        self.index_to_word = [] # unique-id to word

        # How many times each word occurs in our data after filtering
        self.word_counts = Counter()

        # A utility data structure that lets us quickly sample "negative"
        # instances in a context. This table contains unique-ids
        self.negative_sampling_table = []
        
        # The dataset we'll use for training, as a sequence of unqiue word
        # ids. This is the sequence across all documents after tokens have been
        # randomly subsampled by the word2vec preprocessing step
        self.full_token_sequence_as_ids = []

        # These will contain the two weight matrices. W is the embeddings for
        # the center/target word and C are the embeddings for the context
        # words. You might see these called (W, V) or (W1, W2) in various
        # documentation too. These get initalized later in init_weights() once
        # we know the vocabulary size
        self.W = None
        self.C = None
        
    def tokenize(self, text):
        '''
        Tokenize the document and returns a list of the tokens
        '''
        return self.tokenizer.tokenize(text)

        
    def load_data(self, file_name, min_token_freq):
        '''
        Reads the data from the specified file as long long sequence of text
        (ignoring line breaks) and populates the data structures of this
        word2vec object.
        '''

        # Step 1: Read in the file and create a long sequence of tokens
        with open(file_name,"r") as f:   
            original_sequence= f.read()  
        tokens=self.tokenize(original_sequence)
        # Step 2: Count how many tokens we have of each type
        print('Counting token frequencies')
        original_frequency=Counter(tokens)
        

        # Step 3: Replace all tokens below the specified frequency with an <UNK>
        # token
        print("Performing minimum thresholding")
        for i in range(len(tokens)):
            if original_frequency[tokens[i]]<min_token_freq:
                tokens[i]='<UNK>'

        # Step 4: update self.word_counts to be the number of times each word
        # occurs (including <UNK>)
        self.word_counts=Counter(tokens)

        # Step 5: Create the mappings from word to unique integer ID and the
        # reverse mapping.
        #
        # HINT: the id-to-word mapping is easily represented as a list data
        # structure
    
        all_words=list(self.word_counts.keys())
        for i in range(len(all_words)):
            self.word_to_index[all_words[i]]=i
        self.index_to_word=all_words
        
        # Step 6: Compute the probability of keeping any particular token of a
        # word in the training sequence, which we'll use to subsample. This
        # avoids having the training data be filled with many overly common words
        probability={}
        for i in all_words:
            temp_p=self.word_counts[i]/len(tokens)
            #print(temp_p)
            probability[i]=(np.sqrt(temp_p/0.001)+1)*0.001/temp_p
        #print(probability)
        # Step 7: process the list of tokens (after min-freq filtering) to fill
        # a new list self.full_token_sequence_as_ids where (1) we
        # probabilistically choose whether to keep each token based on the
        # subsampling probabilities and (2) all tokens are convered to their
        # unique ids for faster training.
        for i in range(len(tokens)):
            self.full_token_sequence_as_ids.append(self.word_to_index[tokens[i]])
        print(len(tokens))
        # Transform the original input into a sequence of IDs while also
        # performing token-based subsampling based on the probabilities in
        # word_to_sample_prob. This effectively makes the context window larger
        # for some words by removing words that are common from a particular
        # context before the training occurs.
        for i in all_words:
            if probability[i]<1:
                reserve=np.random.binomial(1,probability[i],1)[0]
                if reserve==0:
                    self.full_token_sequence_as_ids=list(filter(lambda x:x!=self.word_to_index[i],self.full_token_sequence_as_ids))

        # self.negative_sampling_table = generate_negative_sampling_table()
        print('Loaded all data from %s; saw %d tokens (%d unique)' \
              % (file_name, len(self.full_token_sequence_as_ids),
                 len(self.word_to_index)))
                
    
    def generate_negative_sampling_table(self, exp_power=0.75, table_size=1e6):
        '''
        Generates a big list data structure that we can quickly randomly index into
        in order to select a negative training example (i.e., a word that was
        *not* present in the context). 
        '''       
        
        # Step 1: Figure out how many instances of each word need to go into the
        # negative sampling table. 
        #
        # HINT: np.power and np.fill might be useful here        
        print("Generating sampling table")
        len_word=[]
        sum_len_word=sum(np.power(list(self.word_counts.values()),exp_power))
        for i in self.index_to_word:
            len_word.append(np.power(self.word_counts[i],exp_power)/sum_len_word)
        len_word=(np.asarray(len_word)*table_size).astype(int)




        print(len(len_word))
        # Step 2: Create the table to the correct size. You'll want this to be a
        # numpy array of type int
        self.negative_sampling_table=np.ones(int(table_size)).astype(int)


        # Step 3: Fill the table so that each word has a number of IDs
        # proportionate to its probability of being sampled.
        #
        # Example: if we have 3 words "a" "b" and "c" with probabilites 0.5,
        # 0.33, 0.16 and a table size of 6 then our table would look like this
        # (before converting the words to IDs):
        #
        # [ "a", "a", "a", "b", "b", "c" ]
        #
        index_temp=0
        sum_temp=0
        #print(len_word)
        for i in range(len(len_word)):
            np.ndarray.fill(self.negative_sampling_table[sum_temp:(sum_temp+len_word[i])],index_temp)
            sum_temp=sum_temp+len_word[i]
            index_temp=index_temp+1


    def generate_negative_samples(self, cur_context_word_id, num_samples):
        '''
        Randomly samples the specified number of negative samples from the lookup
        table and returns this list of IDs as a numpy array. As a performance
        improvement, avoid sampling a negative example that has the same ID as
        the current positive context word.
        '''

        # Step 1: Create a list and sample from the negative_sampling_table to
        # grow the list to num_samples, avoiding adding a negative example that
        # has the same ID as teh current context_word
        results = []
        temp_num_sample=0
        while temp_num_sample<num_samples:
            temp=np.random.choice(self.negative_sampling_table,1)[0]
            if temp!=cur_context_word_id:
                results.append(temp)
                temp_num_sample=temp_num_sample+1
            else:
                pass

        # Step 2: Convert the list of samples to numpy array and return it            
        return np.array(results)
    def save(self, filename):
        '''
        Saves the model to the specified filename as a gensim KeyedVectors in the
        text format so you can load it separately.
        '''

        # Creates an empty KeyedVectors with our embedding size
        kv = KeyedVectors(vector_size=self.hidden_layer_size)        
        vectors = []
        words = []
        # Get the list of words/vectors in a consistent order
        for index, word in enumerate(self.index_to_word): 
            vectors.append(self.W[index].copy())
            words.append(word)
            
        # Fills the KV object with our data in the right order
        kv.add(words, vectors) 
        kv.save_word2vec_format(filename, binary=False)
    def init_weights(self, init_range=0.1):
        '''
        Initializes the weight matrices W (input->hidden) and C (hidden->output)
        by sampling uniformly within a small range around zero.
        '''

        # Step 1: Initialize two numpy arrays (matrices) for W and C by filling
        # their values with a random sample within the speified range.
        #
        # Hint: numpy.random has lots of ways to create matrices for this task
        self.W=np.random.uniform(-init_range,init_range,size=(len(self.word_to_index),self.hidden_layer_size))
        self.C=np.random.uniform(-init_range,init_range,size=(len(self.word_to_index),self.hidden_layer_size))
        
    def train(self, num_epochs=2, window_size=2, num_negative_samples=2,
              learning_rate=0.05, nll_update_iter=10000, max_steps=-1):
        '''
        Trains the word2vec model on the data loaded from load_data for the
        specified number of epochs.
        '''

        # Rather than compute the full negative log-likelihood (NLL), we'll keep
        # a running tally of the nll values for each step and periodically report them
        nll_results = []
        
        # This value keeps track of which step we're on. Since we don't update
        # when the center token is "<UNK>" we may skip over some ids in the
        # inner loop, so we need a separate step count to keep track of how many
        # updates we've done.
        step = 0
        
        # Iterate for the specified number of epochs
        for epoch in range(1, num_epochs+1):
            print("Beginning epoch %d of %d" % (epoch, num_epochs))           
            
            # Step 1: Iterate over each ID in full_token_sequence_as_ids as a center
            # token (skipping those that are <UNK>) and predicting the context
            # word and negative samples
            #
            # Hint: this is a great loop to wrap with a tqdm() call so you can
            # see how long each epoch will take with a progress bar
            overall_length=len(self.full_token_sequence_as_ids)
            for i in range(overall_length):
                center_word=self.full_token_sequence_as_ids[i]
                if self.index_to_word[center_word]=='<UNK>':
                    continue
                window_begin=(i-window_size)
                window_end=(i+1+window_size)
                if window_begin>=0 and window_end<=overall_length:
                    context_word=self.full_token_sequence_as_ids[window_begin:window_begin+2]+self.full_token_sequence_as_ids[window_end-2:window_end]
                elif window_begin<0:
                    context_word=self.full_token_sequence_as_ids[0:window_begin+2]+self.full_token_sequence_as_ids[window_end-2:window_end]
                elif window_end>overall_length:
                    context_word=self.full_token_sequence_as_ids[window_begin:window_begin+2]+self.full_token_sequence_as_ids[window_end-2:overall_length]
                # Periodically print the NLL so we can see how the model is converging
                if nll_update_iter > 0 and step % nll_update_iter == 0 and step > 0 and len(nll_results) > 0:
                    print("Negative log-likelihood (step: %d): %f " % (step, sum(nll_results)))
                    nll_results = []
                                    
                # Step 2: For each word in the window range (before and after)
                # perform an update where we (1) use the current parameters of
                # the model to predict it using the skip-gram task and (2)
                # sample negative instances and predict those. We'll use the
                # values of those predictions (i.e., the output of the sigmoid)
                # to update the W and C matrices using backpropagation.
                #
                # NOTE: this inner loop should call predict_and_backprop() which is
                # defined outside of the class. See note there for why.
                for each_context in context_word:

                    # Step 3: Pick the context word ID
                    

                    # Step 4: Sample negative instances 
                    negative_sample_instances=self.generate_negative_samples(each_context,num_negative_samples)

                    # Step 5: call predict_and_backprop. Don't forget to add the
                    # nll return value to nll_results to keep track of how the
                    # model is learning
                    nll=predict_and_backprop(self.W,self.C,learning_rate,center_word,each_context,negative_sample_instances)
                    nll_results.append(nll)
                    #print(nll_results)

                    
                step += 1
                #print(step)
                if max_steps > 0 and step >= max_steps:
                    break

            if max_steps > 0 and step >= max_steps:
                print('Maximum number of steps reached: %d' % max_steps)
                break
    def get_neighbors(self, target_word):
        """ 
        Finds the top 10 most similar words to a target word
        """
        outputs = []
        for index, word in tqdm(enumerate(self.index_to_word), total=len(self.index_to_word)):
            similarity = self.compute_cosine_similarity(target_word, word)
            result = {"word": word, "score": similarity}
            outputs.append(result)
    
        # Sort by highest scores
        neighbors = sorted(outputs, key=lambda o: o['score'], reverse=True)
        return neighbors[1:11]

    def compute_cosine_similarity(self, word_one, word_two):
        '''
        Computes the cosine similarity between the two words
        '''
        try:
            word_one_index = self.word_to_index[word_one]
            word_two_index = self.word_to_index[word_two]
        except KeyError:
            return 0
    
        embedding_one = self.W[word_one_index]
        embedding_two = self.W[word_two_index]
        similarity = 1 - abs(float(cosine(embedding_one, embedding_two)))
        return similarity

In [3]:
#@jit(nopython=True)
def predict_and_backprop(W, C, learning_rate, center_id, context_id,
                         negative_sample_ids):
    '''
    Using the center token (specified by center_id), makes a forward pass through
    the network to predict the context token (context_id) and negative samples,
    then backprops the error of those predictions to update the network and
    returns the negative log likelihood (Equation 1 in your homework) from the
    current preditions. W and C are the weight matrices of the network and IDs
    refer to particular rows of the matrices (i.e., the word embeddings of the
    target word and the context words!)

    '''

    #
    # GENERAL NOTE: There are many ways to implement this function, depending on
    # how fancy you want to get with numpy. The instructions/steps here are
    # intended as guides for the main tasks on what you have to do and may be
    # implemented as one line or more lines, depending on which methods you use
    # and how you want to write it. The important thing is that it works, not
    # how fast it is, so feel free to write it in a way that is understandable
    # to you. Often when you get to that point, you'll see ways to improve (but
    # first save a copy of your working code!).
    #

    
    # Step 1: Look up the two vectors in W and C. Note that the row for the
    # center_id is effectively the hidden layer activation, h.
    center_vec=W[center_id]
    context_vec=C[context_id]
    # Step 2: Look up the vectors for the negative sample IDs.
    #
    
    
    negative_sample_vecs=C[negative_sample_ids]
    # NOTE: numpy supports multiple indexing (getting multiple rows at once) if
    # you want to use it

    # Step 3: Compute the predictions for the context word and the negative
    # examples. We want the predictions of the context word to be near 1 and
    # those for the negative examples to be near 0.
    context_word_prediction=np.dot(center_vec,context_vec)
    negative_word_prediction=np.matmul(negative_sample_vecs,np.transpose([center_vec]))
    # Step 4: Compute the negative log likelihood
    nll=-np.log(sigmoid(context_word_prediction))-np.sum(np.log(sigmoid(-negative_word_prediction)))
    # Step 5: Update the negative sample vectors to push their dot product with the
    # center word's vecter closer to zero.
    context_vec_new=context_vec-learning_rate*(sigmoid(context_word_prediction)-1)*center_vec
    negative_sample_vecs_new=negative_sample_vecs-np.matmul(learning_rate*(sigmoid(negative_word_prediction)),[center_vec])
    # Step 6: Now backprop all the way back to the center word's vector. Be sure to
    # update it based on the *old* values of the context vectors, not the
    # new values of the context vectors that you just updated!    
    center_vec_new=center_vec-learning_rate*(sigmoid(context_word_prediction)-1)*context_vec
    for i in range(len(negative_word_prediction)):
        center_vec_new=center_vec_new-learning_rate*(sigmoid(negative_word_prediction[i]))*negative_sample_vecs[i]
    #nll = 0
    W[center_id]=center_vec_new
    C[context_id]=context_vec_new
    C[negative_sample_ids]=negative_sample_vecs_new
    return nll
#@jit(nopython=True)
def sigmoid(x):
    '''
    Returns the sigmoid of the provided value
    '''
    return 1.0 / (1 + np.exp(-x))


In [4]:

parser = argparse.ArgumentParser()

parser.add_argument('--corpus_file',
                    type=str,
                    default='wiki-bios.med.txt',
                    help="The file name for the text file to use in training")

parser.add_argument(
    '--window_size',
    type=int,
    default=2,
    help=
    "The number of tokens before or after the center token to include as context when training"
)
parser.add_argument(
    '--learning_rate',
    type=float,
    default=0.05,
    help="The learning rate to use when updating embeddings during SGD")
parser.add_argument(
    '--embedding_size',
    type=int,
    default=50,
    help="The embedding dimension side (i.e., hidden layer size)")

parser.add_argument(
    '--min_token_frequency',
    type=int,
    default=5,
    help=
    "The minimum number of times a token must occur to be considered in the vocabulary"
)

parser.add_argument(
    '--num_epochs',
    type=int,
    default=2,
    help=
    "The number of epochs of training to complete, where an epoch is a full pass through the entire data."
)

parser.add_argument(
    '--max_steps',
    type=int,
    default=-1,
    help=
    "The maximum number of steps to take. Positive values will cause the training to end early if the maximum number is reached before all epochs have finished."
)

parser.add_argument(
    '--nll_update_iter',
    type=int,
    default=10000,
    help=
    "How many steps to take between printing the negative log-likelihood when testing for convergence. Negative values cause the NLL to not be printed"
)

parser.add_argument('--do_quick_nn_test',
                    type=bool,
                    default=True,
                    help="Run a quick nearest-neighbor test")

parser.add_argument(
    '--output_file',
    type=str,
    default='trained_vector.txt',
    help="Where to save the word vectors as a gensim.KeyedVectors")

args = parser.parse_args(args=[])
if not args.corpus_file:
        print('No file specified for training! See help message:\n')
        parser.print_help(sys.stderr)
        exit(1)

if not args.output_file:
        print('REMINDER: this run is not saving any output. If you meant to, restart and add --output_file')
        
corpus_file_name = args.corpus_file



In [5]:
instance=word2vec()

In [6]:
instance.load_data(corpus_file_name, args.min_token_frequency)

Counting token frequencies
Performing minimum thresholding
23015360
Loaded all data from wiki-bios.med.txt; saw 18594088 tokens (108206 unique)


In [7]:
instance.init_weights()

In [8]:
instance.generate_negative_sampling_table()

Generating sampling table
108206


In [9]:
instance.train(nll_update_iter=args.nll_update_iter, max_steps=args.max_steps,
                num_epochs=args.num_epochs)

Beginning epoch 1 of 2
Negative log-likelihood (step: 10000): 81766.082953 
Negative log-likelihood (step: 20000): 79209.344167 
Negative log-likelihood (step: 30000): 77756.214706 
Negative log-likelihood (step: 40000): 76804.980325 
Negative log-likelihood (step: 50000): 77036.815022 
Negative log-likelihood (step: 60000): 75697.123899 
Negative log-likelihood (step: 70000): 74783.416381 
Negative log-likelihood (step: 80000): 75765.061278 
Negative log-likelihood (step: 90000): 73871.414029 
Negative log-likelihood (step: 100000): 73411.041552 
Negative log-likelihood (step: 110000): 73583.060948 
Negative log-likelihood (step: 120000): 73540.303606 
Negative log-likelihood (step: 130000): 71515.316059 
Negative log-likelihood (step: 140000): 70443.127895 
Negative log-likelihood (step: 150000): 72309.516773 
Negative log-likelihood (step: 160000): 70471.715172 
Negative log-likelihood (step: 170000): 72026.598981 
Negative log-likelihood (step: 180000): 67902.238340 
Negative log-l

Negative log-likelihood (step: 1520000): 57439.875453 
Negative log-likelihood (step: 1530000): 60362.749707 
Negative log-likelihood (step: 1540000): 58795.298513 
Negative log-likelihood (step: 1550000): 59590.284886 
Negative log-likelihood (step: 1560000): 59407.357223 
Negative log-likelihood (step: 1570000): 58001.562875 
Negative log-likelihood (step: 1580000): 56909.057372 
Negative log-likelihood (step: 1590000): 55282.422407 
Negative log-likelihood (step: 1600000): 59049.059433 
Negative log-likelihood (step: 1610000): 59880.067877 
Negative log-likelihood (step: 1620000): 54830.994580 
Negative log-likelihood (step: 1630000): 60073.111359 
Negative log-likelihood (step: 1640000): 60429.442859 
Negative log-likelihood (step: 1650000): 59327.608385 
Negative log-likelihood (step: 1660000): 57198.519111 
Negative log-likelihood (step: 1670000): 57105.034344 
Negative log-likelihood (step: 1680000): 60059.963170 
Negative log-likelihood (step: 1690000): 60091.909314 
Negative l

Negative log-likelihood (step: 3010000): 56987.663524 
Negative log-likelihood (step: 3020000): 54076.088918 
Negative log-likelihood (step: 3030000): 56431.849677 
Negative log-likelihood (step: 3040000): 57447.751440 
Negative log-likelihood (step: 3050000): 57476.703438 
Negative log-likelihood (step: 3060000): 57111.456424 
Negative log-likelihood (step: 3070000): 59418.897950 
Negative log-likelihood (step: 3080000): 58822.851010 
Negative log-likelihood (step: 3090000): 49227.557877 
Negative log-likelihood (step: 3100000): 59024.429006 
Negative log-likelihood (step: 3110000): 57544.589403 
Negative log-likelihood (step: 3120000): 56152.639542 
Negative log-likelihood (step: 3130000): 58384.484878 
Negative log-likelihood (step: 3140000): 57958.910341 
Negative log-likelihood (step: 3150000): 54975.269841 
Negative log-likelihood (step: 3160000): 54427.945015 
Negative log-likelihood (step: 3170000): 56929.491626 
Negative log-likelihood (step: 3180000): 57510.903609 
Negative l

Negative log-likelihood (step: 4500000): 54568.984293 
Negative log-likelihood (step: 4510000): 57469.965729 
Negative log-likelihood (step: 4520000): 57494.276630 
Negative log-likelihood (step: 4530000): 56152.122743 
Negative log-likelihood (step: 4540000): 55151.516300 
Negative log-likelihood (step: 4550000): 56814.508054 
Negative log-likelihood (step: 4560000): 57944.809396 
Negative log-likelihood (step: 4570000): 55787.950778 
Negative log-likelihood (step: 4580000): 57146.902100 
Negative log-likelihood (step: 4590000): 55682.862956 
Negative log-likelihood (step: 4600000): 55579.078320 
Negative log-likelihood (step: 4610000): 56212.585240 
Negative log-likelihood (step: 4620000): 56742.764149 
Negative log-likelihood (step: 4630000): 57385.461576 
Negative log-likelihood (step: 4640000): 56082.127666 
Negative log-likelihood (step: 4650000): 54686.784068 
Negative log-likelihood (step: 4660000): 53819.719555 
Negative log-likelihood (step: 4670000): 56707.488312 
Negative l

Negative log-likelihood (step: 5990000): 54997.298473 
Negative log-likelihood (step: 6000000): 56281.237803 
Negative log-likelihood (step: 6010000): 54557.955417 
Negative log-likelihood (step: 6020000): 56940.547636 
Negative log-likelihood (step: 6030000): 53084.626961 
Negative log-likelihood (step: 6040000): 56610.468319 
Negative log-likelihood (step: 6050000): 56240.168642 
Negative log-likelihood (step: 6060000): 55175.933288 
Negative log-likelihood (step: 6070000): 55673.980294 
Negative log-likelihood (step: 6080000): 57310.941531 
Negative log-likelihood (step: 6090000): 55834.925485 
Negative log-likelihood (step: 6100000): 57425.073433 
Negative log-likelihood (step: 6110000): 56665.001137 
Negative log-likelihood (step: 6120000): 55974.530152 
Negative log-likelihood (step: 6130000): 57108.551622 
Negative log-likelihood (step: 6140000): 53898.786687 
Negative log-likelihood (step: 6150000): 53209.524634 
Negative log-likelihood (step: 6160000): 52714.285232 
Negative l

Negative log-likelihood (step: 7480000): 56647.976259 
Negative log-likelihood (step: 7490000): 56471.904907 
Negative log-likelihood (step: 7500000): 55442.947265 
Negative log-likelihood (step: 7510000): 53542.281840 
Negative log-likelihood (step: 7520000): 56076.647454 
Negative log-likelihood (step: 7530000): 56400.264744 
Negative log-likelihood (step: 7540000): 55374.011762 
Negative log-likelihood (step: 7550000): 54917.806097 
Negative log-likelihood (step: 7560000): 53441.447994 
Negative log-likelihood (step: 7570000): 51111.823632 
Negative log-likelihood (step: 7580000): 54330.697615 
Negative log-likelihood (step: 7590000): 57202.583083 
Negative log-likelihood (step: 7600000): 56693.557768 
Negative log-likelihood (step: 7610000): 55055.386662 
Negative log-likelihood (step: 7620000): 54336.809680 
Negative log-likelihood (step: 7630000): 52705.207934 
Negative log-likelihood (step: 7640000): 54049.576247 
Negative log-likelihood (step: 7650000): 53314.659240 
Negative l

Negative log-likelihood (step: 8970000): 52411.984203 
Negative log-likelihood (step: 8980000): 52282.708658 
Negative log-likelihood (step: 8990000): 52400.058691 
Negative log-likelihood (step: 9000000): 55440.743308 
Negative log-likelihood (step: 9010000): 53402.729149 
Negative log-likelihood (step: 9020000): 53907.839125 
Negative log-likelihood (step: 9030000): 55984.002115 
Negative log-likelihood (step: 9040000): 53989.648748 
Negative log-likelihood (step: 9050000): 55520.703380 
Negative log-likelihood (step: 9060000): 55490.624398 
Negative log-likelihood (step: 9070000): 55871.113958 
Negative log-likelihood (step: 9080000): 54415.298372 
Negative log-likelihood (step: 9090000): 56085.293370 
Negative log-likelihood (step: 9100000): 54411.779205 
Negative log-likelihood (step: 9110000): 54377.793473 
Negative log-likelihood (step: 9120000): 52977.057176 
Negative log-likelihood (step: 9130000): 56495.090660 
Negative log-likelihood (step: 9140000): 54342.330929 
Negative l

Negative log-likelihood (step: 10460000): 56752.045315 
Negative log-likelihood (step: 10470000): 55940.046387 
Negative log-likelihood (step: 10480000): 56584.022237 
Negative log-likelihood (step: 10490000): 55408.631975 
Negative log-likelihood (step: 10500000): 52823.025662 
Negative log-likelihood (step: 10510000): 55073.262559 
Negative log-likelihood (step: 10520000): 54584.183564 
Negative log-likelihood (step: 10530000): 55912.338528 
Negative log-likelihood (step: 10540000): 54323.063307 
Negative log-likelihood (step: 10550000): 54373.891452 
Negative log-likelihood (step: 10560000): 55809.085617 
Negative log-likelihood (step: 10570000): 55519.084099 
Negative log-likelihood (step: 10580000): 55758.760667 
Negative log-likelihood (step: 10590000): 55731.049771 
Negative log-likelihood (step: 10600000): 55684.336364 
Negative log-likelihood (step: 10610000): 55912.843601 
Negative log-likelihood (step: 10620000): 55432.663169 
Negative log-likelihood (step: 10630000): 54160.

Negative log-likelihood (step: 11930000): 54969.618947 
Negative log-likelihood (step: 11940000): 55866.669858 
Negative log-likelihood (step: 11950000): 50385.288024 
Negative log-likelihood (step: 11960000): 55841.631631 
Negative log-likelihood (step: 11970000): 51854.505666 
Negative log-likelihood (step: 11980000): 49015.823971 
Negative log-likelihood (step: 11990000): 54324.121233 
Negative log-likelihood (step: 12000000): 56453.392402 
Negative log-likelihood (step: 12010000): 54585.427733 
Negative log-likelihood (step: 12020000): 53168.899355 
Negative log-likelihood (step: 12030000): 52030.839719 
Negative log-likelihood (step: 12040000): 54698.580850 
Negative log-likelihood (step: 12050000): 53897.526380 
Negative log-likelihood (step: 12060000): 54332.491275 
Negative log-likelihood (step: 12070000): 54095.060239 
Negative log-likelihood (step: 12080000): 54369.915854 
Negative log-likelihood (step: 12090000): 54701.278907 
Negative log-likelihood (step: 12100000): 53127.

Negative log-likelihood (step: 13400000): 54408.094712 
Negative log-likelihood (step: 13410000): 53924.109671 
Negative log-likelihood (step: 13420000): 55041.282742 
Negative log-likelihood (step: 13430000): 53376.248829 
Negative log-likelihood (step: 13440000): 54909.437519 
Negative log-likelihood (step: 13450000): 56572.689461 
Negative log-likelihood (step: 13460000): 55447.857331 
Negative log-likelihood (step: 13470000): 55494.473857 
Negative log-likelihood (step: 13480000): 55171.312644 
Negative log-likelihood (step: 13490000): 56083.186091 
Negative log-likelihood (step: 13500000): 53116.931483 
Negative log-likelihood (step: 13510000): 55873.755544 
Negative log-likelihood (step: 13520000): 53899.871956 
Negative log-likelihood (step: 13530000): 53134.086990 
Negative log-likelihood (step: 13540000): 57460.082291 
Negative log-likelihood (step: 13550000): 53374.330875 
Negative log-likelihood (step: 13560000): 55794.974683 
Negative log-likelihood (step: 13570000): 54796.

Negative log-likelihood (step: 14870000): 54190.849458 
Negative log-likelihood (step: 14880000): 54110.366858 
Negative log-likelihood (step: 14890000): 54245.305513 
Negative log-likelihood (step: 14900000): 55172.385503 
Negative log-likelihood (step: 14910000): 55134.656752 
Negative log-likelihood (step: 14920000): 53749.681963 
Negative log-likelihood (step: 14930000): 54043.496326 
Negative log-likelihood (step: 14940000): 55281.449969 
Negative log-likelihood (step: 14950000): 52362.062255 
Negative log-likelihood (step: 14960000): 57267.331875 
Negative log-likelihood (step: 14970000): 55938.880276 
Negative log-likelihood (step: 14980000): 55994.838257 
Negative log-likelihood (step: 14990000): 55447.762612 
Negative log-likelihood (step: 15000000): 54616.042076 
Negative log-likelihood (step: 15010000): 55339.930323 
Negative log-likelihood (step: 15020000): 55549.440854 
Negative log-likelihood (step: 15030000): 56730.094503 
Negative log-likelihood (step: 15040000): 54383.

Negative log-likelihood (step: 16340000): 53898.920749 
Negative log-likelihood (step: 16350000): 52892.785527 
Negative log-likelihood (step: 16360000): 54114.705749 
Negative log-likelihood (step: 16370000): 54551.555094 
Negative log-likelihood (step: 16380000): 55584.817581 
Negative log-likelihood (step: 16390000): 52490.606387 
Negative log-likelihood (step: 16400000): 52757.637737 
Negative log-likelihood (step: 16410000): 55522.052814 
Negative log-likelihood (step: 16420000): 56727.559264 
Negative log-likelihood (step: 16430000): 56449.783420 
Negative log-likelihood (step: 16440000): 55687.871766 
Negative log-likelihood (step: 16450000): 54519.194140 
Negative log-likelihood (step: 16460000): 55255.634927 
Negative log-likelihood (step: 16470000): 55036.908592 
Negative log-likelihood (step: 16480000): 53225.351782 
Negative log-likelihood (step: 16490000): 54532.119449 
Negative log-likelihood (step: 16500000): 55241.362305 
Negative log-likelihood (step: 16510000): 55443.

Negative log-likelihood (step: 17810000): 52607.304781 
Negative log-likelihood (step: 17820000): 54199.635554 
Negative log-likelihood (step: 17830000): 54634.957423 
Negative log-likelihood (step: 17840000): 52719.671700 
Negative log-likelihood (step: 17850000): 56927.203281 
Negative log-likelihood (step: 17860000): 54591.235235 
Negative log-likelihood (step: 17870000): 54887.201966 
Negative log-likelihood (step: 17880000): 54945.644061 
Negative log-likelihood (step: 17890000): 54613.796763 
Negative log-likelihood (step: 17900000): 52241.592875 
Negative log-likelihood (step: 17910000): 53350.257190 
Negative log-likelihood (step: 17920000): 54689.984686 
Negative log-likelihood (step: 17930000): 55341.417571 
Negative log-likelihood (step: 17940000): 51377.440282 
Negative log-likelihood (step: 17950000): 51451.931310 
Negative log-likelihood (step: 17960000): 54036.463771 
Negative log-likelihood (step: 17970000): 53655.695594 
Negative log-likelihood (step: 17980000): 54346.

Negative log-likelihood (step: 19270000): 56139.694231 
Negative log-likelihood (step: 19280000): 55668.698122 
Negative log-likelihood (step: 19290000): 53565.629883 
Negative log-likelihood (step: 19300000): 54068.378966 
Negative log-likelihood (step: 19310000): 53843.168890 
Negative log-likelihood (step: 19320000): 54690.482581 
Negative log-likelihood (step: 19330000): 55472.674647 
Negative log-likelihood (step: 19340000): 54801.938891 
Negative log-likelihood (step: 19350000): 53607.567739 
Negative log-likelihood (step: 19360000): 53769.415461 
Negative log-likelihood (step: 19370000): 53326.448565 
Negative log-likelihood (step: 19380000): 56095.845715 
Negative log-likelihood (step: 19390000): 55737.737814 
Negative log-likelihood (step: 19400000): 54931.611961 
Negative log-likelihood (step: 19410000): 50401.650295 
Negative log-likelihood (step: 19420000): 56081.484792 
Negative log-likelihood (step: 19430000): 51453.829634 
Negative log-likelihood (step: 19440000): 55681.

Negative log-likelihood (step: 20740000): 54500.397288 
Negative log-likelihood (step: 20750000): 54206.140547 
Negative log-likelihood (step: 20760000): 53439.627074 
Negative log-likelihood (step: 20770000): 52780.162392 
Negative log-likelihood (step: 20780000): 54424.234428 
Negative log-likelihood (step: 20790000): 55314.551510 
Negative log-likelihood (step: 20800000): 53905.333854 
Negative log-likelihood (step: 20810000): 55150.888504 
Negative log-likelihood (step: 20820000): 53802.571889 
Negative log-likelihood (step: 20830000): 55298.244270 
Negative log-likelihood (step: 20840000): 54818.641517 
Negative log-likelihood (step: 20850000): 52626.296016 
Negative log-likelihood (step: 20860000): 53068.162488 
Negative log-likelihood (step: 20870000): 54650.801932 
Negative log-likelihood (step: 20880000): 54013.582382 
Negative log-likelihood (step: 20890000): 55908.797819 
Negative log-likelihood (step: 20900000): 55149.881192 
Negative log-likelihood (step: 20910000): 54536.

Negative log-likelihood (step: 22210000): 55174.245711 
Negative log-likelihood (step: 22220000): 54053.421043 
Negative log-likelihood (step: 22230000): 48084.025984 
Negative log-likelihood (step: 22240000): 53902.890367 
Negative log-likelihood (step: 22250000): 49528.710121 
Negative log-likelihood (step: 22260000): 51895.010042 
Negative log-likelihood (step: 22270000): 55948.588022 
Negative log-likelihood (step: 22280000): 54926.087606 
Negative log-likelihood (step: 22290000): 55264.664030 
Negative log-likelihood (step: 22300000): 52659.475244 
Negative log-likelihood (step: 22310000): 53647.050934 
Negative log-likelihood (step: 22320000): 54547.105509 
Negative log-likelihood (step: 22330000): 54984.739493 
Negative log-likelihood (step: 22340000): 53165.678218 
Negative log-likelihood (step: 22350000): 54506.845070 
Negative log-likelihood (step: 22360000): 53389.175278 
Negative log-likelihood (step: 22370000): 54399.970859 
Negative log-likelihood (step: 22380000): 55109.

Negative log-likelihood (step: 23680000): 56221.894091 
Negative log-likelihood (step: 23690000): 54053.380106 
Negative log-likelihood (step: 23700000): 53423.832918 
Negative log-likelihood (step: 23710000): 53517.851480 
Negative log-likelihood (step: 23720000): 56058.433730 
Negative log-likelihood (step: 23730000): 55779.088284 
Negative log-likelihood (step: 23740000): 56336.761215 
Negative log-likelihood (step: 23750000): 51558.950203 
Negative log-likelihood (step: 23760000): 53557.744628 
Negative log-likelihood (step: 23770000): 54851.927441 
Negative log-likelihood (step: 23780000): 55369.989628 
Negative log-likelihood (step: 23790000): 54469.164124 
Negative log-likelihood (step: 23800000): 52874.906439 
Negative log-likelihood (step: 23810000): 53399.287443 
Negative log-likelihood (step: 23820000): 47797.894926 
Negative log-likelihood (step: 23830000): 52128.104966 
Negative log-likelihood (step: 23840000): 54824.451495 
Negative log-likelihood (step: 23850000): 53370.

Negative log-likelihood (step: 25150000): 53710.977105 
Negative log-likelihood (step: 25160000): 55809.140511 
Negative log-likelihood (step: 25170000): 54609.074023 
Negative log-likelihood (step: 25180000): 54525.993413 
Negative log-likelihood (step: 25190000): 51888.810957 
Negative log-likelihood (step: 25200000): 51587.570705 
Negative log-likelihood (step: 25210000): 54093.863549 
Negative log-likelihood (step: 25220000): 53509.942077 
Negative log-likelihood (step: 25230000): 38894.953587 
Negative log-likelihood (step: 25240000): 54805.810905 
Negative log-likelihood (step: 25250000): 52729.011315 
Negative log-likelihood (step: 25260000): 51147.454383 
Negative log-likelihood (step: 25270000): 53011.885144 
Negative log-likelihood (step: 25280000): 49747.973074 
Negative log-likelihood (step: 25290000): 52489.656759 
Negative log-likelihood (step: 25300000): 55126.423467 
Negative log-likelihood (step: 25310000): 54933.427663 
Negative log-likelihood (step: 25320000): 52891.

Negative log-likelihood (step: 26620000): 55070.119410 
Negative log-likelihood (step: 26630000): 53682.774902 
Negative log-likelihood (step: 26640000): 54895.009541 
Negative log-likelihood (step: 26650000): 51371.564937 
Negative log-likelihood (step: 26660000): 55414.464430 
Negative log-likelihood (step: 26670000): 54231.893903 
Negative log-likelihood (step: 26680000): 51463.582309 
Negative log-likelihood (step: 26690000): 53641.377326 
Negative log-likelihood (step: 26700000): 55007.235255 
Negative log-likelihood (step: 26710000): 54445.928092 
Negative log-likelihood (step: 26720000): 54663.446936 
Negative log-likelihood (step: 26730000): 54681.429864 
Negative log-likelihood (step: 26740000): 55595.864137 
Negative log-likelihood (step: 26750000): 52632.188054 
Negative log-likelihood (step: 26760000): 54449.088304 
Negative log-likelihood (step: 26770000): 54328.218299 
Negative log-likelihood (step: 26780000): 55201.256189 
Negative log-likelihood (step: 26790000): 53622.

Negative log-likelihood (step: 28090000): 53620.048242 
Negative log-likelihood (step: 28100000): 51683.027128 
Negative log-likelihood (step: 28110000): 54299.160027 
Negative log-likelihood (step: 28120000): 54809.249632 
Negative log-likelihood (step: 28130000): 53893.931333 
Negative log-likelihood (step: 28140000): 54218.102808 
Negative log-likelihood (step: 28150000): 51630.846930 
Negative log-likelihood (step: 28160000): 52858.494991 
Negative log-likelihood (step: 28170000): 53743.359285 
Negative log-likelihood (step: 28180000): 54310.938441 
Negative log-likelihood (step: 28190000): 54344.002203 
Negative log-likelihood (step: 28200000): 53442.606226 
Negative log-likelihood (step: 28210000): 53679.161269 
Negative log-likelihood (step: 28220000): 52509.363856 
Negative log-likelihood (step: 28230000): 52448.273069 
Negative log-likelihood (step: 28240000): 56021.773732 
Negative log-likelihood (step: 28250000): 52502.584131 
Negative log-likelihood (step: 28260000): 49072.

Negative log-likelihood (step: 29560000): 53593.329599 
Negative log-likelihood (step: 29570000): 51814.083930 
Negative log-likelihood (step: 29580000): 53448.183937 
Negative log-likelihood (step: 29590000): 50478.318284 
Negative log-likelihood (step: 29600000): 53456.200897 
Negative log-likelihood (step: 29610000): 54662.803301 
Negative log-likelihood (step: 29620000): 51085.693134 
Negative log-likelihood (step: 29630000): 52542.077122 
Negative log-likelihood (step: 29640000): 51545.479653 
Negative log-likelihood (step: 29650000): 53498.825002 
Negative log-likelihood (step: 29660000): 54415.014206 
Negative log-likelihood (step: 29670000): 54274.558612 
Negative log-likelihood (step: 29680000): 54123.827366 
Negative log-likelihood (step: 29690000): 53943.352900 
Negative log-likelihood (step: 29700000): 51188.253490 
Negative log-likelihood (step: 29710000): 52344.996135 
Negative log-likelihood (step: 29720000): 53930.179923 
Negative log-likelihood (step: 29730000): 54287.

Negative log-likelihood (step: 31030000): 54534.620483 
Negative log-likelihood (step: 31040000): 54780.280902 
Negative log-likelihood (step: 31050000): 48462.752781 
Negative log-likelihood (step: 31060000): 52108.161468 
Negative log-likelihood (step: 31070000): 53713.778379 
Negative log-likelihood (step: 31080000): 53730.845886 
Negative log-likelihood (step: 31090000): 50353.222245 
Negative log-likelihood (step: 31100000): 54130.743492 
Negative log-likelihood (step: 31110000): 51375.870191 
Negative log-likelihood (step: 31120000): 50410.052061 
Negative log-likelihood (step: 31130000): 54142.785162 
Negative log-likelihood (step: 31140000): 52613.905470 
Negative log-likelihood (step: 31150000): 53265.778842 
Negative log-likelihood (step: 31160000): 53583.077162 
Negative log-likelihood (step: 31170000): 55951.008249 
Negative log-likelihood (step: 31180000): 44322.198750 
Negative log-likelihood (step: 31190000): 53484.187572 
Negative log-likelihood (step: 31200000): 54254.

Negative log-likelihood (step: 32500000): 53341.396864 
Negative log-likelihood (step: 32510000): 52640.648808 
Negative log-likelihood (step: 32520000): 52690.594702 
Negative log-likelihood (step: 32530000): 53852.704437 
Negative log-likelihood (step: 32540000): 54591.107976 
Negative log-likelihood (step: 32550000): 47826.088033 
Negative log-likelihood (step: 32560000): 54462.264847 
Negative log-likelihood (step: 32570000): 53155.700306 
Negative log-likelihood (step: 32580000): 53062.137574 
Negative log-likelihood (step: 32590000): 51348.751424 
Negative log-likelihood (step: 32600000): 54957.732298 
Negative log-likelihood (step: 32610000): 53981.567754 
Negative log-likelihood (step: 32620000): 53386.211558 
Negative log-likelihood (step: 32630000): 53045.440232 
Negative log-likelihood (step: 32640000): 54517.059376 
Negative log-likelihood (step: 32650000): 53382.536542 
Negative log-likelihood (step: 32660000): 49634.703455 
Negative log-likelihood (step: 32670000): 55698.

Negative log-likelihood (step: 33970000): 52801.925148 
Negative log-likelihood (step: 33980000): 53516.632623 
Negative log-likelihood (step: 33990000): 53588.410547 
Negative log-likelihood (step: 34000000): 55059.951366 
Negative log-likelihood (step: 34010000): 55369.515837 
Negative log-likelihood (step: 34020000): 53057.673110 
Negative log-likelihood (step: 34030000): 53103.905816 
Negative log-likelihood (step: 34040000): 54363.146396 
Negative log-likelihood (step: 34050000): 50698.274777 
Negative log-likelihood (step: 34060000): 53458.624167 
Negative log-likelihood (step: 34070000): 53665.049045 
Negative log-likelihood (step: 34080000): 53752.415970 
Negative log-likelihood (step: 34090000): 50696.137235 
Negative log-likelihood (step: 34100000): 55402.151345 
Negative log-likelihood (step: 34110000): 52562.819038 
Negative log-likelihood (step: 34120000): 53059.275114 
Negative log-likelihood (step: 34130000): 55512.009490 
Negative log-likelihood (step: 34140000): 54225.

Negative log-likelihood (step: 35440000): 53945.589452 
Negative log-likelihood (step: 35450000): 52319.380321 
Negative log-likelihood (step: 35460000): 53067.621325 
Negative log-likelihood (step: 35470000): 51926.095425 
Negative log-likelihood (step: 35480000): 50900.882816 
Negative log-likelihood (step: 35490000): 53010.995418 
Negative log-likelihood (step: 35500000): 55323.090714 
Negative log-likelihood (step: 35510000): 53177.862150 
Negative log-likelihood (step: 35520000): 55335.951747 
Negative log-likelihood (step: 35530000): 54710.803248 
Negative log-likelihood (step: 35540000): 52251.412663 
Negative log-likelihood (step: 35550000): 50350.844127 
Negative log-likelihood (step: 35560000): 55062.883555 
Negative log-likelihood (step: 35570000): 53899.074340 
Negative log-likelihood (step: 35580000): 53497.287450 
Negative log-likelihood (step: 35590000): 53974.762809 
Negative log-likelihood (step: 35600000): 53843.545646 
Negative log-likelihood (step: 35610000): 51832.

Negative log-likelihood (step: 36910000): 55215.207230 
Negative log-likelihood (step: 36920000): 51061.458949 
Negative log-likelihood (step: 36930000): 54560.599117 
Negative log-likelihood (step: 36940000): 51881.874986 
Negative log-likelihood (step: 36950000): 52716.272908 
Negative log-likelihood (step: 36960000): 55235.186362 
Negative log-likelihood (step: 36970000): 53122.728685 
Negative log-likelihood (step: 36980000): 54626.666692 
Negative log-likelihood (step: 36990000): 55993.481979 
Negative log-likelihood (step: 37000000): 55226.309068 
Negative log-likelihood (step: 37010000): 52881.882126 
Negative log-likelihood (step: 37020000): 53890.066041 
Negative log-likelihood (step: 37030000): 53850.295013 
Negative log-likelihood (step: 37040000): 51509.461651 
Negative log-likelihood (step: 37050000): 52451.532266 
Negative log-likelihood (step: 37060000): 53267.907899 
Negative log-likelihood (step: 37070000): 54691.331576 
Negative log-likelihood (step: 37080000): 53337.

In [11]:
instance.save('trained_vector.txt')

In [25]:
targets = ["January", "good", "the", "food", "engineering"]

for targ in targets:
    print("Target: ", targ)
    bestpreds = (instance.get_neighbors(targ))
    for pred in bestpreds:
        print(pred["word"], ":", pred["score"])
    print("\n")


  6%|▌         | 6516/108206 [00:00<00:03, 30407.83it/s]

Target:  January


100%|██████████| 108206/108206 [00:02<00:00, 37854.19it/s]
  6%|▌         | 6104/108206 [00:00<00:03, 28961.85it/s]

June : 0.9009177653959356
December : 0.9002343526673667
May : 0.8931960196336791
August : 0.888697408531813
November : 0.8843557200067995
February : 0.8797365250598626
October : 0.8795067674627157
March : 0.8700947144728567
September : 0.8645465671084334
July : 0.8446447546192521


Target:  good


100%|██████████| 108206/108206 [00:03<00:00, 35823.68it/s]
  6%|▌         | 6409/108206 [00:00<00:03, 30215.16it/s]

clever : 0.8322196426204818
superb : 0.8081249289973313
erudition : 0.7968424625718515
pleasant : 0.7958439238487498
fatherly : 0.7804747123760511
industrious : 0.771689874513083
attractive : 0.7700174007861669
unkempt : 0.7691515195600043
discreet : 0.7688850803047285
plenty : 0.7649913653393535


Target:  the


100%|██████████| 108206/108206 [00:02<00:00, 37750.98it/s]
  3%|▎         | 2793/108206 [00:00<00:03, 27929.09it/s]

Strait : 0.4129587394761658
Declaration : 0.40164266090071044
Disaster : 0.38769319346761333
reconstitution : 0.38490525664681385
Gains : 0.38410487081916667
curated : 0.3839401203143855
Cases : 0.3828749644135384
Hurston : 0.3791014402887716
Riverkeeper : 0.37056072609381685
collaborators : 0.36909551480443903


Target:  food


100%|██████████| 108206/108206 [00:02<00:00, 36203.49it/s]
  6%|▌         | 6473/108206 [00:00<00:03, 29813.45it/s]

wireless : 0.7996811010624246
suppliers : 0.7759580444969606
dough : 0.770453400633206
cosmetics : 0.7643467394853108
averse : 0.7636713706325741
vendors : 0.7632830359195228
junk : 0.7566509784028933
chemicals : 0.754046261282496
inexpensive : 0.7519277513292927
cooked : 0.748428276536752


Target:  engineering


100%|██████████| 108206/108206 [00:03<00:00, 35833.75it/s]

economics : 0.8955100424696705
biochemistry : 0.8884327852564067
biophysics : 0.8683646095010252
meteorology : 0.8547714419440686
microbiology : 0.8530374524659494
ophthalmology : 0.8473003277364385
bacteriology : 0.8463735137712631
aeronautical : 0.8462152350026217
biomedical : 0.8424867474694645
geophysics : 0.8414575750581956







In [72]:
#Problem 12
import pandas as pd
testfile=pd.read_csv('word_pairs_to_estimate_similarity.test.csv')

In [73]:
sim=[]
for i in range(len(testfile)):
    sim.append(instance.compute_cosine_similarity(testfile['word1'].loc[i],testfile['word2'].loc[i]))
sim_result=pd.DataFrame({'id':testfile['pair_id'], 'sim':sim})
sim_result.to_csv('result.csv',index=False)