# Evaluating sentence representations from Skip-gram with SentEval

* Dependencies:
    * Python 3.6 with NumPy/SciPy
    * Pytorch 
    * SentEval
    * scikit-learn

In [1]:
from __future__ import absolute_import, division, unicode_literals

import sys
import numpy as np
import logging
import sklearn
#import data 
# data.py is part of Senteval and it is used for loading word2vec style files
import senteval
import torch
import logging
from collections import defaultdict
from skipgram import Skipgram
import pickle

In [2]:
class dotdict(dict):
    """ dot.notation access to dictionary attributes """
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

class EmbeddingExtractor:
    """
    Wraps a skip-gram model and returns embeddings for words.
    """

    def __init__(self, model_path):        
        self.model = Skipgram(71578, 100)
        self.model.load_state_dict(torch.load(model_path, map_location='cpu'))

    def get_embeddings(self, words):
        """
        :param sentence: np array of shape [batch_size, longest_sentence] containing the unique ids of words
        
        :returns: [batch_size, longest_sentence, z_dim]        
        """
        return self.model.embeddings(words).data.numpy()

This is how you interface with SentEval. The only think you need to change are the paths to trained models in the main block at the end.

In [3]:
# Set params for SentEval
# we use logistic regression (usepytorch: False) and kfold 10
# In this dictionary you can add extra information that you model needs for initialization
# for example the path to a dictionary of indices, of hyper parameters
# this dictionary is passed to the batched and the prepare fucntions
params_senteval = {'task_path': '',
                   'usepytorch': False,
                   'kfold': 10,
                   'ckpt_path': '',
                   'tok_path': '',
                   'extractor': None,
                   'tks1': None}
# made dictionary a dotdict
params_senteval = dotdict(params_senteval)
# this is the config for the NN classifier but we are going to use scikit-learn logistic regression with 10 kfold
# usepytorch = False 
#params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
#                                 'tenacity': 3, 'epoch_size': 2}



def prepare(params, samples):
    """
    In this example we are going to load a tensorflow model, 
    we open a dictionary with the indices of tokens and the computation graph
    """
    params.extractor = EmbeddingExtractor(model_path=params.ckpt_path)
    # load tokenizer from training
    params.tks1 = pickle.load(open(params.tok_path, 'rb'))
    return

def batcher(params, batch):
    """
    At this point batch is a python list containing sentences. Each sentence is a list of tokens (each token a string).
    The code below will take care of converting this to unique ids that EmbedAlign can understand.
    
    This function should return a single vector representation per sentence in the batch.
    In this example we use the average of word embeddings (as predicted by EmbedAlign) as a sentence representation.
    
    In this method you can do mini-batching or you can process sentences 1 at a time (batches of size 1).
    We choose to do it 1 sentence at a time to avoid having to deal with masking. 
    
    This should not be too slow, and it also saves memory.
    """
    # if a sentence is empty dot is set to be the only token
    # you can change it into NULL dependening in your model
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []
    for sent in batch:
        # Here is where dgm4nlp converts strings to unique ids respecting the vocabulary
        # of the pre-trained EmbedAlign model
        # from tokens to ids, position 0 is English
        #x1 = params.tks1[0].to_sequences([(' '.join(sent))])
        x1 = torch.tensor([params.tks1.get(word, params.tks1['<unk>']) for word in sent], dtype=torch.long)
        
        # extract word embeddings in context for a sentence
        # [1, sentence_length, z_dim]
        z_batch1 = params.extractor.get_embeddings(x1)
        # sentence vector is the mean of word embeddings in context
        # [1, z_dim]
        sent_vec = np.mean(z_batch1, axis=0)
        # check if there is any NaN in vector (they appear sometimes when there's padding)
        if np.isnan(sent_vec.sum()):
            sent_vec = np.nan_to_num(sent_vec)        
        embeddings.append(sent_vec)
    embeddings = np.vstack(embeddings)
    return embeddings


# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    # define paths
    # path to senteval data
    # note senteval adds downstream into the path
    params_senteval.task_path = '/home/daniel/SentEval/data/'
    # path to computation graph
    # we use best model on validation AER
    # TODO: you have to point to valid paths! Use the pre-trained model linked from the top of this notebook.
    params_senteval.ckpt_path = 'models/71576V_100d_5w_Skipgram.pt'
    # path to tokenizer with ids of trained Europarl data
    # out dictionary id depends on dill for pickle
    params_senteval.tok_path = 'models/word2idx.p'
    # we use 10 fold cross validation
    params_senteval.kfold = 10
    se = senteval.engine.SE(params_senteval, batcher, prepare)
    
    # here you define the NLP taks that your embedding model is going to be evaluated
    # in (https://arxiv.org/abs/1802.05883) we use the following :
    # SICKRelatedness (Sick-R) needs torch cuda to work (even when using logistic regression), 
    # but STS14 (semantic textual similarity) is a similar type of semantic task
    #transfer_tasks = ['MR', 'CR', 'SUBJ', 'MPQA', 'TREC', 'SST2', 'SST5', 'SICKEntailment',
    #                 'STS14', 'MRPC']
    transfer_tasks = ['Depth', 'TopConstituents','BigramShift', 'Tense',
'SubjNumber', 'ObjNumber']
    # senteval prints the results and returns a dictionary with the scores
    results = se.eval(transfer_tasks)
    print(results)

2018-05-31 16:31:00,606 : ***** (Probing) Transfer task : DEPTH classification *****
2018-05-31 16:31:01,664 : Loaded 100000 train - 10000 dev - 10000 test for Depth
2018-05-31 16:31:03,445 : Computing embeddings for train/dev/test
2018-05-31 16:31:33,011 : Computed embeddings
2018-05-31 16:31:33,012 : Training sklearn-LogReg with standard validation..
2018-05-31 16:34:32,332 : [('reg:0.25', 24.04), ('reg:0.5', 23.93), ('reg:1', 23.89), ('reg:2', 23.86), ('reg:4', 23.85), ('reg:8', 23.85)]
2018-05-31 16:34:32,334 : Validation : best param found is reg = 0.25 with score             24.04
2018-05-31 16:34:32,354 : Evaluating...
2018-05-31 16:35:01,270 : 
Dev acc : 24.0 Test acc : 23.4 for DEPTH classification

2018-05-31 16:35:01,277 : ***** (Probing) Transfer task : TOPCONSTITUENTS classification *****
2018-05-31 16:35:02,910 : Loaded 100000 train - 10000 dev - 10000 test for TopConstituents
2018-05-31 16:35:05,073 : Computing embeddings for train/dev/test
2018-05-31 16:35:34,231 : Comp

{'Depth': {'ntest': 10000, 'acc': 23.44, 'devacc': 24.04, 'ndev': 10000}, 'ObjNumber': {'ntest': 10000, 'acc': 64.89, 'devacc': 63.71, 'ndev': 10000}, 'BigramShift': {'ntest': 10000, 'acc': 49.83, 'devacc': 49.57, 'ndev': 10000}, 'Tense': {'ntest': 10000, 'acc': 64.96, 'devacc': 64.97, 'ndev': 10000}, 'SubjNumber': {'ntest': 10000, 'acc': 64.0, 'devacc': 65.58, 'ndev': 10000}, 'TopConstituents': {'ntest': 10000, 'acc': 18.63, 'devacc': 18.0, 'ndev': 10000}}
