# SentEval usage example

* Clone repo from FAIR github
```
    git clone https://github.com/facebookresearch/SentEval.git
    cd SentEval/
```
* Dependencies:
    * Python 2/3 with NumPy/SciPy
    * Pytorch
    * scikit-learn>=0.18.0

* Install senteval
```
    python setup.py install
```
* Download datasets (it takes some time...)
    * these are downstream tasks
    * new Senteval also has probing tasks (https://github.com/facebookresearch/SentEval/tree/master/data/probing) for evaluating linguistic properties of your embeddings. 
```
    cd data/downstream/
    ./get_transfer_data.bash
```
* Download pretained Glove embeddings:

```
    mkdir pretrained
    cd pretrained
    wget http://nlp.stanford.edu/data/glove.840B.300d.zip
   
```

* The following code evaluates Glove pretrained embeddings on different NLP downstream tasks.

In [1]:
import numpy as np
import pickle
from collections import Counter
import gensim
from gensim.models import Word2Vec

#Source for SIF functions: https://github.com/PrincetonML/SIF

def load_pickled_data(filename):
    """
    Load pickled adata
    """
    with open(filename, 'rb') as handle:
        data = pickle.load(handle)
    return data

def save_word_probs():
    #Get for each word a probability based on the training data (europarl)
    with open('hansards/training.en') as f:
        text = f.readlines()
    sentences = [x.strip() for x in text]
    splitted_sentences = [s.split() for s in sentences]
    splitted_sentences = [[w.lower() for w in sentence] for sentence in splitted_sentences]
    final = [item for sentence in splitted_sentences for item in sentence]
    #Count all the occurences of each world and use this to determine the probabilities
    counter = Counter(final)
    total_count = sum(counter.values())
    w_probs = {}
    for word,count in counter.items():
        w_probs[word] = count/total_count
    #Write probabilities to file
    with open('word_probabilities.pickle', 'wb') as handle:
        pickle.dump(w_probs, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def get_word_probs(batch):
    splitted_sentences = [[w.lower() for w in sentence] for sentence in batch]
    final = [item for sentence in splitted_sentences for item in sentence]
    counter = Counter(final)
    total_count = sum(counter.values())
    w_probs = {}
    for word,count in counter.items():
        w_probs[word] = count/total_count
    return w_probs



model = Word2Vec.load('skipgram-100d-50e-mincount0.bin')
index2word = model.wv.index2word
embeddings = model.syn1neg
word2index = dict(zip(index2word, range(len(index2word))))

In [35]:
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

from __future__ import absolute_import, division, unicode_literals

import sys
import logging
import sklearn
import SentEval.examples.data as data
# Set PATHs
# path to senteval
PATH_TO_SENTEVAL = 'SentEval/'
# path to the NLP datasets 
PATH_TO_DATA = 'SentEval/data'
# path to glove embeddings
PATH_TO_VEC = 'SentEval/data/downstream/pretrained/glove.840B.300d.txt'


# import SentEval
sys.path.insert(0, PATH_TO_SENTEVAL)
import senteval


def prepare(params, samples):
    """
    In this example we are going to load Glove, 
    here you will initialize your model.
    remember to add what you model needs into the params dictionary

    _, params.word2id = data.create_dictionary(samples)
    # load glove/word2vec format 
    params.word_vec = data.get_wordvec(PATH_TO_VEC, params.word2id)
    # dimensionality of glove embeddings
    params.wvec_dim = 300
    """

    model = Word2Vec.load('skipgram-100d-50e-mincount0.bin')
    index2word = model.wv.index2word
    embeddings = model.syn1neg
    word2index = dict(zip(index2word, range(len(index2word))))
    
    params.word2id = word2index
    params.wvec_dim = 100
    params.word_vec = model.syn1neg
    params.batch_size = 128
    
    
    return

def average_batch(params, batch):
    """
    In this example we use the average of word embeddings as a sentence representation.
    Each batch consists of one vector for sentence.
    Here you can process each sentence of the batch, 
    or a complete batch (you may need masking for that).
    
    """
    # if a sentence is empty dot is set to be the only token
    # you can change it into NULL dependening in your model
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []
    for sent in batch:
        sentvec = []
        # the format of a sentence is a lists of words (tokenized and lowercased)
        for word in sent:
            if word in params.word2id.keys():
                # [number of words, embedding dimensionality]
                sentvec.append(params.word_vec[params.word2id[word]])
        if not sentvec:
            vec = np.ones(params.wvec_dim)
            # [number of words, embedding dimensionality]
            sentvec.append(vec)
        # average of word embeddings for sentence representation
        # [embedding dimansionality]
        sentvec = np.mean(sentvec, 0)
        embeddings.append(sentvec)
    # [batch size, embedding dimensionality]
    embeddings = np.vstack(embeddings)
    return embeddings 

def weighted_average_full_batch(params, batch):
    #Get the weighted average of each word embedding to construct a sentence embedding
    #The embeddings for each word get scaled with a factor 0.001/(0.001+p(w)) where
    #p(w) is computed based on the hansards (Europarl) word frequencies
    
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []
    a = 0.01
    average_prob = 1/(len(word2index))
    
    for sent in batch:
        sentence_length = len(sent)
        sentvec = np.zeros(params.wvec_dim,)
        for word in sent:
            if word in params.word2id.keys():
                p_w = probs.get(word,average_prob)
                scale = a/(a+p_w)
                emb = params.word_vec[params.word2id[word]]*scale
                sentvec = np.add(sentvec,emb)
        if np.sum(sentvec) == 0.0:
            sentvec = np.ones(params.wvec_dim)
        embeddings.append(sentvec/sentence_length)
    embeddings = np.vstack(embeddings)
    
    return embeddings

def weighted_average_batch2(params, batch):
    #Get the weighted average of each word embedding to construct a sentence embedding
    #The embeddings for each word get scaled with a factor 0.001/(0.001+p(w)) where
    #p(w) is computed based on the word frequencies in the batch
    
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []
    a = 0.001
    word_prob_batch = get_word_probs(batch)

    for sent in batch:
        sentence_length = len(sent)
        sentvec = np.zeros(params.wvec_dim,)
        for word in sent:
            if word in params.word2id.keys():
                p_w = word_prob_batch[word]
                scale = a/(a+p_w)
                emb = params.word_vec[params.word2id[word]]*scale
                sentvec = np.add(sentvec,emb)
        if np.sum(sentvec) == 0.0:
            sentvec = np.ones(params.wvec_dim)
        embeddings.append(sentvec/sentence_length)
    embeddings = np.vstack(embeddings)
    
    return embeddings

probs = load_pickled_data('word_probabilities.pickle')

# Set params for SentEval
# we use logistic regression (usepytorch: Fasle) and kfold 10
# In this dictionary you can add extra information that you model needs for initialization
# for example the path to a dictionary of indices, of hyper parameters
# this dictionary is passed to the batched and the prepare fucntions
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 10}
# this is the config for the NN classifier but we are going to use scikit-learn logistic regression with 10 kfold
#usepytorch = False 
#params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 256,
                                # 'tenacity': 3, 'epoch_size': 2}
# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    se = senteval.engine.SE(params_senteval, weighted_average_full_batch, prepare)
    
    # here you define the NLP taks that your embedding model is going to be evaluated
    # in (https://arxiv.org/abs/1802.05883) we use the following :
    # SICKRelatedness (Sick-R) needs torch cuda to work (even when using logistic regression), 
    # but STS14 (semantic textual similarity) is a similar type of semantic task
    transfer_tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC',
                      'MRPC', 'SICKEntailment', 'STS14']
    # senteval prints the results and returns a dictionary with the scores
    results = se.eval(transfer_tasks)
    print(results)

2018-05-30 14:47:48,296 : ***** Transfer task : MR *****


2018-05-30 14:47:48,301 : loading Word2Vec object from skipgram-100d-50e-mincount0.bin
2018-05-30 14:47:48,302 : {'kw': {}, 'mode': 'rb', 'uri': 'skipgram-100d-50e-mincount0.bin'}
2018-05-30 14:47:48,304 : encoding_wrapper: {'errors': 'strict', 'encoding': None, 'mode': 'rb', 'fileobj': <_io.BufferedReader name='skipgram-100d-50e-mincount0.bin'>}
2018-05-30 14:47:48,483 : loading wv recursively from skipgram-100d-50e-mincount0.bin.wv.* with mmap=None
2018-05-30 14:47:48,484 : setting ignored attribute syn0norm to None
2018-05-30 14:47:48,485 : setting ignored attribute cum_table to None
2018-05-30 14:47:48,486 : loaded skipgram-100d-50e-mincount0.bin
2018-05-30 14:47:48,562 : Generating sentence embeddings
2018-05-30 14:47:48,596 : Generated sentence embeddings
2018-05-30 14:47:48,602 : Training sklearn-LogReg with (inner) 10-fold cross-validation
2018-05-30 14:47:48,724 : Best param found at split 1: l2reg = 1                 

2018-05-30 14:51:08,690 : loading wv recursively from skipgram-100d-50e-mincount0.bin.wv.* with mmap=None
2018-05-30 14:51:08,691 : setting ignored attribute syn0norm to None
2018-05-30 14:51:08,691 : setting ignored attribute cum_table to None
2018-05-30 14:51:08,692 : loaded skipgram-100d-50e-mincount0.bin
2018-05-30 14:51:08,733 : Computing embedding for train
2018-05-30 14:51:12,936 : Computed train embeddings
2018-05-30 14:51:12,937 : Computing embedding for dev
2018-05-30 14:51:13,055 : Computed dev embeddings
2018-05-30 14:51:13,056 : Computing embedding for test
2018-05-30 14:51:13,484 : Computed test embeddings
2018-05-30 14:51:13,485 : Training sklearn-LogReg with standard validation..
2018-05-30 14:51:20,766 : [('reg:0.25', 68.23), ('reg:0.5', 68.23), ('reg:1', 68.35), ('reg:2', 68.35), ('reg:4', 68.35), ('reg:8', 68.35)]
2018-05-30 14:51:20,767 : Validation : best param found is reg = 1 with score             68.35
2018-05-30 14:51:20,768 : Evaluating...
2018-05-30 14:51:22

{'MR': {'devacc': 58.07, 'acc': 55.69, 'ndev': 74, 'ntest': 74}, 'CR': {'devacc': 71.57, 'acc': 72.03, 'ndev': 3775, 'ntest': 3775}, 'MPQA': {'devacc': 83.24, 'acc': 83.08, 'ndev': 10606, 'ntest': 10606}, 'SUBJ': {'devacc': 83.26, 'acc': 83.22, 'ndev': 10000, 'ntest': 10000}, 'SST2': {'devacc': 68.35, 'acc': 67.27, 'ndev': 872, 'ntest': 1821}, 'TREC': {'devacc': 58.49, 'acc': 61.6, 'ndev': 5452, 'ntest': 500}, 'MRPC': {'devacc': 70.39, 'acc': 70.67, 'f1': 80.46, 'ndev': 4076, 'ntest': 1725}, 'SICKEntailment': {'devacc': 72.2, 'acc': 71.38, 'ndev': 500, 'ntest': 4927}, 'STS14': {'deft-forum': {'pearson': (0.30277773334673425, 5.394608121020304e-11), 'spearman': SpearmanrResult(correlation=0.35728776598497364, pvalue=5.381483819848277e-15), 'nsamples': 450}, 'deft-news': {'pearson': (0.6080544324892407, 1.0175134163520799e-31), 'spearman': SpearmanrResult(correlation=0.5919665363954733, pvalue=9.361698052393679e-30), 'nsamples': 300}, 'headlines': {'pearson': (0.4517861088805761, 5.32821

In [36]:
import pickle
with open('results/skipgram_results_batcher_wa001.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [37]:
for k,v in results.items():
    try:
        print(k, v['acc'])
    except:
        print(k, "no acc")

MR 55.69
CR 72.03
MPQA 83.08
SUBJ 83.22
SST2 67.27
TREC 61.6
MRPC 70.67
SICKEntailment 71.38
STS14 no acc


In [38]:
print(results['STS14']['all'])

{'pearson': {'mean': 0.5414966993752444, 'wmean': 0.5526072886838456}, 'spearman': {'mean': 0.5633304761672284, 'wmean': 0.5763775657544196}}
