# SentEval usage example

* Clone repo from FAIR github
```
    git clone https://github.com/facebookresearch/SentEval.git
    cd SentEval/
```
* Dependencies:
    * Python 2/3 with NumPy/SciPy
    * Pytorch
    * scikit-learn>=0.18.0

* Install senteval
```
    python setup.py install
```
* Download datasets (it takes some time...)
    * these are downstream tasks
    * new Senteval also has probing tasks (https://github.com/facebookresearch/SentEval/tree/master/data/probing) for evaluating linguistic properties of your embeddings. 
```
    cd data/downstream/
    ./get_transfer_data.bash
```
* Download pretained Glove embeddings:

```
    mkdir pretrained
    cd pretrained
    wget http://nlp.stanford.edu/data/glove.840B.300d.zip
   
```

* The following code evaluates Glove pretrained embeddings on different NLP downstream tasks.

In [1]:
import io

# Create dictionary
def create_dictionary(sentences, threshold=0):
    words = {}
    for s in sentences:
        for word in s:
            words[word] = words.get(word, 0) + 1

    if threshold > 0:
        newwords = {}
        for word in words:
            if words[word] >= threshold:
                newwords[word] = words[word]
        words = newwords
    words['<s>'] = 1e9 + 4
    words['</s>'] = 1e9 + 3
    words['<p>'] = 1e9 + 2

    sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
    id2word = []
    word2id = {}
    for i, (w, _) in enumerate(sorted_words):
        id2word.append(w)
        word2id[w] = i

    return id2word, word2id


# Get word vectors from vocabulary (glove, word2vec, fasttext ..)
def get_wordvec(path_to_vec, word2id):
    word_vec = {}

    with io.open(path_to_vec, 'r', encoding='utf-8') as f:
        # if word2vec or fasttext file : skip first line "next(f)"
        for line in f:
            word, vec = line.split(' ', 1)
            if word in word2id:
                word_vec[word] = np.fromstring(vec, sep=' ')

    logging.info('Found {0} words with word vectors, out of \
        {1} words'.format(len(word_vec), len(word2id)))
    return word_vec

In [8]:
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

from __future__ import absolute_import, division, unicode_literals

import sys
import numpy as np
import logging
import sklearn

# Set PATHs
# path to senteval
PATH_TO_SENTEVAL = 'SentEval'
# path to the NLP datasets 
PATH_TO_DATA = 'SentEval/data/'
# path to glove embeddings
PATH_TO_VEC = 'SentEval/data/downstream/pretrained/glove.840B.300d.txt'


# import SentEval
sys.path.insert(0, PATH_TO_SENTEVAL)
import senteval


def prepare(params, samples):
    """
    In this example we are going to load Glove, 
    here you will initialize your model.
    remember to add what you model needs into the params dictionary
    """
    _, params.word2id = create_dictionary(samples)
    # load glove/word2vec format 
    params.word_vec = get_wordvec(PATH_TO_VEC, params.word2id)
    # dimensionality of glove embeddings
    params.wvec_dim = 300
    return

def batcher(params, batch):
    """
    In this example we use the average of word embeddings as a sentence representation.
    Each batch consists of one vector for sentence.
    Here you can process each sentence of the batch, 
    or a complete batch (you may need masking for that).
    
    """
    # if a sentence is empty dot is set to be the only token
    # you can change it into NULL dependening in your model
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []

    for sent in batch:
        sentvec = []
        # the format of a sentence is a lists of words (tokenized and lowercased)
        for word in sent:
            if word in params.word_vec:
                # [number of words, embedding dimensionality]
                sentvec.append(params.word_vec[word])
        if not sentvec:
            vec = np.zeros(params.wvec_dim)
            # [number of words, embedding dimensionality]
            sentvec.append(vec)
        # average of word embeddings for sentence representation
        # [embedding dimansionality]
        sentvec = np.mean(sentvec, 0)
        embeddings.append(sentvec)
    # [batch size, embedding dimensionality]
    embeddings = np.vstack(embeddings)
    return embeddings


# Set params for SentEval
# we use logistic regression (usepytorch: Fasle) and kfold 10
# In this dictionary you can add extra information that you model needs for initialization
# for example the path to a dictionary of indices, of hyper parameters
# this dictionary is passed to the batched and the prepare fucntions
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 10}
# this is the config for the NN classifier but we are going to use scikit-learn logistic regression with 10 kfold
# usepytorch = False 
#params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
#                                 'tenacity': 3, 'epoch_size': 2}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    se = senteval.engine.SE(params_senteval, batcher, prepare)
    
    # here you define the NLP taks that your embedding model is going to be evaluated
    # in (https://arxiv.org/abs/1802.05883) we use the following :
    # SICKRelatedness (Sick-R) needs torch cuda to work (even when using logistic regression), 
    # but STS14 (semantic textual similarity) is a similar type of semantic task
    transfer_tasks = ['MR', 'CR', 'MPQA', 'SUBJ']
    for task in transfer_tasks:
        # senteval prints the results and returns a dictionary with the scores
        try:
            results = se.eval([task])
        except:
            print("Failed: ", task)
            continue
    print(results)

2018-05-21 13:28:34,219 : ***** Transfer task : MR *****


2018-05-21 13:29:54,241 : Found 18490 words with word vectors, out of         20328 words
2018-05-21 13:29:54,278 : Generating sentence embeddings
2018-05-21 13:29:55,557 : Generated sentence embeddings
2018-05-21 13:29:55,560 : Training sklearn-LogReg with (inner) 10-fold cross-validation
2018-05-21 13:31:01,937 : Best param found at split 1: l2reg = 1                 with score 78.15
2018-05-21 13:32:08,339 : Best param found at split 2: l2reg = 2                 with score 78.04
2018-05-21 13:33:15,008 : Best param found at split 3: l2reg = 1                 with score 78.01
2018-05-21 13:34:20,844 : Best param found at split 4: l2reg = 1                 with score 77.96
2018-05-21 13:35:26,332 : Best param found at split 5: l2reg = 2                 with score 78.18
2018-05-21 13:36:33,329 : Best param found at split 6: l2reg = 4                 with score 77.9
2018-05-21 13:37:40,443 : Best param found at split 7: l2reg = 

{'SUBJ': {'acc': 91.69, 'devacc': 91.77, 'ndev': 10000, 'ntest': 10000}}
