In [33]:
# %load eval.py
#!/usr/bin/env python
'''
| Filename    : eval.py
| Description :
| Author      : Pushpendre Rastogi
| Created     : Thu Dec  1 20:46:46 2016 (-0500)
| Last-Updated: Fri Dec  2 06:27:32 2016 (-0500)
|           By: Pushpendre Rastogi
|     Update #: 23
All learning to rank methods are quite simple, the goal is to learn a model
that can optimize a metric like MAP, or ROC given a few examples. In my case
I can start with a list of things that the true FSDM code returns, thanks to
chenyan's results files, and then rerank those things.
'''
import config
import numpy
from collections import defaultdict, OrderedDict
import cPickle as pkl
import os, itertools
DBPEDIA_PFXLEN = len('http://dbpedia.org/resource/')
#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
qid_2_query = {}
with open(config.QUERY_FN) as file_handle:
    for (qid, query_string) in (row.strip().split('\t') for row in file_handle):
        qid_2_query[qid] = query_string.split()

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-##-#-#-#-#
qid_2_fsdm_top_100 = OrderedDict()
with open(config.RANK_SVM_INEX_LD) as file_handle:
    for (qid, _, answer, _serial_no, _score, __) in (
            row.strip().split() for row in file_handle):
        try:
            qid_2_fsdm_top_100[qid].append(answer)
        except KeyError:
            qid_2_fsdm_top_100[qid] = [answer]
            pass
        pass
    qid_2_fsdm_top_100.default_factory = None

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
qid_2_true_answer = defaultdict(list)
with open(config.QRELS_FN) as file_handle:
    for (qid, _, answer, __) in (row.strip().split() for row in file_handle):
        qid_2_true_answer[qid].append(answer)
qid_2_true_answer.default_factory = None
#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
class FoldIterator(object):
    def __init__(self, n=5, list_to_fold=None, total=None, return_iterables=False):
        assert not (list_to_fold is None and total is None)
        if total is None:
            total=len(list_to_fold)
        assert total == len(list_to_fold)
        assert total%n == 0, "Can not split %d into %d parts"%(total, n)
        self.n = n
        self.i = 0
        self.d = total / n
        self.total = total
        self.return_iterables = return_iterables
        self.list_to_fold = list(list_to_fold)

    def __iter__(self):
        return self

    def next(self):
        if self.i == self.n:
            raise StopIteration
        else:
            test_start = self.i * self.d
            test_end = test_start + self.d
            if self.list_to_fold is not None:
                if self.return_iterables:
                    return ((self.list_to_fold[e] for e in itertools.chain(xrange(0, test_start), xrange(test_end, self.n))),# Train id
                            (self.list_to_fold[e] for e in xrange(test_start, test_end))) # Test id
                else:
                    return ([self.list_to_fold[e] for e in (range(0, test_start) + range(test_end, self.total))],
                            [self.list_to_fold[e] for e in range(test_start, test_end)])
            else:
                if self.return_iterables:
                    return (itertools.chain(xrange(0, test_start), xrange(test_end, self.n)),# Train id
                            xrange(test_start, test_end)) # Test id
                else:
                    return ((range(0, test_start) + range(test_end, self.total)),
                            range(test_start, test_end))
            pass
        pass


#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
def get_query_vec(query_token_list, mvlsa_word_emb_data):
    # TODO: The real program should do these things automatically. For now
    # I have manually made these changes to the query files.
    # 1. Taking care of apostrophes, remove traling aprostrophes
    # 2. Splitting hyphens
    # 3. Removing other punctuation
    def get_vec(token):
        if token in mvlsa_word_emb_data:
            return mvlsa_word_emb_data[token]
        else:
            return None
        pass
    query_vec = [get_vec(token) for token in query_token_list]
    filtered_vec = [e for e in query_vec if e is not None]
    if len(filtered_vec) > 0:
        return numpy.mean(filtered_vec, axis=0)
    else:
        return numpy.zeros((300,))


def feat_string(query_vec, answer_vec):
    # TODO: Add customization to vary the features used.
    vec = numpy.outer(query_vec[:25], answer_vec[:25]).ravel() * 1000
    assert not numpy.isnan(vec).any()
    return ''.join('%d:%.3e '%(i+1,e) for i,e in enumerate(vec))

def train_model(training_query_ids, true_answer_entities, entities_retrieved_by_fsdm, entity_vec_dict, mvlsa_data, query_id_to_question_tokens,
                train_data_fn = '/tmp/train_data_fn',
                model_fn = '/tmp/model_fn'):
    '''
    --- INPUT ---
    training_query_ids          : [INEX_LD-2009022', ...]
    true_answer_entities        : {'INEX_LD-2009022': ['http://dbpedia.org/resource/Indian_Chinese_cuisine', ...], ...}
    entities_retrieved_by_fsdm  : {'INEX_LD-2009022': ['http://dbpedia.org/resource/National_dish', ...], ...}
    entity_vec_dict             : {'http://dbpedia.org/resource/National_dish': [300d 'float64' array]], ...}
    mvlsa_data                  : {'star': [300d 'float64' array]}
    query_id_to_question_tokens : {'INEX_LD-2009022': ['Szechwan', 'dish', 'food', 'cuisine']}
    '''
    with open(train_data_fn, 'wb') as train_data_f:
        for numeric_qid, qid in enumerate(training_query_ids):
            numeric_qid += 1
            query_vec = get_query_vec(query_id_to_question_tokens[qid], mvlsa_data)
            true_answer_set = set(true_answer_entities[qid])
            for answer in true_answer_set:
                train_data_f.write('1 qid:%d %s\n'%(
                    numeric_qid,
                    feat_string(query_vec,
                                get_query_vec(answer[DBPEDIA_PFXLEN:].strip().split('_'),
                                              mvlsa_data))))

            for answer in entities_retrieved_by_fsdm[qid]:
                if answer not in true_answer_set:
                    train_data_f.write('0 qid:%d %s\n'%(
                        numeric_qid,
                        feat_string(query_vec,
                                    get_query_vec(answer[DBPEDIA_PFXLEN:].split('_'),
                                                  mvlsa_data))))

    # os.system('~/data/svm_rank/svm_rank_learn -c 10 %s %s'%(train_data_fn, model_fn))
    return model_fn

def test_model(testing_query_ids, entities_retrieved_by_fsdm, entity_vec_dict, mvlsa_data, query_id_to_question_tokens, true_answer_entities,
               model_fn = '/tmp/model_fn', test_data_fn='/tmp/test_data_fn', prediction_fn='/tmp/prediction_fn'):
    '''
    --- INPUT ---
    testing_query_ids           :
    entities_retrieved_by_fsdm  :
    entity_vec_dict             :
    mvlsa_data                  :
    query_id_to_question_tokens :
    --- OUTPUT ---
    '''
    with open(test_data_fn, 'wb') as test_data_f:
        for numeric_qid, qid in enumerate(testing_query_ids):
            numeric_qid += 1
            query_vec = get_query_vec(query_id_to_question_tokens[qid], mvlsa_data)
            true_answer_set = set(true_answer_entities[qid])
            for answer in entities_retrieved_by_fsdm[qid]:
                test_data_f.write('%d qid:%d %s\n'%(
                    int(answer in true_answer_set),
                    numeric_qid,
                    feat_string(query_vec,
                                get_query_vec(answer[DBPEDIA_PFXLEN:].split('_'),
                                              mvlsa_data))))
    # os.system('~/data/svm_rank/svm_rank_classify %s %s %s'%(test_data_fn, model_fn, prediction_fn))
    return predictions_fn

def evaluate(test_query_ids, true_answer_entities, predictions):

    print 'MAP', 'P@10', 'P@20'


mvlsa_data = pkl.load(open(config.MVLSA_EMB_PKL_FN, mode="rb"))
# entity_vec_dict = pkl.load(open("kbmvlsa_embedding.pkl", mode="rb"))
entity_vec_dict = defaultdict(float)
for (train_ent, test_ent) in FoldIterator(n=5, list_to_fold=qid_2_fsdm_top_100):
    print(len(train_ent))
    print(len(test_ent))
    model_fn = train_model(train_ent, qid_2_true_answer, qid_2_fsdm_top_100, entity_vec_dict, mvlsa_data, qid_2_query)
    predictions_fn = test_model(test_ent, qid_2_fsdm_top_100, entity_vec_dict, mvlsa_data, qid_2_query, qid_2_true_answer,
                             model_fn=model_fn)
    import pdb
    pdb.set_trace()
    print(evaluate(test_ent, qid_2_true_answer, predictions_fn))


80
20


NameError: global name 'predictions_fn' is not defined