In [36]:
from rte.riedel.fever_features import TermFrequencyFeatureFunction
from retrieval.fever_doc_db import FeverDocDB
from common.dataset.data_set import DataSet
from retrieval.sentence import FEVERSentenceFormatter
from common.dataset.reader import JSONLineReader
from rte.riedel.data import FEVERGoldFormatter, FEVERLabelSchema
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
db = FeverDocDB("data/fever/fever.db")
jlr = JSONLineReader()
formatter = FEVERGoldFormatter(set(),FEVERLabelSchema())

In [21]:
train_ds = DataSet(file="data/fever/train.ns.pages.p{0}.jsonl".format(1), reader=jlr, formatter=formatter)
dev_ds = DataSet(file="data/fever/dev.ns.pages.p{0}.jsonl".format(1), reader=jlr, formatter=formatter)

In [22]:
train_ds.read()
dev_ds.read()

In [39]:
class XTermFrequencyFeatureFunction(TermFrequencyFeatureFunction):
    def texts(self,data):
        return [item["text"] for item in data] 

    def process(self,data):
        claim_bow = self.bow_vectorizer.transform(self.claims(data))
        claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
        claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

        body_texts = self.texts(data)
        body_bow = self.bow_vectorizer.transform(body_texts)
        body_tfs = self.tfreq_vectorizer.transform(body_bow)
        body_tfidf = self.tfidf_vectorizer.transform(body_texts)

        cosines = np.array([cosine_similarity(c, b)[0] for c,b in zip(claim_tfidf,body_tfidf)])

        return cosines

    
tf = XTermFrequencyFeatureFunction(db)
tf.inform(train_ds.data, dev_ds.data)

In [41]:
def tf_idf_sim(claim,lines):
    test = []
    for line in lines:
        test.append({"claim":claim,"text":line})
        
    return tf.lookup(test).reshape(-1).tolist()

tf_idf_sim("This is at test",["This is a test","A completely different claim about something that test is random and unrelated"])

[1.0, 0.5254255747637875]