In [2]:
import re
import os
import math
import copy
import operator
from collections import defaultdict
from datetime import datetime
from elasticsearch import Elasticsearch, RequestsHttpConnection

In [10]:
ELASTIC_USER = "elastic"
ELASTIC_PWD = "changeme"
ELASTIC_INDEX = "ap_dataset"
ELASTIC_TYPE = "document"
DF_THRESHOLD = 7000
# Delete from Query
# DELETE_LIST = ["document"]
DELETE_LIST = [
    "document", "discuss", "type", "identifi", "predict", "cite", "describ",
    "determin"
]

In [4]:
class Term:
    def __init__(self, term, tf, df, ttf):
        self.term = term
        self.tf = tf
        self.df = df
        self.ttf = ttf
        
class Document:
    def __init__(self, id, dlen):
        self.id = id
        self.dlen = dlen
        self.terms = []
    def add_word(self, term_obj):
        self.terms.append(term_obj)
        
    def add_words(self, doc_words):
        for word in doc_words:
            self.add_word(Term(word['term'], word['tf'], word['df']))
                
                
                
class Query:
    def __init__(self, id, es_helper, tokens=None):
        self.es_helper = es_helper
        self.id =id
        self.tokens = tokens
        self.update_docs()
        self.calc_query_tf()
        
    def calc_query_tf(self):
        query_words = self.tokens
        query_tf = defaultdict(int)
        for word in query_words:
            query_tf[word] += 1
        self.query_tf = query_tf
        
    def update_docs(self):
        doc_list = {}
        for token in self.tokens:
            term_docs = self.es_helper.search_term(token)
            for doc in term_docs:
                doc_id = doc['id']
                term_tf_df = self.es_helper.get_tf_df(doc_id, token)
                if (doc_id in list(doc_list.keys())):
                    doc_obj = doc_list[doc_id]
                    doc_obj.add_word(term_tf_df)
                else:
                    doc_obj = Document(doc['id'], doc['dlen'])
                    doc_obj.add_word(term_tf_df)
                    doc_list[doc_id] = doc_obj
        self.docs = doc_list

In [13]:
class ESHelpers:
    """ Utility Class with all ES helper functions """
    def __init__(self):
        self.es = Elasticsearch(connection_class=RequestsHttpConnection, http_auth=(ELASTIC_USER, ELASTIC_PWD))
        self.get_avg_dlen()
        self.total_docs()
        self.get_sum_ttf()
        self.get_vocab_size()

    def get_tokens(self, query):
        """
        Creates tokens for a query.
        @param: A query that needs to be tokenized 
        This function first analyzes the query and gets the tokens and filters with the DELETE LIST and
        only displays the tokens that have a Document Frequency less than the threshold.
        """
        data = self.es.indices.analyze(index=ELASTIC_INDEX, analyzer="my_english", body={"text": query})["tokens"]
        filtered_data = list(map(lambda x: x["token"], data))
        new_tokens = []
        for word in filtered_data:
            if (self.get_df(word) < DF_THRESHOLD):
                new_tokens.append(word)
        return list(filter(lambda x: x not in DELETE_LIST, new_tokens))
        

    def get_avg_dlen(self):
        """ Gets the average document length from elasticsearch"""
        query_aggs = {
          "query": {
            "match_all": {}
          },
          "aggs": {
            "doc_lengths": {
              "stats": {
                "script": {
                  "lang": "groovy",
                  "inline": "_doc['doc_length']"
                }
              }
            }
          }
        }
        self.avg_dlen = self.es.search(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, body=query_aggs)["aggregations"]['doc_lengths']['avg']
    def get_vocab_size(self):
        """ Gets the average document length from elasticsearch"""
        query_cardinality = {
          "size": 0, 
          "aggs": {
            "vocabSize": {
              "cardinality": {
                "field": "text"
              }
            }
          }
        }
        self.vocab_size = self.es.search(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, body=query_cardinality)["aggregations"]['vocabSize']['value']

    def get_sum_ttf(self):
        """ Gets the sum of total tf from elasticsearch"""
        self.sum_ttf = self.es.field_stats(index=ELASTIC_INDEX, fields="text")['indices']['_all']['fields']['text']['sum_total_term_freq']
        
    def total_docs(self):
        """ Gets the total number of documents in elasticsearch """
        self.doc_count = self.es.field_stats(index=ELASTIC_INDEX, fields=['doc_length'])['indices']['_all']['fields']['doc_length']['doc_count']

    def get_all_tv(self, doc, body=None):
        """ Gets all the TermVectors for a particular document.
            @param: A document Id
        """
        if body:
            return self.es.termvectors(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, id=doc, positions=False, payloads=False, offsets=False, term_statistics=True, body=body)['term_vectors']['text']['terms']
        else:
            return self.es.termvectors(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, id=doc, positions=False, payloads=False, offsets=False, term_statistics=True)['term_vectors']['text']['terms']

    def get_tf_df(self, doc, term):
        """
        Get TF and DF values for the given term in the given document
        @param term: A term
        @param doc: A document Id
        """
        tv_doc = self.get_all_tv(doc)[term]
        return Term(term, tv_doc['term_freq'], tv_doc['doc_freq'], tv_doc['ttf'])
    
    def get_df_2(self, term):
        """
        Get the Document Frequency for the given term
        @param term: term to get DF
        """
        df_search_qry = {
                      "script_fields": {
                        "df": {
                          "script": {
                          "lang": "groovy",
                          "inline": "_index['text']['"+ term +"'].df()"
                                }
                            }
                          }
                    }
        return self.es.search(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE,body=df_search_qry)['hits']['hits'][0]['fields']['df'][0]

    def get_ttf(self, term):
        """
        Get the Total Term Frequency for the given term
        @param term: term to get DF
        """
        ttf_search_qry = {
                      "script_fields": {
                        "ttf": {
                          "script": {
                          "lang": "groovy",
                          "inline": """_index['text']["%s"].ttf()""" % term
                                }
                            }
                          }
                    }
        return self.es.search(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE,body=ttf_search_qry)['hits']['hits'][0]['fields']['ttf'][0]
    
    def get_df(self, term):
        """
        Get the Document Frequency for the given term. Different implementation
        @param term: term to get DF
        """
        return self.es.count(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, body={"query": {"term": {"text": term}}})['count']

    def search_term(self, term):
        """
        Find all documents containing the given term. Performs an exact match.
        @param term: A term to search in ES
        """
#         size=self.get_df(term)
        search_qry = {
                      "query": {
                        "term": {
                          "text": term
                          }
                      }
                    }
        res = self.es.search(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, _source=["_id", "doc_length"], size=9000, body=search_qry)
        return (list(map(lambda x: {"id": x['_id'], "dlen": x['_source']['doc_length']}, res['hits']['hits'])))

    def search(self, query):
        """
        Performs a search on ES with the given query
        @param query: A query to perform search
        """
        return self.es.search(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, body=query)

    def get_all_tv_multi(self, doc_ids, body=None):
        """
        Gets all the TermVectors from multiple documents
        @param doc_ids: A list of document Ids to fetch the termvectors
        """
        if body:
            return self.es.mtermvectors(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, offsets="false", payloads="false", positions="false", term_statistics="true", field_statistics="true", ids=doc_ids, body=body)['docs']
        else:
            return self.es.mtermvectors(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, offsets="false", payloads="false", positions="false", term_statistics="true", field_statistics="true", ids=doc_ids)['docs']

    def get_multi_tf(self, term, doc_ids):
        """
        Get the TF for given term from the given document IDs
        @param term: A term to get TF
        @param doc_ids: A list of document Ids to fetch TF
        """
        res = self.get_all_tv_multi()
        return list(map(lambda x: {"id": x['_id'], "term": term, "tf": x['term_vectors']['text']['terms'][term]['term_freq'], "df": x['term_vectors']['text']['terms'][term]['doc_freq']}, res))

    def get_significant_terms(self, term):
        sign_term_qry = {
                        "query" : {
                            "term" : {"text" : term}
                        },
                        "aggregations" : {
                            "sign_term" : {
                                "significant_terms" : {
                                  "field" : "text"              
                                }
                            }
                        },
                        "size": 0
                    }
        return self.es.search(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, body=sign_term_qry)['aggregations']['sign_term']['buckets']

In [14]:
class Scoring:
    """
        A class to load the queries, and create the initial dataset from ES.
        then performs multiple scoring techniques with different models.
    """
    def __init__(self, queries=None):
        self.es_helper = ESHelpers()
        self.avgdlen = self.es_helper.avg_dlen
        self.num_docs = self.es_helper.doc_count
        self.vocab_size = self.es_helper.vocab_size
        self.sum_ttf = self.es_helper.sum_ttf
        if queries: self.queries = queries
        self.results = {}
        
    def load_queries(self, file):
        """
        Load the queries from the given file and tokenize them
        @param file: A file name to fetch the queries from
        """
        queries = []
        with open(file, 'r') as f:
            for line in f:
                reg_match = re.match(r'^(\d+).(.*)', line)
                tokens = self.es_helper.get_tokens(reg_match.group(2).strip())
                queries.append(Query(reg_match.group(1).strip(), self.es_helper, tokens))
        self.queries = queries
        
    def logTF(self, tf):
        """
        Calculates the log of the given TF value
        @param tf: A Term Frequency value
        """
        return Math.log(tf)

    def okapiTF(self, tf, dlen, avgdlen):
        """
        Calculates the okapiTF value from the given parameters.
        @param tf: Term Frequency
        @param dlen: Document Length
        @param avgdlen: Average document length in ES
        """
        return tf / (tf + 0.5 + 1.5 * (dlen/avgdlen))

    def processVS(self, model="okapi", l = 0.5):
        """
        Transforms the input dataset to a new retrieval model.
        @param model: A retrieval model to use.
        Options: okapi, tfidf, bm25
        """
        self.results[model] = {}
        for query in self.queries:
            res = {}
            for doc in query.docs.values():
                sum_otf = 0.0
                for term in doc.terms:
                    if model == "okapi":
                        sum_otf += self.okapiTF(term.tf, doc.dlen, self.avgdlen)
                    if model == "tfidf":
                        sum_otf += self.tfidf(term.tf, term.df, doc.dlen, self.avgdlen, self.num_docs)
                    if model == "bm25":
                        sum_otf += self.bm25(term.tf, term.df, query.query_tf[term.term], doc.dlen, self.avgdlen, self.num_docs)
                res[doc.id] = sum_otf
            self.results[model][query.id] = res
            
    def processLM(self, model='laplace', l=0.5):
        self.results[model] = {}
        for query in self.queries:
            res = {}
            df = {}
            ttf = {}
            
            for token in query.tokens:
                df[token] = self.es_helper.get_df(token)
                ttf[token] = self.es_helper.get_ttf(token)
                
            local_query = copy.deepcopy(query)
#             print("Query tokens: ", query.tokens)
            for doc in local_query.docs.values():
                sum_otf = 0.0
#                 print(doc.dlen, self.vocab_size)
#                 print("doc terms before: ", list(map(lambda t: t.term, doc.terms)))
                for token in query.tokens:
                    if token not in map(lambda t: t.term, doc.terms):
                        doc.terms.append(Term(token, 0, df[token], ttf[token]))
#                 print("doc terms after: ", list(map(lambda t: (t.term, t.tf, t.df, t.ttf), doc.terms)))
                for term in doc.terms:
                    if model == "laplace":
                        sum_otf += self.ulm_laplace(term.tf, doc.dlen)
                    if model == "jmercer":
                        sum_otf += self.ulm_mercer(term.tf, term.ttf, doc.dlen, l)
                res[doc.id] = sum_otf
            self.results[model][query.id] = res
                
    def output_results(self, model, limit):
        """
        Extract the final list with new values
        @param limit: A limit 
        """
        out = []
        for key in self.results[model].keys():
            qId = key
            count = 1
            sorted_docs = sorted(self.results[model][key].items(), key=operator.itemgetter(1), reverse=True)[:limit] 
            for  val in sorted_docs:
                dId = val[0]
                dScore = val[1]
                dRank = count
                out.append(str(qId) + " " + "Q0" + " " + str(dId) + " " + str(dRank) + " " + str(dScore) + " " + "Exp")
                count += 1
        return out
    
    def write_file(self, model, fname, limit=1000):
        with open(fname, 'a') as outFile:
            for data in self.output_results(model, limit):
                outFile.write(data + os.linesep)
        
    def tf_doc_query(self, doc, query, tfidf=False):
        sumtf = 0.0
        #tokenize query
        for word in query:
            tf = getTF(doc, word)
            temp = okapiTF(tf, dlen, avgdlen)
            if tfidf:
                temp * log(es_helper.doc_count/df)
            sumtf += temp
        return sumtf

    def tfidf(self, tf, df, dlen, avgdlen, num_docs):
        temp = self.okapiTF(tf, dlen, avgdlen)
        return temp * math.log(num_docs/df)
            
    def bm25(self, tf, df, q_tf, dlen, avgdlen, num_docs, k1=1.2, k2=0.5, b=0.75):
        return (math.log((num_docs+0.5)/(df+0.5)) * ((tf + k1*tf)/(tf+k1 * ((1-b) + b * dlen/avgdlen))) * (q_tf + k2*q_tf)/(q_tf + k2))

    def ulm_laplace(self, tf, dlen):
        return math.log((tf + 1) /(dlen + self.vocab_size))
    
    def ulm_mercer(self, tf, ttf, dlen, l=0.5):
#         return math.log(l * tf/dlen + (1-l) * ttf/self.vocab_size)
        return math.log(l * tf/dlen + (1-l) * (ttf - tf)/(self.sum_ttf - dlen))

# min(list(scoring.results["okapi"]["100"].values()))
    def normalize(self):
        for model in self.results.keys():
            model_min = 99999
            model_max = -99999
            for qId, docs in self.results[model].items():
#                 print("docs", docs)
                model_min = min(model_min, min(list(docs.values())))
                model_max = max(model_max, max(list(docs.values())))            
                model_diff = model_max - model_min
            print(model_diff)
            for qId in self.results[model].keys():
                for k, v in docs.items():
                    self.results[model][qId][k] = (v - model_min)/model_diff
                
    def metasearch(self, limit=1000):
        out = []
        docs = {}
        for model in self.results.keys():
            for qId, d in self.results[model].items():
                docs[qId] = {}
                for docId, val in d.items():
                    docs[qId][docId] = 0.0
        for model in self.results.keys():
            for qId, d in self.results[model].items():
                for docId, val in d.items():
                    docs[qId][docId] += val
        for qId, d in docs.items():
            sorted_docs = sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:limit]
            count = 1
            for  val in sorted_docs:
                    dId = val[0]
                    dScore = val[1]
                    dRank = count
                    out.append(str(qId) + " " + "Q0" + " " + str(dId) + " " + str(dRank) + " " + str(dScore) + " " + "Exp")
                    count += 1
        return out
    
    def write_meta_file(self, fname, limit=1000):
        with open(fname, 'a') as outFile:
            for data in self.metasearch(limit):
                outFile.write(data + os.linesep)

In [13]:
es_helper = ESHelpers()
# Load all the queries
queries = []
with open('./IR_DATA/AP_DATA/query_desc.51-100.short.txt', 'r') as f:
    for line in f:
        reg_match = re.match(r'^(\d+).(.*)', line)
        tokens = es_helper.get_tokens(reg_match.group(2).strip())
        queries.append(Query(reg_match.group(1).strip(), es_helper, tokens))

In [54]:
# Initializing a scoring object
scoring = Scoring(queries)

In [55]:
# List of operations
vs_ops = ['okapi', 'tfidf', 'bm25']
lm_ops = ['laplace', 'jmercer']

In [56]:
# Doing all the vs ops and writing to a file
for op in vs_ops:
    scoring.processVS(op)
#     scoring.write_file( op, op + '.1000' )

In [None]:
# Doing all the lm ops and writing to a file
for op in lm_ops:
    scoring.processLM(op, 0.95)
#     scoring.write_file( op, op + '.1000' )

In [29]:
scoring.normalize()

41.96705542298496
125.01114282815296
9.430759340738948


In [32]:
scoring.write_meta_file("meta.1000")

In [57]:
scoring.output_results('tfidf', 1000)

['60 Q0 AP890728-0243 1 21.228677424342735 Exp',
 '60 Q0 AP890217-0179 2 19.280966018275176 Exp',
 '60 Q0 AP890427-0283 3 19.110128659317255 Exp',
 '60 Q0 AP891026-0273 4 18.451182009758252 Exp',
 '60 Q0 AP890404-0214 5 18.261555255111475 Exp',
 '60 Q0 AP890217-0192 6 18.052259023777957 Exp',
 '60 Q0 AP890420-0248 7 17.33345291713348 Exp',
 '60 Q0 AP890720-0266 8 17.31928672176114 Exp',
 '60 Q0 AP890330-0253 9 17.294706598555766 Exp',
 '60 Q0 AP891122-0305 10 17.289775738403787 Exp',
 '60 Q0 AP890407-0313 11 17.285131550279477 Exp',
 '60 Q0 AP890627-0213 12 17.097882992657713 Exp',
 '60 Q0 AP890329-0228 13 17.03672053209403 Exp',
 '60 Q0 AP890727-0293 14 16.876336852427897 Exp',
 '60 Q0 AP891215-0101 15 16.6985679609707 Exp',
 '60 Q0 AP890421-0218 16 16.654194584491663 Exp',
 '60 Q0 AP891215-0276 17 16.47011961149506 Exp',
 '60 Q0 AP890413-0281 18 16.38411550913143 Exp',
 '60 Q0 AP890614-0272 19 16.273246203417504 Exp',
 '60 Q0 AP890113-0260 20 16.16265917875024 Exp',
 '60 Q0 AP890524-

In [122]:
# Getting distinct terms from each document and re-running the entire scoring model
filter_qry = {
      "filter" : {
          "min_doc_freq" : 100,
          "max_doc_freq" : 1000
        }
    }

for query in queries:
    new_terms = []
    for doc in query.docs.keys():
        for k, v in es_helper.get_all_tv(doc, filter_qry).items():
            new_terms.append(k)
    query.tokens += new_terms
    query.update_docs()
    query.calc_query_tf()

GET http://localhost:9200/ap_dataset/document/_search?_source=_id%2Cdoc_length&size=13230 [status:500 request:0.014s]


TransportError: TransportError(500, 'search_phase_execution_exception', 'Result window is too large, from + size must be less than or equal to: [10000] but was [13230]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting.')

In [34]:
es_helper.get_avg_dlen()

In [None]:
es_helper = ESHelpers()

In [14]:
es_helper_sign = ESHelpers()
sign_queries = copy.copy(queries)
#Significant Terms Query
for query in sign_queries:
    new_terms = []
    query.es_helper = es_helper_sign
    for token in query.tokens:
        new_terms += list(map(lambda k: k['key'], filter(lambda k: k['doc_count'] > 100 and k['doc_count'] < 500, es_helper_sign.get_significant_terms(token))))[1:]
    query.tokens += new_terms
    query.update_docs()
    query.calc_query_tf()
    print(new_terms)

['ziyang', 'racket', 'zhao', 'democraci', 'xiaop', 'affair', 'case', 'feder', 'sale']
['wound', 'hospit', 'stab', 'feet', 'search']
[]
[]
['murder', 'shot', 'investig', 'racial', 'kidnap', 'beirut', 'obeid', 'captor', 'captiv']
['noriega', 'kabul', 'insurg']
['semiautomat', 'stockton', 'pistol', 'automat', 'grenad']
['ortega', 'gesel']
['higher', 'billion', 'stock', 'cent', 'railwai', 'train', 'railwai', 'train']
['research', 'scientist', 'studi', 'laboratori', 'wai', 'test', 'develop', 'habitat', 'fish', 'bird', 'biologist', 'environmentalist', 'environment', 'endang', 'valdez']
['investig', 'camp', 'shelter', 'negoti', 'nasa', 'shuttl', 'liftoff', 'spacecraft', 'satellit', 'astronaut', 'liftoff', 'nasa', 'space', 'payload', 'astronaut', 'shuttl', 'launch']
[]
['ibm']
['ls', 'regulatori', 'insolv', 'deposit', 'technolog', 'panama', 'technolog', 'comput', 'electron']
['file', 'appeal', 'lawsuit', 'benchmark', 'petroleum', 'barrel', 'crude', 'intermedi', 'cartel', 'unlead']
['ortega', '

In [18]:
scoring_sign = Scoring(sign_queries)
# Doing all the vs ops and writing to a file
for op in vs_ops:
    scoring_sign.processVS(op)
    scoring_sign.write_file( op, op + '.1000' )
    
# Doing all the lm ops and writing to a file
for op in lm_ops:
    scoring_sign.processLM(op, 0.95)
    scoring_sign.write_file( op, op + '.1000' )

In [19]:
scoring_sign.normalize()
scoring_sign.write_meta_file("meta.1000")

41.96705542298496
125.01114282815296
9.430759340738948
352.6979925378687
339.40148457844003


In [41]:
# class Query:
#     def __init__(self, id, es_helper, tokens=None):
#         self.es_helper = es_helper
#         self.id =id
#         self.tokens = tokens
#         self.update_docs()
#         self.calc_query_tf()

In [44]:
q.update_docs()

In [15]:
es_helper = ESHelpers()
# Load all the queries
queries = []
with open('./IR_DATA/AP_DATA/query_test.txt', 'r') as f:
    for line in f:
        reg_match = re.match(r'^(\d+).(.*)', line)
        tokens = es_helper.get_tokens(reg_match.group(2).strip())
        queries.append(Query(reg_match.group(1).strip(), es_helper, tokens))

In [12]:
queries[0].tokens

IndexError: list index out of range

In [50]:
# # View all final tokens after analyzing, delete list and threshold on DF    
# with open('./IR_DATA/AP_DATA/query_test.txt', 'r') as f:
#     for line in f:
#         reg_match = re.match(r'^(\d+).(.*)', line)
#         tokens = es_helper.get_tokens(reg_match.group(2).strip())
#         print(tokens)

[]
