In [None]:
from elasticsearch import Elasticsearch, helpers
import os
import re
import json
import logging
import operator
import time
import math

In [None]:
def connect_elasticsearch():
    _es = None
    _es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    if _es.ping():
        print('Yay Connected')
    else:
        print('Awww it could not connect!')
    return _es
if __name__ == '__main__':
  logging.basicConfig(level=logging.ERROR)

In [None]:
es = connect_elasticsearch()

## Load Query File

In [None]:
query_file_path = "D:\\CS 6200\\AP_DATA\\query_desc.51-100.short.txt"
queries = dict()
with open(query_file_path) as f:
    for line in f:
        if len(line.strip()) > 0:
            query_no = re.search(r'\d+', line).group()
            line = line.lstrip('0123456789.- ')
            queries[query_no] = line.rstrip()

# Retrieval Models

## Average Document Length

In [None]:
statistics = es.termvectors(index= "assignment1", id = "AP890110-0294",term_statistics = True, fields = "text")

In [None]:
vocabSize = 192963

In [None]:
def write_to_file( file, query_id, doclist, dict):
    outF = open(file, "a")
    string = ""
    for i in range(0, len(doclist)):
        string = str(query_id) + " Q0 " + doclist[i] + "  " + str(i+1) + "  "+str(dict[doclist[i]]) + " Exp\n" 
        outF.write(string)
    outF.close()

In [None]:
def createDocList(query):
    doclist = []
    res = es.search(index="assignment1", body={"query": {"match": {"text": query}}, "size":10000})
    len_hits = res["hits"]["total"]["value"]
    for i in range(0, len_hits):
        doclist.append(res["hits"]["hits"][i]["_id"])
    return doclist

### Pre compute Corpus Frequency

In [None]:
cf_dict = dict()
for query in queries.values():
    analyzerTokens = es.indices.analyze(index = 'assignment1', body = {"field": "text", "text" : query})
    query_words = []
    unstemmed_words = query.split()

    for token in analyzerTokens["tokens"]:
        query_words.append(token["token"])

    for i in range(0, len(query_words)):
        if query_words[i] not in cf_dict:
            res = es.search(index="assignment1", body={"query": {"match": {"text": unstemmed_words[i]}}, "size":1})
            if res["hits"]["total"]["value"]>0:
                document_id = res["hits"]["hits"][0]["_id"]
                print(document_id+" "+query_words[i]+" \n")
                statistics = es.termvectors(index = "assignment1", id = document_id, term_statistics = True, fields="text")
                cf_dict[query_words[i]] = statistics['term_vectors']['text']['terms'][query_words[i]]['ttf']

In [None]:
print(cf_dict)

## Unigram LM with Jelinek-Mercer Smoothing

In [None]:
def unigram_jm(query_words, document_list, es_object, index, lam):
    lmjm_scores = dict()
    statistics = es.mtermvectors(index= index, ids = document_list,term_statistics = False, fields = "text")
    for doc in statistics['docs']:
        lmjm_score = 0
        doc_length = 0
        tf = 0
        if doc["term_vectors"]:
            words = doc["term_vectors"]["text"]["terms"]
            termsList = list(words.keys())
            for term in termsList:
                doc_length += words[term]["term_freq"] 
            for word in query_words:
                if word in termsList:
                    score = 0
                    tf = words[word]["term_freq"]
                    p_jm = (lam*(tf/doc_length)) + ((1-lam)*cf_dict[word]/vocabSize)
                    lmjm_score += math.log(p_jm)
                else:
                    tf = 0
                    lmjm_score += math.log(((1-lam)*cf_dict[word]/vocabSize))
            
            lmjm_scores[doc["_id"]] = lmjm_score 
    return lmjm_scores
    

In [None]:
def run_query_and_write_results(query, query_id, doclist, es_object, index, chunk_size, filename):
    # Tokenize query with built in stemmer
    analyzerTokens = es.indices.analyze(index = 'assignment1', body = {"field": "text", "text" : query})

    query_words = []

    for token in analyzerTokens["tokens"]:
        query_words.append(token["token"])

    dict_lmjm = dict()
    dict_lmjm_new = dict()
    for i in range(0, len(doclist), chunk_size):
        if dict_lmjm:
            dict_lmjm_new = dict_lmjm
        chunk = doclist[i:i+chunk_size]
        if dict_lmjm_new:
            dict_lmjm_new.update(unigram_jm(query_words, chunk, es_object, index, 0.5))
        else:
            dict_lmjm = unigram_jm(query_words, chunk, es_object, index, 0.5)

    #Sort scores in descending order
    sorted_d = dict( sorted(dict_lmjm_new.items(), key=operator.itemgetter(1),reverse=True))
    sortedKeys = list(sorted_d)

    # Write scores to File
    write_to_file(filename, query_id, sortedKeys[:1000], sorted_d)

In [None]:
query_ids = list(queries.keys())

In [None]:
for i in range(0, len(query_ids)):
    run_query_and_write_results(queries[query_ids[i]], query_ids[i], createDocList(queries[query_ids[i]]), es, "assignment1", 250, "D:\\CS 6200\\AP_DATA\\results_jmlm.txt")