In [2]:
from elasticsearch import Elasticsearch, helpers
import os
import re
import json
import logging
import operator
import time
import math

In [3]:
def connect_elasticsearch():
    _es = None
    _es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    if _es.ping():
        print('Yay Connected')
    else:
        print('Awww it could not connect!')
    return _es
if __name__ == '__main__':
  logging.basicConfig(level=logging.ERROR)

In [4]:
es = connect_elasticsearch()

Yay Connected


## Load Query File

In [5]:
query_file_path = "D:\\CS 6200\\AP_DATA\\query_desc.51-100.short.txt"
queries = dict()
with open(query_file_path) as f:
    for line in f:
        if len(line.strip()) > 0:
            query_no = re.search(r'\d+', line).group()
            line = line.lstrip('0123456789.- ')
            queries[query_no] = line.rstrip()

# Retrieval Models

## Average Document Length

In [6]:
statistics = es.termvectors(index= "assignment1", id = "AP890110-0294",term_statistics = True, fields = "text")

In [7]:
avg_doc_length = statistics["term_vectors"]["text"]["field_statistics"]["sum_ttf"]/statistics["term_vectors"]["text"]["field_statistics"]["doc_count"]

In [8]:
vocabSize = 192963

In [9]:
total_documents = statistics["term_vectors"]["text"]["field_statistics"]["doc_count"]

In [10]:
def write_to_file( file, query_id, doclist, dict):
    outF = open(file, "a")
    string = ""
    for i in range(0, len(doclist)):
        string = str(query_id) + " Q0 " + doclist[i] + "  " + str(i+1) + "  "+str(dict[doclist[i]]) + " Exp\n" 
        outF.write(string)
    outF.close()

In [11]:
def createDocList(query):
    doclist = []
    res = es.search(index="assignment1", body={"query": {"match": {"text": query}}, "size":10000})
    len_hits = res["hits"]["total"]["value"]
    for i in range(0, len_hits):
        doclist.append(res["hits"]["hits"][i]["_id"])
    return doclist

## Unigram LM with Laplace Smoothing

In [12]:
def unigram_laplace(query_words, document_list, es_object, index):
    lmlaplace_scores = dict()
    statistics = es.mtermvectors(index= index, ids = document_list,term_statistics = False, fields = "text")
    for doc in statistics['docs']:
        lmlaplace_score = 0
        prior = -9999
        doc_length = 0
        tf = 0
        if doc["term_vectors"]:
            words = doc["term_vectors"]["text"]["terms"]
            termsList = list(words.keys())
            for term in termsList:
                doc_length += words[term]["term_freq"] 
            for word in query_words:
                if word in termsList:
                    score = 0
                    tf = words[word]["term_freq"]
                    p_laplace = (tf + 1)/(doc_length + vocabSize)
                    lmlaplace_score += math.log(p_laplace)
                else:
                    tf = 0
                    lmlaplace_score += prior
            
            lmlaplace_scores[doc["_id"]] = lmlaplace_score
    return lmlaplace_scores
    

In [13]:
def run_query_and_write_results(query, query_id, doclist, es_object, index, chunk_size, filename):
    # Tokenize query with built in stemmer
    analyzerTokens = es.indices.analyze(body = {"tokenizer" : "standard", "filter" : ["lowercase", "stemmer"], "text" : query})

    query_words = []

    for token in analyzerTokens["tokens"]:
        query_words.append(token["token"])

    dict_lmlaplace = dict()
    dict_lmlaplace_new = dict()
    for i in range(0, len(doclist), chunk_size):
        if dict_lmlaplace:
            dict_lmlaplace_new = dict_lmlaplace
        chunk = doclist[i:i+chunk_size]
        if dict_lmlaplace_new:
            dict_lmlaplace_new.update(unigram_laplace(query_words, chunk, es_object, index))
        else:
            dict_lmlaplace = unigram_laplace(query_words, chunk, es_object, index)

    #Sort scores in descending order
    sorted_d = dict( sorted(dict_lmlaplace_new.items(), key=operator.itemgetter(1),reverse=True))
    sortedKeys = list(sorted_d)

    # Write scores to File
    write_to_file(filename, query_id, sortedKeys[:1000], sorted_d)

In [14]:
query_ids = list(queries.keys())

In [15]:
for i in range(0, len(query_ids)):
    run_query_and_write_results(queries[query_ids[i]], query_ids[i], createDocList(queries[query_ids[i]]), es, "assignment1", 250, "D:\\CS 6200\\AP_DATA\\results_laplacelm.txt")