In [11]:
# Search engine SE1(----) with ElasticSearch
# //The files' paths are compatible with Ubuntu 20.04 and the code is just tested in this OS

from __future__ import unicode_literals
import glob
from elasticsearch import Elasticsearch, helpers


directory = 'Poems/*'
query_directory = 'Queries/*'
stop_words_path = 'Stopwords/Stopwords.txt'
relevant_docs_path = 'RelevanceAssesment/RelevanceAssesment.txt'

# read stop words
with open(stop_words_path, 'r', encoding="utf8") as s_file:
    stopWords = [word for line in s_file for word in line.split()]


def connect():
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    if es.ping():
        return es
    else:
        print("Unable to connect to elasticsearch!")


def get_files_in_dir(directory):
    file_list = []
    for filename in glob.glob(directory, recursive=True):
        file_list += [filename]
    return file_list


def create_index(index_name):
    elastic = Elasticsearch()
    # if not elastic.indices.exists(index_name):
    elastic.indices.create(index=index_name, ignore=[400, 404])
    # else:
    #     print('The index already exists..')


def delete_index(index_names):
    es = connect()
    for element in index_names:
        es.indices.delete(index=element, ignore=[400, 404])
        print("The index:'" + element + "' is deleted")


def get_text_file_data(file):
    data = []
    with open(file, encoding="utf8") as file:
      for line in file:
        data += [str(line)]
    return data


def text_files_parser():
    file_names = get_files_in_dir(directory)

    for _id, _file in enumerate(file_names):
        file_name = _file[6:]
        file_data = get_text_file_data(_file)
        data_body = "".join(file_data)

        create_index("persian_poems_se1")
        doc_source = {
            "file_name": file_name,
            "data": data_body
        }

        yield {
            "_index": "persian_poems_se1",
            "_id": _id + 1,
            "_source": doc_source
        }


def index_files():
    try:
        resp = helpers.bulk(
            connect(),
            text_files_parser()
        )
        print("Response:", resp)
        print("Response type:", type(resp))
    except Exception as err:
        print("ERROR:", err)


def total_results(ap, precisions, recalls, f_measures):

   # //total_map
   map = 0
   m_counter = 0
   for a in ap:
     map += a
     m_counter +=1
   try:
    total_map = map/m_counter
   except ZeroDivisionError:
    total_map = 0

   # //total_precision
   pre = 0
   p_counter = 0
   for p in precisions:
     pre += p
     p_counter +=1
   try:
    total_precision = pre/p_counter
   except ZeroDivisionError:
    total_precision = 0

   # //total_recall
   rec = 0
   r_counter = 0
   for r in recalls:
     rec += r
     r_counter +=1
   try:
    total_recall = rec/r_counter
   except ZeroDivisionError:
    total_recall = 0

   # //total F-Measure
   fm = 0
   f_counter = 0
   for f in f_measures:
     fm += f
     f_counter +=1
   try:
    total_fmeasure = fm/f_counter
   except ZeroDivisionError:
    total_fmeasure = 0

   print("Precision:", total_precision)
   print("Recall:", total_recall)
   print("F-measure:", total_fmeasure)
   print("MAP:", total_map)


def search_query():
   es = connect()
   file_names = get_files_in_dir(query_directory)

   ap = []
   precisions = []
   recalls = []
   f_measures = []

   for name in file_names:
     file_name = name[8:]
     results = []
     file_data = get_text_file_data(name)
     data_body = "".join(file_data)

     search_param = {
    'query': {
        'match': {
            'data': data_body
        }
       }
     }

     print('Results for ' + file_name)
     res = es.search(index="persian_poems_se1", body=search_param)
     # print(res)


     print("Got %d Hits:" % res['hits']['total']['value'])
     for hit in res['hits']['hits']:
         print("%(file_name)s" % hit["_source"])
         results += [str(hit['_source']['file_name'])]


     line_number = 0
     with open(relevant_docs_path,"r" ,encoding="utf8") as fp:

        for line in fp:
           if line.rstrip('\n') == file_name:
              fp.seek(0)
              for i, sen in enumerate(fp):
                 if i == line_number+1:
                    rel_docs = sen.split()
                    intersection_set = set.intersection(set(results), set(rel_docs))
                    intersection_list = list(intersection_set)
                    tp = len(intersection_list)
                    fp = len(results) - len(intersection_list)
                    fn= len(rel_docs) - len(intersection_list)

                    try:
                     precision = tp/(tp+fp)
                     recall = tp/(tp+fn)
                     f_measure = 2*precision*recall/(precision+recall)
                    except ZeroDivisionError:
                     precision = 0
                     recall = 0
                     f_measure = 0

                    relative = 0
                    counter = 1
                    average_p = 0
                    for result in results:
                        if result in rel_docs:
                            relative +=1
                            average_p += (relative/counter)
                        counter +=1
                    try:
                     ap.append(average_p/relative)
                    except ZeroDivisionError:
                     ap.append(0)

                    precisions.append(precision)
                    recalls.append(recall)
                    f_measures.append(f_measure)
              break
           else:
              line_number += 1
   total_results(ap, precisions, recalls, f_measures)


# All needed function calls are as follows:

# //index files
# index_files()

# //search poems and print the total Precision,Recall,F_Measure and MAP of the SE
# search_query()

# //delete index
# delete_index(['persian_poems_se1'])


The index:'persian_poems_se1' is deleted
