In [26]:
import re
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from tqdm import tqdm
import string
import math

In [2]:
def read_queries_from_file(file_path):
    queries = {}

    with open(file_path, 'r') as file:
        for line in file:
            match = re.match(r'(\d+)\.\s+(.*)', line)
            if match:
                
                number = match.group(1)
                query_text = match.group(2).strip()  
                queries[number] = query_text

    return queries


In [3]:
# Extracting queries 
file_path = 'query_desc.51-100.short.txt'
queries = read_queries_from_file(file_path)
queries

{'85': 'Document will discuss allegations, or measures being taken against, corrupt public officials of any governmental jurisdiction worldwide.',
 '59': 'Document will report a type of weather event which has directly caused at least one fatality in some location.',
 '56': 'Document will include a prediction about the prime lending rate, or will report an actual prime rate move.',
 '71': 'Document will report incursions by land, air, or water into the border area of one country by military forces of a second country or a guerrilla group based in a second country.',
 '64': 'Document will report an event or result of politically motivated hostage-taking.',
 '62': "Document will report a military coup d'etat, either attempted or successful, in any country.",
 '93': 'Document must describe or identify supporters of the National Rifle Association (NRA), or its assets.',
 '99': 'Document will identify a development in the Iran-Contra Affair.',
 '58': 'Document will predict or anticipate a r

In [4]:
from elasticsearch7 import Elasticsearch
from elasticsearch7.client import IndicesClient
import time

index_name = "ap89_data5"
es = Elasticsearch("http://localhost:9200")
ic = IndicesClient(es)
print(es.ping())


True


In [5]:
def ES_search(query):
    
    res_es_search = es.search(index = index_name, query = {'match': {'content': query }} , size = 1000)
    
    return res_es_search
    

In [12]:
#export result
output_path = './'

def output_txt(filename, string):
    with open(output_path+filename+'.txt', 'a') as f:
        f.write(string)

In [9]:
def preprocess_queries(query):
    
    # Initialize Porter Stemmer
    stemmer = PorterStemmer()
    
    # Load stopwords from file
    with open('stoplist.txt', 'r') as file:
        stopwords = set(file.read().strip().splitlines())
        
   
    preprocessed_words = []
    query = query.replace('\n', ' ').replace("'", '').replace('""','')
    tokens = word_tokenize(query)
    for word in tokens:
        if '-' in word:
            hyphen_words = word.split('-')
            preprocessed_words.extend(hyphen_words)
        else:
            preprocessed_words.append(word)
                    
    preprocessed_words = [word for word in preprocessed_words if word.lower() not in stopwords and word.strip(string.punctuation)]
    preprocessed_words = [stemmer.stem(word) for word in preprocessed_words] 
    preprocessed_words = list(set(preprocessed_words))
    
    
    
    return ' '.join(preprocessed_words)  

In [20]:
def process_res(queries):
    i = 0
    for num, query in queries.items():
        preprocessed_query = word_tokenize(preprocess_queries(query))
        result = ES_search(query)
        for hit in result['hits']['hits']:
            docno = hit['_id']
            rank = i + 1
            score = hit['_score']
            output_line = num + ' Q0 ' + str(docno) + ' ' + str(rank) + ' ' + str(score) + ' Exp' + "\n"
            output_txt('query_result_es_builtinfeedback',output_line)
            i += 1


In [13]:
start = time.time()
process_res(queries)
end = time.time()
print('Completed, total run time in min', (end-start)//60)

Completed, total run time in min 0.0


## Task 4

**Average Precision:** 0.1925

In [14]:
!perl trec_eval.pl qrels.adhoc.51-100.AP89.txt query_result_es_builtin2.txt 

Error due to 23

Queryid (Num):       23
Total number of documents over all queries
    Retrieved:    20079
    Relevant:      1826
    Rel_ret:       1060
Interpolated Recall - Precision Averages:
    at 0.00       0.4763
    at 0.10       0.3897
    at 0.20       0.3144
    at 0.30       0.2627
    at 0.40       0.2192
    at 0.50       0.1845
    at 0.60       0.1400
    at 0.70       0.1162
    at 0.80       0.0892
    at 0.90       0.0609
    at 1.00       0.0226
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.1925
Precision:
  At    5 docs:   0.3217
  At   10 docs:   0.3087
  At   15 docs:   0.2957
  At   20 docs:   0.2848
  At   30 docs:   0.2725
  At  100 docs:   0.1887
  At  200 docs:   0.1341
  At  500 docs:   0.0772
  At 1000 docs:   0.0461
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.2303


## Task 5

### Pseudo-relevance Feedback

In [15]:
additional_words = {
    '85': ['prison', 'polic', 'prosector', 'parti'],
    '59': ['tornado', 'thunderstorm', 'offic'],
    '56': ['bank', 'lower'],
    '71': ['rubber', 'lebanon', 'govern'],
    '64': ['north', 'novemb', 'speaker'],
    '54': ['spacecraft', 'titan', 'backlog'],
    '95': ['student', 'bank', 'chip'],
    '60': ['risk', 'govern', 'tax'],
    '80': ['campaign', 'parti'],
    '91': ['pentagon', 'program', 'pakistan', 'spend']
}

In [16]:
queries1 = queries.copy()

In [17]:
for querynum, words in additional_words.items():
    addW= " ".join(words)
    queries1[querynum] = queries1[querynum] + " " + addW

In [18]:
queries1

{'85': 'Document will discuss allegations, or measures being taken against, corrupt public officials of any governmental jurisdiction worldwide. prison polic prosector parti',
 '59': 'Document will report a type of weather event which has directly caused at least one fatality in some location. tornado thunderstorm offic',
 '56': 'Document will include a prediction about the prime lending rate, or will report an actual prime rate move. bank lower',
 '71': 'Document will report incursions by land, air, or water into the border area of one country by military forces of a second country or a guerrilla group based in a second country. rubber lebanon govern',
 '64': 'Document will report an event or result of politically motivated hostage-taking. north novemb speaker',
 '62': "Document will report a military coup d'etat, either attempted or successful, in any country.",
 '93': 'Document must describe or identify supporters of the National Rifle Association (NRA), or its assets.',
 '99': 'Doc

In [21]:
process_res(queries1)

In [22]:
!perl trec_eval.pl qrels.adhoc.51-100.AP89.txt query_result_es_builtinFeedback.txt 

Error due to 25

Queryid (Num):       25
Total number of documents over all queries
    Retrieved:    22079
    Relevant:      1832
    Rel_ret:       1059
Interpolated Recall - Precision Averages:
    at 0.00       0.4843
    at 0.10       0.3790
    at 0.20       0.2949
    at 0.30       0.2536
    at 0.40       0.2229
    at 0.50       0.1826
    at 0.60       0.1388
    at 0.70       0.1159
    at 0.80       0.0898
    at 0.90       0.0641
    at 1.00       0.0211
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.1883
Precision:
  At    5 docs:   0.2880
  At   10 docs:   0.2840
  At   15 docs:   0.2667
  At   20 docs:   0.2680
  At   30 docs:   0.2693
  At  100 docs:   0.1908
  At  200 docs:   0.1302
  At  500 docs:   0.0709
  At 1000 docs:   0.0424
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.2265


**New Average Precision:** 0.1883