# Task 3 : Query execution

In [1]:
from elasticsearch7 import Elasticsearch, helpers
from elasticsearch7.client import IndicesClient
import json
from nltk.stem import PorterStemmer
import string
import math
from collections import defaultdict

index_name = "ap89_data4"
es = Elasticsearch("http://localhost:9200")
ic = IndicesClient(es)
ps = PorterStemmer()

print(es.ping())

True


In [2]:
def stem_text(text, ps):
    stemmed = ' '.join([ps.stem(word) for word in text.split()])
    return stemmed

In [3]:
sw_path = "./IR_data/AP_DATA/stoplist.txt"

with open(sw_path) as file:
    stopwords = file.read().splitlines()

additional_stopwords = [
    "document", "discuss", "system", "identifi", "actual", "ongo", "ha", "ani", "describ",
    "motiv", "directli","successful", "detat", "area", "result", "type", "anticip","develop",
    "make", "tent", "financi", "specifi", "advanc", "someth", "standard", "fatal",
    "perpetr", "unsubstanti", "organ", "platform", "aid", "senior", "basi", "side", 
    "countri", "undesir", "good", "instanc", "method", "bell", "role", "exist",
    "effort", "support", "machin", "controversi", "forc", "applic",
    "determin", "second", "preliminari", "perform", "high", "tech", "predict", "insul", "instal", "regul", "level",
    "country", "dual-us", "militari", "alleg", "polit", "candid", "reserv", "contract", "fail", "state", 
    "concern", "manufactur", "current", "product", "equip", "non", "take", "fine"
]



In [4]:
def process_content(text):
    text = text.translate(str.maketrans("", "", ",.()'\""))
    text = text.replace("-", " ")
    words = text.split()
    processed_words = []
    for word in words:
        processed_words.append(word.lower() if word.lower() not in stopwords else '')
    processed_text = ' '.join(processed_words)
    return processed_text


In [5]:
def remove_additional_stopwords(text):
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in additional_stopwords])
    return text

In [6]:
def query_preprocessing(query):
    query = process_content(query)
    query = stem_text(query, ps)
    query = remove_additional_stopwords(query)
    return query

In [7]:
def ES_Search(query):
    res_es_search = es.search(index = index_name, query = {'match':{'content':query}}, size=1000)
    return res_es_search

In [8]:
def process_res(query_num, res_es_search, output):
    for rank, hit in enumerate(res_es_search['hits']['hits'], start=1):
        docno = hit['_id']
        score = hit['_score']
        output_one_line = str(query_num) + ' Q0 ' + str(docno) + ' ' + str(rank) + ' ' + str(score) + ' Exp'+"\n"
        output.write(output_one_line)

In [87]:
query_dict = {}
query_file_path = "./IR_data/AP_DATA/query_desc.51-100.short.txt"

with open(query_file_path, 'r') as query_desc_file:
    queries = query_desc_file.readlines()

for query_desc in queries:
    query_desc = query_desc.strip()

    if not query_desc:
        continue

    query_num, query_text = query_desc.split('.', 1)
    query_num = int(query_num.strip())
    query_text = query_preprocessing(query_text.strip())

    query_dict[query_num] = query_text

print(query_dict)

{85: 'corrupt public offici', 59: 'weather least locat', 56: 'prime lend rate prime rate', 71: 'incurs border guerrilla', 64: 'hostag', 62: 'coup attempt', 93: 'nation rifl associ nra', 99: 'iran contra affair', 58: 'rail strike rail strike', 77: 'poach wildlif', 54: 'agreement launch commerci satellit', 87: 'offic institut', 94: 'crime comput', 100: 'communist transfer technolog nation', 89: 'invest opec downstream', 61: 'israel iran contra affair', 95: 'comput crime', 68: 'safeti worker diamet fiber', 57: 'mci', 97: 'fiber optic technolog', 98: 'fiber optic', 60: 'salari incent pay pay', 80: '1988 presidenti', 63: 'translat', 91: 'weapon'}


# ES built-in

In [10]:
def run_queries_and_write_results_esbuiltin(output_file, query_dict):
    with open(output_file, 'w') as output:
        for query_num, query_text in query_dict.items():
            print(query_num)
            print(query_text)
            res_es_search = ES_Search(query_text)

            process_res(query_num, res_es_search, output)

In [11]:
run_queries_and_write_results_esbuiltin("./Deliverables/output_ES_builtin.txt", query_dict)

print("Queries executed successfully.")

85
corrupt public offici
59
weather least locat
56
prime lend rate prime rate
71
incurs border guerrilla
64
hostag
62
coup attempt
93
nation rifl associ nra
99
iran contra affair
58
rail strike rail strike
77
poach wildlif
54
agreement launch commerci satellit
87
offic institut
94
crime comput
100
communist transfer technolog nation
89
invest opec downstream
61
israel iran contra affair
95
comput crime
68
safeti worker diamet fiber
57
mci
97
fiber optic technolog
98
fiber optic
60
salari incent pay pay
80
1988 presidenti
63
translat
91
weapon
Queries executed successfully.


In [12]:
from elasticsearch7 import Elasticsearch
from elasticsearch7.helpers import scan

index_name = "ap89_data4"
es = Elasticsearch("http://localhost:9200")

def retrieve_all_document_ids():
    document_ids = []

    for hit in scan(
        es,
        query={"query": {"match_all": {}}},
        index=index_name,
        doc_type="_doc",
        _source=False, 
    ):
        doc_id = hit["_id"]
        document_ids.append(doc_id)
    print("doc_id done")
    return document_ids

def retrieve_document_term_vectors(document_ids):
    document_term_vectors = {}

    for doc_id in document_ids:
        res = es.termvectors(index=index_name, id=doc_id, fields=['content'], term_statistics=True)

        if "content" in res["term_vectors"]:
            document_term_vectors[doc_id] = res["term_vectors"]["content"]["terms"]
        else:
            document_term_vectors[doc_id] = {}
            
    return document_term_vectors

all_document_ids = retrieve_all_document_ids()
print(all_document_ids[0])
all_document_term_vectors = retrieve_document_term_vectors(all_document_ids)
print(all_document_term_vectors["AP890102-0056"])




doc_id done
AP890102-0055
{'360': {'doc_freq': 243, 'ttf': 251, 'term_freq': 1, 'tokens': [{'position': 45, 'start_offset': 262, 'end_offset': 265}]}, 'accept': {'doc_freq': 6679, 'ttf': 8467, 'term_freq': 1, 'tokens': [{'position': 90, 'start_offset': 510, 'end_offset': 516}]}, 'albert': {'doc_freq': 739, 'ttf': 831, 'term_freq': 1, 'tokens': [{'position': 1, 'start_offset': 5, 'end_offset': 11}]}, 'ani': {'doc_freq': 21331, 'ttf': 29945, 'term_freq': 1, 'tokens': [{'position': 91, 'start_offset': 517, 'end_offset': 520}]}, 'away': {'doc_freq': 8704, 'ttf': 10392, 'term_freq': 1, 'tokens': [{'position': 66, 'start_offset': 370, 'end_offset': 374}]}, 'bank': {'doc_freq': 8193, 'ttf': 21865, 'term_freq': 1, 'tokens': [{'position': 88, 'start_offset': 494, 'end_offset': 498}]}, 'befor': {'doc_freq': 25059, 'ttf': 34200, 'term_freq': 1, 'tokens': [{'position': 38, 'start_offset': 220, 'end_offset': 225}]}, 'box': {'doc_freq': 1585, 'ttf': 2241, 'term_freq': 1, 'tokens': [{'position': 52, 

In [13]:
document_lengths = {}
doc_length_sum = 0

for doc_id, document_term_vector in all_document_term_vectors.items():
    length_doc = 0
    
    for term, term_info in document_term_vector.items():
        length_doc += 1 * term_info.get('term_freq', 0)

    document_lengths[doc_id] = length_doc
    doc_length_sum += length_doc

In [14]:
print(document_lengths["AP890103-0053"])
print(len(all_document_term_vectors))
print(doc_length_sum)
avg_doc_length = doc_length_sum / len(all_document_term_vectors)
print(avg_doc_length)

90
84678
21577217
254.81491060251778


# Okapi TF

In [16]:
def calculate_okapi_tf(tf, length_doc, avg_doc_length):
    return tf / (tf + 0.5 + 1.5 * (length_doc / avg_doc_length))

def calculate_query_scores(query_terms, all_document_term_vectors):
    query_scores = {}

    for term in query_terms:
        term_scores = {}

        for doc_id, term_info in all_document_term_vectors.items():
            if term in term_info:
                tf = term_info[term]["term_freq"]
                length_doc = document_lengths[doc_id]

                okapi_tf_score = calculate_okapi_tf(tf, length_doc, avg_doc_length)

                term_scores[doc_id] = okapi_tf_score

        query_scores[term] = term_scores

    return query_scores

def calculate_matching_score(query_terms, document_id, document_term_vector, avg_doc_length):
    score = 0

    for term in query_terms:
        if term in document_term_vector:
            tf = document_term_vector[term]["term_freq"]
            length_doc = document_lengths[document_id]
            okapi_tf_score = calculate_okapi_tf(tf, length_doc, avg_doc_length)
            score += okapi_tf_score

    return score

def run_queries_and_write_results_okapitf(output_file, query_dict):
    with open(output_file, 'w') as output:
        for query_num, query_text in query_dict.items():
            print(f"Processing Query {query_num}: {query_text}")

            query_terms = query_text.split()
            document_scores = {}

            for doc_id, document_term_vector in all_document_term_vectors.items():
                score = calculate_matching_score(query_terms, doc_id, document_term_vector, avg_doc_length)
                document_scores[doc_id] = score

            sorted_docs = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)[:1000]

            for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
                output.write(str(query_num) + ' Q0 ' + str(doc_id) + ' ' + str(rank) + ' ' + str(score) + ' Exp'+"\n")

output_file_path = "./Deliverables/output_okapiTF_Score.txt"
run_queries_and_write_results_okapitf(output_file_path, query_dict)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci
Processing Query 97: fiber optic technolog
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 presidenti
Processing Query 63: translat
Processing Query 91: weapon
Q

# TF-IDF

In [18]:
def calculate_okapi_tf_tfidf(tf, length_doc, avg_doc_length):
    return tf / (tf + 0.5 + 1.5 * (length_doc / avg_doc_length))

def calculate_idf(dfw, total_documents):
    return math.log(total_documents / dfw)

def calculate_tfidf(tf, length_doc, avg_doc_length, dfw, total_documents):
    okapi_tf_score = calculate_okapi_tf_tfidf(tf, length_doc, avg_doc_length)
    idf_score = calculate_idf(dfw, total_documents)
    return okapi_tf_score * idf_score

def run_queries_and_write_results_tfidf(output_file, query_dict):
    with open(output_file, 'w') as output:
        for query_num, query_text in query_dict.items():
            print(f"Processing Query {query_num}: {query_text}")
            query_terms = query_text.split()
            total_documents = len(all_document_ids)
            document_scores = {}

            for doc_id, document_term_vector in all_document_term_vectors.items():
                score = 0

                for term in query_terms:
                    if term in document_term_vector:
                        tf = document_term_vector[term]["term_freq"]
                        length_doc = document_lengths[doc_id]
                        dfw = document_term_vector[term]["doc_freq"]
                        
                        tfidf_score = calculate_tfidf(tf, length_doc, avg_doc_length, dfw, total_documents)
                        score += tfidf_score

                document_scores[doc_id] = score

            sorted_docs = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)[:1000]

            for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
                output.write(str(query_num) + ' Q0 ' + str(doc_id) + ' ' + str(rank) + ' ' + str(score) + ' Exp'+"\n")

output_file_path = "./Deliverables/output_TFIDF_Score.txt"
run_queries_and_write_results_tfidf(output_file_path, query_dict)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci
Processing Query 97: fiber optic technolog
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 presidenti
Processing Query 63: translat
Processing Query 91: weapon
Q

# Okapi BM25

In [19]:
def calculate_okapi_bm25(tfwd, dfw, D, length_doc, avg_doc_length, tfwq, k1=1.2, b=0.75, k2=100):
    term2_num = (tfwd * (k1 + 1))
    term2_den = (tfwd + k1 * ((1 - b) + (b * (length_doc / avg_doc_length)))) 
    term3 = (tfwq + (k2 * tfwq)) / (tfwq + k2)
    term1 = math.log((D + 0.5) / (dfw + 0.5))
    return term1 * (term2_num / term2_den) * term3

def run_queries_and_write_results_bm25(output_file, query_dict):
    with open(output_file, 'w') as output:
        for query_num, query_text in query_dict.items():
            print(f"Processing Query {query_num}: {query_text}")

            query_terms = query_text.split()
            D = len(all_document_ids)  
            document_scores = defaultdict(float)

            for doc_id, document_term_vector in all_document_term_vectors.items():
                score = 0

                for term in query_terms:
                    if term in document_term_vector:
                        tfwd = document_term_vector[term]["term_freq"]
                        length_doc = document_lengths[doc_id]
                        dfw = document_term_vector[term]["doc_freq"]
                        tfwq = query_terms.count(term)

                        bm25_score = calculate_okapi_bm25(tfwd, dfw, D, length_doc, avg_doc_length, tfwq)
                        score += bm25_score

                document_scores[doc_id] = score

            sorted_docs = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)[:1000]

            for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
                output.write(str(query_num) + ' Q0 ' + str(doc_id) + ' ' + str(rank) + ' ' + str(score) + ' Exp'+"\n")
                
output_file_path = "./Deliverables/output_BM25_Score.txt"
run_queries_and_write_results_bm25(output_file_path, query_dict)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci
Processing Query 97: fiber optic technolog
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 presidenti
Processing Query 63: translat
Processing Query 91: weapon
Q

# Unigram LM with Laplace smoothing

In [20]:
unique_terms_set = set()
count = 0

for document_id, term_info in all_document_term_vectors.items():
    terms = term_info.keys()
    unique_terms_set.update(terms)

unique_terms_list = list(unique_terms_set)

print(f"Number of unique terms across documents: {len(unique_terms_list)}")


Number of unique terms across documents: 262010


In [21]:
total_unique_terms = len(unique_terms_list)

In [22]:
def calculate_laplace_score(tfwd, length_doc, total_unique_terms):
    return math.log((tfwd + 1) / (length_doc + total_unique_terms))

def run_queries_and_write_results_laplace(output_file, query_dict, total_unique_terms):
    with open(output_file, 'w') as output:
        for query_num, query_text in query_dict.items():
            print(f"Processing Query {query_num}: {query_text}")

            query_terms = query_text.split()
            document_scores = {}

            for doc_id, document_term_vector in all_document_term_vectors.items():
                score = 1000

                for term in query_terms:
                    if term in document_term_vector:
                        tf = document_term_vector[term]["term_freq"]
                        length_doc = document_lengths[doc_id]
                        
                        laplace_score = calculate_laplace_score(tf, length_doc, total_unique_terms)
                        score += laplace_score
                    else:
                        score -= 1000

                document_scores[doc_id] = score

            sorted_docs = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)[:1000]

            for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
                output.write(str(query_num) + ' Q0 ' + str(doc_id) + ' ' + str(rank) + ' ' + str(score) + ' Exp'+"\n")

output_file_path = "./Deliverables/output_UnigramLaplace_Score.txt"
run_queries_and_write_results_laplace(output_file_path, query_dict, total_unique_terms)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci
Processing Query 97: fiber optic technolog
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 presidenti
Processing Query 63: translat
Processing Query 91: weapon
Q

# Unigram LM with Jelinek-Mercer smoothing

In [25]:
def calculate_term_freq_in_corpus(all_document_term_vectors, all_query_terms):
    term_freq_in_corpus = {}

    for document_term_vector in all_document_term_vectors.values():
        for target_term in all_query_terms:
            term_freq_in_doc = document_term_vector.get(target_term, {"term_freq": 0})["term_freq"]
            term_freq_in_corpus[target_term] = term_freq_in_corpus.get(target_term, 0) + term_freq_in_doc
    print(term_freq_in_corpus)
    return term_freq_in_corpus

def get_query_terms(query_dict):
    all_query_terms=[]
    for query_num, query_text in query_dict.items():
        print(f"Processing Query {query_num}: {query_text}")
        all_query_terms += query_text.split()
            
    term_freq_in_corpus = calculate_term_freq_in_corpus(all_document_term_vectors, all_query_terms)
    print(term_freq_in_corpus)
    return term_freq_in_corpus

In [26]:
term_freq_in_corpus = get_query_terms(query_dict)

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci
Processing Query 97: fiber optic technolog
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 presidenti
Processing Query 63: translat
Processing Query 91: weapon
{

In [27]:
def calculate_jm_score(tfwd, length_doc, lambda_value, tf_corpus, doc_length_sum):
    jm_term = lambda_value * (tfwd / length_doc) + (1 - lambda_value) * (tf_corpus / doc_length_sum)
    return math.log(jm_term)

def run_queries_and_write_results_jm(output_file, query_dict, lambda_value):     
    with open(output_file, 'w') as output:
        for query_num, query_text in query_dict.items():
            print(f"Processing Query {query_num}: {query_text}")

            query_terms = query_text.split()
            document_scores = {}

            for doc_id, document_term_vector in all_document_term_vectors.items():
                score = 1000

                for term in query_terms:
                    if term in document_term_vector:
                        tf = document_term_vector[term]["term_freq"]
                        length_doc = document_lengths[doc_id]
                        jm_score = calculate_jm_score(tf, length_doc, lambda_value, term_freq_in_corpus[term], doc_length_sum)
                        score += jm_score
                    else:
                        score -= 1000

                document_scores[doc_id] = score

            sorted_docs = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)[:1000]

            for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
                output.write(str(query_num) + ' Q0 ' + str(doc_id) + ' ' + str(rank) + ' ' + str(score) + ' Exp'+"\n")
                
print(term_freq_in_corpus)
output_file_path_jm = "./Deliverables/output_UnigramJM_Score.txt"
lambda_value = 0.8  
run_queries_and_write_results_jm(output_file_path_jm, query_dict, lambda_value)
print("Queries executed successfully.")

{'corrupt': 1856, 'public': 23607, 'offici': 55742, 'weather': 5234, 'least': 15009, 'locat': 2813, 'prime': 17904, 'lend': 1236, 'rate': 41518, 'incurs': 96, 'border': 7365, 'guerrilla': 8117, 'hostag': 3163, 'coup': 2796, 'attempt': 9396, 'nation': 119498, 'rifl': 1444, 'associ': 13529, 'nra': 178, 'iran': 12368, 'contra': 6786, 'affair': 8720, 'rail': 2634, 'strike': 24404, 'poach': 95, 'wildlif': 1371, 'agreement': 13303, 'launch': 5849, 'commerci': 5105, 'satellit': 1922, 'offic': 27581, 'institut': 7384, 'crime': 10302, 'comput': 12376, 'communist': 14499, 'transfer': 2882, 'technolog': 6838, 'invest': 7630, 'opec': 990, 'downstream': 110, 'israel': 10380, 'safeti': 5622, 'worker': 13771, 'diamet': 162, 'fiber': 1173, 'mci': 277, 'optic': 304, 'salari': 2054, 'incent': 1431, 'pay': 31956, '1988': 16708, 'presidenti': 6111, 'translat': 925, 'weapon': 8321}
Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime

# Task 5: Pseudo-relevance Feedback

In [28]:
from collections import Counter

file_path = "./Deliverables/output_TFIDF_Score.txt"

k = 5

with open(file_path, 'r') as file:
    ids_to_calculate = [line.split()[2] for line in file if line.startswith("95")][:k]
    file.seek(0)
    ids_to_calculate += [line.split()[2] for line in file if line.startswith("87")][:k]
    file.seek(0)
    ids_to_calculate += [line.split()[2] for line in file if line.startswith("68")][:k]
    file.seek(0)
    ids_to_calculate += [line.split()[2] for line in file if line.startswith("97")][:k]
    file.seek(0)
    ids_to_calculate += [line.split()[2] for line in file if line.startswith("77")][:k]
    


def calculate_term_frequency(document_term_vector, top_n):
    if not document_term_vector:
        return []

    term_frequency = {term: info["term_freq"] for term, info in document_term_vector.items()}

    term_counter = Counter(term_frequency)

    top_terms = term_counter.most_common(top_n)

    return top_terms

for i, doc_id in enumerate(ids_to_calculate, start=1):
    if doc_id in all_document_term_vectors:
        term_vector = all_document_term_vectors[doc_id]
        top_terms = calculate_term_frequency(term_vector, top_n=k)
        print(f"Top 5 term frequencies for Document {doc_id}: {top_terms}")
    else:
        print(f"Document {doc_id} not found.")

    if i % k == 0 and i != len(ids_to_calculate):
        print('-' * 120)

Top 5 term frequencies for Document AP890123-0235: [('comput', 27), ('crime', 6), ('say', 6), ('system', 5), ('ani', 4)]
Top 5 term frequencies for Document AP891113-0008: [('comput', 26), ('feder', 9), ('intent', 6), ('viru', 6), ('virus', 6)]
Top 5 term frequencies for Document AP890123-0236: [('comput', 19), ('hacker', 10), ('say', 9), ('ha', 7), ('dont', 5)]
Top 5 term frequencies for Document AP890212-0033: [('fbi', 8), ('comput', 7), ('ncic', 6), ('report', 6), ('system', 6)]
Top 5 term frequencies for Document AP890520-0024: [('comput', 8), ('bill', 5), ('virus', 4), ('make', 3), ('becaus', 2)]
------------------------------------------------------------------------------------------------------------------------
Top 5 term frequencies for Document AP890414-0195: [('institut', 11), ('research', 9), ('associ', 4), ('govern', 4), ('panel', 4)]
Top 5 term frequencies for Document AP891208-0171: [('feder', 14), ('citi', 8), ('new', 8), ('thrift', 8), ('bank', 5)]
Top 5 term frequenc

In [29]:
pr_query_dict = {}
def pseudo_relevance_feedback_queries(query_dict):
    for query_num, query_text in query_dict.items():
        if query_num == 95:
            query_text += " feder fbi viru ncic hacker"
        elif query_num == 87:
            query_text += " feder billion"
        elif query_num == 97:
            query_text += " satellit"
        elif query_num == 68:
            query_text += " asbesto"
        elif query_num == 77:
            query_text += " wa"
            
        if not query_text:
            continue

        print(query_num)
        print(query_text)
        pr_query_dict[query_num] = query_text
    return pr_query_dict

In [30]:
pr_query_dict = pseudo_relevance_feedback_queries(query_dict)

85
corrupt public offici
59
weather least locat
56
prime lend rate prime rate
71
incurs border guerrilla
64
hostag
62
coup attempt
93
nation rifl associ nra
99
iran contra affair
58
rail strike rail strike
77
poach wildlif wa
54
agreement launch commerci satellit
87
offic institut feder billion
94
crime comput
100
communist transfer technolog nation
89
invest opec downstream
61
israel iran contra affair
95
comput crime feder fbi viru ncic hacker
68
safeti worker diamet fiber asbesto
57
mci
97
fiber optic technolog satellit
98
fiber optic
60
salari incent pay pay
80
1988 presidenti
63
translat
91
weapon


# Pseudo-relevance Feedback ES Builtin

In [31]:
run_queries_and_write_results_esbuiltin("./Deliverables/output_ES_builtin_PR.txt", pr_query_dict)

print("Queries executed successfully.")

85
corrupt public offici
59
weather least locat
56
prime lend rate prime rate
71
incurs border guerrilla
64
hostag
62
coup attempt
93
nation rifl associ nra
99
iran contra affair
58
rail strike rail strike
77
poach wildlif wa
54
agreement launch commerci satellit
87
offic institut feder billion
94
crime comput
100
communist transfer technolog nation
89
invest opec downstream
61
israel iran contra affair
95
comput crime feder fbi viru ncic hacker
68
safeti worker diamet fiber asbesto
57
mci
97
fiber optic technolog satellit
98
fiber optic
60
salari incent pay pay
80
1988 presidenti
63
translat
91
weapon
Queries executed successfully.


# Pseudo-relevance Feedback Okapi TF

In [32]:
output_file_path = "./Deliverables/output_okapiTF_Score_PR.txt"
run_queries_and_write_results_okapitf(output_file_path, pr_query_dict)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif wa
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut feder billion
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime feder fbi viru ncic hacker
Processing Query 68: safeti worker diamet fiber asbesto
Processing Query 57: mci
Processing Query 97: fiber optic technolog satellit
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 president

# Pseudo-relevance Feedback Okapi TF-IDF

In [33]:
output_file_path = "./Deliverables/output_TFIDF_Score_PR.txt"
run_queries_and_write_results_tfidf(output_file_path, pr_query_dict)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif wa
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut feder billion
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime feder fbi viru ncic hacker
Processing Query 68: safeti worker diamet fiber asbesto
Processing Query 57: mci
Processing Query 97: fiber optic technolog satellit
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 president

# Pseudo-relevance Feedback BM25

In [34]:
output_file_path = "./Deliverables/output_BM25_Score_PR.txt"
run_queries_and_write_results_bm25(output_file_path, pr_query_dict)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif wa
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut feder billion
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime feder fbi viru ncic hacker
Processing Query 68: safeti worker diamet fiber asbesto
Processing Query 57: mci
Processing Query 97: fiber optic technolog satellit
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 president

# Pseudo-relevance Feedback Unigram LM with Laplace smoothing

In [36]:
output_file_path = "./Deliverables/output_UnigramLaplace_Score_PR.txt"
run_queries_and_write_results_laplace(output_file_path, pr_query_dict, total_unique_terms)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif wa
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut feder billion
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime feder fbi viru ncic hacker
Processing Query 68: safeti worker diamet fiber asbesto
Processing Query 57: mci
Processing Query 97: fiber optic technolog satellit
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 president

# Pseudo-relevance Feedback Unigram LM with Jelinek-Mercer smoothing

In [37]:
def get_query_terms_pr(pr_query_dict):
    all_query_terms=[]
    for query_num, query_text in pr_query_dict.items():
        print(f"Processing Query {query_num}: {query_text}")
        all_query_terms += query_text.split()
            
    term_freq_in_corpus = calculate_term_freq_in_corpus(all_document_term_vectors, all_query_terms)
    print(term_freq_in_corpus)
    return term_freq_in_corpus

term_freq_in_corpus = get_query_terms_pr(pr_query_dict)

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif wa
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut feder billion
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime feder fbi viru ncic hacker
Processing Query 68: safeti worker diamet fiber asbesto
Processing Query 57: mci
Processing Query 97: fiber optic technolog satellit
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 president

In [38]:
output_file_path_jm = "./Deliverables/output_UnigramJM_Score_PR.txt"
lambda_value = 0.8  
run_queries_and_write_results_jm(output_file_path_jm, pr_query_dict, lambda_value)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif wa
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut feder billion
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime feder fbi viru ncic hacker
Processing Query 68: safeti worker diamet fiber asbesto
Processing Query 57: mci
Processing Query 97: fiber optic technolog satellit
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 president

# Pseudo-relevance Feedback using ElasticSearch aggs "significant terms"

In [171]:
from collections import defaultdict

query_significant_terms_score = defaultdict(list)

# modify the queries with following query number
modify_query = [59, 64, 77, 87, 97, 98, 63, 91]

for query_id, query_data in query_dict.items():
    query_string = query_data
    current_query_terms = query_string.split()
    result = []
    existing_terms = set()
    
    for term in current_query_terms:
        es_query = {
            "query": {
                "terms": {"content": [term]}
            },
            "aggregations": {
                "significantCrimeTypes": {
                    "significant_terms": {
                        "field": "content"
                    }
                }
            },
            "size": 0
        }

        response = es.search(index=index_name, body=es_query)

        result.extend(
            bucket
            for bucket in response["aggregations"]["significantCrimeTypes"]["buckets"]
            if bucket["key"] not in current_query_terms and bucket["key"] not in existing_terms
        )
        existing_terms.update(bucket["key"] for bucket in result)

    sorted_terms_by_score = sorted(result, key=lambda x: x["score"], reverse=True)

    query_significant_terms_score[query_id] = [(term_info["key"], term_info["score"]) for term_info in sorted_terms_by_score[:5]]

pr_es_query_terms_score = {}

for query_id, significant_terms_score in query_significant_terms_score.items():
    pr_es_query_terms_score[query_id] = [term[0] for term in significant_terms_score]

for query_id, significant_terms in pr_es_query_terms_score.items():
    print(f"Query {query_id}: {significant_terms}")

pr_es_query_dict = {}

for query_id, significant_terms_score in query_significant_terms_score.items():
    query_string = query_dict[query_id]
    if query_id not in modify_query:
        pr_es_query_dict[query_id] = query_string
    else:
        significant_terms_strings = [term[0] for term in significant_terms_score]
        merged_terms = query_string.split() + significant_terms_strings
        merged_terms_string = " ".join(merged_terms)
        pr_es_query_dict[query_id] = merged_terms_string

for query_id, query_string in pr_es_query_dict.items():
    print(f"Query {query_id}: {query_string}")


  response = es.search(index=index_name, body=es_query)


Query 85: ['corruption', 'racket', 'bribe', 'anticorrupt', 'fraud']
Query 59: ['forecast', 'temperatur', 'rain', 'wind', 'snow']
Query 56: ['minist', 'loan', 'bank', 'loans', '2122396200']
Query 71: ['swapo', 'incursion', 'thornberry', 'africanl', 'namibia']
Query 64: ['hostages', 'lebanon', 'proiranian', 'shiit', 'kidnap']
Query 62: ['gen', 'stroessner', 'militari', 'oust', 'pinochet']
Query 93: ['lapierre', 'ar15', 'assaultstyl', 'largecapac', 'guncontrol']
Query 99: ['nicaraguan', 'contras', 'nicaragua', 'sandinista', 'hondura']
Query 58: ['railroad', 'railway', 'amtrak', 'freight', 'locomot']
Query 77: ['poacher', 'tusk', 'poaching', 'antipoach', 'poachers']
Query 54: ['satellite', 'satellites', 'payload', 'orbit', 'nasa']
Query 87: ['research', 'institute', 'thrift', 'institutions', 'deposit']
Query 94: ['softwar', 'computers', 'computer', 'ibm', 'electron']
Query 100: ['parti', 'technology', 'reform', 'soviet', 'politburo']
Query 89: ['intermediate', '13nation', 'barrel', 'petrol

# Pseudo-relevance Feedback ES Builtin with ES significant terms

In [172]:
run_queries_and_write_results_esbuiltin("./Deliverables/output_ES_builtin_PR_ES.txt", pr_es_query_dict)

print("Queries executed successfully.")

85
corrupt public offici
59
weather least locat forecast temperatur rain wind snow
56
prime lend rate prime rate
71
incurs border guerrilla
64
hostag hostages lebanon proiranian shiit kidnap
62
coup attempt
93
nation rifl associ nra
99
iran contra affair
58
rail strike rail strike
77
poach wildlif poacher tusk poaching antipoach poachers
54
agreement launch commerci satellit
87
offic institut research institute thrift institutions deposit
94
crime comput
100
communist transfer technolog nation
89
invest opec downstream
61
israel iran contra affair
95
comput crime
68
safeti worker diamet fiber
57
mci
97
fiber optic technolog hairthin spacemad resolidifi fibers telescope
98
fiber optic hairthin spacemad resolidifi fibers telescope
60
salari incent pay pay
80
1988 presidenti
63
translat english languages translation text rozovskaya
91
weapon weapons nuclear arm missil missiles
Queries executed successfully.


# Pseudo-relevance Feedback Okapi TF with ES significant terms

In [173]:
output_file_path = "./Deliverables/output_okapiTF_Score_PR_ES.txt"
run_queries_and_write_results_okapitf(output_file_path, pr_es_query_dict)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat forecast temperatur rain wind snow
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag hostages lebanon proiranian shiit kidnap
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif poacher tusk poaching antipoach poachers
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut research institute thrift institutions deposit
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci
Processing Query 97: fiber optic technolog hairthin

# Pseudo-relevance Feedback TF-IDF with ES significant terms

In [174]:
output_file_path = "./Deliverables/output_TFIDF_Score_PR_ES.txt"
run_queries_and_write_results_tfidf(output_file_path, pr_es_query_dict)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat forecast temperatur rain wind snow
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag hostages lebanon proiranian shiit kidnap
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif poacher tusk poaching antipoach poachers
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut research institute thrift institutions deposit
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci
Processing Query 97: fiber optic technolog hairthin

# Pseudo-relevance Feedback BM25 with ES significant terms

In [175]:
output_file_path = "./Deliverables/output_BM25_Score_PR_ES.txt"
run_queries_and_write_results_bm25(output_file_path, pr_es_query_dict)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat forecast temperatur rain wind snow
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag hostages lebanon proiranian shiit kidnap
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif poacher tusk poaching antipoach poachers
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut research institute thrift institutions deposit
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci
Processing Query 97: fiber optic technolog hairthin

# Pseudo-relevance Feedback Unigram Laplace with ES significant terms

In [176]:
output_file_path = "./Deliverables/output_UnigramLaplace_Score_PR_ES.txt"
run_queries_and_write_results_laplace(output_file_path, pr_es_query_dict, total_unique_terms)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat forecast temperatur rain wind snow
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag hostages lebanon proiranian shiit kidnap
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif poacher tusk poaching antipoach poachers
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut research institute thrift institutions deposit
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci
Processing Query 97: fiber optic technolog hairthin

# Pseudo-relevance Feedback Unigram JM with ES significant terms

In [177]:
def get_query_terms_pr(pr_es_query_dict):
    all_query_terms=[]
    for query_num, query_text in pr_es_query_dict.items():
        print(f"Processing Query {query_num}: {query_text}")
        all_query_terms += query_text.split()
            
    term_freq_in_corpus = calculate_term_freq_in_corpus(all_document_term_vectors, all_query_terms)
    print(term_freq_in_corpus)
    return term_freq_in_corpus

term_freq_in_corpus = get_query_terms_pr(pr_es_query_dict)

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat forecast temperatur rain wind snow
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag hostages lebanon proiranian shiit kidnap
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif poacher tusk poaching antipoach poachers
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut research institute thrift institutions deposit
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci
Processing Query 97: fiber optic technolog hairthin

In [178]:
output_file_path_jm = "./Deliverables/output_UnigramJM_Score_PR_ES.txt"
lambda_value = 0.8  
run_queries_and_write_results_jm(output_file_path_jm, pr_es_query_dict, lambda_value)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat forecast temperatur rain wind snow
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag hostages lebanon proiranian shiit kidnap
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif poacher tusk poaching antipoach poachers
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut research institute thrift institutions deposit
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci
Processing Query 97: fiber optic technolog hairthin