# Querying for decompressed stemmed

In [1]:
import json
from nltk.stem import PorterStemmer
import string
import math
from collections import defaultdict

ps = PorterStemmer()

In [2]:
def stem_text(text, ps):
    stemmed = ' '.join([ps.stem(word) for word in text.split()])
    return stemmed

In [3]:
sw_path = "../IR_data/AP_DATA/stoplist.txt"

with open(sw_path) as file:
    stopwords = file.read().splitlines()

additional_stopwords = [
    "document", "discuss", "system", "identifi", "actual", "ongo", "ha", "ani", "describ",
    "motiv", "directli","successful", "detat", "area", "result", "type", "anticip","develop",
    "make", "tent", "financi", "specifi", "advanc", "someth", "standard", "fatal",
    "perpetr", "unsubstanti", "organ", "platform", "aid", "senior", "basi", "side", 
    "countri", "undesir", "good", "instanc", "method", "role", "exist",
    "effort", "support", "controversi", "forc", "applic",
    "determin", "second", "preliminari", "perform", "high", "tech", "predict", "insul", "instal", "regul", "level",
    "country", "dual-us", "militari", "alleg", "polit", "candid", "reserv", "contract", "fail", "state", 
    "concern", "manufactur", "current", "product", "equip", "non", "take", "fine"
]


In [4]:
def process_content(text):
    text = text.translate(str.maketrans("", "", ",.()'\""))
    text = text.replace("-", " ")
    words = text.split()
    processed_words = []
    for word in words:
        processed_words.append(word.lower() if word.lower() not in stopwords else '')
    processed_text = ' '.join(processed_words)
    return processed_text


In [5]:
def remove_additional_stopwords(text):
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in additional_stopwords])
    return text

In [6]:
def query_preprocessing(query):
    query = process_content(query)
    query = stem_text(query, ps)
    query = remove_additional_stopwords(query)
    return query

In [7]:
query_dict = {}
query_file_path = "../IR_data/AP_DATA/query_desc.51-100.short.txt"

with open(query_file_path, 'r') as query_desc_file:
    queries = query_desc_file.readlines()

for query_desc in queries:
    query_desc = query_desc.strip()

    if not query_desc:
        continue

    query_num, query_text = query_desc.split('.', 1)
    query_num = int(query_num.strip())
    query_text = query_preprocessing(query_text.strip())

    query_dict[query_num] = query_text

print(query_dict)

{85: 'corrupt public offici', 59: 'weather least locat', 56: 'prime lend rate prime rate', 71: 'incurs border guerrilla', 64: 'hostag', 62: 'coup attempt', 93: 'nation rifl associ nra', 99: 'iran contra affair', 58: 'rail strike rail strike', 77: 'poach wildlif', 54: 'agreement launch commerci satellit', 87: 'offic institut', 94: 'crime comput', 100: 'communist transfer technolog nation', 89: 'invest opec downstream', 61: 'israel iran contra affair', 95: 'comput crime', 68: 'safeti worker diamet fiber', 57: 'mci bell', 97: 'fiber optic technolog', 98: 'fiber optic', 60: 'salari incent pay pay', 80: '1988 presidenti', 63: 'machin translat', 91: 'weapon'}


In [8]:
all_query_terms = []

for text in query_dict.values():
    all_query_terms.extend(text.split())

print(all_query_terms)

['corrupt', 'public', 'offici', 'weather', 'least', 'locat', 'prime', 'lend', 'rate', 'prime', 'rate', 'incurs', 'border', 'guerrilla', 'hostag', 'coup', 'attempt', 'nation', 'rifl', 'associ', 'nra', 'iran', 'contra', 'affair', 'rail', 'strike', 'rail', 'strike', 'poach', 'wildlif', 'agreement', 'launch', 'commerci', 'satellit', 'offic', 'institut', 'crime', 'comput', 'communist', 'transfer', 'technolog', 'nation', 'invest', 'opec', 'downstream', 'israel', 'iran', 'contra', 'affair', 'comput', 'crime', 'safeti', 'worker', 'diamet', 'fiber', 'mci', 'bell', 'fiber', 'optic', 'technolog', 'fiber', 'optic', 'salari', 'incent', 'pay', 'pay', '1988', 'presidenti', 'machin', 'translat', 'weapon']


# Read doc_map and store it in a dict

In [9]:
doc_map_file_path = "./stemmed_index_files/doc_map.txt"
all_document_ids = {}

with open(doc_map_file_path, 'r') as doc_map_file:
    for line in doc_map_file:
        parts = line.strip().split(': ')
        if len(parts) == 2:
            value, doc_id = parts
            all_document_ids[doc_id] = value

print("Doc_map read successfully from doc_map.txt file.")
print(len(all_document_ids))

Doc_map read successfully from doc_map.txt file.
84678


In [10]:
total_length = 0
total_docs = 0
document_lengths = {}

with open("./stemmed_index_files/tokenized_dict_stemmed_length_map.txt", "r") as file:
    for line in file:
        parts = line.strip().split()
        if len(parts) == 2:
            doc_id, length = parts
            length = int(length)
            document_lengths[doc_id] = length
            total_length += length
            total_docs += 1

avg_doc_length = total_length / total_docs

print("Document lengths read successfully from tokenized_dict_stemmed_length_map.txt file.")
print("Total document count:", total_docs)
print("Average document length:", avg_doc_length)

Document lengths read successfully from tokenized_dict_stemmed_length_map.txt file.
Total document count: 84678
Average document length: 253.7629844823921


# Get the postings using binary search

In [11]:
def binary_search(query, catalog_file_path):
    with open(catalog_file_path, 'r') as catalog_file:
        lines = catalog_file.readlines()

        results = {}
        for term in query:
            low = 0
            high = len(lines) - 1

            while low <= high:
                mid = (low + high) // 2
                current_term, offset, size = lines[mid].strip().split()

                if current_term == term:
                    results[term] = (mid, lines[mid])
                    break
                elif current_term < term:
                    low = mid + 1
                else:
                    high = mid - 1

            if term not in results:
                results[term] = -1

    return results

def read_data_from_index_file(offset, size, index_file_path):
    data = ""
    with open(index_file_path, 'r') as index_file:
        index_file.seek(offset)
        data = index_file.read(size)
    return data

def process_query(query, catalog_file_path, index_file_path):
    term_data_dict = {}
    
    catalog_file_info_query_terms = binary_search(query, catalog_file_path)

    for term, (position, line) in catalog_file_info_query_terms.items():
        if position != -1:
            term, offset, size = line.strip().split()
            data = read_data_from_index_file(int(offset), int(size), index_file_path)
            term_data_dict[term] = data

    return term_data_dict

In [12]:
catalog_file_path = './stemmed_index_files/final_merged_catalog_file.txt'
index_file_path = './stemmed_index_files/final_merged_index_file.txt'

term_data_dict = process_query(all_query_terms, catalog_file_path, index_file_path)
print("Term data dictionary created.")

Term data dictionary created.


In [13]:
import re

def update_term_data_dict(term_data_dict):
    updated_dict = {}
    for key, value in term_data_dict.items():
        positions = {}
        matches = re.findall(r'(\d+):(\[.*?\])', value)
        for match in matches:
            doc_id = int(match[0])
            pos_str = match[1]
            positions[doc_id] = [int(pos) for pos in pos_str[1:-1].split(',')]

        df = len(positions)

        updated_dict[key] = {'positions': positions, 'doc_freq': df}
    return updated_dict

In [14]:
all_document_term_vectors = update_term_data_dict(term_data_dict)

# TF-IDF

In [15]:
def calculate_okapi_tf_tfidf(tf, length_doc, avg_doc_length):
    return tf / (tf + 0.5 + 1.5 * (length_doc / avg_doc_length))

def calculate_idf(dfw, total_documents):
    return math.log(total_documents / dfw)

def calculate_tfidf(tf, length_doc, avg_doc_length, dfw, total_documents):
    okapi_tf_score = calculate_okapi_tf_tfidf(tf, length_doc, avg_doc_length)
    idf_score = calculate_idf(dfw, total_documents)
    return okapi_tf_score * idf_score

def run_queries_and_write_results_tfidf(output_file, query_dict, all_document_term_vectors):
    with open(output_file, 'w') as output:
        for query_num, query_text in query_dict.items():
            print(f"Processing Query {query_num}: {query_text}")

            query_terms = query_text.split()
            document_scores = defaultdict(float)
            
            for term in query_terms:
                if term in all_document_term_vectors:
                    document_term_vector = all_document_term_vectors[term]
                    dfw = document_term_vector['doc_freq']
                    avg_doc_length = sum(document_lengths.values()) / len(document_lengths)
                    
                    for doc_id, positions in document_term_vector['positions'].items():
                        tf = len(positions)
                        length_doc = document_lengths[all_document_ids[str(doc_id)]]
                        
                        tfidf_score = calculate_tfidf(tf, length_doc, avg_doc_length, dfw, total_docs)
                        document_scores[doc_id] += tfidf_score

            sorted_docs = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)[:1000]

            for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
                output.write(str(query_num) + ' Q0 ' + str(all_document_ids[str(doc_id)]) + ' ' + str(rank) + ' ' + str(score) + ' Exp'+"\n")

In [16]:
output_file_path = "./result_files/output_TFIDF_Score.txt"
run_queries_and_write_results_tfidf(output_file_path, query_dict, all_document_term_vectors)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci bell
Processing Query 97: fiber optic technolog
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 presidenti
Processing Query 63: machin translat
Processing Query 

# BM25

In [17]:
import math
from collections import defaultdict

def calculate_okapi_bm25(tfwd, dfw, D, length_doc, avg_doc_length, tfwq, k1=1.2, b=0.75, k2=100):
    term1 = math.log((D - dfw + 0.5) / (dfw + 0.5))
    term2_num = (tfwd * (k1 + 1))
    term2_den = (tfwd + k1 * ((1 - b) + (b * (length_doc / avg_doc_length))))
    term3 = (tfwq + (k2 * tfwq)) / (tfwq + k2)
    return term1 * (term2_num / term2_den) * term3

def run_queries_and_write_results_bm25(output_file, query_dict, all_document_term_vectors):
    with open(output_file, 'w') as output:
        for query_num, query_text in query_dict.items():
            print(f"Processing Query {query_num}: {query_text}")

            query_terms = query_text.split()
            D = len(all_document_ids)  
            document_scores = defaultdict(float)

            for term in query_terms:
                if term in all_document_term_vectors:
                    document_term_vector = all_document_term_vectors[term]
                    dfw = document_term_vector['doc_freq']
                    avg_doc_length = sum(document_lengths.values()) / len(document_lengths)
                    
                    for doc_id, positions in document_term_vector['positions'].items():
                        tfwd = len(positions)
                        length_doc = document_lengths[all_document_ids[str(doc_id)]]
                        tfwq = query_terms.count(term)
                        
                        bm25_score = calculate_okapi_bm25(tfwd, dfw, D, length_doc, avg_doc_length, tfwq)
                        document_scores[doc_id] += bm25_score

            sorted_docs = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)[:1000]

            for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
                output.write(str(query_num) + ' Q0 ' + str(all_document_ids[str(doc_id)]) + ' ' + str(rank) + ' ' + str(score) + ' Exp'+"\n")

In [18]:
output_file_path = "./result_files/output_BM25_Score.txt"
run_queries_and_write_results_bm25(output_file_path, query_dict, all_document_term_vectors)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci bell
Processing Query 97: fiber optic technolog
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 presidenti
Processing Query 63: machin translat
Processing Query 

In [19]:
catalog_file_path = './stemmed_index_files/final_merged_catalog_file.txt'

with open(catalog_file_path, 'r') as catalog_file:
        total_unique_terms = sum(1 for line in catalog_file)

print(total_unique_terms)

217712


# Unigram Laplace

In [20]:
def calculate_laplace_score(tfwd, length_doc, total_unique_terms):
    return math.log((tfwd + 1) / (length_doc + total_unique_terms))

def run_queries_and_write_results_laplace(output_file, query_dict, total_unique_terms, all_document_term_vectors):
    with open(output_file, 'w') as output:
        document_scores = defaultdict(float)

        for query_num, query_text in query_dict.items():
            print(f"Processing Query {query_num}: {query_text}")
            query_terms = query_text.split()
            for doc_id in all_document_ids:
                document_scores[doc_id] = 1000
            for term in query_terms:
                for doc_id in all_document_ids:
                    if term in all_document_term_vectors and int(doc_id) in all_document_term_vectors[term]['positions']:
                        tf = len(all_document_term_vectors[term]['positions'][int(doc_id)])
                        length_doc = document_lengths[all_document_ids[str(doc_id)]]
                        laplace_score = calculate_laplace_score(tf, length_doc, total_unique_terms)
                        document_scores[doc_id] += laplace_score
                    else:
                        document_scores[doc_id] -= 1000
            
            sorted_docs = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)[:1000]

            for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
                output.write(str(query_num) + ' Q0 ' + str(all_document_ids[str(doc_id)]) + ' ' + str(rank) + ' ' + str(score) + ' Exp'+"\n")


In [21]:
output_file_path = "./result_files/output_UnigramLaplace_Score.txt"
run_queries_and_write_results_laplace(output_file_path, query_dict, total_unique_terms, all_document_term_vectors)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci bell
Processing Query 97: fiber optic technolog
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 presidenti
Processing Query 63: machin translat
Processing Query 

# Querying for decompressed unstemmed

In [22]:
additional_stopwords_unstemmed = [
    "document", "discuss", "system", "identifi", "actual", "ongoing", "ha", "ani", "describ","states",
    "motiv", "directli","successful", "detat", "area", "result", "type", "anticip","development",
    "make", "tent", "financi", "specifi", "advanc", "someth", "standard", "fatality",
    "perpetr", "unsubstanti", "organ", "platform", "aid", "senior", "basi", "side", 
    "countri", "undesir", "goods", "instanc", "method", "role", "exist",
    "effort", "supporters", "controversi", "forces", "applic",
    "determin", "second", "preliminari", "perform", "high", "tech", "predict", "insul", "instal", "regul", "level",
    "country", "dual-us", "military", "allegations", "politically", "candid", "reservation", "contract", "failed", "state", 
    "concern", "manufactur", "current", "product", "equip", "non", "taking", "fine", "application", "concerns", "manufacturing",
    "products", "equipment", "performance", "candidate", "systems"
]


In [23]:
def remove_additional_stopwords_unstemmed(text):
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in additional_stopwords_unstemmed])
    return text

In [24]:
def query_preprocessing_unstemmed(query):
    query = process_content(query)
    query = remove_additional_stopwords_unstemmed(query)
    return query

In [25]:
query_dict_unstemmed = {}
query_file_path = "../IR_data/AP_DATA/query_desc.51-100.short.txt"

with open(query_file_path, 'r') as query_desc_file:
    queries = query_desc_file.readlines()

for query_desc in queries:
    query_desc = query_desc.strip()

    if not query_desc:
        continue

    query_num, query_text = query_desc.split('.', 1)
    query_num = int(query_num.strip())
    query_text = query_preprocessing_unstemmed(query_text.strip())

    query_dict_unstemmed[query_num] = query_text

print(query_dict_unstemmed)

{85: 'corrupt public officials', 59: 'weather least location', 56: 'prime lending rate prime rate', 71: 'incursions border guerrilla', 64: 'hostage', 62: 'coup attempted', 93: 'national rifle association nra', 99: 'iran contra affair', 58: 'rail strike rail strike', 77: 'poaching wildlife', 54: 'agreement launch commercial satellite', 87: 'officers institution', 94: 'crime computer', 100: 'communist transfer technologies nations', 89: 'investment opec downstream', 61: 'israel iran contra affair', 95: 'computer crime', 68: 'safety workers diameter fibers', 57: 'mci bell', 97: 'fiber optics technology', 98: 'fiber optics', 60: 'salary incentive pay pay', 80: '1988 presidential', 63: 'machine translation', 91: 'weapons'}


In [26]:
all_query_terms_unstemmed = []

for text in query_dict_unstemmed.values():
    all_query_terms_unstemmed.extend(text.split())

print(all_query_terms_unstemmed)

['corrupt', 'public', 'officials', 'weather', 'least', 'location', 'prime', 'lending', 'rate', 'prime', 'rate', 'incursions', 'border', 'guerrilla', 'hostage', 'coup', 'attempted', 'national', 'rifle', 'association', 'nra', 'iran', 'contra', 'affair', 'rail', 'strike', 'rail', 'strike', 'poaching', 'wildlife', 'agreement', 'launch', 'commercial', 'satellite', 'officers', 'institution', 'crime', 'computer', 'communist', 'transfer', 'technologies', 'nations', 'investment', 'opec', 'downstream', 'israel', 'iran', 'contra', 'affair', 'computer', 'crime', 'safety', 'workers', 'diameter', 'fibers', 'mci', 'bell', 'fiber', 'optics', 'technology', 'fiber', 'optics', 'salary', 'incentive', 'pay', 'pay', '1988', 'presidential', 'machine', 'translation', 'weapons']


In [27]:
total_length = 0
total_docs = 0
document_lengths = {}

with open("./non_stemmed_index_files/tokenized_dict_non_stemmed_length_map.txt", "r") as file:
    for line in file:
        parts = line.strip().split()
        if len(parts) == 2:
            doc_id, length = parts
            length = int(length)
            document_lengths[doc_id] = length
            total_length += length
            total_docs += 1

avg_doc_length = total_length / total_docs

print("Document lengths read successfully from tokenized_dict_non_stemmed_length_map.txt file.")
print("Total document count:", total_docs)
print("Average document length:", avg_doc_length)

Document lengths read successfully from tokenized_dict_non_stemmed_length_map.txt file.
Total document count: 84678
Average document length: 253.7629844823921


In [28]:
catalog_file_path_unstemmed = './non_stemmed_index_files/final_merged_catalog_file.txt'
index_file_path_unstemmed = './non_stemmed_index_files/final_merged_index_file.txt'

term_data_dict_unstemmed = process_query(all_query_terms_unstemmed, catalog_file_path_unstemmed, index_file_path_unstemmed)
print("Term data dictionary created.")

Term data dictionary created.


In [29]:
all_document_term_vectors_unstemmed = update_term_data_dict(term_data_dict_unstemmed)

# Unstemmed TF-IDF

In [30]:
output_file_path = "./result_files/output_TFIDF_Unstemmed_Score.txt"
run_queries_and_write_results_tfidf(output_file_path, query_dict_unstemmed, all_document_term_vectors_unstemmed)
print("Queries executed successfully.")

Processing Query 85: corrupt public officials
Processing Query 59: weather least location
Processing Query 56: prime lending rate prime rate
Processing Query 71: incursions border guerrilla
Processing Query 64: hostage
Processing Query 62: coup attempted
Processing Query 93: national rifle association nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poaching wildlife
Processing Query 54: agreement launch commercial satellite
Processing Query 87: officers institution
Processing Query 94: crime computer
Processing Query 100: communist transfer technologies nations
Processing Query 89: investment opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: computer crime
Processing Query 68: safety workers diameter fibers
Processing Query 57: mci bell
Processing Query 97: fiber optics technology
Processing Query 98: fiber optics
Processing Query 60: salary incentive pay pay
Processing Query 80: 1988 presid

# Unstemmed BM-25

In [31]:
output_file_path = "./result_files/output_BM25_Unstemmed_Score.txt"
run_queries_and_write_results_bm25(output_file_path, query_dict_unstemmed, all_document_term_vectors_unstemmed)
print("Queries executed successfully.")

Processing Query 85: corrupt public officials
Processing Query 59: weather least location
Processing Query 56: prime lending rate prime rate
Processing Query 71: incursions border guerrilla
Processing Query 64: hostage
Processing Query 62: coup attempted
Processing Query 93: national rifle association nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poaching wildlife
Processing Query 54: agreement launch commercial satellite
Processing Query 87: officers institution
Processing Query 94: crime computer
Processing Query 100: communist transfer technologies nations
Processing Query 89: investment opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: computer crime
Processing Query 68: safety workers diameter fibers
Processing Query 57: mci bell
Processing Query 97: fiber optics technology
Processing Query 98: fiber optics
Processing Query 60: salary incentive pay pay
Processing Query 80: 1988 presid

# Unstemmed Unigram Laplace

In [32]:
catalog_file_path = './non_stemmed_index_files/final_merged_catalog_file.txt'

with open(catalog_file_path, 'r') as catalog_file:
        total_unique_terms_unstemmed = sum(1 for line in catalog_file)

print(total_unique_terms_unstemmed)

output_file_path = "./result_files/output_UnigramLaplace_Unstemmed_Score.txt"
run_queries_and_write_results_laplace(output_file_path, query_dict_unstemmed, total_unique_terms_unstemmed, all_document_term_vectors_unstemmed)
print("Queries executed successfully.")


271747
Processing Query 85: corrupt public officials
Processing Query 59: weather least location
Processing Query 56: prime lending rate prime rate
Processing Query 71: incursions border guerrilla
Processing Query 64: hostage
Processing Query 62: coup attempted
Processing Query 93: national rifle association nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poaching wildlife
Processing Query 54: agreement launch commercial satellite
Processing Query 87: officers institution
Processing Query 94: crime computer
Processing Query 100: communist transfer technologies nations
Processing Query 89: investment opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: computer crime
Processing Query 68: safety workers diameter fibers
Processing Query 57: mci bell
Processing Query 97: fiber optics technology
Processing Query 98: fiber optics
Processing Query 60: salary incentive pay pay
Processing Query 80: 1988

# Compressed index

In [33]:
import gzip

def read_data_from_index_file_compressed(offset, size, index_file_path):
    data = b""
    with gzip.open(index_file_path, 'rb') as index_file:
        index_file.seek(offset)
        data = index_file.read(size)
    return data.decode('utf-8')

def process_query_compressed(all_query_terms, catalog_file_path, index_file_path):
    term_data_dict = {}
    
    catalog_file_info_query_terms = binary_search(all_query_terms, catalog_file_path)

    for term, (position, line) in catalog_file_info_query_terms.items():
        if position != -1:
            _, offset, size = line.strip().split()
            data = read_data_from_index_file_compressed(int(offset), int(size), index_file_path)
            term_data_dict[term] = data

    return term_data_dict


catalog_file_path = './stemmed_index_files/final_merged_catalog_file.txt'
index_file_path_compressed = './stemmed_index_files/final_merged_index_file.gz'

term_data_dict_compressed = process_query_compressed(all_query_terms, catalog_file_path, index_file_path_compressed)
print("Term data dictionary created.")

Term data dictionary created.


In [34]:
all_document_term_vectors_compressed = update_term_data_dict(term_data_dict_compressed)

# BM-25 using one compressed index file

In [35]:
output_file_path = "./result_files/output_BM25_Compressed_Score.txt"
run_queries_and_write_results_bm25(output_file_path, query_dict, all_document_term_vectors_compressed)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci bell
Processing Query 97: fiber optic technolog
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 presidenti
Processing Query 63: machin translat
Processing Query 

# Decompression using of compressed chunks

In [36]:
import json

input_file_path = "./stemmed_index_files/compressed_chunks.json"
with open(input_file_path, 'r') as input_file:
    compressed_chunks = json.load(input_file)

print("Compressed chunks: ")
print(compressed_chunks)

Compressed chunks: 
{'chesebroughpond': {'file_path': './stemmed_index_files/compressed_index_files/chunk_1_compressed.gz', 'file_offset': 0}, 'motionless': {'file_path': './stemmed_index_files/compressed_index_files/chunk_2_compressed.gz', 'file_offset': 35891995}, 'zzzz': {'file_path': './stemmed_index_files/compressed_index_files/chunk_3_compressed.gz', 'file_offset': 111511981}}


In [37]:
def read_data_from_index_file_compressed(offset, size, index_file_path):
    data = b""
    with gzip.open(index_file_path, 'rb') as index_file:
        index_file.seek(offset)
        data = index_file.read(size)
    return data.decode('utf-8')

def process_query_compressed(all_query_terms, catalog_file_path, index_file_paths):
    term_data_dict = {}
    
    catalog_file_info_query_terms = binary_search(all_query_terms, catalog_file_path)

    for term, (position, line) in catalog_file_info_query_terms.items():
        if position != -1:
            term, offset, size = line.strip().split()
            offset = int(offset)
            size = int(size)
            
            compresssed_chunk_key = None
            for chunk_key, (file_path, file_offset) in compressed_chunks.items():
                if term <= chunk_key:
                    compresssed_chunk_key = chunk_key
                    break
            
            if compresssed_chunk_key is not None:
                index_file_path = compressed_chunks[compresssed_chunk_key]['file_path']
                data = read_data_from_index_file_compressed(offset - compressed_chunks[compresssed_chunk_key]['file_offset'], size, index_file_path)
                term_data_dict[term] = data

    return term_data_dict


catalog_file_path = './stemmed_index_files/final_merged_catalog_file.txt'
term_data_dict_compressed_chunks = process_query_compressed(all_query_terms, catalog_file_path, compressed_chunks)
print("term_data_dict_compressed created")

term_data_dict_compressed created


In [38]:
all_document_term_vectors_compressed_chunks = update_term_data_dict(term_data_dict_compressed_chunks)

# BM-25 using three compressed index chunk files

In [39]:
output_file_path = "./result_files/output_BM25_Compressed_Chunks_Score.txt"
run_queries_and_write_results_bm25(output_file_path, query_dict, all_document_term_vectors_compressed_chunks)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci bell
Processing Query 97: fiber optic technolog
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 presidenti
Processing Query 63: machin translat
Processing Query 

# Proximity search

In [40]:
def query_preprocessing_proximity(query):
    query = process_content(query)
    return query

In [41]:
query_dict_unmodified = {}
query_file_path = "../IR_data/AP_DATA/query_desc.51-100.short.txt"

with open(query_file_path, 'r') as query_desc_file:
    queries = query_desc_file.readlines()

for query_desc in queries:
    query_desc = query_desc.strip()

    if not query_desc:
        continue

    query_num, query_text = query_desc.split('.', 1)
    query_num = int(query_num.strip())
    query_text = query_preprocessing_proximity(' '.join(query_text.strip().split()))
    query_dict_unmodified[query_num] = query_text.strip()
print(query_dict_unmodified)

{85: 'allegations      corrupt public officials', 59: 'weather       least  fatality   location', 56: 'prime lending rate      prime rate', 71: 'incursions        border area     military forces   second    guerrilla     second', 64: 'result  politically  hostage taking', 62: 'military coup detat  attempted', 93: 'supporters   national rifle association nra', 99: 'development   iran contra affair', 58: 'rail strike    ongoing rail strike', 77: 'poaching method       wildlife', 54: 'contract   agreement       reservation  launch  commercial satellite', 87: 'current    officers   failed   institution', 94: 'crime    aid   computer', 100: 'non communist  states    transfer  high tech goods    technologies   nations', 89: 'investment   opec     downstream', 61: 'israel   iran contra affair', 95: 'computer application  crime', 68: 'concerns   safety  manufacturing    workers  fine diameter fibers      products', 57: 'mci      bell system', 97: 'fiber optics technology', 98: 'fiber optics eq

In [42]:
all_query_terms_unmodified = []

for text in query_dict_unmodified.values():
    all_query_terms_unmodified.extend(text.split())

print(all_query_terms_unmodified)

['allegations', 'corrupt', 'public', 'officials', 'weather', 'least', 'fatality', 'location', 'prime', 'lending', 'rate', 'prime', 'rate', 'incursions', 'border', 'area', 'military', 'forces', 'second', 'guerrilla', 'second', 'result', 'politically', 'hostage', 'taking', 'military', 'coup', 'detat', 'attempted', 'supporters', 'national', 'rifle', 'association', 'nra', 'development', 'iran', 'contra', 'affair', 'rail', 'strike', 'ongoing', 'rail', 'strike', 'poaching', 'method', 'wildlife', 'contract', 'agreement', 'reservation', 'launch', 'commercial', 'satellite', 'current', 'officers', 'failed', 'institution', 'crime', 'aid', 'computer', 'non', 'communist', 'states', 'transfer', 'high', 'tech', 'goods', 'technologies', 'nations', 'investment', 'opec', 'downstream', 'israel', 'iran', 'contra', 'affair', 'computer', 'application', 'crime', 'concerns', 'safety', 'manufacturing', 'workers', 'fine', 'diameter', 'fibers', 'products', 'mci', 'bell', 'system', 'fiber', 'optics', 'technology'

In [43]:
catalog_file_path_nonstemmed = './non_stemmed_index_files/final_merged_catalog_file.txt'
index_file_path_nonstemmed = './non_stemmed_index_files/final_merged_index_file.txt'

term_data_dict_unmodified = process_query(all_query_terms_unmodified, catalog_file_path_nonstemmed, index_file_path_nonstemmed)
print("Term data dictionary created.")

Term data dictionary created.


In [44]:
all_document_term_vectors_unmodified = update_term_data_dict(term_data_dict_unmodified)

In [45]:
def query_preprocessing_unmodified_stemmed(query):
    query = process_content(query)
    query = stem_text(query, ps)
    return query

In [46]:
query_dict_unmodified_stemmed = {}
query_file_path = "../IR_data/AP_DATA/query_desc.51-100.short.txt"

with open(query_file_path, 'r') as query_desc_file:
    queries = query_desc_file.readlines()

for query_desc in queries:
    query_desc = query_desc.strip()

    if not query_desc:
        continue

    query_num, query_text = query_desc.split('.', 1)
    query_num = int(query_num.strip())
    query_text = query_preprocessing_unmodified_stemmed(' '.join(query_text.strip().split()))
    query_dict_unmodified_stemmed[query_num] = query_text.strip()
print(query_dict_unmodified_stemmed)

{85: 'alleg corrupt public offici', 59: 'weather least fatal locat', 56: 'prime lend rate prime rate', 71: 'incurs border area militari forc second guerrilla second', 64: 'result polit hostag take', 62: 'militari coup detat attempt', 93: 'support nation rifl associ nra', 99: 'develop iran contra affair', 58: 'rail strike ongo rail strike', 77: 'poach method wildlif', 54: 'contract agreement reserv launch commerci satellit', 87: 'current offic fail institut', 94: 'crime aid comput', 100: 'non communist state transfer high tech good technolog nation', 89: 'invest opec downstream', 61: 'israel iran contra affair', 95: 'comput applic crime', 68: 'concern safeti manufactur worker fine diamet fiber product', 57: 'mci bell system', 97: 'fiber optic technolog', 98: 'fiber optic equip', 60: 'perform salari incent pay pay', 80: '1988 presidenti candid', 63: 'machin translat system', 91: 'weapon system'}


In [47]:
all_query_terms_unmodified_stemmed = []

for text in query_dict_unmodified_stemmed.values():
    all_query_terms_unmodified_stemmed.extend(text.split())

print(all_query_terms_unmodified_stemmed)

['alleg', 'corrupt', 'public', 'offici', 'weather', 'least', 'fatal', 'locat', 'prime', 'lend', 'rate', 'prime', 'rate', 'incurs', 'border', 'area', 'militari', 'forc', 'second', 'guerrilla', 'second', 'result', 'polit', 'hostag', 'take', 'militari', 'coup', 'detat', 'attempt', 'support', 'nation', 'rifl', 'associ', 'nra', 'develop', 'iran', 'contra', 'affair', 'rail', 'strike', 'ongo', 'rail', 'strike', 'poach', 'method', 'wildlif', 'contract', 'agreement', 'reserv', 'launch', 'commerci', 'satellit', 'current', 'offic', 'fail', 'institut', 'crime', 'aid', 'comput', 'non', 'communist', 'state', 'transfer', 'high', 'tech', 'good', 'technolog', 'nation', 'invest', 'opec', 'downstream', 'israel', 'iran', 'contra', 'affair', 'comput', 'applic', 'crime', 'concern', 'safeti', 'manufactur', 'worker', 'fine', 'diamet', 'fiber', 'product', 'mci', 'bell', 'system', 'fiber', 'optic', 'technolog', 'fiber', 'optic', 'equip', 'perform', 'salari', 'incent', 'pay', 'pay', '1988', 'presidenti', 'candid

In [48]:
catalog_file_path = './stemmed_index_files/final_merged_catalog_file.txt'
index_file_path = './stemmed_index_files/final_merged_index_file.txt'

term_data_dict_unmodified_stemmed = process_query(all_query_terms_unmodified_stemmed, catalog_file_path, index_file_path)
print("Term data dictionary created.")

Term data dictionary created.


In [49]:
all_document_term_vectors_unmodified_unstemmed = update_term_data_dict(term_data_dict_unmodified_stemmed)

# Get min-span using sliding window algorithm

In [50]:
def calculate_proximity_score(min_span, ngram_length):
    if ngram_length == 0:
        return 0
    else:
        return math.pow(0.8, (min_span - ngram_length) / ngram_length)

def get_min_span(query_proximity_dict):
    if len(query_proximity_dict) == 0:
        return -1
    else:
        tuples = []
        for index, (key, postings) in enumerate(query_proximity_dict.items()):
            for val in postings:
                tuples.append((val, index, postings.index(val)))

        tuples.sort()

        window = [postings[0] for postings in query_proximity_dict.values()]
        left = 0
        min_span = float('inf')

        for right in range(len(tuples)):
            val, list_idx, _ = tuples[right]
            window[list_idx] = val
            
            while left < len(window) and window[left] != min(window):
                left += 1

            if max(window) - min(window) < min_span:
                min_span = max(window) - min(window)
        return min_span

# Proximity search on top of BM-25

In [51]:
import math
from collections import defaultdict

def calculate_okapi_bm25(tfwd, dfw, D, length_doc, avg_doc_length, tfwq, k1=1.2, b=0.75, k2=100):
    term1 = math.log((D - dfw + 0.5) / (dfw + 0.5))
    term2_num = (tfwd * (k1 + 1))
    term2_den = (tfwd + k1 * ((1 - b) + (b * (length_doc / avg_doc_length))))
    term3 = (tfwq + (k2 * tfwq)) / (tfwq + k2)
    return term1 * (term2_num / term2_den) * term3

def run_queries_and_write_results_bm25_PS(output_file, query_dict, total_unique_terms, all_document_term_vectors):
    with open(output_file, 'w') as output:
        for query_num, query_text in query_dict.items():
            print(f"Processing Query {query_num}: {query_text}")

            query_terms = query_text.split()
            D = len(all_document_ids)  
            document_scores = defaultdict(float)

            for term in query_terms:
                if term in all_document_term_vectors:
                    document_term_vector = all_document_term_vectors[term]
                    dfw = document_term_vector['doc_freq']
                    avg_doc_length = sum(document_lengths.values()) / len(document_lengths)
                    
                    for doc_id, positions in document_term_vector['positions'].items():
                        tfwd = len(positions)
                        length_doc = document_lengths[all_document_ids[str(doc_id)]]
                        tfwq = query_terms.count(term)
                        
                        bm25_score = calculate_okapi_bm25(tfwd, dfw, D, length_doc, avg_doc_length, tfwq)
                        document_scores[doc_id] += bm25_score
            
            for doc_id in all_document_ids:
                query_proximity_dict = {}
                for term in query_terms:
                    if term in all_document_term_vectors and int(doc_id) in all_document_term_vectors[term]['positions']:
                        query_proximity_dict[term] = all_document_term_vectors[term]['positions'][int(doc_id)]
                min_span = get_min_span(query_proximity_dict)
                document_scores[int(doc_id)] += calculate_proximity_score(min_span, len(query_proximity_dict))

            sorted_docs = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)[:1000]

            for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
                output.write(str(query_num) + ' Q0 ' + str(all_document_ids[str(doc_id)]) + ' ' + str(rank) + ' ' + str(score) + ' Exp'+"\n")

# Running proximity search on top of BM-25 for unmodified unstemmed queries 

In [52]:
output_file_path = "./result_files/output_ProximitySearch_Unstemmed_Unmodified_Score.txt"
run_queries_and_write_results_bm25_PS(output_file_path, query_dict_unmodified, total_unique_terms, all_document_term_vectors_unmodified)
print("Queries executed successfully.")

Processing Query 85: allegations      corrupt public officials
Processing Query 59: weather       least  fatality   location
Processing Query 56: prime lending rate      prime rate
Processing Query 71: incursions        border area     military forces   second    guerrilla     second
Processing Query 64: result  politically  hostage taking
Processing Query 62: military coup detat  attempted
Processing Query 93: supporters   national rifle association nra
Processing Query 99: development   iran contra affair
Processing Query 58: rail strike    ongoing rail strike
Processing Query 77: poaching method       wildlife
Processing Query 54: contract   agreement       reservation  launch  commercial satellite
Processing Query 87: current    officers   failed   institution
Processing Query 94: crime    aid   computer
Processing Query 100: non communist  states    transfer  high tech goods    technologies   nations
Processing Query 89: investment   opec     downstream
Processing Query 61: israel

# Running proximity search on top of BM-25 for modified unstemmed queries

In [53]:
output_file_path = "./result_files/output_ProximitySearch_Unstemmed_Modified_Score.txt"
run_queries_and_write_results_bm25_PS(output_file_path, query_dict_unstemmed, total_unique_terms, all_document_term_vectors_unstemmed)
print("Queries executed successfully.")

Processing Query 85: corrupt public officials
Processing Query 59: weather least location
Processing Query 56: prime lending rate prime rate
Processing Query 71: incursions border guerrilla
Processing Query 64: hostage
Processing Query 62: coup attempted
Processing Query 93: national rifle association nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poaching wildlife
Processing Query 54: agreement launch commercial satellite
Processing Query 87: officers institution
Processing Query 94: crime computer
Processing Query 100: communist transfer technologies nations
Processing Query 89: investment opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: computer crime
Processing Query 68: safety workers diameter fibers
Processing Query 57: mci bell
Processing Query 97: fiber optics technology
Processing Query 98: fiber optics
Processing Query 60: salary incentive pay pay
Processing Query 80: 1988 presid

# Running proximity search on top of BM-25 for unmodified stemmed queries

In [54]:
output_file_path = "./result_files/output_ProximitySearch_Stemmed_Unmodified_Score.txt"
run_queries_and_write_results_bm25_PS(output_file_path, query_dict_unmodified_stemmed, total_unique_terms, all_document_term_vectors_unmodified_unstemmed)
print("Queries executed successfully.")

Processing Query 85: alleg corrupt public offici
Processing Query 59: weather least fatal locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border area militari forc second guerrilla second
Processing Query 64: result polit hostag take
Processing Query 62: militari coup detat attempt
Processing Query 93: support nation rifl associ nra
Processing Query 99: develop iran contra affair
Processing Query 58: rail strike ongo rail strike
Processing Query 77: poach method wildlif
Processing Query 54: contract agreement reserv launch commerci satellit
Processing Query 87: current offic fail institut
Processing Query 94: crime aid comput
Processing Query 100: non communist state transfer high tech good technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput applic crime
Processing Query 68: concern safeti manufactur worker fine diamet fiber product
Processing Query 57: mci bell system


# Running proximity search on top of BM-25 for modified stemmed queries

In [55]:
output_file_path = "./result_files/output_ProximitySearch_Stemmed_Modified_Score.txt"
run_queries_and_write_results_bm25_PS(output_file_path, query_dict, total_unique_terms, all_document_term_vectors)
print("Queries executed successfully.")

Processing Query 85: corrupt public offici
Processing Query 59: weather least locat
Processing Query 56: prime lend rate prime rate
Processing Query 71: incurs border guerrilla
Processing Query 64: hostag
Processing Query 62: coup attempt
Processing Query 93: nation rifl associ nra
Processing Query 99: iran contra affair
Processing Query 58: rail strike rail strike
Processing Query 77: poach wildlif
Processing Query 54: agreement launch commerci satellit
Processing Query 87: offic institut
Processing Query 94: crime comput
Processing Query 100: communist transfer technolog nation
Processing Query 89: invest opec downstream
Processing Query 61: israel iran contra affair
Processing Query 95: comput crime
Processing Query 68: safeti worker diamet fiber
Processing Query 57: mci bell
Processing Query 97: fiber optic technolog
Processing Query 98: fiber optic
Processing Query 60: salari incent pay pay
Processing Query 80: 1988 presidenti
Processing Query 63: machin translat
Processing Query 