In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import numpy as np
import random
import re
import os
from collections import defaultdict
from math import log

In [13]:
def preprocess_content(text_content):
    return re.findall(r'\b\w+\b', text_content.lower())

In [23]:
def load_documents(directory_path):
    document_store = {}
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.txt'):
            with open(os.path.join(directory_path, file_name), 'r', encoding='utf-8') as doc_file:
                document_store[file_name] = preprocess_content(doc_file.read())

    return document_store


In [24]:
def load_queries(query_file_path):
    with open(query_file_path, 'r') as query_file:
        return [line.strip() for line in query_file.readlines()]

In [25]:
def calculate_statistics(document_store):
    # Initialize statistics tracking
    total_docs = len(document_store)
    word_doc_frequency = defaultdict(int)
    word_frequency = defaultdict(lambda: defaultdict(int))

    for doc_name, terms in document_store.items():
        unique_terms = set(terms)
        for term in terms:
            word_frequency[doc_name][term] += 1
        for term in unique_terms:
            word_doc_frequency[term] += 1

    return word_frequency, word_doc_frequency, total_docs

In [26]:
def compute_relevance_scores(query_terms, word_frequency, word_doc_frequency, total_docs):
    relevance_scores = {}
    for doc_name in word_frequency:
        doc_score = 1.0
        for term in query_terms:
            term_freq = word_frequency[doc_name].get(term, 0)
            doc_freq = word_doc_frequency.get(term, 0)
            p_relevant = (term_freq + 1) / (sum(word_frequency[doc_name].values()) + len(word_doc_frequency))
            p_not_relevant = (doc_freq + 1) / (total_docs - doc_freq + len(word_doc_frequency))
            doc_score *= (p_relevant / p_not_relevant)
        relevance_scores[doc_name] = doc_score
    return relevance_scores

In [27]:
def assign_random_relevance(query_list, doc_list, relevance_range=(0, 1)):
    random_relevance = {}

    for query in query_list:
        random_relevance[query] = {}
        for document in doc_list:
            random_relevance[query][document] = random.randint(relevance_range[0], relevance_range[1])

    return random_relevance

In [28]:
def save_relevance_scores(output_scores, output_file_path):

    with open(output_file_path, 'w') as output_file:
        for query, doc_scores in output_scores.items():
            for doc, score in doc_scores.items():
                output_file.write(f"{query},{doc},{score}\n")

In [33]:
def process_documents_and_queries(data_directory, query_file):
    document_store = load_documents(data_directory)
    query_list = load_queries(query_file)


    word_frequency, word_doc_frequency, total_docs = calculate_statistics(document_store)


    for query in query_list:
        query_terms = preprocess_content(query)
        relevance_scores = compute_relevance_scores(query_terms, word_frequency, word_doc_frequency, total_docs)
        sorted_docs = sorted(relevance_scores.items(), key=lambda item: item[1], reverse=True)
        print(f"Query: {query}")
        for doc_name, score in sorted_docs:
            print(f"Document: {doc_name}, Score: {score:.4f}")
        print()


    random_relevance_scores = assign_random_relevance(query_list, document_store.keys())
    output_file_name = 'sahas_scores_output.txt'
    save_relevance_scores(random_relevance_scores, output_file_name)

    print(f"Relevance scores saved to {output_file_name}")


In [34]:
print(os.listdir('/content/drive/MyDrive/Westcliff/IR'))

['Binary_Independence_Model.ipynb', 'queries.txt', 'week_2__sahas.ipynb', 'WaronRussiaandUkrain.txt', 'WarRUs.txt', 'Ukrain.txt', 'GlobalView.txt', 'Putin.txt', 'HelloRussia.txt', 'week3.ipynb']


In [36]:
folder_location = '/content/drive/MyDrive/Westcliff/IR'
query_file_location = '/content/drive/MyDrive/Westcliff/IR/queries.txt'
process_documents_and_queries(folder_location, query_file_location)

Query: Energy Crisis
Document: queries.txt, Score: 0.3310
Document: WarRUs.txt, Score: 0.1432
Document: HelloRussia.txt, Score: 0.0756
Document: WaronRussiaandUkrain.txt, Score: 0.0610
Document: Ukrain.txt, Score: 0.0597
Document: Putin.txt, Score: 0.0372
Document: GlobalView.txt, Score: 0.0327

Query: Refugee Displacement
Document: queries.txt, Score: 0.9948
Document: WaronRussiaandUkrain.txt, Score: 0.1832
Document: Ukrain.txt, Score: 0.1794
Document: WarRUs.txt, Score: 0.1435
Document: Putin.txt, Score: 0.1119
Document: GlobalView.txt, Score: 0.0984
Document: HelloRussia.txt, Score: 0.0379

Query: NATO Involvement
Document: queries.txt, Score: 0.4416
Document: HelloRussia.txt, Score: 0.3027
Document: WaronRussiaandUkrain.txt, Score: 0.0813
Document: Ukrain.txt, Score: 0.0796
Document: WarRUs.txt, Score: 0.0637
Document: Putin.txt, Score: 0.0497
Document: GlobalView.txt, Score: 0.0437

Query: Economic Sanctions
Document: GlobalView.txt, Score: 0.4701
Document: queries.txt, Score: 0.1