In [1]:
import random
import re
import os
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Delta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Delta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Delta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
STOP_WORDS = stopwords.words('english')
LEMMA = WordNetLemmatizer()

In [4]:
def preprocess_text(input_text):
    input_text = input_text.lower()
    input_text = re.sub(r"[^a-zA-Z0-9\s]", "", input_text)
    input_text = re.sub(r"\d+", "", input_text)
    token_list = word_tokenize(input_text)
    processed_tokens = [LEMMA.lemmatize(word) for word in token_list if word not in STOP_WORDS]
    return processed_tokens


In [5]:
def load_documents_from_path(path_to_docs):
    doc_data = {}
    for file in os.listdir(path_to_docs):
        if file.endswith(".txt"):
            with open(os.path.join(path_to_docs, file), 'r', encoding='utf-8') as f:
                doc_data[file] = preprocess_text(f.read())
    return doc_data


In [6]:
def fetch_query(query_file):
    with open(query_file, 'r') as f:
        return [line.strip() for line in f.readlines()]

In [7]:
def calculate_stats(docs):
    total_docs = len(docs)
    term_doc_frequency = defaultdict(int)
    term_frequencies = defaultdict(lambda: defaultdict(int))

    for doc_id, terms in docs.items():
        print(doc_id)
        unique_terms = set(terms)
        for term in terms:
            term_frequencies[doc_id][term] += 1
        for term in unique_terms:
            term_doc_frequency[term] += 1

    return term_frequencies, term_doc_frequency, total_docs

In [8]:
def calculate_relevance(query, term_frequencies, term_doc_frequency, total_docs):
    doc_scores = {}
    for doc_id in term_frequencies:
        doc_score = 1.0
        for term in query:
            term_frequency = term_frequencies[doc_id].get(term, 0)
            document_frequency = term_doc_frequency.get(term, 0)
            p_term_in_relevant = (term_frequency + 1) / (sum(term_frequencies[doc_id].values()) + len(term_doc_frequency))
            p_term_in_non_relevant = (document_frequency + 1) / (total_docs - document_frequency + len(term_doc_frequency))
            doc_score *= (p_term_in_relevant / p_term_in_non_relevant)
        doc_scores[doc_id] = doc_score
    return doc_scores

In [9]:
def perform_document_retrieval(doc_path, query_file):
    docs = load_documents_from_path(doc_path)
    queries = fetch_query(query_file)

    term_frequencies, term_doc_frequency, total_docs = calculate_stats(docs)

    for query in queries:
        query_terms = preprocess_text(query)
        doc_scores = calculate_relevance(query_terms, term_frequencies, term_doc_frequency, total_docs)
        ranked_documents = sorted(doc_scores.items(), key=lambda item: item[1], reverse=True)
        print(f"Query: {query}")
        for filename, score in ranked_documents:
            print(f"Document: {filename}, Score: {score:.4f}")
        print()

In [17]:
def main():
    documents_dir = 'C:/Users/Delta/Documents/BIM/BIM/documents/' 
    query_file_path = 'C:/Users/Delta/Documents/BIM/BIM/queries.txt'
    perform_document_retrieval(documents_dir, query_file_path)

    def generate_random_relevance_scores(queries, docs, score_range=(0, 1)):
        random_scores = {}

        for query in queries:
            random_scores[query] = {}
            for doc in docs:
                random_scores[query][doc] = random.randint(score_range[0], score_range[1])

        return random_scores

    def write_relevance_scores_to_file(relevance_scores, output_file_name):
        with open(output_file_name, 'w') as file:
            for query, doc_scores in relevance_scores.items():
                file.write(f"Query: {query}\n")
                for doc, score in doc_scores.items():
                    file.write(f"{doc}: {score}\n")
                file.write("\n")

    docs = load_documents_from_path(documents_dir)
    queries = fetch_query(query_file_path)

    random_scores = generate_random_relevance_scores(queries, docs.keys())

    output_file_name = 'sahas_scores.txt'
    write_relevance_scores_to_file(random_scores, output_file_name)

    print(f"Relevance scores saved to {output_file_name}")

if __name__ == "__main__":
    main()


Cultural Heritage of Ukraine.txt
Economic Relations Between Russia and Ukraine.txt
GlobalView.txt
HelloRussia.txt
Historical Context of Russia-Ukraine Relations.txt
NATO's Influence on Ukraine's Defense Strategy.txt
Natural Resources in Russia's Economy.txt
Putin.txt
Recent Conflicts and Regional Stability.txt
Russia's History and Modern Geopolitics.txt
Russian Culture and National Identity.txt
The Cold War's Impact on Russia-Ukraine Relations.txt
The Cultural Dynamics of Ukraine.txt
The Impact of the Conflict on Ukrainian Society.txt
The Role of Natural Resources in Russia's Economy.txt
Ukrain.txt
Ukraine's Agricultural Sector.txt
Ukraine's EU Membership Aspirations.txt
Ukraine's Pursuit of European Integration.txt
WaronRussiaandUkrain.txt
Query: Ukraine conflict 2022
Document: HelloRussia.txt, Score: 0.5000
Document: The Impact of the Conflict on Ukrainian Society.txt, Score: 0.3070
Document: Historical Context of Russia-Ukraine Relations.txt, Score: 0.1638
Document: Ukraine's Pursui

 Volume in drive C is Windows
 Volume Serial Number is 1809-4465

 Directory of C:\Users\Delta\Documents\BIM

10/14/2024  03:12 PM    <DIR>          .
10/14/2024  03:12 PM    <DIR>          ..
10/14/2024  03:08 PM    <DIR>          .ipynb_checkpoints
10/14/2024  03:13 PM    <DIR>          BIM
10/14/2024  03:12 PM             7,769 Untitled.ipynb
               1 File(s)          7,769 bytes
               4 Dir(s)  397,209,866,240 bytes free


In [13]:
cd BIM


C:\Users\Delta\Documents\BIM\BIM


 Volume in drive C is Windows
 Volume Serial Number is 1809-4465

 Directory of C:\Users\Delta\Documents\BIM\BIM

10/14/2024  03:13 PM    <DIR>          .
10/14/2024  03:13 PM    <DIR>          ..
10/14/2024  03:10 PM    <DIR>          documents
10/14/2024  03:13 PM                35 queries.txt
10/14/2024  03:08 PM                 5 README.md
               2 File(s)             40 bytes
               3 Dir(s)  397,209,841,664 bytes free
