In [1]:
import os
import math
import re
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Delta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Delta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Delta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
ENGLISH_STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()


In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\d+", "", text)
    tokens = word_tokenize(text)
    return [LEMMATIZER.lemmatize(word) for word in tokens if word not in ENGLISH_STOPWORDS]


In [5]:
def calc_term_freq(term, document):
    return document.count(term) / len(document)

In [6]:
def calc_inverse_doc_freq(term, all_docs):
    doc_count_with_term = sum(1 for doc in all_docs if term in doc)
    return math.log(len(all_docs) / (doc_count_with_term)) if doc_count_with_term > 0 else 0


In [7]:
def calc_tfidf_vector(document, all_docs, vocab):
    tfidf_vec = []
    for term in vocab:
        tf = calc_term_freq(term, document)
        idf = calc_inverse_doc_freq(term, all_docs)
        tfidf_vec.append(tf * idf)
    return np.array(tfidf_vec)

In [8]:
def compute_cosine_similarity(vector1, vector2):
    dot_prod = np.dot(vector1, vector2)
    norm_vec1 = np.linalg.norm(vector1)
    norm_vec2 = np.linalg.norm(vector2)
    return dot_prod / (norm_vec1 * norm_vec2) if norm_vec1 > 0 and norm_vec2 > 0 else 0


In [9]:
def get_text_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    title = soup.title.string if soup.title else ""
    body = soup.body.get_text(separator=" ") if soup.body else ""
    return title, body

In [10]:
def load_docs_from_html(folder_path):
    docs = []
    filenames = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".html"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                title, body = get_text_from_html(content)
                docs.append(preprocess_text(title + " " + body))
                filenames.append(filename)
    
    return docs, filenames


In [12]:
# Precision K calculation
def calc_precision_at_k(relevant_docs, ranked_docs, k):
    if not ranked_docs:
        return 0.0
    top_k = ranked_docs[:k]
    relevant_in_top_k = sum(1 for doc in top_k if doc[0] in relevant_docs)
    return relevant_in_top_k / k


In [13]:
# Main function
def main():
    folder_path = './documents/'
    documents, filenames = load_docs_from_html(folder_path)

    # Iterate over each query
    for query in queries:
        cleaned_query = preprocess_text(query)
        vocab = sorted(set(word for doc in documents + [cleaned_query] for word in doc))

        # Compute TF-IDF vectors
        query_vector = calc_tfidf_vector(cleaned_query, documents, vocab)
        doc_vectors = [calc_tfidf_vector(doc, documents, vocab) for doc in documents]

        # Calculate cosine similarity
        similarities = [(filenames[i], compute_cosine_similarity(query_vector, doc_vector)) for i, doc_vector in enumerate(doc_vectors)]

        # Sort similarities
        similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

        # Get relevant documents for this query
        relevant_docs = relevant_docs_dict.get(query, set())

        # Calculate Precision@K
        k = 5
        precision_at_k = calc_precision_at_k(relevant_docs, similarities, k)

        with open("evaluation_results.txt", "a") as f:
            f.write(f"Results for query: '{query}'\n")
            f.write(f"Precision@{k}: {precision_at_k:.4f}\n")
            for title, similarity in similarities[:k]:
                f.write(f"Document: {title}, Similarity: {similarity:.4f}\n")
            f.write("\n")

    print("Results written to 'evaluation_results.txt'.")
    with open('sahas_evaluation_results.txt', 'r') as f:
        content = f.readlines()
        for line in content:
            if line.startswith("Document:"):
                parts = line.split(", Similarity:")
                filename = parts[0][10:]
                similarity_score = parts[1].strip()
                if len(filename) > 48:
                    formatted_filename = f"{filename[:25]}...{filename[-25:]}, Similarity: {similarity_score}"
                else:
                    formatted_filename = line.strip()
                print(formatted_filename)
            else:
                print(line.strip())

if __name__ == "__main__":
    main()

NameError: name 'queries' is not defined