In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import os
import string
import logging
import re  # Import regular expressions library
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer ()

In [4]:
def load_text_files(folder_path):
    """Reads all files in a folder and returns a dictionary
    with filenames as keys and content as values."""
    data = {}
    doc_id_to_filename = {}
    doc_id = 0

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data[doc_id] = file.read()
                doc_id_to_filename[doc_id] = filename  # Map doc_id to filename
                logging.info(f"Loaded file: {filename} with doc_id: {doc_id}")
            doc_id += 1  # Increment document ID for the next file

    return data, doc_id_to_filename

In [5]:
def clean_text(text):
    """Performs text cleaning: removing special characters, tokenization, stopword removal, and lemmatization."""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and punctuation using regular expressions
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Keeps only alphanumeric characters and spaces

    # Tokenize the cleaned text
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]

    return cleaned_tokens


In [6]:
def build_inverted_index(data):
    """Builds an inverted index from the cleaned text data and tracks term frequencies."""

    inverted_index = defaultdict(set)
    term_frequencies = Counter()  # Track the frequency of each term

    for doc_id, content in data.items():
        cleaned_tokens = clean_text(content)

        for token in cleaned_tokens:
            inverted_index[token].add(doc_id)
            term_frequencies[token] += 1  # Update term frequency

    return inverted_index, term_frequencies

In [7]:
def boolean_query(query, inverted_index, doc_id_to_filename):
    """Processes a Boolean query ('AND', 'OR', 'NOT') on the inverted index and returns filenames."""

    query = query.lower()
    tokens = query.split()

    result_set = set()

    if 'and' in tokens:
        terms = [term for term in tokens if term not in ['and', 'or', 'not']]

        # Check if all terms exist in the inverted index
        if all(term in inverted_index for term in terms):
            result_set = inverted_index[terms[0]]
            for term in terms[1:]:
                result_set = result_set.intersection(inverted_index[term])
        else:
            result_set = set()  # Return empty result if any term is missing

    elif 'or' in tokens:
        terms = [term for term in tokens if term not in ['and', 'or', 'not']]
        for term in terms:
            if term in inverted_index:
                if not result_set:
                    result_set = inverted_index[term]
                else:
                    result_set = result_set.union(inverted_index[term])

    elif 'not' in tokens:
        term = tokens[1]
        if term in inverted_index:
            result_set = set(inverted_index.keys()) - inverted_index[term]
        else:
            result_set = set(inverted_index.keys())  # If term doesn't exist, return all docs

    else:
        # If no 'AND', 'OR', 'NOT' operators, check if single query term exists
        if query in inverted_index:
            result_set = inverted_index[query]
        else:
            result_set = set()  # If query term doesn't exist, return empty set

    # Convert doc_ids to filenames
    result_filenames = [doc_id_to_filename[doc_id] for doc_id in result_set if doc_id in doc_id_to_filename]

    logging.info(f"Query '{query}' resulted in: {result_filenames}")

    return result_filenames

In [8]:
def generate_queries_file(term_frequencies):
    """
    Generates a queries.txt file with a variety of example queries based on the most frequent terms.
    """

    # Open the file for writing
    with open("queries.txt", "w") as file:
        # Get top 10 most common terms
        most_common_terms = [term for term, freq in term_frequencies.most_common(10)]

        # Write multiple 'AND' queries
        if len(most_common_terms) >= 4:
            and_query1 = f"{most_common_terms[0]} AND {most_common_terms[1]}"
            and_query2 = f"{most_common_terms[2]} AND {most_common_terms[3]}"
            file.write(f"{and_query1}\n")
            file.write(f"{and_query2}\n")

        # Write multiple 'OR' queries
        if len(most_common_terms) >= 6:
            or_query1 = f"{most_common_terms[1]} OR {most_common_terms[2]}"
            or_query2 = f"{most_common_terms[4]} OR {most_common_terms[5]}"
            file.write(f"{or_query1}\n")
            file.write(f"{or_query2}\n")

        # Write multiple 'NOT' queries
        if len(most_common_terms) >= 6:
            not_query1 = f"NOT {most_common_terms[3]}"
            not_query2 = f"NOT {most_common_terms[5]}"
            file.write(f"{not_query1}\n")
            file.write(f"{not_query2}\n")

        # Add some complex 'AND OR NOT' queries
        if len(most_common_terms) >= 6:
            complex_query1 = f"{most_common_terms[0]} AND {most_common_terms[1]} OR NOT {most_common_terms[2]}"
            complex_query2 = f"{most_common_terms[3]} OR {most_common_terms[4]} AND NOT {most_common_terms[5]}"
            file.write(f"{complex_query1}\n")
            file.write(f"{complex_query2}\n")


In [9]:
def main():
    # Load dataset
    folder_path = '/content/drive/MyDrive/dataset/docs'
    data, doc_id_to_filename = load_text_files(folder_path)

    # Build the inverted index and term frequencies
    inverted_index, term_frequencies = build_inverted_index(data)

    # Generate the queries.txt file
    generate_queries_file(term_frequencies)

    # Read the queries from the generated queries.txt file
    with open("queries.txt", "r") as query_file:
        queries = query_file.readlines()

    # Open a file to write the results
    with open("query_results.txt", "w") as result_file:
        for query in queries:
            query = query.strip()  # Remove any leading/trailing whitespace
            if query:  # Skip empty lines
                result = boolean_query(query, inverted_index, doc_id_to_filename)
                result_str = f"Results for '{query}': {result}\n"
                print(result_str)  # Print to console
                result_file.write(result_str)  # Write to file

if __name__ == "__main__":
    main()


Results for 'dod AND bike': ['doc-6.txt', 'doc-4.txt', 'doc-10.txt', 'doc-8.txt', 'doc-1.txt', 'doc-5.txt']

Results for 'ride AND one': ['doc-6.txt', 'doc-4.txt', 'doc-7.txt', 'doc-1.txt', 'doc-5.txt', 'doc-3.txt']

Results for 'bike OR ride': ['doc-6.txt', 'doc-4.txt', 'doc-7.txt', 'doc-10.txt', 'doc-8.txt', 'doc-1.txt', 'doc-5.txt', 'doc-3.txt', 'doc-2.txt']

Results for 'motorcycle OR like': ['doc-6.txt', 'doc-4.txt', 'doc-9.txt', 'doc-10.txt', 'doc-8.txt', 'doc-1.txt', 'doc-5.txt', 'doc-3.txt', 'doc-2.txt']

Results for 'NOT one': []

Results for 'NOT like': []

Results for 'dod AND bike OR NOT ride': ['doc-6.txt', 'doc-4.txt', 'doc-10.txt', 'doc-1.txt', 'doc-5.txt']

Results for 'one OR motorcycle AND NOT like': ['doc-6.txt', 'doc-4.txt', 'doc-1.txt']

