In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import os
import string
import logging
import re # Import regular expressions library
from collections import defaultdict , Counter
from nltk . corpus import stopwords
from nltk . tokenize import word_tokenize
from nltk . stem import WordNetLemmatizer
# Initialize the stop words and lemmatizer
STOPWORDS = set( stopwords . words ('english') )
LEMMATIZER = WordNetLemmatizer ()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Function to load documents from a specified directory
def load_documents(directory):
    documents = {}
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r') as file:
                documents[filename] = file.read()
    return documents

documents = load_documents('directory')

In [None]:
# Function to clean and preprocess text (lowercase, tokenization, stopwords removal, and lemmatization)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [LEMMATIZER.lemmatize(token) for token in tokens if token not in STOPWORDS]
    return tokens

cleaned_documents = {filename: clean_text(content) for filename, content in documents.items()}

In [None]:
# Function to create an inverted index
def create_inverted_index(documents):
    inverted_index = defaultdict(set)
    for filename, tokens in documents.items():
        for word in tokens:
            inverted_index[word].add(filename)
    return inverted_index

inverted_index = create_inverted_index(cleaned_documents)

In [None]:
# Initialize all_documents with the set of all document filenames
all_documents = set(documents.keys())

# Function for 'AND' query (finds common documents for all terms)
def and_query(terms, inverted_index):
    result = inverted_index.get(terms[0], set())
    for term in terms[1:]:
        result &= inverted_index.get(term, set())
    return result

In [None]:
# Function for 'OR' query (finds documents that contain any of the terms)
def or_query(terms, inverted_index):
    result = inverted_index.get(terms[0], set())
    for term in terms[1:]:
        result |= inverted_index.get(term, set())
    return result

In [None]:
# Function for 'NOT' query (finds documents that do not contain the specified term)
def not_query(term, inverted_index, all_documents):
    return all_documents - inverted_index.get(term, set())

In [None]:
# Function to convert document IDs (filenames) to a list
def convert_doc_ids_to_filenames(doc_ids):
    return list(doc_ids)

In [None]:
# Function to process the query and execute the appropriate Boolean operation
def process_query(query, inverted_index, all_documents):
    # Tokenize and preprocess the query
    terms = [LEMMATIZER.lemmatize(term) for term in word_tokenize(query.lower()) if term not in STOPWORDS]
    if 'and' in terms:
        terms.remove('and')
        result = and_query(terms, inverted_index)
    elif 'or' in terms:
        terms.remove('or')
        result = or_query(terms, inverted_index)
    elif 'not' in terms:
        terms.remove('not')
        result = not_query(terms[0], inverted_index, all_documents)
    else:
        result = inverted_index.get(terms[0], set())
    return convert_doc_ids_to_filenames(result)

In [None]:
# Example usage
query = "not see"
result = process_query(query, inverted_index, all_documents)
print(result)

['Song 7.txt', 'Song 6.txt', 'Song 3.txt', 'Song 10.txt']


In [None]:
# Example usage
query = "play and cool"
result = process_query(query, inverted_index, all_documents)
print(result)

['Song 7.txt']


In [None]:
# Example usage
query = "baby or know"
result = process_query(query, inverted_index, all_documents)
print(result)

['Song 7.txt', 'Song 6.txt', 'Song 10.txt', 'Song 9.txt']
