First, I load the corpus.

In [127]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import numpy as np
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

class AlgebraicSearchEngine:
    def __init__(self,
                 docs,
                 stemmer=PorterStemmer(),
                 vectorizer=TfidfVectorizer(),
                 stopwords=stopwords.words('english')):

        self.stemmer = stemmer
        self.vectorizer = vectorizer
        self.stopwords = stopwords

        # Perform stemming on the documents
        proc_docs = []
        for doc in docs:
            proc_tokens = self.process_doc(doc)
            proc_doc = ' '.join(proc_tokens)

            # Add the stemmed document to the list of stemmed documents
            proc_docs.append(proc_doc)

        self.docs = docs
        self.proc_docs = proc_docs
        self.proc_doc_vectors = self.vectorizer.fit_transform(proc_docs).toarray()

    def process_doc(self, doc):
        # remove punctuation
        doc = re.sub(r'[^\w\s]', '', doc)

        # lower-case
        doc = doc.lower()

        # split the string into words
        words = doc.split()

        # remove stopwords
        words = [word for word in words if word not in self.stopwords]

        # stem each word and join them back into a string
        return [self.stemmer.stem(word) for word in words]

    def recursive_search(self, tokens):
        operator = tokens.pop(0).upper()

        if operator not in ['AND', 'OR', 'NOT']:
            raise ValueError(f"Invalid operator {operator}")
        
        operands = []
        while tokens[0] != ')':
            if tokens[0] == '(':
                tokens.pop(0)  # Remove '('
                operands.append(self.recursive_search(tokens))
            else:
                term_vec = self.vectorizer.transform([tokens.pop(0)]).toarray()[0]
                term_scores = self.proc_doc_vectors.dot(term_vec)
                operands.append(term_scores)

        

        tokens.pop(0)  # Remove ')'
        result = None
        if operator == 'AND':
            result = np.min(np.array(operands), axis=0)
        elif operator == 'OR':
            result = np.max(np.array(operands), axis=0)
        elif operator == 'NOT':
            if len(operands) != 1:
                raise ValueError("NOT operator can only have one operand")
            result = 1 - operands[0]
        return result
    
    def process_query(self, query):
        tokens = re.findall(r'\b\w+\b|\(|\)', query)
        tokens = [self.stemmer.stem(token) for token in tokens]
        return tokens

    def search(self, query):
        tokens = self.process_query(query)
        if tokens[0] != '(':
            raise ValueError("Invalid query")
        
        tokens = tokens[1:]

        scores = self.recursive_search(tokens)
        return scores


In [None]:
docs = ["The cat in the hat",
        "This is just a document with no other purpose than to show how the search engine works.",
        "A dog and his boy.",
        "A boy jumps up and down.",
        "The cats are out of the bag.",
        "Dogs and cats, living together.",
        "The quick brown fox jumps over the lazy dog.",
        "Cats, cats, cats, cats, cats, and maybe a dog!",
        "The dog did not bite the cat.",
        "quick dog cat",
        "a quick dog bite a cat",
        "dog cat",
        "quick dog",
        "Dog, dogs, dogs, dogs, dogs! And maybe a cat.",
        "Dog, dogs, dogs! And maybe a cat.",
        "Okay, now is the time, for all the good men, to come to the aid of their country.",
        "Cat cat cat cat cat cat cats cats cats!",
        "test"]
boolean_search_engine = AlgebraicSearchEngine(docs=docs, vectorizer=CountVectorizer(binary=True))
fuzzy_search_engine =   AlgebraicSearchEngine(docs=docs, vectorizer=TfidfVectorizer())
#query = "(AND (AND cat (NOT bite) dog) (AND dog quick) (NOT (AND boy jump)))"
query = "(AND cat dog)"
print(f"Query: {query}")
results = boolean_search_engine.search(query)
# pretty print the results
for i, doc in enumerate(boolean_search_engine.docs):
    print(f"Document {i+1}: {doc} => {results[i]}")

print("---")
results = fuzzy_search_engine.search(query)
# pretty print the results
for i, doc in enumerate(fuzzy_search_engine.docs):
    print(f"Document {i+1}: {doc} => {results[i]}")
