In [1]:
import re
from typing import List, Union

class Query:
    """
    A Boolean algebra for constructing and evaluating queries.

    Theory:
        Q = (P(T*), and, or, not, {}, T*)

        where:
            - T is the set of all ASCII characters,
            - T* is the set of all strings of ASCII characters,
            - {} is the empty set,
            - P(T*) is the power set of T*.

        This framework allows constructing queries such as:
            "(or (and cat dog) (not (or fish bird)))"
        which is internally represented as:
            ['or', ['and', 'cat', 'dog'], ['not', ['or', 'fish', 'bird']]]

        Queries can also be combined using Python operators:
            Q1 & Q2  # Represents logical AND
            Q1 | Q2  # Represents logical OR
            ~Q1     # Represents logical NOT
    """

    def __init__(self, query: Union[str, List] = None):
        """
        Initialize a Query instance.

        Args:
            query (str or list, optional): A query string or a list of tokens.
                If a string is provided, it is parsed into tokens.
                If a list is provided, it is used directly as the token representation.
                Defaults to None, which initializes an empty query.
        """
        if isinstance(query, str):
            self.tokens = self.tokenize(query)
        elif isinstance(query, list):
            self.tokens = query
        elif query is None:
            self.tokens = []
        else:
            raise TypeError("Query must be initialized with a string or a list of tokens.")

    def tokenize(self, query: str) -> List:
        """
        Tokenize the input query string into a nested list structure.

        Args:
            query (str): The query string.

        Returns:
            list: A nested list representing the parsed query.

        Raises:
            ValueError: If there are mismatched parentheses or unexpected tokens.
        """
        tokens = re.findall(r'\b\w+\b|\(|\)', query)

        def _build(tokens: List) -> List:
            if not tokens:
                raise ValueError("Unexpected end of query.")

            if tokens[0] == '(':
                tokens.pop(0) # Remove '('


            if tokens[0].lower() not in ['and', 'or', 'not']:
                op = 'and'
            else:
                op = tokens.pop(0).lower()

            result = [op]
            while tokens and tokens[0] != ')':
                if tokens[0] == '(':
                    tokens.pop(0)  # Remove '('
                    result.append(_build(tokens))
                else:
                    result.append(tokens.pop(0))

            if tokens and tokens[0] == ')':
                tokens.pop(0)

            return result

        return _build(tokens)

    def eval(self, docs: List) -> List[bool]:
        """
        Evaluate the query against a list of documents.

        Args:
            docs: A list where each document has method for determining if it
            contains a term, `__contains__`.

        Returns:
            List[bool]: A list indicating whether each document matches the query.
        """
        def _eval(query_part: Union[str, list], doc) -> bool:
            if isinstance(query_part, str):
                return query_part in doc
            elif isinstance(query_part, list):
                op = query_part[0]
                if op == 'and':
                    return all(_eval(part, doc) for part in query_part[1:])
                elif op == 'or':
                    return any(_eval(part, doc) for part in query_part[1:])
                elif op == 'not':
                    if len(query_part) != 2:
                        raise ValueError("`not` operation must have exactly one operand.")
                    return not _eval(query_part[1], doc)
                else:
                    raise ValueError(f"Unknown operator: {op}")
            else:
                raise TypeError("Query parts must be strings or lists.")

        if not isinstance(docs, list):
            docs = [docs]

        return [_eval(self.tokens, doc) for doc in docs]

    def __and__(self, other: 'Query') -> 'Query':
        """
        Combine two queries with a logical AND.

        Args:
            other (Query): Another Query instance.

        Returns:
            Query: A new Query representing the logical AND of both queries.
        """
        return Query(['and', self.tokens, other.tokens])

    def __or__(self, other: 'Query') -> 'Query':
        """
        Combine two queries with a logical OR.

        Args:
            other (Query): Another Query instance.

        Returns:
            Query: A new Query representing the logical OR of both queries.
        """
        return Query(['or', self.tokens, other.tokens])

    def __invert__(self) -> 'Query':
        """
        Negate the query with a logical NOT.

        Returns:
            Query: A new Query representing the logical NOT of the current query.
        """
        return Query(['not', self.tokens])

    def __repr__(self) -> str:
        return f"Query({self.tokens})"

    def __str__(self) -> str:
        """
        Convert the internal token representation back to a query string.

        Returns:
            str: The string representation of the query.
        """
        def _build(tokens: Union[str, list]) -> str:
            if isinstance(tokens, str):
                return tokens
            return f"({tokens[0]} {' '.join(_build(t) for t in tokens[1:])})"
        return _build(self.tokens)


In [2]:
# Example queries
q1 = Query("cat dog")
q2 = Query("(or fish bird)")
q3 = ~q2
combined_query = q1 & q3  # Represents "(and (and cat dog) (not (or fish bird)))"
assert str(combined_query) == "(and (and cat dog) (not (or fish bird)))"

# let's rewrite as  "((cat dog) (not (or fish bird)))" for fun
q4 = Query("( cat dog (not (or fish bird)))")

# Example documents
documents = [
    ["cat", "dog"],
    ["fish"],
    ["bird"],
    ["cat", "dog", "fish"],
    ["cat", "dog", "bird"],
    ["cat"],
    ["dog"],
    ["fish", "bird"],
    ["cat", "dog", "fish", "bird"],
]


results1 = q1.eval(documents)
print(q1)
print(results1)

results2 = q2.eval(documents)
print(q2)
print(results2)

results3 = q3.eval(documents)
print(q3)
print(results3)

results4 = q4.eval(documents)
print(q4)
print(results4)

results_combined = combined_query.eval(documents)
print(combined_query)
print(results_combined)



(and cat dog)
[True, False, False, True, True, False, False, False, True]
(or fish bird)
[False, True, True, True, True, False, False, True, True]
(not (or fish bird))
[True, False, False, False, False, True, True, False, False]
(and cat dog (not (or fish bird)))
[True, False, False, False, False, False, False, False, False]
(and (and cat dog) (not (or fish bird)))
[True, False, False, False, False, False, False, False, False]
