In [None]:
import re
# from nltk.stem import PorterStemmer
# from nltk.corpus import stopwords
# import nltk
# nltk.download('stopwords')


class Query:
    """
    A Boolean algebra for queries

        Q = (P(T*), and, or, not, {}, T*)

    where T is the set of all ASCII characters, T* is the set of all strings of
    ASCII characters, {} is the empty set, and P(T*) is the power set of T*.

    This allows us to construct queries such as:

        - "(or (and cat dog) (not (or fish bird)))"

    which internally is represented as:

        - ['or', ['and', 'cat', 'dog'], ['not', ['or', 'fish', 'bird']]]
    """
    @classmethod
    def from_query_string(cls, query: str):
        query_obj = cls()
        query_obj.tokens = query_obj.tokenize(query)
        return query_obj

    @classmethod
    def from_tokens(cls, tokens):
        query = cls()
        query.tokens = tokens
        return query

    def tokenize(self, query: str) -> list:
        """
        Tokenize and stem the query.
        """

        tokens = re.findall(r'\b\w+\b|\(|\)', query)

        def _build(tokens):
            """Recursively process the tokens to compute the final set of matching documents."""
            # if not tokens:
            #    raise ValueError("Unexpected end of query.")

            # if tokens[0] != '(':
            #    raise ValueError("Expected '('")

            if tokens[0] == '(':
                tokens.pop(0)  # Remove '('

            result = []
            if tokens[0].lower() not in {'and', 'or', 'not'}:
                op = 'and'
            else:
                op = tokens.pop(0).lower()

            result = [op]
            while tokens and tokens[0] != ')':
                if tokens[0] == '(':
                    tokens.pop(0)  # Remove '('
                    result.append(_build(tokens))  # New sub-list
                else:
                    term = tokens.pop(0)
                    result.append(term)

            # if not tokens:
             #   raise ValueError("Mismatched parentheses in query.")

            # if tokens[0] != ')':
            #    raise ValueError("Expected ')'")
            # tokens.pop(0)  # Remove ')'

            if tokens and tokens[0] == ')':
                tokens.pop(0)

            return result

        return _build(tokens)

    def __and__(self, other: 'Query') -> 'Query':
        return Query.from_tokens(['and', self.tokens, other.tokens])

    def __or__(self, other: 'Query'):
        return Query.from_tokens(['or', self.tokens, other.tokens])

    def __invert__(self):
        return Query.from_tokens(['not', self.tokens])

    def __repr__(self):
        return f"Query.from_tokens({self.tokens})"

    def __str__(self):
        def _build(tokens):
            if isinstance(tokens, list):
                return f"({tokens[0]} {' '.join(_build(t) for t in tokens[1:])})"
            return tokens
        return _build(self.tokens)

In [11]:
q1 = Query.from_query_string("cat dog")
print(q1)
print(repr(q1))


(and cat dog)
Query.from_tokens(['and', 'cat', 'dog'])


In [16]:

q2 = Query.from_query_string("or fish bird")
print(q2.tokens)


q3 = ~(~(q1 & ~q2) | q2)
print(q3)


['or', 'fish', 'bird']
(not (or (not (and (and cat dog) (not (or fish bird)))) (or fish bird)))
