In [6]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

class AlgebraicSearchEngine:
    def __init__(self, docs, stemmer=None, vectorizer=None, stopwords_list=None):
        # Initialize components with defaults if not provided
        self.stemmer = stemmer or PorterStemmer()
        self.vectorizer = vectorizer or TfidfVectorizer()
        self.stopwords = set(stopwords_list or stopwords.words('english'))
        
        # Preprocess documents
        processed_docs = [' '.join(self._preprocess(doc)) for doc in docs]
        self.original_docs = docs
        self.doc_vectors = self.vectorizer.fit_transform(processed_docs).toarray().tolist()
    
    def _preprocess(self, text):
        """Preprocess the text by removing punctuation, lowercasing, removing stopwords, and stemming."""
        text = re.sub(r'[^\w\s]', '', text).lower()
        words = text.split()
        filtered_words = [self.stemmer.stem(word) for word in words if word not in self.stopwords]
        return filtered_words
    
    def _tokenize_query(self, query):
        """Tokenize and stem the query."""
        tokens = re.findall(r'\b\w+\b|\(|\)', query)
        return [self.stemmer.stem(token) for token in tokens]
    
    def _compute_term_scores(self, term):
        """Compute the score for each document based on the term's vector."""
        term_vector = self.vectorizer.transform([term]).toarray()[0].tolist()
        return [
            sum(doc_val * term_val for doc_val, term_val in zip(doc_vec, term_vector))
            for doc_vec in self.doc_vectors
        ]
    
    def _recursive_search(self, tokens):
        """Recursively process the tokens to compute the final scores."""
        if not tokens:
            raise ValueError("Unexpected end of query.")
        
        operator = tokens.pop(0).upper()
        if operator not in {'AND', 'OR', 'NOT'}:
            raise ValueError(f"Invalid operator: {operator}")
        
        operands = []
        while tokens and tokens[0] != ')':
            if tokens[0] == '(':
                tokens.pop(0)  # Remove '('
                operands.append(self._recursive_search(tokens))
            else:
                term = tokens.pop(0)
                operands.append(self._compute_term_scores(term))
        
        if not tokens:
            raise ValueError("Mismatched parentheses in query.")
        tokens.pop(0)  # Remove ')'
        
        if operator == 'AND':
            # Element-wise minimum for AND operation
            return [min(scores) for scores in zip(*operands)]
        elif operator == 'OR':
            # Element-wise maximum for OR operation
            return [max(scores) for scores in zip(*operands)]
        elif operator == 'NOT':
            if len(operands) != 1:
                raise ValueError("NOT operator requires exactly one operand.")
            # Simple NOT implementation: invert scores (assuming scores are between 0 and 1)
            return [1 - score for score in operands[0]]
    
    def search(self, query):
        """Search the documents based on the algebraic query."""
        tokens = self._tokenize_query(query)
        if not tokens or tokens.pop(0) != '(':
            raise ValueError("Query must start with '('.")
        
        scores = self._recursive_search(tokens)
        return scores

nltk.download('stopwords')
documents = [
    "Algebra is a branch of mathematics.",
    "Search engines use algebraic methods.",
    "Mathematics includes algebra, geometry, and calculus."
]

engine = AlgebraicSearchEngine(documents)
query = "( AND algebra mathematics )"
results = engine.search(query)
for doc, score in zip(engine.original_docs, results):
    print(f"Score: {score:.4f} | Document: {doc}")


Score: 0.4254 | Document: Algebra is a branch of mathematics.
Score: 0.0000 | Document: Search engines use algebraic methods.
Score: 0.2980 | Document: Mathematics includes algebra, geometry, and calculus.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/spinoza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

class AlgebraicBooleanSearchEngine:
    def __init__(self, docs, stemmer=None, vectorizer=None, stopwords_list=None):
        # Initialize components with defaults if not provided
        self.stemmer = stemmer or PorterStemmer()
        self.vectorizer = vectorizer or TfidfVectorizer()
        self.stopwords = set(stopwords_list or stopwords.words('english'))
        
        # Preprocess documents
        self.processed_docs = [' '.join(self._preprocess(doc)) for doc in docs]
        self.original_docs = docs
        self.doc_vectors = self.vectorizer.fit_transform(self.processed_docs).toarray().tolist()
        
        # Build Inverted Index for Boolean Search
        self.inverted_index = self._build_inverted_index(self.processed_docs)
        self.num_docs = len(docs)
        self.all_docs_set = set(range(self.num_docs))
    
    def _preprocess(self, text):
        """Preprocess the text by removing punctuation, lowercasing, removing stopwords, and stemming."""
        text = re.sub(r'[^\w\s]', '', text).lower()
        words = text.split()
        filtered_words = [self.stemmer.stem(word) for word in words if word not in self.stopwords]
        return filtered_words
    
    def _tokenize_query(self, query):
        """Tokenize and stem the query."""
        tokens = re.findall(r'\b\w+\b|\(|\)', query)
        return [self.stemmer.stem(token) for token in tokens]
    
    def _compute_term_scores(self, term):
        """Compute the score for each document based on the term's vector."""
        term_vector = self.vectorizer.transform([term]).toarray()[0].tolist()
        return [
            sum(doc_val * term_val for doc_val, term_val in zip(doc_vec, term_vector))
            for doc_vec in self.doc_vectors
        ]
    
    def _recursive_algebraic_search(self, tokens):
        """Recursively process the tokens to compute the final algebraic scores."""
        if not tokens:
            raise ValueError("Unexpected end of query.")
        
        operator = tokens.pop(0).upper()
        if operator not in {'AND', 'OR', 'NOT'}:
            raise ValueError(f"Invalid operator: {operator}")
        
        operands = []
        while tokens and tokens[0] != ')':
            if tokens[0] == '(':
                tokens.pop(0)  # Remove '('
                operands.append(self._recursive_algebraic_search(tokens))
            else:
                term = tokens.pop(0)
                operands.append(self._compute_term_scores(term))
        
        if not tokens:
            raise ValueError("Mismatched parentheses in query.")
        tokens.pop(0)  # Remove ')'
        
        if operator == 'AND':
            # Element-wise minimum for AND operation
            return [min(scores) for scores in zip(*operands)]
        elif operator == 'OR':
            # Element-wise maximum for OR operation
            return [max(scores) for scores in zip(*operands)]
        elif operator == 'NOT':
            if len(operands) != 1:
                raise ValueError("NOT operator requires exactly one operand.")
            # Invert scores (assuming scores are between 0 and 1)
            return [1 - score for score in operands[0]]
    
    def search_algebraic(self, query):
        """Perform an algebraic search based on TF-IDF vectors."""
        tokens = self._tokenize_query(query)
        if not tokens or tokens.pop(0) != '(':
            raise ValueError("Query must start with '('.")
        
        scores = self._recursive_algebraic_search(tokens)
        return scores
    
    def _build_inverted_index(self, processed_docs):
        """Build an inverted index mapping terms to the set of document indices containing them."""
        inverted = {}
        for idx, doc in enumerate(processed_docs):
            terms = set(doc.split())  # Split the doc into words
            for term in terms:
                if term in inverted:
                    inverted[term].add(idx)
                else:
                    inverted[term] = {idx}
        return inverted
    
    def _recursive_boolean_search(self, tokens):
        """Recursively process the tokens to compute the final set of matching documents."""
        if not tokens:
            raise ValueError("Unexpected end of query.")
        
        operator = tokens.pop(0).upper()
        if operator not in {'AND', 'OR', 'NOT'}:
            raise ValueError(f"Invalid operator: {operator}")
        
        operands = []
        while tokens and tokens[0] != ')':
            if tokens[0] == '(':
                tokens.pop(0)  # Remove '('
                operands.append(self._recursive_boolean_search(tokens))
            else:
                term = tokens.pop(0)
                operands.append(self.inverted_index.get(term, set()))
        
        if not tokens:
            raise ValueError("Mismatched parentheses in query.")
        tokens.pop(0)  # Remove ')'
        
        if operator == 'AND':
            return set.intersection(*operands) if operands else set()
        elif operator == 'OR':
            return set.union(*operands) if operands else set()
        elif operator == 'NOT':
            if len(operands) != 1:
                raise ValueError("NOT operator requires exactly one operand.")
            return self.all_docs_set - operands[0]
    
    def search_boolean(self, query):
        """Perform a boolean search based on exact term matching."""
        tokens = self._tokenize_query(query)
        if not tokens or tokens.pop(0) != '(':
            raise ValueError("Query must start with '('.")
        
        matching_docs = self._recursive_boolean_search(tokens)
        # Generate scores: 1 for matching documents, 0 for others
        scores = [1.0 if idx in matching_docs else 0.0 for idx in range(self.num_docs)]
        return scores

import nltk
nltk.download('stopwords')

documents = [
    "Algebra is a branch of mathematics.",
    "Search engines use algebraic methods.",
    "Mathematics includes algebra, geometry, and calculus.",
    "Geometry is another branch of mathematics.",
    "Calculus and algebra are fundamental to mathematics."
]

engine = AlgebraicBooleanSearchEngine(documents)

# Algebraic Search Example
algebraic_query = "( AND algebra mathematics )"
algebraic_results = engine.search_algebraic(algebraic_query)
print("Algebraic Search Results:")
for doc, score in zip(engine.original_docs, algebraic_results):
    print(f"Score: {score:.4f} | Document: {doc}")

print("\n" + "="*50 + "\n")

# Boolean Search Example
boolean_query = "( AND algebra ( OR geometry ( NOT calculus )) )"
boolean_results = engine.search_boolean(boolean_query)
print("Boolean Search Results:")
for doc, score in zip(engine.original_docs, boolean_results):
    match_status = "MATCH" if score == 1.0 else "NO MATCH"
    print(f"Score: {score:.1f} | Document: {doc} | {match_status}")


In [None]:
import unittest
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')

# Assuming the AlgebraicBooleanSearchEngine class is defined in a module named search_engine.py
# from search_engine import AlgebraicBooleanSearchEngine

# For demonstration, I'll include the class definition here.
class AlgebraicBooleanSearchEngine:
    def __init__(self, docs, stemmer=None, vectorizer=None, stopwords_list=None):
        # Initialize components with defaults if not provided
        self.stemmer = stemmer or PorterStemmer()
        self.vectorizer = vectorizer or TfidfVectorizer()
        self.stopwords = set(stopwords_list or stopwords.words('english'))
        
        # Preprocess documents
        self.processed_docs = [' '.join(self._preprocess(doc)) for doc in docs]
        self.original_docs = docs
        self.doc_vectors = self.vectorizer.fit_transform(self.processed_docs).toarray().tolist()
        
        # Build Inverted Index for Boolean Search
        self.inverted_index = self._build_inverted_index(self.processed_docs)
        self.num_docs = len(docs)
        self.all_docs_set = set(range(self.num_docs))
    
    def _preprocess(self, text):
        """Preprocess the text by removing punctuation, lowercasing, removing stopwords, and stemming."""
        text = re.sub(r'[^\w\s]', '', text).lower()
        words = text.split()
        filtered_words = [self.stemmer.stem(word) for word in words if word not in self.stopwords]
        return filtered_words
    
    def _tokenize_query(self, query):
        """Tokenize and stem the query."""
        tokens = re.findall(r'\b\w+\b|\(|\)', query)
        return [self.stemmer.stem(token) for token in tokens]
    
    def _compute_term_scores(self, term):
        """Compute the score for each document based on the term's vector."""
        term_vector = self.vectorizer.transform([term]).toarray()[0].tolist()
        return [
            sum(doc_val * term_val for doc_val, term_val in zip(doc_vec, term_vector))
            for doc_vec in self.doc_vectors
        ]
    
    def _recursive_algebraic_search(self, tokens):
        """Recursively process the tokens to compute the final algebraic scores."""
        if not tokens:
            raise ValueError("Unexpected end of query.")
        
        operator = tokens.pop(0).upper()
        if operator not in {'AND', 'OR', 'NOT'}:
            raise ValueError(f"Invalid operator: {operator}")
        
        operands = []
        while tokens and tokens[0] != ')':
            if tokens[0] == '(':
                tokens.pop(0)  # Remove '('
                operands.append(self._recursive_algebraic_search(tokens))
            else:
                term = tokens.pop(0)
                operands.append(self._compute_term_scores(term))
        
        if not tokens:
            raise ValueError("Mismatched parentheses in query.")
        tokens.pop(0)  # Remove ')'
        
        if operator == 'AND':
            # Element-wise minimum for AND operation
            return [min(scores) for scores in zip(*operands)]
        elif operator == 'OR':
            # Element-wise maximum for OR operation
            return [max(scores) for scores in zip(*operands)]
        elif operator == 'NOT':
            if len(operands) != 1:
                raise ValueError("NOT operator requires exactly one operand.")
            # Invert scores (assuming scores are between 0 and 1)
            return [1 - score for score in operands[0]]
    
    def search_algebraic(self, query):
        """Perform an algebraic search based on TF-IDF vectors."""
        tokens = self._tokenize_query(query)
        if not tokens or tokens.pop(0) != '(':
            raise ValueError("Query must start with '('.")
        
        scores = self._recursive_algebraic_search(tokens)
        return scores
    
    def _build_inverted_index(self, processed_docs):
        """Build an inverted index mapping terms to the set of document indices containing them."""
        inverted = {}
        for idx, doc in enumerate(processed_docs):
            terms = set(doc.split())  # Split the doc into words
            for term in terms:
                if term in inverted:
                    inverted[term].add(idx)
                else:
                    inverted[term] = {idx}
        return inverted
    
    def _recursive_boolean_search(self, tokens):
        """Recursively process the tokens to compute the final set of matching documents."""
        if not tokens:
            raise ValueError("Unexpected end of query.")
        
        operator = tokens.pop(0).upper()
        if operator not in {'AND', 'OR', 'NOT'}:
            raise ValueError(f"Invalid operator: {operator}")
        
        operands = []
        while tokens and tokens[0] != ')':
            if tokens[0] == '(':
                tokens.pop(0)  # Remove '('
                operands.append(self._recursive_boolean_search(tokens))
            else:
                term = tokens.pop(0)
                operands.append(self.inverted_index.get(term, set()))
        
        if not tokens:
            raise ValueError("Mismatched parentheses in query.")
        tokens.pop(0)  # Remove ')'
        
        if operator == 'AND':
            return set.intersection(*operands) if operands else set()
        elif operator == 'OR':
            return set.union(*operands) if operands else set()
        elif operator == 'NOT':
            if len(operands) != 1:
                raise ValueError("NOT operator requires exactly one operand.")
            return self.all_docs_set - operands[0]
    
    def search_boolean(self, query):
        """Perform a boolean search based on exact term matching."""
        tokens = self._tokenize_query(query)
        if not tokens or tokens.pop(0) != '(':
            raise ValueError("Query must start with '('.")
        
        matching_docs = self._recursive_boolean_search(tokens)
        # Generate scores: 1 for matching documents, 0 for others
        scores = [1.0 if idx in matching_docs else 0.0 for idx in range(self.num_docs)]
        return scores



In [37]:
# Unit Test Class
class TestAlgebraicBooleanSearchEngine(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        """Set up a common search engine instance for all tests."""
        cls.documents = [# => 3
            "Algebra is a branch of mathematics.",      
            "Search engines use algebraic methods.",
            "Mathematics includes algebra, geometry, and calculus.",
            "Geometry is another branch of mathematics.",
            "Calculus and algebra are fundamental to mathematics."
        ]
        cls.engine = AlgebraicBooleanSearchEngine(cls.documents)
    
    def test_preprocessing(self):
        """Test if preprocessing correctly processes documents."""
        expected_processed_docs = [
            "algebra branch mathematic",#
            "search engine use algebra method",
            "mathem includ algebra geometri calculu",
            "geometri anoth branch mathematic",
            "calculu algebra fundament mathematic"#
        ]
        self.assertEqual(self.engine.processed_docs, expected_processed_docs)
    
    def test_algebraic_search_simple_and(self):
        """Test algebraic search with a simple AND query."""
        query = "( AND algebra mathematics )"
        scores = self.engine.search_algebraic(query)
        # Expected: Documents 0,2,4 have both 'algebra' and 'mathematics'
        # Since algebra and mathematics are present in all except document 3
        # The actual scores depend on TF-IDF; here, we'll check that document 3 has score 0
        self.assertEqual(scores[1], 0.0)
        self.assertEqual(scores[3], 0.0)
        # Check that other documents have non-zero scores
        for idx in [0,2,4]:
            self.assertGreater(scores[idx], 0.0)
    
    def test_algebraic_search_or(self):
        """Test algebraic search with an OR query."""
        query = "( OR algebra method )"
        scores = self.engine.search_algebraic(query)
        # All documents except document 1 have either 'algebra' or 'geometry'
        expected_non_matching = [3]
        for idx in expected_non_matching:
            self.assertEqual(scores[idx], 0.0)
        # Check that other documents have non-zero scores
        for idx in range(len(self.documents)):
            if idx not in expected_non_matching:
                self.assertGreater(scores[idx], 0.0)
    
    def test_algebraic_search_not(self):
        """Test algebraic search with a NOT query."""
        query = "(AND geometry ( NOT calculus ))"
        scores = self.engine.search_algebraic(query)
        # Documents not containing 'calculus' are 0,1,3
        expected_matching = [3]
        expected_non_matching = [0,1,2,4]
        for idx in [0,1,2,3,4]:
            print(idx, self.documents[idx], scores[idx])
        for idx in expected_matching:
            self.assertGreater(scores[idx], 0.0)
        for idx in expected_non_matching:
            self.assertEqual(scores[idx], 0.0)
    
    def test_boolean_search_and_or(self):
        """Test boolean search with AND and OR operators."""
        query = "( AND algebra ( OR geometry calculus ) )"
        scores = self.engine.search_boolean(query)
        # Documents that contain 'algebra' AND ('geometry' OR 'calculus')
        # Expected matching documents: 0 (algebra), 2 (algebra & geometry & calculus), 4 (algebra & calculus)
        expected_matching = [2, 4]
        expected_non_matching = [0, 1, 3]
        for idx in expected_matching:
            self.assertEqual(scores[idx], 1.0)
        for idx in expected_non_matching:
            self.assertEqual(scores[idx], 0.0)
    
    def test_boolean_search_not(self):
        """Test boolean search with NOT operator."""
        query = "( AND algebra ( NOT calculus ) )"
        scores = self.engine.search_boolean(query)
        # Documents that contain 'algebra' AND NOT 'calculus'
        # Expected matching documents: 0,1
        expected_matching = [0, 1]
        expected_non_matching = [2,3,4]
        for idx in expected_matching:
            self.assertEqual(scores[idx], 1.0)
        for idx in expected_non_matching:
            self.assertEqual(scores[idx], 0.0)
    
    def test_boolean_search_or_not(self):
        """Test boolean search with OR and NOT operators."""
        query = "( OR ( NOT algebra ) calculus )"
        scores = self.engine.search_boolean(query)
        # Documents that contain NOT 'algebra' OR 'calculus'
        # 'calculus' is in documents 2,4
        # NOT 'algebra' is documents that do not have 'algebra': possibly none in this dataset
        # However, in our inverted index, 'algebra' is in docs 0,1,2,4, so NOT 'algebra' is doc 3
        # So matching documents: doc3, doc2, doc4
        expected_matching = [2,3,4]
        expected_non_matching = [0,1]
        for idx in expected_matching:
            self.assertEqual(scores[idx], 1.0)
        for idx in expected_non_matching:
            self.assertEqual(scores[idx], 0.0)
    
    def test_boolean_search_invalid_query(self):
        """Test boolean search with an invalid query."""
        invalid_queries = [
            "AND algebra mathematics )",         # Missing opening parenthesis
            "( AND algebra mathematics",         # Missing closing parenthesis
            "( XOR algebra mathematics )",       # Invalid operator
            "( AND algebra ( OR geometry )",     # Missing closing parenthesis
            "( NOT algebra geometry )"            # NOT with two operands
        ]
        for query in invalid_queries:
            with self.assertRaises(ValueError):
                self.engine.search_boolean(query)
    
    def test_algebraic_search_invalid_query(self):
        """Test algebraic search with an invalid query."""
        invalid_queries = [
            "AND algebra mathematics )",         # Missing opening parenthesis
            "( AND algebra mathematics",         # Missing closing parenthesis
            "( XOR algebra mathematics )",       # Invalid operator
            "( AND algebra ( OR geometry )",     # Missing closing parenthesis
            "( NOT algebra mathematics )"         # NOT with two operands
        ]
        for query in invalid_queries:
            with self.assertRaises(ValueError):
                self.engine.search_algebraic(query)
    
    def test_boolean_search_case_insensitivity(self):
        """Test boolean search with mixed case terms."""
        query = "( AND Algebra ( OR Geometry Calculus ) )"
        scores = self.engine.search_boolean(query)
        # Should behave the same as lowercase query
        expected_matching = [2, 4]
        expected_non_matching = [0, 1, 3]
        for idx in expected_matching:
            self.assertEqual(scores[idx], 1.0)
        for idx in expected_non_matching:
            self.assertEqual(scores[idx], 0.0)
    
    def test_algebraic_search_case_insensitivity2(self):
        """Test algebraic search with mixed case terms."""
        query = "( AND Algebra Mathematics )"
        scores = self.engine.search_algebraic(query)
        # Should behave the same as lowercase query
        self.assertEqual(scores[3], 0.0)
        for idx in [0,2,4]:
            self.assertGreater(scores[idx], 0.0)

unittest.main(argv=[''], exit=False)


..F

.......F
FAIL: test_algebraic_search_not (__main__.TestAlgebraicBooleanSearchEngine.test_algebraic_search_not)
Test algebraic search with a NOT query.
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_281050/424487080.py", line 64, in test_algebraic_search_not
    self.assertEqual(scores[idx], 0.0)
AssertionError: 0.47080139270829086 != 0.0

FAIL: test_preprocessing (__main__.TestAlgebraicBooleanSearchEngine.test_preprocessing)
Test if preprocessing correctly processes documents.
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_281050/424487080.py", line 24, in test_preprocessing
    self.assertEqual(self.engine.processed_docs, expected_processed_docs)
AssertionError: Lists differ: ['alg[15 chars]hemat', 'search engin use algebra method', 'ma[107 chars]mat'] != ['alg[15 chars]hematic', 'search engine use algebra method', [112 cha

0 Algebra is a branch of mathematics. 0.0
1 Search engines use algebraic methods. 0.0
2 Mathematics includes algebra, geometry, and calculus. 0.47080139270829086
3 Geometry is another branch of mathematics. 0.498511878090309
4 Calculus and algebra are fundamental to mathematics. 0.0


<unittest.main.TestProgram at 0x7aeb073e7790>

In [None]:
print(PorterStemmer().stem('algebraic'))
print(PorterStemmer().stem('algebra'))
print(PorterStemmer().stem('geometry'))

'geometri'

In [None]:
query = "(AND cat (NOT bite) dog)"# (AND dog quick) (NOT (AND boy jump)))"
#query = "(AND cat (NOT dog))"
print(f"Query: {query}")
# Query: (AND cat (NOT (OR dog men))
print(f'Processed Query: {boolean_search_engine.process_query(query)}')
# Processed Query: ['(', 'and', 'cat', '(', 'not', '(', 'or', 'dog', 'men', ')', ')']
results = boolean_search_engine.search(query)
print(f"{results=} {results.shape=}")
# sort results by score
results = sorted(enumerate(results), key=lambda x: x[1], reverse=True)
# pretty print the results
for i, doc in enumerate(boolean_search_engine.docs):
    print(f"Document {i}: {doc} => {results[i]}")

