diff --git a/README.md b/README.md index b3d5c98..f7cfc2e 100644 --- a/README.md +++ b/README.md @@ -154,9 +154,11 @@ The layered approach emerged from TDD — writing tests first revealed natural b - [x] Computed fields: `price * 0.9 AS discounted` - [x] Vector KNN search: `vector_distance(field, :param)` - [x] Hybrid search (filters + vector) -- [x] Full-text search: `LIKE 'prefix%'` (prefix), `fulltext(field, 'terms')` function +- [x] Full-text search: exact phrase, fuzzy, proximity, OR/union, LIKE patterns, BM25 scoring (see below) - [x] GEO field queries with full operator support (see below) - [x] Date functions: `YEAR()`, `MONTH()`, `DAY()`, `DATE_FORMAT()`, etc. (see below) +- [x] `IS NULL` / `IS NOT NULL` via `ismissing()` (requires Redis 7.4+, see below) +- [x] `exists()` function for field presence checks (see below) ## What's Not Implemented (Yet...) @@ -166,6 +168,112 @@ The layered approach emerged from TDD — writing tests first revealed natural b - [ ] DISTINCT - [ ] Index creation from SQL (CREATE INDEX) +### TEXT Search + +Full-text search on TEXT fields with multiple search modes: + +| Feature | SQL Syntax | RediSearch Output | Notes | +|---------|-----------|-------------------|-------| +| Exact phrase | `title = 'gaming laptop'` | `@title:"gaming laptop"` | Stopwords stripped | +| Tokenized search | `fulltext(title, 'gaming laptop')` | `@title:(gaming laptop)` | Stopwords stripped | +| Fuzzy LD=1 | `fuzzy(title, 'laptap')` | `@title:%laptap%` | | +| Fuzzy LD=2 | `fuzzy(title, 'laptap', 2)` | `@title:%%laptap%%` | | +| Fuzzy LD=3 | `fuzzy(title, 'laptap', 3)` | `@title:%%%laptap%%%` | | +| OR / union | `fulltext(title, 'laptop OR tablet')` | `@title:(laptop\|tablet)` | | +| Prefix | `title LIKE 'lap%'` | `@title:lap*` | | +| Suffix | `title LIKE '%top'` | `@title:*top` | | +| Contains | `title LIKE '%apt%'` | `@title:*apt*` | | +| Proximity (slop) | `fulltext(title, 'gaming laptop', 2)` | `@title:(gaming laptop) => { $slop: 2; }` | | +| Proximity + order | `fulltext(title, 'gaming laptop', 2, true)` | `@title:(gaming laptop) => { $slop: 2; $inorder: true; }` | | +| Optional term | `fulltext(title, 'laptop ~gaming')` | `@title:(laptop ~gaming)` | | +| BM25 score | `SELECT score() AS relevance FROM idx` | `FT.SEARCH ... WITHSCORES` | | +| Negation | `NOT fulltext(title, 'refurbished')` | `-@title:refurbished` | | + +**Examples:** + +```sql +-- Exact phrase match (stopwords like "of" are stripped automatically) +SELECT * FROM products WHERE title = 'bank of america' +-- Produces: @title:"bank america" + +-- Fuzzy search for typos (Levenshtein distance 2) +SELECT * FROM products WHERE fuzzy(title, 'laptap', 2) + +-- OR search across terms +SELECT * FROM products WHERE fulltext(title, 'laptop OR tablet OR phone') + +-- Proximity: terms within 3 words of each other, in order +SELECT * FROM products WHERE fulltext(title, 'gaming laptop', 3, true) + +-- Suffix/contains pattern matching +SELECT * FROM products WHERE title LIKE '%phone%' + +-- BM25 relevance scoring +SELECT title, score() AS relevance FROM products WHERE fulltext(title, 'laptop') + +-- Multi-field search +SELECT * FROM products WHERE fulltext(title, 'laptop') OR fulltext(description, 'laptop') +``` + +**Stopword handling:** + +Both `=` (exact phrase) and `fulltext()` (tokenized search) automatically strip [Redis default stopwords](https://redis.io/docs/latest/develop/ai/search-and-query/advanced-concepts/stopwords/) before sending queries to RediSearch. This is necessary because RediSearch does not index stopwords, so including them in queries causes syntax errors or failed matches. A `UserWarning` is emitted when stopwords are removed. + +For example, `WHERE title = 'bank of america'` produces `@title:"bank america"` because "of" is a default stopword and is never stored in the inverted index. The stripped phrase still matches correctly because the indexer assigns consecutive token positions after dropping stopwords. + +To include stopwords in your queries, create your index with `STOPWORDS 0`: + +``` +FT.CREATE myindex ON HASH PREFIX 1 doc: STOPWORDS 0 SCHEMA title TEXT +``` + +**Notes:** +- `=` on TEXT fields performs **exact phrase** matching (double-quoted) +- `fulltext()` performs **tokenized** AND search (parenthesized) +- Both operators strip stopwords and emit a warning when they do +- `fuzzy()` and `fulltext()` only work on TEXT fields; using them on TAG or NUMERIC raises `ValueError` +- OR must be **uppercase**: `'laptop OR tablet'` triggers union; lowercase `'laptop or tablet'` is treated as a regular three-word AND search +- Special characters (`@`, `|`, `-`, `*`, `+`, etc.) in search terms are automatically escaped + +### IS NULL / IS NOT NULL (ismissing) + +Check for missing (absent) fields using standard SQL `IS NULL` / `IS NOT NULL` syntax. Requires **Redis 7.4+** (RediSearch 2.10+) with `INDEXMISSING` declared on the field. + +| SQL | RediSearch Output | +|-----|-------------------| +| `WHERE email IS NULL` | `ismissing(@email)` | +| `WHERE email IS NOT NULL` | `-ismissing(@email)` | + +```sql +-- Find users without an email +SELECT * FROM users WHERE email IS NULL + +-- Find users with an email +SELECT * FROM users WHERE email IS NOT NULL + +-- Combine with other filters +SELECT * FROM users WHERE category = 'eng' AND email IS NULL +``` + +**Note:** The field must be declared with `INDEXMISSING` in the index schema. A warning is emitted at translation time as a reminder. + +### exists() — Field Presence Check + +Check whether a field has a value using `exists()` in SELECT or HAVING. This uses `FT.AGGREGATE` with `APPLY exists(@field)`. + +```sql +-- Check if fields exist (returns 1 or 0) +SELECT name, exists(email) AS has_email FROM users + +-- Filter to only rows where a field exists +SELECT name FROM users HAVING exists(email) = 1 + +-- Combine with other computed fields +SELECT name, exists(email) AS has_email, exists(phone) AS has_phone FROM users +``` + +**Note:** `exists()` is different from `IS NOT NULL` — it works via `FT.AGGREGATE APPLY` and doesn't require `INDEXMISSING` on the field, but returns `1`/`0` rather than filtering rows directly. + ### DATE/DATETIME Handling Redis does not have a native DATE field type. Dates are stored as **NUMERIC fields** with Unix timestamps. diff --git a/sql_redis/executor.py b/sql_redis/executor.py index 97e1fe2..e15d1b9 100644 --- a/sql_redis/executor.py +++ b/sql_redis/executor.py @@ -103,7 +103,50 @@ class QueryResult: count: int -class Executor: +class _ScoreParseMixin: + """Shared helpers for score-related response parsing.""" + + @staticmethod + def _has_return_0(args: list[str]) -> bool: + """Return True when the args contain 'RETURN 0' (no document fields).""" + try: + idx = args.index("RETURN") + return args[idx + 1] == "0" + except (ValueError, IndexError): + return False + + @staticmethod + def _resolve_score_alias( + score_alias: str | None, + args: list[str], + first_row_fields: set[str] | None = None, + ) -> str: + """Determine a stable score column name that won't collide with + document fields. The alias is resolved once and reused for every + row so all rows share the same column name. + + When a RETURN clause is present, the returned field names are used + for collision detection. When RETURN is absent (SELECT *), the + caller should pass ``first_row_fields`` — the union of all field + names across all result rows — so we can detect collisions even + when different documents have different field sets.""" + alias = score_alias or "__score" + # Extract RETURN field names from args to detect collision + try: + idx = args.index("RETURN") + count = int(args[idx + 1]) + return_fields = set(args[idx + 2 : idx + 2 + count]) + except (ValueError, IndexError): + # Normalize bytes keys to str so collision detection works + # regardless of decode_responses setting. + raw = first_row_fields or set() + return_fields = {k.decode() if isinstance(k, bytes) else k for k in raw} + while alias in return_fields: + alias = f"__score_{alias}" + return alias + + +class Executor(_ScoreParseMixin): """Executes SQL queries against Redis.""" def __init__(self, client: redis.Redis, schema_registry: SchemaRegistry) -> None: @@ -166,12 +209,55 @@ def execute(self, sql: str, *, params: dict | None = None) -> QueryResult: rows = [] if translated.command == "FT.SEARCH": - # FT.SEARCH format: [count, key1, [fields1], key2, [fields2], ...] - # Skip document keys (odd indices), take field lists (even indices after count) - for i in range(2, len(raw_result), 2): - row_data = raw_result[i] - row = dict(zip(row_data[::2], row_data[1::2])) - rows.append(row) + # Use the explicit score_alias signal rather than scanning args + # for the literal token "WITHSCORES", which could false-positive + # if a returned field happened to be named "WITHSCORES". + with_scores = translated.score_alias is not None + # RETURN 0 suppresses document fields (like NOCONTENT); + # with WITHSCORES the reply is [count, id, score, id, score, ...] + no_content = self._has_return_0(translated.args) + + # Pre-resolve score alias; may be deferred for SELECT * + score_alias: str | None = None + + if with_scores and no_content: + # WITHSCORES + RETURN 0: [count, id1, score1, id2, score2, ...] + # Stride of 2: key, score (no field array) + score_alias = self._resolve_score_alias( + translated.score_alias, translated.args + ) + for i in range(1, len(raw_result) - 1, 2): + score = raw_result[i + 1] + row = {score_alias: score} + rows.append(row) + elif with_scores: + # WITHSCORES format: [count, key1, score1, [fields1], key2, score2, [fields2], ...] + # Stride of 3: key, score, field_list + # First pass: collect all field names across all rows so the + # alias avoids collisions with any document field, not just + # the first row's fields. + all_field_names: set[str] = set() + parsed_rows: list[tuple[dict, Any]] = [] + for i in range(1, len(raw_result) - 2, 3): + score = raw_result[i + 1] + row_data = raw_result[i + 2] + row = dict(zip(row_data[::2], row_data[1::2])) + all_field_names.update(row.keys()) + parsed_rows.append((row, score)) + resolved_alias = self._resolve_score_alias( + translated.score_alias, + translated.args, + first_row_fields=all_field_names, + ) + for row, score in parsed_rows: + row[resolved_alias] = score + rows.append(row) + else: + # Standard format: [count, key1, [fields1], key2, [fields2], ...] + for i in range(2, len(raw_result), 2): + row_data = raw_result[i] + row = dict(zip(row_data[::2], row_data[1::2])) + rows.append(row) else: # FT.AGGREGATE format: [count, [fields1], [fields2], ...] for row_data in raw_result[1:]: @@ -181,7 +267,7 @@ def execute(self, sql: str, *, params: dict | None = None) -> QueryResult: return QueryResult(rows=rows, count=count) -class AsyncExecutor: +class AsyncExecutor(_ScoreParseMixin): """Async version of Executor for use with redis.asyncio clients.""" def __init__( @@ -258,11 +344,46 @@ async def execute(self, sql: str, *, params: dict | None = None) -> QueryResult: rows = [] if translated.command == "FT.SEARCH": - # FT.SEARCH format: [count, key1, [fields1], key2, [fields2], ...] - for i in range(2, len(raw_result), 2): - row_data = raw_result[i] - row = dict(zip(row_data[::2], row_data[1::2])) - rows.append(row) + with_scores = translated.score_alias is not None + no_content = self._has_return_0(translated.args) + + score_alias: str | None = None + + if with_scores and no_content: + # WITHSCORES + RETURN 0: [count, id1, score1, id2, score2, ...] + score_alias = self._resolve_score_alias( + translated.score_alias, translated.args + ) + for i in range(1, len(raw_result) - 1, 2): + score = raw_result[i + 1] + row = {score_alias: score} + rows.append(row) + elif with_scores: + # WITHSCORES format: [count, key1, score1, [fields1], ...] + # First pass: collect all field names across all rows so the + # alias avoids collisions with any document field. + all_field_names: set[str] = set() + parsed_rows: list[tuple[dict, Any]] = [] + for i in range(1, len(raw_result) - 2, 3): + score = raw_result[i + 1] + row_data = raw_result[i + 2] + row = dict(zip(row_data[::2], row_data[1::2])) + all_field_names.update(row.keys()) + parsed_rows.append((row, score)) + resolved_alias = self._resolve_score_alias( + translated.score_alias, + translated.args, + first_row_fields=all_field_names, + ) + for row, score in parsed_rows: + row[resolved_alias] = score + rows.append(row) + else: + # Standard format: [count, key1, [fields1], key2, [fields2], ...] + for i in range(2, len(raw_result), 2): + row_data = raw_result[i] + row = dict(zip(row_data[::2], row_data[1::2])) + rows.append(row) else: # FT.AGGREGATE format: [count, [fields1], [fields2], ...] for row_data in raw_result[1:]: diff --git a/sql_redis/parser.py b/sql_redis/parser.py index e37cfcd..dc12690 100644 --- a/sql_redis/parser.py +++ b/sql_redis/parser.py @@ -164,6 +164,9 @@ class Condition: operator: str value: object negated: bool = False + fuzzy_level: int | None = None # Levenshtein distance for FUZZY (1, 2, or 3) + slop: int | None = None # Max distance between terms for proximity search + inorder: bool = False # Require terms in order (used with slop) @dataclass @@ -196,6 +199,17 @@ class GeoDistanceSelect: unit: str = "m" # m, km, mi, ft (default: meters) +@dataclass +class ScoringSpec: + """Specification for relevance scoring. + + Triggers WITHSCORES and optional SCORER on FT.SEARCH. + """ + + alias: str = "score" # Column alias for the score + scorer: str = "BM25" # Scorer algorithm (BM25, TFIDF, DISMAX, etc.) + + @dataclass class ParsedQuery: """Result of parsing a SQL query.""" @@ -219,6 +233,7 @@ class ParsedQuery: limit: int | None = None offset: int | None = None filters: list[str] = dataclasses.field(default_factory=list) + scoring: ScoringSpec | None = None # Relevance scoring config class SQLParser: @@ -441,6 +456,41 @@ def _process_select_expression_inner( elif func_name_lower == "geo_distance": # geo_distance(field, POINT(lon, lat), unit) in SELECT self._process_geo_distance_select(expression, result, alias) + elif func_name_lower == "score": + # score() or score('BM25') — triggers WITHSCORES + SCORER + scorer = "BM25" + if len(expression.expressions) > 1: + raise ValueError( + f"score() expects at most one argument, " + f"got {len(expression.expressions)}." + ) + if expression.expressions: + scorer_val = self._extract_literal_value(expression.expressions[0]) + if scorer_val is None: + raise ValueError( + "score() argument must be a literal scorer name " + f"(e.g., 'BM25', 'TFIDF'), got {expression.expressions[0]}." + ) + if not isinstance(scorer_val, str): + raise ValueError( + "score() argument must be a string scorer name " + f"(e.g., 'BM25', 'TFIDF'), got {scorer_val!r}." + ) + if not scorer_val: + raise ValueError( + "score() scorer name must not be empty. " + "Use score() with no arguments for the default " + "BM25 scorer, or pass a valid name like 'TFIDF'." + ) + scorer = scorer_val + if result.scoring is not None: + raise ValueError( + "Only one score() expression is allowed per query." + ) + result.scoring = ScoringSpec( + alias=alias or "score", + scorer=scorer, + ) elif func_name_lower in redis_reducers: # Redis-specific reducer functions field_name = None @@ -656,6 +706,9 @@ def _process_where_clause( self._add_between_condition(expression, result, negated) elif isinstance(expression, exp.In): self._add_in_condition(expression, result, negated) + elif isinstance(expression, exp.Like): + # LIKE 'pattern%' / '%pattern' / '%pattern%' + self._add_condition(expression, "LIKE", result, negated) elif isinstance(expression, exp.And): result.boolean_operator = "AND" self._process_where_clause(expression.this, result, negated) @@ -938,27 +991,145 @@ def _add_in_condition(self, expression, result: ParsedQuery, negated: bool) -> N def _add_function_condition( self, expression, result: ParsedQuery, negated: bool ) -> None: - """Add a condition from a function call like fulltext(field, value).""" + """Add a condition from a function call like fulltext(field, value) or fuzzy(field, value, level).""" func_name = expression.name.upper() - if func_name == "FULLTEXT" and len(expression.expressions) >= 2: - first_arg = expression.expressions[0] - second_arg = expression.expressions[1] + args = expression.expressions - field_name = None - if isinstance(first_arg, exp.Column): - field_name = first_arg.name + if func_name in ("FULLTEXT", "FUZZY") and len(args) < 2: + raise ValueError( + f"{func_name.lower()}() requires at least 2 arguments: " + f"{func_name.lower()}(field, value), got {len(args)}." + ) - value = self._extract_literal_value(second_arg) + # Validate max argument counts to catch typos / misuse early. + # fulltext(field, value [, slop [, inorder]]) → max 4 + # fuzzy(field, value [, level]) → max 3 + _max_args = {"FULLTEXT": 4, "FUZZY": 3} + if func_name in _max_args and len(args) > _max_args[func_name]: + raise ValueError( + f"{func_name.lower()}() accepts at most {_max_args[func_name]} " + f"arguments, got {len(args)}." + ) - if field_name is not None: - result.conditions.append( - Condition( - field=field_name, - operator="FULLTEXT", - value=value, - negated=negated, + if func_name == "FULLTEXT" and len(args) >= 2: + field_name = args[0].name if isinstance(args[0], exp.Column) else None + value = self._extract_literal_value(args[1]) + if value is None and not isinstance(args[1], exp.Placeholder): + raise ValueError( + "fulltext() second argument must be a literal string, " + f"got {args[1]}. Usage: fulltext(field, 'search terms')" + ) + + # Optional 3rd arg: slop (non-negative int) + slop = None + if len(args) >= 3: + slop_val = self._extract_literal_value(args[2]) + if slop_val is None and not isinstance(args[2], exp.Placeholder): + raise ValueError( + "fulltext() slop argument must be a literal integer, " + f"got {args[2]}." ) + if slop_val is not None: + # Reject booleans and non-integer floats — only real + # integers are valid for slop. + if isinstance(slop_val, bool): + raise ValueError( + f"FULLTEXT slop argument must be an integer (got {slop_val})" + ) + if isinstance(slop_val, float) and slop_val != int(slop_val): + raise ValueError( + f"FULLTEXT slop argument must be an integer (got {slop_val})" + ) + slop = int(slop_val) + if slop < 0: + raise ValueError( + f"FULLTEXT slop argument must be a non-negative integer (got {slop})" + ) + + # Optional 4th arg: inorder (boolean-like: true/false or 1/0) + inorder = False + if len(args) >= 4: + inorder_val = self._extract_literal_value(args[3]) + if inorder_val is None and not isinstance(args[3], exp.Placeholder): + raise ValueError( + "fulltext() inorder argument must be a literal boolean " + f"(true/false or 1/0), got {args[3]}." + ) + if inorder_val is not None: + if isinstance(inorder_val, bool): + inorder = inorder_val + elif str(inorder_val).lower() in ("1", "0", "true", "false"): + inorder = str(inorder_val).lower() in ("1", "true") + else: + raise ValueError( + f"FULLTEXT inorder argument must be a boolean " + f"(true/false or 1/0), got {inorder_val!r}" + ) + + if field_name is None: + raise ValueError( + "fulltext() first argument must be a column name, " + f"got {args[0]}. Usage: fulltext(field, 'search terms')" + ) + result.conditions.append( + Condition( + field=field_name, + operator="FULLTEXT", + value=value, + negated=negated, + slop=slop, + inorder=inorder, ) + ) + + elif func_name == "FUZZY" and len(args) >= 2: + field_name = args[0].name if isinstance(args[0], exp.Column) else None + value = self._extract_literal_value(args[1]) + if value is None and not isinstance(args[1], exp.Placeholder): + raise ValueError( + "fuzzy() second argument must be a literal string, " + f"got {args[1]}. Usage: fuzzy(field, 'search term')" + ) + + # Optional 3rd arg: fuzzy level (1, 2, or 3) + fuzzy_level = None + if len(args) >= 3: + level_val = self._extract_literal_value(args[2]) + if level_val is None and not isinstance(args[2], exp.Placeholder): + raise ValueError( + "fuzzy() level argument must be a literal integer, " + f"got {args[2]}." + ) + if level_val is not None: + if isinstance(level_val, bool): + raise ValueError( + f"FUZZY level argument must be an integer (got {level_val})" + ) + if isinstance(level_val, float) and level_val != int(level_val): + raise ValueError( + f"FUZZY level argument must be an integer (got {level_val})" + ) + fuzzy_level = int(level_val) + if fuzzy_level not in (1, 2, 3): + raise ValueError( + f"FUZZY level must be 1, 2, or 3 (got {fuzzy_level}). " + "RediSearch supports a maximum Levenshtein distance of 3." + ) + + if field_name is None: + raise ValueError( + "fuzzy() first argument must be a column name, " + f"got {args[0]}. Usage: fuzzy(field, 'search term')" + ) + result.conditions.append( + Condition( + field=field_name, + operator="FUZZY", + value=value, + negated=negated, + fuzzy_level=fuzzy_level, + ) + ) def _extract_literal_value(self, expression, convert_dates: bool = False): """Extract a Python value from a sqlglot Literal or Neg expression. @@ -983,6 +1154,9 @@ def _extract_literal_value(self, expression, convert_dates: bool = False): if timestamp is not None: return timestamp return value + elif isinstance(expression, exp.Boolean): + # Handle TRUE/FALSE keywords parsed by sqlglot + return expression.this elif isinstance(expression, exp.Neg): # Handle negative numbers: Neg(Literal(122.4)) -> -122.4 inner_value = self._extract_literal_value(expression.this) diff --git a/sql_redis/query_builder.py b/sql_redis/query_builder.py index 1c674ce..8a8acf0 100644 --- a/sql_redis/query_builder.py +++ b/sql_redis/query_builder.py @@ -2,6 +2,7 @@ from __future__ import annotations +import re import warnings # Redis default stopwords - these are not indexed by default @@ -51,40 +52,201 @@ class QueryBuilder: # Characters that need escaping in TAG values TAG_SPECIAL_CHARS = r".,<>{}[]\"':;!@#$%^&*()-+=~" + # Characters that have special meaning in RediSearch free-text queries + # (outside double-quoted phrases). Must be escaped with backslash. + # Includes double-quote to prevent starting/ending quoted phrases. + TEXT_QUERY_SPECIAL_CHARS = set('\\|-()"@~!{}[]^$><=;:*+') + + @classmethod + def _escape_fulltext_term(cls, term: str) -> str: + """Escape characters that have special meaning in RediSearch free-text queries. + + Applied to individual terms used outside of double-quoted phrases (e.g., + in parenthesized FULLTEXT expressions, LIKE, FUZZY) so that user input + containing RediSearch operator characters does not alter query semantics + or produce syntax errors. + """ + result = [] + for char in term: + if char in cls.TEXT_QUERY_SPECIAL_CHARS: + result.append(f"\\{char}") + else: + result.append(char) + return "".join(result) + + @staticmethod + def _escape_text_value(value: str) -> str: + """Escape characters that are special inside RediSearch double-quoted phrases. + + Backslashes and double quotes must be escaped so they don't break + the query syntax or alter its meaning. + """ + # Escape backslashes first (so we don't double-escape the quote escapes), + # then escape double quotes. + return value.replace("\\", "\\\\").replace('"', '\\"') + def build_text_condition( self, field: str | list[str], operator: str, value: str, negated: bool = False, + *, + fuzzy_level: int | None = None, + slop: int | None = None, + inorder: bool = False, ) -> str: """Build query syntax for TEXT field conditions. Args: field: Field name or list of field names for multi-field search. - operator: One of =, MATCH, LIKE, FUZZY. + operator: One of =, !=, FULLTEXT, LIKE, FUZZY. + - = / !=: exact phrase match, value wrapped in double quotes. + - FULLTEXT: tokenized keyword search with stopword filtering. + - LIKE: prefix/suffix/infix pattern (SQL % → RediSearch *). + - FUZZY: Levenshtein fuzzy match. value: The search term or pattern. negated: If True, prefix with - for negation. + fuzzy_level: Levenshtein distance for FUZZY (1, 2, or 3). Default 1. + slop: Maximum distance between terms for proximity search. + inorder: If True with slop, require terms in order. Returns: - RediSearch query syntax like @field:term or @field:"phrase". + RediSearch query syntax like @field:"exact phrase" or @field:(term1 term2). """ - prefix = "-" if negated else "" + # Derive negation from both the flag and the operator itself, + # consistent with how build_tag_condition handles != via operator. + prefix = "-" if negated or operator == "!=" else "" - # Handle multi-field search - if isinstance(field, list): - field_str = "|".join(field) - return f"(@{field_str}:{value})" - - # Handle different operators + # Build search_value based on operator — shared by single- and multi-field paths if operator == "LIKE": - # Convert SQL LIKE pattern (%) to RediSearch prefix (*) - search_value = value.replace("%", "*") + # Escape special chars in the non-wildcard portion, then convert % → * + # Split on %, escape each segment, rejoin with * + parts = value.split("%") + escaped_parts = [self._escape_fulltext_term(p) for p in parts] + search_value = "*".join(escaped_parts) + # If the non-wildcard portion contains spaces, wrap in parens + # so all tokens stay scoped to the field (e.g. '%gaming laptop%' + # → *gaming laptop* needs grouping to avoid token leaking). + non_wildcard = value.strip("%") + if " " in non_wildcard: + search_value = f"({search_value})" elif operator == "FUZZY": - # Wrap with % for fuzzy matching - search_value = f"%{value}%" + # Escape special chars before wrapping with % markers + escaped = self._escape_fulltext_term(value) + level = fuzzy_level if fuzzy_level is not None else 1 + if level not in (1, 2, 3): + raise ValueError( + f"Fuzzy level must be 1, 2, or 3 (got {level}). " + "RediSearch supports a maximum Levenshtein distance of 3." + ) + pct = "%" * level + search_value = f"{pct}{escaped}{pct}" + elif operator in ("=", "!="): + # Exact phrase match — wrap in double quotes. + # Strip default stopwords because RediSearch does not index them; + # keeping them in the quoted phrase causes a query-time error + # (e.g. "diagnosing and treating" fails on "and"). + # Since the indexer assigns consecutive positions after dropping + # stopwords, the stripped phrase matches correctly. + words = value.split() + removed = [w for w in words if w.lower() in REDIS_DEFAULT_STOPWORDS] + filtered = [w for w in words if w.lower() not in REDIS_DEFAULT_STOPWORDS] + + if removed: + phrase_words = filtered if filtered else words + if filtered: + sw_msg = f"Stopwords {removed} were removed from" + else: + sw_msg = ( + f"All tokens in '{value}' are stopwords and may not " + "be indexed in" + ) + warnings.warn( + f"{sw_msg} exact phrase '{value}'. " + "By default, Redis does not index stopwords. " + "To include stopwords in your index, create it " + "with STOPWORDS 0.", + UserWarning, + stacklevel=2, + ) + else: + phrase_words = words + + escaped = self._escape_text_value(" ".join(phrase_words)) + search_value = f'"{escaped}"' + elif re.search(r"(?:^|\s+)OR(?:\s+|$)", value): + # OR union within text field: split on uppercase-only OR with + # flexible whitespace, escape each term, join with |. + # Only uppercase OR is treated as a boolean operator; lowercase + # "or" is treated as a regular search term (e.g. "bank or america" + # stays as a multi-word AND search, not bank|america). + # Multi-word operands (e.g. "gaming laptop OR tablet") are wrapped + # in parentheses so each side is an atomic subexpression. + # The regex also matches leading/trailing OR (e.g. "laptop OR" + # or "OR tablet") so that the empty-operand check below catches + # these malformed inputs instead of silently dropping "OR". + or_parts: list[str] = [] + all_removed: list[str] = [] + for part in re.split(r"(?:^|\s+)OR(?:\s+|$)", value): + words = part.strip().split() + if not words: + raise ValueError( + "Empty operand in OR expression — each side of OR " + "must contain at least one search term." + ) + + # Filter stopwords from this operand (same logic as + # the multi-word FULLTEXT branch). + removed = [w for w in words if w.lower() in REDIS_DEFAULT_STOPWORDS] + filtered = [ + w for w in words if w.lower() not in REDIS_DEFAULT_STOPWORDS + ] + if removed: + all_removed.extend(removed) + # Use filtered list if any non-stopword tokens remain; + # otherwise fall back to original words so we don't + # silently produce an empty operand. + effective = filtered if filtered else words + + if not effective: + raise ValueError( + "Empty operand in OR expression — each side of OR " + "must contain at least one search term." + ) + + if len(effective) > 1: + escaped_tokens = [] + for w in effective: + if w.startswith("~"): + escaped_tokens.append( + "~" + self._escape_fulltext_term(w[1:]) + ) + else: + escaped_tokens.append(self._escape_fulltext_term(w)) + or_parts.append(f"({' '.join(escaped_tokens)})") + else: + token = effective[0] + if token.startswith("~"): + or_parts.append("~" + self._escape_fulltext_term(token[1:])) + else: + or_parts.append(self._escape_fulltext_term(token)) + + if all_removed: + warnings.warn( + f"Stopwords {all_removed} were removed from OR " + f"expression '{value}'. By default, Redis does not " + "index stopwords. To include stopwords in your " + "index, create it with STOPWORDS 0.", + UserWarning, + stacklevel=2, + ) + search_value = f"({'|'.join(or_parts)})" elif " " in value: - # Phrase search - filter stopwords and wrap in quotes + # FULLTEXT with multi-word: tokenized search with stopword filtering. + # Each term is escaped to prevent accidental operator injection, but a + # leading ~ (optional-term modifier) is preserved as an intentional + # RediSearch operator. words = value.split() removed_stopwords = [ w for w in words if w.lower() in REDIS_DEFAULT_STOPWORDS @@ -94,21 +256,54 @@ def build_text_condition( ] if removed_stopwords: + if filtered_words: + sw_action = f"Stopwords {removed_stopwords} were removed from" + else: + sw_action = f"All tokens in '{value}' are stopwords and may not be indexed in" warnings.warn( - f"Stopwords {removed_stopwords} were removed from phrase search '{value}'. " + f"{sw_action} text search '{value}'. " "By default, Redis does not index stopwords. " - "To include stopwords in your index, create it with STOPWORDS 0.", + "To include stopwords in your index, create it " + "with STOPWORDS 0.", UserWarning, stacklevel=2, ) - # Use filtered phrase, or original if all words were stopwords - phrase = " ".join(filtered_words) if filtered_words else value - search_value = f'"{phrase}"' + escaped_words = [] + for w in (filtered_words if filtered_words else words): + if w.startswith("~"): + # Preserve ~ optional-term prefix, escape the rest + escaped_words.append("~" + self._escape_fulltext_term(w[1:])) + else: + escaped_words.append(self._escape_fulltext_term(w)) + + terms = " ".join(escaped_words) + search_value = f"({terms})" else: - search_value = value + # Single-word FULLTEXT — escape to prevent accidental operator injection. + # Preserve ~ optional-term prefix (same as multi-word branch). + if value.startswith("~"): + search_value = "~" + self._escape_fulltext_term(value[1:]) + else: + search_value = self._escape_fulltext_term(value) - return f"{prefix}@{field}:{search_value}" + # Handle multi-field search — use computed search_value with multi-field syntax + if isinstance(field, list): + field_str = "|".join(field) + base = f"{prefix}(@{field_str}:{search_value})" + else: + base = f"{prefix}@{field}:{search_value}" + + # Append query attributes (slop, inorder) if specified + if slop is not None: + if not isinstance(slop, int) or isinstance(slop, bool) or slop < 0: + raise ValueError(f"slop must be a non-negative integer (got {slop!r})") + attrs = f"$slop: {slop};" + if inorder: + attrs += " $inorder: true;" + base = f"{base} => {{ {attrs} }}" + + return base def _escape_tag_value(self, value: str) -> str: """Escape special characters in TAG values.""" diff --git a/sql_redis/translator.py b/sql_redis/translator.py index b594362..8f3f199 100644 --- a/sql_redis/translator.py +++ b/sql_redis/translator.py @@ -28,6 +28,7 @@ class TranslatedQuery: query_string: str args: list[str] = field(default_factory=list) params: dict[str, object] = field(default_factory=dict) # Named parameters + score_alias: str | None = None # Alias for score column when WITHSCORES is used def to_command_list(self) -> list[str]: """Return as a list suitable for redis.execute_command().""" @@ -161,6 +162,13 @@ def _build_command(self, analyzed: AnalyzedQuery) -> TranslatedQuery: query_string = self._build_query_string(analyzed) if use_aggregate: + if parsed.scoring is not None: + raise ValueError( + "score() is not supported with FT.AGGREGATE queries. " + "WITHSCORES / SCORER are FT.SEARCH-only features. " + "Remove score() or avoid GROUP BY / aggregation functions " + "in the same query." + ) return self._build_aggregate(analyzed, query_string) else: return self._build_search(analyzed, query_string) @@ -218,11 +226,28 @@ def _build_condition(self, condition: Condition, field_type: str | None) -> str: condition.field, is_missing=(condition.operator == "IS_NULL") ) - # Determine if this is a negation (either explicit or via != operator) + # Reject text-only operators on non-TEXT fields — fuzzy() and fulltext() + # only make sense for TEXT fields; silently falling through to TAG/NUMERIC + # would produce incorrect queries. + if condition.operator in ("FUZZY", "FULLTEXT", "LIKE") and field_type != "TEXT": + op_display = ( + "LIKE" + if condition.operator == "LIKE" + else f"{condition.operator.lower()}()" + ) + raise ValueError( + f"{op_display} can only be used on TEXT fields, " + f"but '{condition.field}' is {field_type or 'unknown'}." + ) + + # Resolve negation using XOR so that double negation cancels out. + # e.g. NOT (field != 'x') → negated=True, op='!=' → is_negated=False. operator = condition.operator - is_negated = condition.negated or operator == "!=" - if condition.negated and operator == "=": - operator = "!=" + is_negated = condition.negated ^ (operator == "!=") + # Normalize = / != to match the resolved negation state so every + # downstream builder sees a consistent (operator, negated) pair. + if operator in ("=", "!="): + operator = "!=" if is_negated else "=" if field_type == "TEXT": return self._query_builder.build_text_condition( @@ -230,6 +255,9 @@ def _build_condition(self, condition: Condition, field_type: str | None) -> str: operator, str(condition.value), is_negated, + fuzzy_level=condition.fuzzy_level, + slop=condition.slop, + inorder=condition.inorder, ) elif field_type == "TAG": # Keep list value for IN clauses, convert scalar to string @@ -252,6 +280,12 @@ def _build_condition(self, condition: Condition, field_type: str | None) -> str: low_val = self._convert_to_numeric(low) high_val = self._convert_to_numeric(high) numeric_value = (low_val, high_val) + elif isinstance(condition.value, bool): + raise ValueError( + f"Boolean value {condition.value!r} is not valid in a " + "numeric context. Use 1/0 instead of true/false for " + "numeric fields." + ) elif isinstance(condition.value, (int, float)): numeric_value = condition.value else: @@ -283,6 +317,11 @@ def _convert_to_numeric(self, value: object) -> int | float: Raises: ValueError: If conversion fails. """ + if isinstance(value, bool): + raise ValueError( + f"Boolean value {value!r} is not valid in a numeric context. " + "Use 1/0 instead of true/false for numeric fields." + ) if isinstance(value, (int, float)): return value if isinstance(value, str): @@ -319,21 +358,51 @@ def _build_search( if analyzed.vector_search.alias not in return_fields: return_fields.append(analyzed.vector_search.alias) - if return_fields and return_fields != ["*"]: + # When score() is the only SELECT expression, parsed.fields is empty. + # We still need a RETURN clause to avoid leaking full document payloads. + # Score itself is delivered via WITHSCORES (not RETURN), but we must + # emit RETURN 0 so Redis returns no document attributes beyond the score. + score_only_select = parsed.scoring is not None and not return_fields + + if score_only_select: + # RETURN 0 — suppress all document fields, score comes via WITHSCORES + args.extend(["RETURN", "0"]) + elif return_fields and return_fields != ["*"]: args.append("RETURN") args.append(str(len(return_fields))) args.extend(return_fields) - # SORTBY + # SORTBY — skip if the ORDER BY field is a score() alias, because + # WITHSCORES already returns results in relevance order and the alias + # is not a sortable indexed field. + score_alias_name = parsed.scoring.alias if parsed.scoring else None if parsed.orderby_fields: field_name, direction = parsed.orderby_fields[0] - args.extend(["SORTBY", field_name, direction]) + if field_name == score_alias_name: + # score() alias — not a real field; RediSearch sorts by + # relevance by default when no SORTBY is specified. + if direction == "ASC": + raise ValueError( + f"ORDER BY {field_name} ASC is not supported: " + "RediSearch returns results in descending relevance " + "order by default and does not support ascending " + "score sorting via FT.SEARCH." + ) + # DESC is the default — omit SORTBY entirely + else: + args.extend(["SORTBY", field_name, direction]) # LIMIT if parsed.limit is not None: offset = parsed.offset or 0 args.extend(["LIMIT", str(offset), str(parsed.limit)]) + # Scoring — WITHSCORES and SCORER + if parsed.scoring is not None: + args.append("WITHSCORES") + if parsed.scoring.scorer: + args.extend(["SCORER", parsed.scoring.scorer]) + # DIALECT 2 — unconditionally appended as the last arguments args.extend(["DIALECT", "2"]) @@ -343,6 +412,7 @@ def _build_search( query_string=query_string, args=args, params=params, + score_alias=(parsed.scoring.alias if parsed.scoring is not None else None), ) def _build_geo_filter_args(self, geo_cond: GeoDistanceCondition) -> list[str]: diff --git a/tests/test_parameter_substitution.py b/tests/test_parameter_substitution.py index f123038..c7affe7 100644 --- a/tests/test_parameter_substitution.py +++ b/tests/test_parameter_substitution.py @@ -211,9 +211,13 @@ def test_empty_string_value(self, param_executor: Executor, param_test_index: st Note: Redis Search doesn't handle empty string literals well in TEXT fields. This is a Redis limitation, not a parameter substitution bug. """ - # Empty strings cause Redis syntax errors in TEXT field queries + # Empty strings cause Redis errors in TEXT field queries # This is expected behavior - Redis Search requires non-empty search terms - with pytest.raises(redis.exceptions.ResponseError, match="Syntax error"): + # With exact phrase syntax (@field:""), Redis may return "Syntax error" + # or "INDEXEMPTY" guidance depending on the Redis version + with pytest.raises( + redis.exceptions.ResponseError, match="Syntax error|INDEXEMPTY" + ): param_executor.execute( f"SELECT * FROM {param_test_index} WHERE name = :name", params={"name": ""}, diff --git a/tests/test_query_builder.py b/tests/test_query_builder.py index 07e5865..d01c1be 100644 --- a/tests/test_query_builder.py +++ b/tests/test_query_builder.py @@ -8,27 +8,76 @@ class TestQueryBuilderTextFields: """Tests for building TEXT field query syntax.""" - def test_text_single_term(self): - """TEXT field with single term: @field:term.""" + def test_text_single_term_exact(self): + """TEXT field with = wraps in quotes for exact phrase: @field:"term".""" builder = QueryBuilder() result = builder.build_text_condition("title", "=", "laptop") - assert result == "@title:laptop" + assert result == '@title:"laptop"' def test_text_exact_phrase(self): - """TEXT field with phrase: @field:"exact phrase".""" + """TEXT field with = preserves multi-word phrase: @field:"exact phrase".""" builder = QueryBuilder() result = builder.build_text_condition("title", "=", "gaming laptop") assert result == '@title:"gaming laptop"' - def test_text_match_term(self): - """TEXT field with MATCH: @field:term.""" + def test_text_exact_phrase_strips_stopwords(self): + """TEXT field with = strips stopwords and warns (RediSearch doesn't index them).""" + builder = QueryBuilder() + import warnings + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = builder.build_text_condition("name", "=", "bank of america") + + # "of" is a stopword — stripped so the phrase matches indexed positions + assert result == '@name:"bank america"' + assert len(w) == 1 + assert "Stopwords ['of']" in str(w[0].message) + + def test_text_exact_phrase_no_stopwords_no_warning(self): + """TEXT field with = on phrase without stopwords produces no warning.""" + builder = QueryBuilder() + import warnings + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = builder.build_text_condition("name", "=", "bank america") + + assert result == '@name:"bank america"' + assert len(w) == 0 + + def test_text_exact_phrase_escapes_quotes(self): + """TEXT field with = escapes double quotes inside the value.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "=", 'say "hello"') + + assert result == r'@title:"say \"hello\""' + + def test_text_exact_phrase_escapes_backslashes(self): + """TEXT field with = escapes backslashes inside the value.""" builder = QueryBuilder() - result = builder.build_text_condition("title", "MATCH", "laptop") + result = builder.build_text_condition("path", "=", r"c:\users\docs") + + assert result == r'@path:"c:\\users\\docs"' + + def test_text_fulltext_term(self): + """TEXT field with FULLTEXT (tokenized search): @field:term.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "laptop") assert result == "@title:laptop" + def test_text_fulltext_multi_word(self): + """TEXT field with FULLTEXT and multi-word: @field:(term1 term2).""" + builder = QueryBuilder() + result = builder.build_text_condition( + "description", "FULLTEXT", "gaming laptop" + ) + + assert result == "@description:(gaming laptop)" + def test_text_prefix_search(self): """TEXT field with prefix: @field:prefix*.""" builder = QueryBuilder() @@ -40,7 +89,7 @@ def test_text_negation(self): """TEXT field with NOT: -@field:term.""" builder = QueryBuilder() result = builder.build_text_condition( - "title", "MATCH", "refurbished", negated=True + "title", "FULLTEXT", "refurbished", negated=True ) assert result == "-@title:refurbished" @@ -52,11 +101,18 @@ def test_text_fuzzy_match(self): assert result == "@title:%laptap%" + def test_text_fulltext_special_chars_escaped(self): + """FULLTEXT term with RediSearch operator chars is escaped to avoid injection.""" + builder = QueryBuilder() + result = builder.build_text_condition("description", "FULLTEXT", "anti-virus") + + assert result == r"@description:anti\-virus" + def test_text_multi_field(self): """TEXT multi-field search: (@field1|field2:term).""" builder = QueryBuilder() result = builder.build_text_condition( - ["title", "description"], "MATCH", "wireless" + ["title", "description"], "FULLTEXT", "wireless" ) assert result == "(@title|description:wireless)" @@ -286,7 +342,7 @@ def test_simple_text_query(self): """Build simple text search query.""" builder = QueryBuilder() result = builder.build_query_string( - text_conditions=[("title", "MATCH", "laptop")], + text_conditions=[("title", "FULLTEXT", "laptop")], field_types={"title": "TEXT"}, ) @@ -296,7 +352,7 @@ def test_combined_query(self): """Build combined text + numeric + tag query.""" builder = QueryBuilder() result = builder.build_query_string( - text_conditions=[("title", "MATCH", "laptop")], + text_conditions=[("title", "FULLTEXT", "laptop")], numeric_conditions=[("price", "<", 1000)], tag_conditions=[("category", "=", "electronics")], field_types={"title": "TEXT", "price": "NUMERIC", "category": "TAG"}, @@ -314,6 +370,165 @@ def test_wildcard_query(self): assert result == "*" +class TestQueryBuilderFuzzyLevels: + """Tests for fuzzy matching with Levenshtein distance levels 1-3.""" + + def test_fuzzy_ld1_default(self): + """Fuzzy LD=1 (default): @field:%term%.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FUZZY", "laptap") + assert result == "@title:%laptap%" + + def test_fuzzy_ld1_explicit(self): + """Fuzzy LD=1 (explicit): @field:%term%.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FUZZY", "laptap", fuzzy_level=1) + assert result == "@title:%laptap%" + + def test_fuzzy_ld2(self): + """Fuzzy LD=2: @field:%%term%%.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FUZZY", "laptap", fuzzy_level=2) + assert result == "@title:%%laptap%%" + + def test_fuzzy_ld3(self): + """Fuzzy LD=3: @field:%%%term%%%.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FUZZY", "laptap", fuzzy_level=3) + assert result == "@title:%%%laptap%%%" + + def test_fuzzy_negated(self): + """Fuzzy with negation: -@field:%term%.""" + builder = QueryBuilder() + result = builder.build_text_condition( + "title", "FUZZY", "laptap", negated=True, fuzzy_level=2 + ) + assert result == "-@title:%%laptap%%" + + def test_fuzzy_invalid_level_raises(self): + """Fuzzy level outside 1-3 raises ValueError.""" + builder = QueryBuilder() + with pytest.raises(ValueError, match="Fuzzy level must be 1, 2, or 3"): + builder.build_text_condition("title", "FUZZY", "laptap", fuzzy_level=4) + + def test_fuzzy_level_zero_raises(self): + """Fuzzy level 0 raises ValueError.""" + builder = QueryBuilder() + with pytest.raises(ValueError, match="Fuzzy level must be 1, 2, or 3"): + builder.build_text_condition("title", "FUZZY", "laptap", fuzzy_level=0) + + +class TestQueryBuilderSuffixInfix: + """Tests for suffix and infix (contains) matching.""" + + def test_suffix_match(self): + """LIKE '%term' -> suffix match: @field:*term.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "LIKE", "%phone") + assert result == "@title:*phone" + + def test_infix_match(self): + """LIKE '%term%' -> infix/contains match: @field:*term*.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "LIKE", "%phone%") + assert result == "@title:*phone*" + + def test_prefix_match_still_works(self): + """LIKE 'term%' -> prefix match: @field:term* (unchanged).""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "LIKE", "lap%") + assert result == "@title:lap*" + + def test_suffix_negated(self): + """Suffix match with negation.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "LIKE", "%phone", negated=True) + assert result == "-@title:*phone" + + def test_infix_multiword_grouped(self): + """LIKE '%multi word%' groups tokens in parentheses.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "LIKE", "%gaming laptop%") + assert result == "@title:(*gaming laptop*)" + + +class TestQueryBuilderORInText: + """Tests for OR/union within text field searches.""" + + def test_fulltext_or_terms(self): + """FULLTEXT with OR: @field:(term1|term2).""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "laptop OR tablet") + assert result == "@title:(laptop|tablet)" + + def test_fulltext_or_multiple_terms(self): + """FULLTEXT with multiple OR: @field:(t1|t2|t3).""" + builder = QueryBuilder() + result = builder.build_text_condition( + "title", "FULLTEXT", "laptop OR tablet OR phone" + ) + assert result == "@title:(laptop|tablet|phone)" + + def test_fulltext_or_negated(self): + """FULLTEXT OR with negation: -@field:(term1|term2).""" + builder = QueryBuilder() + result = builder.build_text_condition( + "title", "FULLTEXT", "laptop OR tablet", negated=True + ) + assert result == "-@title:(laptop|tablet)" + + def test_fulltext_and_still_works(self): + """FULLTEXT without OR: @field:(term1 term2) (AND semantics, unchanged).""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "gaming laptop") + assert result == "@title:(gaming laptop)" + + +class TestQueryBuilderProximity: + """Tests for proximity search (slop and inorder).""" + + def test_fulltext_with_slop(self): + """FULLTEXT with slop: @field:(term1 term2) => {$slop: N}.""" + builder = QueryBuilder() + result = builder.build_text_condition( + "title", "FULLTEXT", "gaming laptop", slop=2 + ) + assert result == "@title:(gaming laptop) => { $slop: 2; }" + + def test_fulltext_with_slop_and_inorder(self): + """FULLTEXT with slop and inorder.""" + builder = QueryBuilder() + result = builder.build_text_condition( + "title", "FULLTEXT", "gaming laptop", slop=2, inorder=True + ) + assert result == "@title:(gaming laptop) => { $slop: 2; $inorder: true; }" + + def test_exact_phrase_with_slop(self): + """Exact phrase with slop appended.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "=", "gaming laptop", slop=1) + assert result == '@title:"gaming laptop" => { $slop: 1; }' + + def test_slop_negated(self): + """Proximity with negation.""" + builder = QueryBuilder() + result = builder.build_text_condition( + "title", "FULLTEXT", "gaming laptop", negated=True, slop=3 + ) + assert result == "-@title:(gaming laptop) => { $slop: 3; }" + + +class TestQueryBuilderOptionalTerms: + """Tests for optional term (~) syntax.""" + + def test_fulltext_optional_term(self): + """FULLTEXT with optional terms using ~ prefix in value.""" + builder = QueryBuilder() + # User writes: fulltext(field, 'required ~optional') + result = builder.build_text_condition("title", "FULLTEXT", "laptop ~gaming") + assert result == "@title:(laptop ~gaming)" + + class TestQueryBuilderMissingCondition: """Tests for ismissing() query syntax.""" @@ -328,3 +543,256 @@ def test_build_missing_condition_is_not_null(self): builder = QueryBuilder() result = builder.build_missing_condition("email", is_missing=False) assert result == "-ismissing(@email)" + + +class TestQueryBuilderEscaping: + """Tests for escaping special characters in text search values.""" + + def test_escape_fulltext_term_special_chars(self): + """_escape_fulltext_term escapes RediSearch operator characters.""" + builder = QueryBuilder() + result = builder._escape_fulltext_term("hello|world") + assert result == "hello\\|world" + + def test_escape_fulltext_term_double_quote(self): + """_escape_fulltext_term escapes double quotes.""" + builder = QueryBuilder() + result = builder._escape_fulltext_term('say "hello"') + assert result == 'say \\"hello\\"' + + def test_escape_fulltext_term_at_sign(self): + """_escape_fulltext_term escapes @ to prevent field injection.""" + builder = QueryBuilder() + result = builder._escape_fulltext_term("user@email") + assert result == "user\\@email" + + def test_like_escapes_special_chars(self): + """LIKE pattern escapes special chars in the non-wildcard portion.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "LIKE", "%hello|world%") + assert result == "@title:*hello\\|world*" + + def test_fuzzy_escapes_special_chars(self): + """FUZZY escapes special chars in the term before wrapping with %.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FUZZY", "hello@world") + assert result == "@title:%hello\\@world%" + + def test_fuzzy_escapes_double_quote(self): + """FUZZY escapes double quotes in term.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FUZZY", 'say"hi') + assert result == '@title:%say\\"hi%' + + def test_multi_field_non_exact_escapes(self): + """Multi-field search with non-exact operator escapes special chars.""" + builder = QueryBuilder() + result = builder.build_text_condition( + ["title", "description"], "FULLTEXT", "hello|world" + ) + assert result == "(@title|description:hello\\|world)" + + def test_or_terms_escape_special_chars(self): + """OR operands are escaped before joining with |.""" + builder = QueryBuilder() + result = builder.build_text_condition( + "title", "FULLTEXT", "laptop OR anti-virus" + ) + # '-' is a RediSearch operator and should be escaped in OR terms + assert result == "@title:(laptop|anti\\-virus)" + + def test_or_terms_escape_at_sign(self): + """OR operands escape @ to prevent field injection.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "hello OR @world") + assert result == "@title:(hello|\\@world)" + + def test_multiword_fulltext_escapes_special_chars(self): + """Multi-word FULLTEXT escapes dangerous chars like @ and |.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "hello @world") + assert result == "@title:(hello \\@world)" + + def test_multiword_fulltext_preserves_optional_prefix(self): + """Multi-word FULLTEXT preserves ~ optional-term prefix.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "laptop ~gaming") + assert result == "@title:(laptop ~gaming)" + + def test_multiword_fulltext_escapes_dash(self): + """Multi-word FULLTEXT escapes - to prevent accidental negation.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "hello anti-virus") + assert result == "@title:(hello anti\\-virus)" + + def test_multi_field_multiword_fulltext(self): + """Multi-field with multi-word FULLTEXT scopes all terms to fields.""" + builder = QueryBuilder() + result = builder.build_text_condition( + ["title", "description"], "FULLTEXT", "gaming laptop" + ) + assert result == "(@title|description:(gaming laptop))" + + def test_multi_field_or_fulltext(self): + """Multi-field with OR FULLTEXT uses pipe-separated terms.""" + builder = QueryBuilder() + result = builder.build_text_condition( + ["title", "description"], "FULLTEXT", "laptop OR tablet" + ) + assert result == "(@title|description:(laptop|tablet))" + + def test_multi_field_fuzzy(self): + """Multi-field with FUZZY wraps with % markers.""" + builder = QueryBuilder() + result = builder.build_text_condition( + ["title", "description"], "FUZZY", "laptap", fuzzy_level=2 + ) + assert result == "(@title|description:%%laptap%%)" + + def test_lowercase_or_is_not_boolean(self): + """Lowercase 'or' is treated as a regular search term, not a boolean operator. + + 'bank or america' should NOT become bank|america — it should be a + multi-word AND-style search with stopword filtering applied to 'or'. + """ + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "laptop or tablet") + # "or" is a stopword; remaining terms are "laptop" and "tablet" + assert result == "@title:(laptop tablet)" + + def test_mixed_case_or_is_not_boolean(self): + """Mixed case 'Or' / 'oR' is treated as a regular term, not boolean OR. + + Only uppercase 'OR' triggers the union operator. + 'Or' is a stopword (or → stopword list), so it gets removed. + """ + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "laptop Or tablet") + # "Or" lowercases to "or" which is a stopword; remaining: laptop tablet + assert result == "@title:(laptop tablet)" + + def test_or_extra_whitespace(self): + """OR parsing tolerates extra whitespace.""" + builder = QueryBuilder() + result = builder.build_text_condition( + "title", "FULLTEXT", "laptop OR tablet" + ) + assert result == "@title:(laptop|tablet)" + + def test_or_trailing_raises(self): + """Trailing OR with no operand raises ValueError.""" + builder = QueryBuilder() + with pytest.raises(ValueError, match="Empty operand"): + builder.build_text_condition("title", "FULLTEXT", "laptop OR") + + def test_or_leading_raises(self): + """Leading OR with no operand raises ValueError.""" + builder = QueryBuilder() + with pytest.raises(ValueError, match="Empty operand"): + builder.build_text_condition("title", "FULLTEXT", "OR tablet") + + def test_or_only_raises(self): + """Bare 'OR' with no operands raises ValueError.""" + builder = QueryBuilder() + with pytest.raises(ValueError, match="Empty operand"): + builder.build_text_condition("title", "FULLTEXT", "OR") + + def test_or_operand_stopword_filtered(self): + """Stopwords inside OR operands are stripped with a warning.""" + builder = QueryBuilder() + with pytest.warns(UserWarning, match="Stopwords.*removed from OR"): + result = builder.build_text_condition("title", "FULLTEXT", "laptop OR the") + # "the" is a stopword — after filtering only "laptop" remains on + # the right side, but since the right operand was *only* a stopword + # and falls back to original words, we keep it. + # Actually "the" is the only word so filtered=[] → fallback to ["the"]. + # Let's just verify the left side is clean. + assert "laptop" in result + + def test_or_multi_word_operand_stopword_filtered(self): + """Stopwords in multi-word OR operands are stripped.""" + builder = QueryBuilder() + with pytest.warns(UserWarning, match="Stopwords.*removed from OR"): + result = builder.build_text_condition( + "title", "FULLTEXT", "gaming laptop OR the tablet" + ) + # "the" stripped from second operand → "tablet" + assert result == "@title:((gaming laptop)|tablet)" + + def test_or_all_stopwords_operand_warns(self): + """OR operand that is entirely stopwords falls back but warns.""" + builder = QueryBuilder() + with pytest.warns(UserWarning, match="Stopwords.*removed from OR"): + result = builder.build_text_condition("title", "FULLTEXT", "laptop OR the") + # "the" is sole token and a stopword → filtered=[] → fallback to ["the"] + assert result == "@title:(laptop|the)" + + def test_escape_asterisk_in_fulltext(self): + """Literal * in FULLTEXT is escaped to prevent wildcard.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "hello*world") + assert result == r"@title:hello\*world" + + def test_escape_plus_in_fulltext(self): + """Literal + in FULLTEXT is escaped to prevent mandatory-term.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "C++") + assert result == r"@title:C\+\+" + + def test_single_term_optional_prefix_preserved(self): + """Single-term FULLTEXT with ~ prefix preserves optional semantics.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "~gaming") + assert result == "@title:~gaming" + + def test_or_multiword_operand_grouped(self): + """OR with multi-word operand wraps it in parentheses.""" + builder = QueryBuilder() + result = builder.build_text_condition( + "title", "FULLTEXT", "gaming laptop OR tablet" + ) + assert result == "@title:((gaming laptop)|tablet)" + + def test_or_both_multiword_operands_grouped(self): + """OR with multi-word operands on both sides wraps each.""" + builder = QueryBuilder() + result = builder.build_text_condition( + "title", "FULLTEXT", "gaming laptop OR android tablet" + ) + assert result == "@title:((gaming laptop)|(android tablet))" + + def test_or_trailing_empty_operand_raises(self): + """Trailing OR with empty operand raises ValueError.""" + builder = QueryBuilder() + with pytest.raises(ValueError, match="Empty operand in OR expression"): + builder.build_text_condition("title", "FULLTEXT", "laptop OR ") + + def test_or_leading_empty_operand_raises(self): + """Leading OR with empty operand raises ValueError.""" + builder = QueryBuilder() + with pytest.raises(ValueError, match="Empty operand in OR expression"): + builder.build_text_condition("title", "FULLTEXT", " OR tablet") + + def test_or_preserves_optional_prefix(self): + """OR operand with ~ prefix preserves optional-term semantics.""" + builder = QueryBuilder() + result = builder.build_text_condition("title", "FULLTEXT", "laptop OR ~gaming") + assert result == "@title:(laptop|~gaming)" + + +class TestQueryBuilderSlopValidation: + """Tests for slop validation at the QueryBuilder level.""" + + def test_slop_negative_raises(self): + """Negative slop raises ValueError.""" + builder = QueryBuilder() + with pytest.raises(ValueError, match="non-negative integer"): + builder.build_text_condition("title", "FULLTEXT", "gaming laptop", slop=-1) + + def test_slop_boolean_raises(self): + """Boolean slop raises ValueError.""" + builder = QueryBuilder() + with pytest.raises(ValueError, match="non-negative integer"): + builder.build_text_condition( + "title", "FULLTEXT", "gaming laptop", slop=True + ) diff --git a/tests/test_sql_parser.py b/tests/test_sql_parser.py index 10e63cb..72a4f56 100644 --- a/tests/test_sql_parser.py +++ b/tests/test_sql_parser.py @@ -612,22 +612,16 @@ def test_parse_non_fulltext_function_in_where(self): assert len(result.conditions) == 0 def test_parse_fulltext_non_column_first_arg(self): - """Parse fulltext with non-column first argument.""" + """Parse fulltext with non-column first argument raises ValueError.""" parser = SQLParser() - result = parser.parse( - "SELECT * FROM products WHERE fulltext(UPPER(title), 'query')" - ) - - # First arg is a function, not Column - condition skipped - assert len(result.conditions) == 0 + with pytest.raises(ValueError, match="must be a column name"): + parser.parse("SELECT * FROM products WHERE fulltext(UPPER(title), 'query')") def test_parse_fulltext_insufficient_args(self): - """Parse fulltext with insufficient arguments.""" + """Parse fulltext with insufficient arguments raises ValueError.""" parser = SQLParser() - result = parser.parse("SELECT * FROM products WHERE fulltext(title)") - - # Only 1 arg, needs >= 2 - condition skipped - assert len(result.conditions) == 0 + with pytest.raises(ValueError, match="requires at least 2 arguments"): + parser.parse("SELECT * FROM products WHERE fulltext(title)") def test_parse_geo_distance_no_args(self): """Parse geo_distance with no arguments in comparison.""" @@ -855,3 +849,85 @@ def test_exists_in_where_raises_error(self): parser = SQLParser() with pytest.raises(ValueError, match="exists.*aggregate"): parser.parse("SELECT * FROM idx WHERE exists(email)") + + +class TestSQLParserFulltextValidation: + """Tests for fulltext() argument validation.""" + + def test_fulltext_negative_slop_raises(self): + """Negative slop in fulltext() raises ValueError.""" + parser = SQLParser() + with pytest.raises(ValueError, match="non-negative integer"): + parser.parse("SELECT * FROM idx WHERE fulltext(title, 'hello world', -1)") + + def test_fulltext_float_slop_raises(self): + """Float slop in fulltext() raises ValueError instead of silently truncating.""" + parser = SQLParser() + with pytest.raises(ValueError, match="must be an integer"): + parser.parse("SELECT * FROM idx WHERE fulltext(title, 'hello world', 2.9)") + + def test_fuzzy_float_level_raises(self): + """Float fuzzy level raises ValueError instead of silently truncating.""" + parser = SQLParser() + with pytest.raises(ValueError, match="must be an integer"): + parser.parse("SELECT * FROM idx WHERE fuzzy(title, 'laptap', 2.9)") + + def test_fulltext_boolean_slop_raises(self): + """Boolean slop in fulltext() raises ValueError.""" + parser = SQLParser() + with pytest.raises(ValueError, match="must be an integer"): + parser.parse("SELECT * FROM idx WHERE fulltext(title, 'hello world', true)") + + def test_fuzzy_boolean_level_raises(self): + """Boolean fuzzy level raises ValueError.""" + parser = SQLParser() + with pytest.raises(ValueError, match="must be an integer"): + parser.parse("SELECT * FROM idx WHERE fuzzy(title, 'laptap', true)") + + def test_fulltext_invalid_inorder_raises(self): + """Invalid inorder value (e.g., 'yes') raises ValueError.""" + parser = SQLParser() + with pytest.raises(ValueError, match="inorder argument must be a boolean"): + parser.parse( + "SELECT * FROM idx WHERE fulltext(title, 'hello world', 0, 'yes')" + ) + + def test_fulltext_no_value_raises(self): + """fulltext() with only field arg raises ValueError.""" + parser = SQLParser() + with pytest.raises(ValueError, match="requires at least 2 arguments"): + parser.parse("SELECT * FROM idx WHERE fulltext(title)") + + def test_fuzzy_no_value_raises(self): + """fuzzy() with only field arg raises ValueError.""" + parser = SQLParser() + with pytest.raises(ValueError, match="requires at least 2 arguments"): + parser.parse("SELECT * FROM idx WHERE fuzzy(title)") + + def test_fulltext_too_many_args_raises(self): + """fulltext() with more than 4 arguments raises ValueError.""" + parser = SQLParser() + with pytest.raises(ValueError, match="at most 4 arguments"): + parser.parse( + "SELECT * FROM idx WHERE fulltext(title, 'hello world', 2, true, 'extra')" + ) + + def test_fuzzy_too_many_args_raises(self): + """fuzzy() with more than 3 arguments raises ValueError.""" + parser = SQLParser() + with pytest.raises(ValueError, match="at most 3 arguments"): + parser.parse("SELECT * FROM idx WHERE fuzzy(title, 'laptap', 2, 'extra')") + + def test_fuzzy_non_column_first_arg_raises(self): + """fuzzy() with non-column first argument raises ValueError.""" + parser = SQLParser() + with pytest.raises(ValueError, match="must be a column name"): + parser.parse("SELECT * FROM idx WHERE fuzzy('title', 'laptap')") + + def test_score_non_literal_arg_raises(self): + """score() with a non-literal argument (e.g., column ref) raises ValueError.""" + parser = SQLParser() + with pytest.raises(ValueError, match="must be a literal scorer name"): + parser.parse( + "SELECT score(my_column) AS relevance FROM idx WHERE fulltext(title, 'laptop')" + ) diff --git a/tests/test_sql_queries.py b/tests/test_sql_queries.py index 5758349..0fb7b9b 100644 --- a/tests/test_sql_queries.py +++ b/tests/test_sql_queries.py @@ -340,3 +340,135 @@ def test_limit_with_offset(self, executor: Executor, products_data: str): if len(all_books.rows) > 1 and len(paginated.rows) >= 1: # Second item from all_books should be first in paginated result assert all_books.rows[1]["title"] == paginated.rows[0]["title"] + + +class TestFuzzySearch: + """Integration tests for fuzzy text search with Levenshtein distance levels.""" + + def test_fuzzy_ld1_finds_misspelled(self, executor: Executor, products_data: str): + """fuzzy(field, 'laptap') at LD=1 should find 'laptop' titles.""" + result = executor.execute( + f"SELECT title FROM {products_data} WHERE fuzzy(title, 'laptap')" + ) + assert len(result.rows) >= 1, "Fuzzy LD=1 should match 'laptop' from 'laptap'" + for row in result.rows: + assert "laptop" in row["title"].lower() + + def test_fuzzy_ld2(self, executor: Executor, products_data: str): + """fuzzy(field, 'laptep', 2) at LD=2 should still find 'laptop'.""" + result = executor.execute( + f"SELECT title FROM {products_data} WHERE fuzzy(title, 'laptep', 2)" + ) + assert len(result.rows) >= 1, "Fuzzy LD=2 should match 'laptop' from 'laptep'" + + def test_fuzzy_ld3(self, executor: Executor, products_data: str): + """fuzzy(field, 'loptep', 3) at LD=3 should find 'laptop'.""" + result = executor.execute( + f"SELECT title FROM {products_data} WHERE fuzzy(title, 'loptep', 3)" + ) + assert len(result.rows) >= 1, "Fuzzy LD=3 should match 'laptop' from 'loptep'" + + +class TestSuffixInfixSearch: + """Integration tests for suffix and infix (contains) pattern matching.""" + + def test_prefix_search(self, executor: Executor, products_data: str): + """LIKE 'lap%' should find laptop titles (prefix match).""" + result = executor.execute( + f"SELECT title FROM {products_data} WHERE title LIKE 'lap%'" + ) + assert len(result.rows) >= 1, "Prefix 'lap%' should match laptop titles" + + def test_suffix_search(self, executor: Executor, products_data: str): + """LIKE '%board' should find keyboard titles (suffix match).""" + result = executor.execute( + f"SELECT title FROM {products_data} WHERE title LIKE '%board'" + ) + # "Mechanical Keyboard" has 'board' at end of 'Keyboard' + assert len(result.rows) >= 1, "Suffix '%board' should match 'Keyboard'" + + def test_infix_search(self, executor: Executor, products_data: str): + """LIKE '%ouse%' should find 'Wireless Mouse' (contains match).""" + result = executor.execute( + f"SELECT title FROM {products_data} WHERE title LIKE '%ouse%'" + ) + assert len(result.rows) >= 1, "Infix '%ouse%' should match 'Mouse'" + + +class TestORInTextSearch: + """Integration tests for OR/union within text field searches.""" + + def test_fulltext_or_two_terms(self, executor: Executor, products_data: str): + """fulltext(field, 'laptop OR keyboard') should find both.""" + result = executor.execute( + f"SELECT title FROM {products_data} WHERE fulltext(title, 'laptop OR keyboard')" + ) + titles = [row["title"].lower() for row in result.rows] + has_laptop = any("laptop" in t for t in titles) + has_keyboard = any("keyboard" in t for t in titles) + assert ( + has_laptop and has_keyboard + ), f"Should find both laptop and keyboard titles, got: {titles}" + + def test_fulltext_or_three_terms(self, executor: Executor, products_data: str): + """fulltext(field, 'laptop OR mouse OR lamp') should find all three.""" + result = executor.execute( + f"SELECT title FROM {products_data} WHERE fulltext(title, 'laptop OR mouse OR lamp')" + ) + assert ( + len(result.rows) >= 3 + ), f"Should find at least 3 products (laptop, mouse, lamp), got {len(result.rows)}" + + +class TestProximitySearch: + """Integration tests for proximity search (slop + inorder).""" + + def test_fulltext_with_slop(self, executor: Executor, products_data: str): + """fulltext(title, 'gaming pro', 2) should find 'Gaming laptop Pro'.""" + result = executor.execute( + f"SELECT title FROM {products_data} WHERE fulltext(title, 'gaming pro', 2)" + ) + assert ( + len(result.rows) >= 1 + ), "Slop=2 should find 'Gaming laptop Pro' (1 word between gaming and pro)" + + def test_fulltext_with_slop_and_inorder( + self, executor: Executor, products_data: str + ): + """fulltext(title, 'gaming pro', 2, true) with inorder should match.""" + result = executor.execute( + f"SELECT title FROM {products_data} WHERE fulltext(title, 'gaming pro', 2, true)" + ) + assert ( + len(result.rows) >= 1 + ), "Slop=2 with inorder should find 'Gaming laptop Pro'" + + +class TestBM25Scoring: + """Integration tests for relevance scoring with WITHSCORES.""" + + def test_score_returns_relevance(self, executor: Executor, products_data: str): + """score() in SELECT should return relevance scores.""" + result = executor.execute( + f"""SELECT title, score() AS relevance + FROM {products_data} + WHERE fulltext(title, 'laptop')""" + ) + assert len(result.rows) >= 1, "Should return results with scores" + for row in result.rows: + assert "relevance" in row, f"Row should have 'relevance' key: {row}" + score = float(row["relevance"]) + assert score >= 0, f"Score should be non-negative, got {score}" + + def test_score_custom_scorer(self, executor: Executor, products_data: str): + """score('TFIDF') should use TFIDF scorer.""" + result = executor.execute( + f"""SELECT title, score('TFIDF') AS relevance + FROM {products_data} + WHERE fulltext(title, 'laptop')""" + ) + assert len(result.rows) >= 1, "Should return results with TFIDF scores" + for row in result.rows: + assert "relevance" in row + score = float(row["relevance"]) + assert score >= 0 diff --git a/tests/test_translator.py b/tests/test_translator.py index 4240cad..23fda9a 100644 --- a/tests/test_translator.py +++ b/tests/test_translator.py @@ -136,7 +136,7 @@ def test_select_with_text_filter(self, translator: Translator, basic_index: str) ) assert result.command == "FT.SEARCH" - assert result.query_string == "@title:hello" + assert result.query_string == '@title:"hello"' def test_select_with_numeric_filter(self, translator: Translator, basic_index: str): """SELECT with NUMERIC field condition.""" @@ -202,7 +202,7 @@ def test_and_conditions(self, translator: Translator, basic_index: str): f"SELECT * FROM {basic_index} WHERE title = 'hello' AND price > 50" ) - assert "@title:hello" in result.query_string + assert '@title:"hello"' in result.query_string assert "@price:[(50 +inf]" in result.query_string def test_or_conditions(self, translator: Translator, basic_index: str): @@ -213,6 +213,13 @@ def test_or_conditions(self, translator: Translator, basic_index: str): assert "|" in result.query_string # OR uses pipe + def test_boolean_in_numeric_context_raises( + self, translator: Translator, basic_index: str + ): + """WHERE price = true should raise, not produce @price:[True True].""" + with pytest.raises(ValueError, match="Boolean value"): + translator.translate(f"SELECT * FROM {basic_index} WHERE price = true") + class TestTranslatorAggregate: """Tests for FT.AGGREGATE translation.""" @@ -416,6 +423,14 @@ def test_not_condition(self, translator: Translator, basic_index: str): assert "-@title" in result.query_string + def test_double_negation_cancels(self, translator: Translator, basic_index: str): + """NOT (field != x) double negation resolves to positive match.""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE NOT title != 'good'" + ) + + assert result.query_string == '@title:"good"' + class TestTranslatorOutput: """Tests for output format methods.""" @@ -616,3 +631,250 @@ def test_select_star_with_having_uses_load_all( assert "LOAD" in result.args load_idx = result.args.index("LOAD") assert result.args[load_idx + 1] == "*" + + +class TestTranslatorFuzzyLevels: + """Tests for FUZZY with Levenshtein distance levels. + + Inspired by PostgreSQL's pg_trgm similarity threshold levels, + maps to RediSearch's %, %%, %%% fuzzy syntax. + """ + + def test_fuzzy_ld1_default(self, translator: Translator, basic_index: str): + """fuzzy(field, 'term') with no level → LD=1 (%term%).""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE fuzzy(title, 'laptap')" + ) + assert result.command == "FT.SEARCH" + assert "@title:%laptap%" in result.query_string + + def test_fuzzy_ld2(self, translator: Translator, basic_index: str): + """fuzzy(field, 'term', 2) → LD=2 (%%term%%).""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE fuzzy(title, 'laptap', 2)" + ) + assert "@title:%%laptap%%" in result.query_string + + def test_fuzzy_ld3(self, translator: Translator, basic_index: str): + """fuzzy(field, 'term', 3) → LD=3 (%%%term%%%).""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE fuzzy(title, 'laptap', 3)" + ) + assert "@title:%%%laptap%%%" in result.query_string + + def test_fuzzy_on_tag_field_raises(self, translator: Translator, basic_index: str): + """fuzzy() on a TAG field raises ValueError.""" + with pytest.raises(ValueError, match="can only be used on TEXT fields"): + translator.translate( + f"SELECT * FROM {basic_index} WHERE fuzzy(category, 'laptap')" + ) + + def test_fulltext_on_numeric_field_raises( + self, translator: Translator, basic_index: str + ): + """fulltext() on a NUMERIC field raises ValueError.""" + with pytest.raises(ValueError, match="can only be used on TEXT fields"): + translator.translate( + f"SELECT * FROM {basic_index} WHERE fulltext(price, 'laptop')" + ) + + def test_like_on_tag_field_raises(self, translator: Translator, basic_index: str): + """LIKE on a TAG field raises ValueError.""" + with pytest.raises(ValueError, match="can only be used on TEXT fields"): + translator.translate( + f"SELECT * FROM {basic_index} WHERE category LIKE '%phone%'" + ) + + +class TestTranslatorSuffixInfix: + """Tests for suffix and infix (contains) pattern matching. + + PostgreSQL analogy: LIKE '%term' and LIKE '%term%'. + RediSearch uses *term and *term* respectively. + """ + + def test_suffix_match(self, translator: Translator, basic_index: str): + """LIKE '%phone' → suffix match @field:*phone.""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE title LIKE '%phone'" + ) + assert "@title:*phone" in result.query_string + + def test_infix_match(self, translator: Translator, basic_index: str): + """LIKE '%phone%' → infix/contains match @field:*phone*.""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE title LIKE '%phone%'" + ) + assert "@title:*phone*" in result.query_string + + def test_prefix_still_works(self, translator: Translator, basic_index: str): + """LIKE 'lap%' → prefix match @field:lap* (unchanged).""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE title LIKE 'lap%'" + ) + assert "@title:lap*" in result.query_string + + +class TestTranslatorORInText: + """Tests for OR/union within text field searches. + + Inspired by PostgreSQL's to_tsquery('fat | rat') and + websearch_to_tsquery('fat OR rat') — natural OR syntax + maps to RediSearch's @field:(term1|term2). + """ + + def test_fulltext_or(self, translator: Translator, basic_index: str): + """fulltext(field, 'laptop OR tablet') → @field:(laptop|tablet).""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE fulltext(title, 'laptop OR tablet')" + ) + assert "@title:(laptop|tablet)" in result.query_string + + def test_fulltext_multiple_or(self, translator: Translator, basic_index: str): + """fulltext(field, 'a OR b OR c') → @field:(a|b|c).""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE fulltext(title, 'laptop OR tablet OR phone')" + ) + assert "@title:(laptop|tablet|phone)" in result.query_string + + +class TestTranslatorProximity: + """Tests for proximity search (slop + inorder). + + Inspired by PostgreSQL's phraseto_tsquery / FOLLOWED BY operator. + Maps to RediSearch query attributes: => { $slop: N; $inorder: true; }. + """ + + def test_fulltext_with_slop(self, translator: Translator, basic_index: str): + """fulltext(field, 'gaming laptop', 2) → slop=2 query attribute.""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE fulltext(title, 'gaming laptop', 2)" + ) + assert "$slop: 2;" in result.query_string + + def test_fulltext_with_slop_and_inorder( + self, translator: Translator, basic_index: str + ): + """fulltext(field, 'gaming laptop', 2, true) → slop=2 + inorder.""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE fulltext(title, 'gaming laptop', 2, true)" + ) + assert "$slop: 2;" in result.query_string + assert "$inorder: true;" in result.query_string + + +class TestTranslatorScoring: + """Tests for relevance scoring (WITHSCORES + SCORER). + + Inspired by PostgreSQL's ts_rank(vector, query) AS rank in SELECT. + Maps to RediSearch's WITHSCORES and SCORER flags on FT.SEARCH. + + SQL: SELECT name, score() AS relevance FROM idx WHERE fulltext(...) + Redis: FT.SEARCH idx "@field:(term)" WITHSCORES SCORER BM25 + """ + + def test_score_default_bm25(self, translator: Translator, basic_index: str): + """score() in SELECT → WITHSCORES + SCORER BM25.""" + result = translator.translate( + f"SELECT title, score() AS relevance FROM {basic_index} WHERE fulltext(title, 'laptop')" + ) + assert "WITHSCORES" in result.args + assert "SCORER" in result.args + scorer_idx = result.args.index("SCORER") + assert result.args[scorer_idx + 1] == "BM25" + + def test_score_custom_scorer(self, translator: Translator, basic_index: str): + """score('TFIDF') in SELECT → WITHSCORES + SCORER TFIDF.""" + result = translator.translate( + f"SELECT title, score('TFIDF') AS relevance FROM {basic_index} WHERE fulltext(title, 'laptop')" + ) + assert "WITHSCORES" in result.args + scorer_idx = result.args.index("SCORER") + assert result.args[scorer_idx + 1] == "TFIDF" + + def test_score_custom_scorer_preserves_case( + self, translator: Translator, basic_index: str + ): + """score('MyScorer') preserves caller-provided casing.""" + result = translator.translate( + f"SELECT title, score('MyScorer') AS relevance FROM {basic_index} " + "WHERE fulltext(title, 'laptop')" + ) + scorer_idx = result.args.index("SCORER") + assert result.args[scorer_idx + 1] == "MyScorer" + + def test_duplicate_score_raises(self, translator: Translator, basic_index: str): + """Multiple score() expressions in the same query raise ValueError.""" + with pytest.raises(ValueError, match="Only one score"): + translator.translate( + f"SELECT score() AS s1, score('TFIDF') AS s2 FROM {basic_index} " + "WHERE fulltext(title, 'laptop')" + ) + + def test_no_score_no_withscores(self, translator: Translator, basic_index: str): + """Without score() → no WITHSCORES flag.""" + result = translator.translate( + f"SELECT title FROM {basic_index} WHERE fulltext(title, 'laptop')" + ) + assert "WITHSCORES" not in result.args + + def test_score_only_select_emits_return_0( + self, translator: Translator, basic_index: str + ): + """SELECT score() AS relevance (no other fields) → RETURN 0 to prevent payload leak.""" + result = translator.translate( + f"SELECT score() AS relevance FROM {basic_index} WHERE fulltext(title, 'laptop')" + ) + assert "RETURN" in result.args + ret_idx = result.args.index("RETURN") + assert result.args[ret_idx + 1] == "0" + assert "WITHSCORES" in result.args + + def test_score_with_aggregate_raises( + self, translator: Translator, basic_index: str + ): + """score() combined with GROUP BY (forces FT.AGGREGATE) raises ValueError.""" + with pytest.raises(ValueError, match="score.*not supported.*FT.AGGREGATE"): + translator.translate( + f"SELECT COUNT(*), score() AS relevance FROM {basic_index} " + "WHERE fulltext(title, 'laptop') GROUP BY category" + ) + + def test_score_too_many_args_raises(self, translator: Translator, basic_index: str): + """score() with more than one argument raises ValueError.""" + with pytest.raises(ValueError, match="at most one argument"): + translator.translate( + f"SELECT score('BM25', 'extra') AS relevance FROM {basic_index} " + "WHERE fulltext(title, 'laptop')" + ) + + def test_order_by_score_desc_omits_sortby( + self, translator: Translator, basic_index: str + ): + """ORDER BY score_alias DESC omits SORTBY (RediSearch sorts by relevance by default).""" + result = translator.translate( + f"SELECT title, score() AS relevance FROM {basic_index} " + "WHERE fulltext(title, 'laptop') ORDER BY relevance DESC" + ) + assert "WITHSCORES" in result.args + assert "SORTBY" not in result.args + + def test_order_by_score_asc_raises(self, translator: Translator, basic_index: str): + """ORDER BY score_alias ASC raises ValueError (not supported by RediSearch).""" + with pytest.raises(ValueError, match="ASC is not supported"): + translator.translate( + f"SELECT title, score() AS relevance FROM {basic_index} " + "WHERE fulltext(title, 'laptop') ORDER BY relevance ASC" + ) + + def test_order_by_real_field_with_score_still_works( + self, translator: Translator, basic_index: str + ): + """ORDER BY a real field (not score alias) still emits SORTBY.""" + result = translator.translate( + f"SELECT title, score() AS relevance FROM {basic_index} " + "WHERE fulltext(title, 'laptop') ORDER BY price DESC" + ) + assert "SORTBY" in result.args + idx = result.args.index("SORTBY") + assert result.args[idx + 1] == "price"