diff --git a/docs/api/query.rst b/docs/api/query.rst index c2ba04f9..9d65dc9b 100644 --- a/docs/api/query.rst +++ b/docs/api/query.rst @@ -47,6 +47,11 @@ HybridQuery :show-inheritance: :exclude-members: add_filter,get_args,highlight,return_field,summarize +.. note:: + The ``stopwords`` parameter in :class:`HybridQuery` (and :class:`AggregateHybridQuery`) controls query-time stopword filtering (client-side). + For index-level stopwords configuration (server-side), see :class:`redisvl.schema.IndexInfo.stopwords`. + Using query-time stopwords with index-level ``STOPWORDS 0`` is counterproductive. + TextQuery ================ @@ -61,6 +66,11 @@ TextQuery :show-inheritance: :exclude-members: add_filter,get_args,highlight,return_field,summarize +.. note:: + The ``stopwords`` parameter in :class:`TextQuery` controls query-time stopword filtering (client-side). + For index-level stopwords configuration (server-side), see :class:`redisvl.schema.IndexInfo.stopwords`. + Using query-time stopwords with index-level ``STOPWORDS 0`` is counterproductive. + FilterQuery =========== diff --git a/docs/api/schema.rst b/docs/api/schema.rst index 7f38d63a..c5b8ab68 100644 --- a/docs/api/schema.rst +++ b/docs/api/schema.rst @@ -31,6 +31,47 @@ IndexSchema :exclude-members: generate_fields,validate_and_create_fields,redis_fields +Index-Level Stopwords Configuration +==================================== + +The :class:`IndexInfo` class supports index-level stopwords configuration through +the ``stopwords`` field. This controls which words are filtered during indexing +(server-side), as opposed to query-time filtering (client-side). + +**Configuration Options:** + +- ``None`` (default): Use Redis default stopwords (~300 common words) +- ``[]`` (empty list): Disable stopwords completely (``STOPWORDS 0``) +- Custom list: Specify your own stopwords (e.g., ``["the", "a", "an"]``) + +**Example:** + +.. code-block:: python + + from redisvl.schema import IndexSchema + + # Disable stopwords to search for phrases like "Bank of Glasberliner" + schema = IndexSchema.from_dict({ + "index": { + "name": "company-idx", + "prefix": "company", + "stopwords": [] # STOPWORDS 0 + }, + "fields": [ + {"name": "name", "type": "text"} + ] + }) + +**Important Notes:** + +- Index-level stopwords affect what gets indexed (server-side) +- Query-time stopwords (in :class:`TextQuery` and :class:`AggregateHybridQuery`) affect what gets searched (client-side) +- Using query-time stopwords with index-level ``STOPWORDS 0`` is counterproductive + +For detailed information about stopwords configuration and best practices, see the +Advanced Queries user guide (``docs/user_guide/11_advanced_queries.ipynb``). + + Defining Fields =============== diff --git a/docs/user_guide/11_advanced_queries.ipynb b/docs/user_guide/11_advanced_queries.ipynb index a8d56fdb..831857d7 100644 --- a/docs/user_guide/11_advanced_queries.ipynb +++ b/docs/user_guide/11_advanced_queries.ipynb @@ -30,8 +30,15 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": {}, + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:12.222169Z", + "iopub.status.busy": "2025-11-21T00:42:12.222058Z", + "iopub.status.idle": "2025-11-21T00:42:12.301776Z", + "shell.execute_reply": "2025-11-21T00:42:12.301163Z" + } + }, "outputs": [], "source": [ "import numpy as np\n", @@ -117,8 +124,15 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:12.303593Z", + "iopub.status.busy": "2025-11-21T00:42:12.303450Z", + "iopub.status.idle": "2025-11-21T00:42:12.305709Z", + "shell.execute_reply": "2025-11-21T00:42:12.305407Z" + } + }, "outputs": [], "source": [ "schema = {\n", @@ -167,17 +181,16 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded 6 products into the index\n" - ] + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:12.306952Z", + "iopub.status.busy": "2025-11-21T00:42:12.306869Z", + "iopub.status.idle": "2025-11-21T00:42:12.416481Z", + "shell.execute_reply": "2025-11-21T00:42:12.415926Z" } - ], + }, + "outputs": [], "source": [ "from redisvl.index import SearchIndex\n", "\n", @@ -206,22 +219,16 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
scoreproduct_idbrief_descriptioncategoryprice
5.953989333038773prod_1comfortable running shoes for athletesfootwear89.99
2.085315593627535prod_5basketball shoes with excellent ankle supportfootwear139.99
2.0410082774474088prod_2lightweight running jacket with water resistanceouterwear129.99
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:12.433591Z", + "iopub.status.busy": "2025-11-21T00:42:12.433464Z", + "iopub.status.idle": "2025-11-21T00:42:13.709475Z", + "shell.execute_reply": "2025-11-21T00:42:13.708647Z" } - ], + }, + "outputs": [], "source": [ "from redisvl.query import TextQuery\n", "\n", @@ -248,29 +255,16 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Results with BM25 scoring:\n" - ] - }, - { - "data": { - "text/html": [ - "
scoreproduct_idbrief_descriptionprice
6.031534703977659prod_1comfortable running shoes for athletes89.99
2.085315593627535prod_5basketball shoes with excellent ankle support139.99
1.5268074873573214prod_4yoga mat with extra cushioning for comfort39.99
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.711396Z", + "iopub.status.busy": "2025-11-21T00:42:13.711221Z", + "iopub.status.idle": "2025-11-21T00:42:13.749216Z", + "shell.execute_reply": "2025-11-21T00:42:13.748398Z" } - ], + }, + "outputs": [], "source": [ "# BM25 standard scoring (default)\n", "bm25_query = TextQuery(\n", @@ -288,29 +282,16 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Results with TFIDF scoring:\n" - ] - }, - { - "data": { - "text/html": [ - "
scoreproduct_idbrief_descriptionprice
2.3333333333333335prod_1comfortable running shoes for athletes89.99
2.0prod_5basketball shoes with excellent ankle support139.99
1.0prod_4yoga mat with extra cushioning for comfort39.99
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.750799Z", + "iopub.status.busy": "2025-11-21T00:42:13.750686Z", + "iopub.status.idle": "2025-11-21T00:42:13.754896Z", + "shell.execute_reply": "2025-11-21T00:42:13.754345Z" } - ], + }, + "outputs": [], "source": [ "# TFIDF scoring\n", "tfidf_query = TextQuery(\n", @@ -337,22 +318,16 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
scoreproduct_idbrief_descriptioncategoryprice
3.9314935770863046prod_1comfortable running shoes for athletesfootwear89.99
3.1279733904413027prod_5basketball shoes with excellent ankle supportfootwear139.99
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.756368Z", + "iopub.status.busy": "2025-11-21T00:42:13.756224Z", + "iopub.status.idle": "2025-11-21T00:42:13.760388Z", + "shell.execute_reply": "2025-11-21T00:42:13.759844Z" } - ], + }, + "outputs": [], "source": [ "from redisvl.query.filter import Tag, Num\n", "\n", @@ -371,22 +346,16 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
scoreproduct_idbrief_descriptionprice
3.1541404034996914prod_1comfortable running shoes for athletes89.99
1.5268074873573214prod_4yoga mat with extra cushioning for comfort39.99
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.761654Z", + "iopub.status.busy": "2025-11-21T00:42:13.761566Z", + "iopub.status.idle": "2025-11-21T00:42:13.765694Z", + "shell.execute_reply": "2025-11-21T00:42:13.765316Z" } - ], + }, + "outputs": [], "source": [ "# Search for products under $100\n", "price_filtered_query = TextQuery(\n", @@ -413,22 +382,16 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
scoreproduct_idbrief_description
5.035440025836444prod_1comfortable running shoes for athletes
2.085315593627535prod_5basketball shoes with excellent ankle support
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.767228Z", + "iopub.status.busy": "2025-11-21T00:42:13.767102Z", + "iopub.status.idle": "2025-11-21T00:42:13.771059Z", + "shell.execute_reply": "2025-11-21T00:42:13.770555Z" } - ], + }, + "outputs": [], "source": [ "weighted_query = TextQuery(\n", " text=\"shoes\",\n", @@ -452,22 +415,16 @@ }, { "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
scoreproduct_idbrief_description
5.953989333038773prod_1comfortable running shoes for athletes
2.085315593627535prod_5basketball shoes with excellent ankle support
2.0410082774474088prod_2lightweight running jacket with water resistance
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.772513Z", + "iopub.status.busy": "2025-11-21T00:42:13.772419Z", + "iopub.status.idle": "2025-11-21T00:42:13.776286Z", + "shell.execute_reply": "2025-11-21T00:42:13.775861Z" } - ], + }, + "outputs": [], "source": [ "# Use English stopwords (default)\n", "query_with_stopwords = TextQuery(\n", @@ -484,22 +441,16 @@ }, { "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
scoreproduct_idbrief_description
3.1541404034996914prod_1comfortable running shoes for athletes
3.0864038416103prod_3professional tennis racket for competitive players
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.777294Z", + "iopub.status.busy": "2025-11-21T00:42:13.777220Z", + "iopub.status.idle": "2025-11-21T00:42:13.781329Z", + "shell.execute_reply": "2025-11-21T00:42:13.780713Z" } - ], + }, + "outputs": [], "source": [ "# Use custom stopwords\n", "custom_stopwords_query = TextQuery(\n", @@ -516,22 +467,16 @@ }, { "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
scoreproduct_idbrief_description
5.953989333038773prod_1comfortable running shoes for athletes
2.085315593627535prod_5basketball shoes with excellent ankle support
2.0410082774474088prod_2lightweight running jacket with water resistance
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.782401Z", + "iopub.status.busy": "2025-11-21T00:42:13.782323Z", + "iopub.status.idle": "2025-11-21T00:42:13.787197Z", + "shell.execute_reply": "2025-11-21T00:42:13.786617Z" } - ], + }, + "outputs": [], "source": [ "# No stopwords\n", "no_stopwords_query = TextQuery(\n", @@ -569,29 +514,222 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Basic Aggregate Hybrid Query\n", + "### Index-Level Stopwords Configuration\n", "\n", - "Let's search for \"running\" with both text and semantic search:" + "The previous example showed **query-time stopwords** using `TextQuery.stopwords`, which filters words from the query before searching. RedisVL also supports **index-level stopwords** configuration, which determines which words are indexed in the first place.\n", + "\n", + "**Key Difference:**\n", + "- **Query-time stopwords** (`TextQuery.stopwords`): Filters words from your search query (client-side)\n", + "- **Index-level stopwords** (`IndexInfo.stopwords`): Controls which words get indexed in Redis (server-side)\n", + "\n", + "**Three Configuration Modes:**\n", + "\n", + "1. **`None` (default)**: Use Redis's default stopwords list\n", + "2. **`[]` (empty list)**: Disable stopwords completely (`STOPWORDS 0` in FT.CREATE)\n", + "3. **`[\"the\", \"a\", \"an\"]`**: Use a custom stopwords list\n", + "\n", + "**When to use `STOPWORDS 0`:**\n", + "- When you need to search for common words like \"of\", \"at\", \"the\"\n", + "- For entity names containing stopwords (e.g., \"Bank of Glasberliner\", \"University of Glasberliner\")\n", + "- When working with structured data where every word matters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.788835Z", + "iopub.status.busy": "2025-11-21T00:42:13.788717Z", + "iopub.status.idle": "2025-11-21T00:42:13.795247Z", + "shell.execute_reply": "2025-11-21T00:42:13.794662Z" + } + }, + "outputs": [], + "source": [ + "# Create a schema with index-level stopwords disabled\n", + "from redisvl.index import SearchIndex\n", + "\n", + "stopwords_schema = {\n", + " \"index\": {\n", + " \"name\": \"company_index\",\n", + " \"prefix\": \"company:\",\n", + " \"storage_type\": \"hash\",\n", + " \"stopwords\": [] # STOPWORDS 0 - disable stopwords completely\n", + " },\n", + " \"fields\": [\n", + " {\"name\": \"company_name\", \"type\": \"text\"},\n", + " {\"name\": \"description\", \"type\": \"text\"}\n", + " ]\n", + "}\n", + "\n", + "# Create index using from_dict (handles schema creation internally)\n", + "company_index = SearchIndex.from_dict(stopwords_schema, redis_url=\"redis://localhost:6379\")\n", + "company_index.create(overwrite=True, drop=True)\n", + "\n", + "print(f\"Index created with STOPWORDS 0: {company_index}\")" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.796880Z", + "iopub.status.busy": "2025-11-21T00:42:13.796745Z", + "iopub.status.idle": "2025-11-21T00:42:13.802750Z", + "shell.execute_reply": "2025-11-21T00:42:13.802098Z" + } + }, + "outputs": [], + "source": [ + "# Load sample data with company names containing common stopwords\n", + "companies = [\n", + " {\"company_name\": \"Bank of Glasberliner\", \"description\": \"Major financial institution\"},\n", + " {\"company_name\": \"University of Glasberliner\", \"description\": \"Public university system\"},\n", + " {\"company_name\": \"Department of Glasberliner Affairs\", \"description\": \"A government agency\"},\n", + " {\"company_name\": \"Glasberliner FC\", \"description\": \"Football Club\"},\n", + " {\"company_name\": \"The Home Market\", \"description\": \"Home improvement retailer\"},\n", + "]\n", + "\n", + "for i, company in enumerate(companies):\n", + " company_index.load([company], keys=[f\"company:{i}\"])\n", + "\n", + "print(f\"✓ Loaded {len(companies)} companies\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.804059Z", + "iopub.status.busy": "2025-11-21T00:42:13.803942Z", + "iopub.status.idle": "2025-11-21T00:42:13.807026Z", + "shell.execute_reply": "2025-11-21T00:42:13.806491Z" + } + }, + "outputs": [], + "source": [ + "# Search for \"Bank of Glasberliner\" - with STOPWORDS 0, \"of\" is indexed and searchable\n", + "from redisvl.query import FilterQuery\n", + "\n", + "query = FilterQuery(\n", + " filter_expression='@company_name:(Bank of Glasberliner)',\n", + " return_fields=[\"company_name\", \"description\"],\n", + ")\n", + "\n", + "results = company_index.search(query.query, query_params=query.params)\n", + "\n", + "print(f\"Found {len(results.docs)} results for 'Bank of Glasberliner':\")\n", + "for doc in results.docs:\n", + " print(f\" - {doc.company_name}: {doc.description}\")" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
vector_distanceproduct_idbrief_descriptioncategorypricevector_similaritytext_scorehybrid_score
5.96046447754e-08prod_1comfortable running shoes for athletesfootwear89.990.9999999701985.953989333042.48619677905
0.00985252857208prod_5basketball shoes with excellent ankle supportfootwear139.990.9950737357142.085315593631.32214629309
0.00985252857208prod_2lightweight running jacket with water resistanceouterwear129.990.9950737357142.041008277451.30885409823
0.0038834810257prod_4yoga mat with extra cushioning for comfortaccessories39.990.99805825948700.698640781641
0.236237406731prod_6swimming goggles with anti-fog coatingaccessories24.990.88188129663500.617316907644
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "source": [ + "**Comparison: With vs Without Stopwords**\n", + "\n", + "If we had used the default stopwords (not specifying `stopwords` in the schema), the word \"of\" would be filtered out during indexing. This means:\n", + "\n", + "- ❌ Searching for `\"Bank of Glasberliner\"` might not find exact matches\n", + "- ❌ The phrase would be indexed as `\"Bank Berlin\"` (without \"of\")\n", + "- ✅ With `STOPWORDS 0`, all words including \"of\" are indexed\n", + "\n", + "**Custom Stopwords Example:**\n", + "\n", + "You can also provide a custom list of stopwords:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.808543Z", + "iopub.status.busy": "2025-11-21T00:42:13.808418Z", + "iopub.status.idle": "2025-11-21T00:42:13.810612Z", + "shell.execute_reply": "2025-11-21T00:42:13.810083Z" } + }, + "outputs": [], + "source": [ + "# Example: Create index with custom stopwords\n", + "custom_stopwords_schema = {\n", + " \"index\": {\n", + " \"name\": \"custom_stopwords_index\",\n", + " \"prefix\": \"custom:\",\n", + " \"stopwords\": [\"inc\", \"llc\", \"corp\"] # Filter out legal entity suffixes\n", + " },\n", + " \"fields\": [\n", + " {\"name\": \"name\", \"type\": \"text\"}\n", + " ]\n", + "}\n", + "\n", + "# This would create an index where \"inc\", \"llc\", \"corp\" are not indexed\n", + "print(\"Custom stopwords:\", custom_stopwords_schema[\"index\"][\"stopwords\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**YAML Format:**\n", + "\n", + "You can also define stopwords in YAML schema files:\n", + "\n", + "```yaml\n", + "version: '0.1.0'\n", + "\n", + "index:\n", + " name: company_index\n", + " prefix: company:\n", + " storage_type: hash\n", + " stopwords: [] # Disable stopwords (STOPWORDS 0)\n", + "\n", + "fields:\n", + " - name: company_name\n", + " type: text\n", + " - name: description\n", + " type: text\n", + "```\n", + "\n", + "Or with custom stopwords:\n", + "\n", + "```yaml\n", + "index:\n", + " stopwords:\n", + " - the\n", + " - a\n", + " - an\n", + "```" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Cleanup\n", + "company_index.delete(drop=True)\n", + "print(\"✓ Cleaned up company_index\")" ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Basic Aggregate Hybrid Query\n", + "\n", + "Let's search for \"running\" with both text and semantic search:" + ] + }, + { + "cell_type": "code", + "metadata": {}, "source": [ "from redisvl.query import AggregateHybridQuery\n", "\n", @@ -607,7 +745,9 @@ "\n", "results = index.query(hybrid_query)\n", "result_print(results)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -623,29 +763,7 @@ }, { "cell_type": "code", - "execution_count": 38, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Results with alpha=0.9 (vector-heavy):\n" - ] - }, - { - "data": { - "text/html": [ - "
vector_distanceproduct_idbrief_descriptionvector_similaritytext_scorehybrid_score
-1.19209289551e-07prod_4yoga mat with extra cushioning for comfort1.00000005961.526807487361.05268080238
0.00136888027191prod_5basketball shoes with excellent ankle support0.99931555986400.899384003878
0.00136888027191prod_2lightweight running jacket with water resistance0.99931555986400.899384003878
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "# More emphasis on vector search (alpha=0.9)\n", "vector_heavy_query = AggregateHybridQuery(\n", @@ -661,7 +779,9 @@ "print(\"Results with alpha=0.9 (vector-heavy):\")\n", "results = index.query(vector_heavy_query)\n", "result_print(results)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -674,22 +794,7 @@ }, { "cell_type": "code", - "execution_count": 39, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
vector_distanceproduct_idbrief_descriptioncategorypricevector_similaritytext_scorehybrid_score
-1.19209289551e-07prod_3professional tennis racket for competitive playersequipment199.991.00000005963.086403841611.62592119421
0.411657452583prod_5basketball shoes with excellent ankle supportfootwear139.990.79417127370800.555919891596
0.411657452583prod_2lightweight running jacket with water resistanceouterwear129.990.79417127370800.555919891596
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "# Hybrid search with a price filter\n", "filtered_hybrid_query = AggregateHybridQuery(\n", @@ -704,7 +809,9 @@ "\n", "results = index.query(filtered_hybrid_query)\n", "result_print(results)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -717,22 +824,7 @@ }, { "cell_type": "code", - "execution_count": 40, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
vector_distanceproduct_idbrief_descriptionvector_similaritytext_scorehybrid_score
0prod_5basketball shoes with excellent ankle support152.2
0prod_2lightweight running jacket with water resistance100.7
0.00136888027191prod_4yoga mat with extra cushioning for comfort0.99931555986400.699520891905
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "# Aggregate Hybrid query with TFIDF scorer\n", "hybrid_tfidf = AggregateHybridQuery(\n", @@ -747,7 +839,9 @@ "\n", "results = index.query(hybrid_tfidf)\n", "result_print(results)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -775,22 +869,7 @@ }, { "cell_type": "code", - "execution_count": 41, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
distance_0distance_1product_idbrief_descriptioncategoryscore_0score_1combined_score
5.96046447754e-085.96046447754e-08prod_1comfortable running shoes for athletesfootwear0.9999999701980.9999999701980.999999970198
0.009852528572080.00266629457474prod_5basketball shoes with excellent ankle supportfootwear0.9950737357140.9986668527130.996151670814
0.009852528572080.0118260979652prod_2lightweight running jacket with water resistanceouterwear0.9950737357140.9940869510170.994777700305
0.00388348102570.210647821426prod_4yoga mat with extra cushioning for comfortaccessories0.9980582594870.8946760892870.967043608427
0.2362374067310.639005899429prod_6swimming goggles with anti-fog coatingaccessories0.8818812966350.6804970502850.82146602273
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "from redisvl.query import MultiVectorQuery, Vector\n", "\n", @@ -818,7 +897,9 @@ "\n", "results = index.query(multi_vector_query)\n", "result_print(results)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -831,29 +912,7 @@ }, { "cell_type": "code", - "execution_count": 42, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Results with emphasis on image similarity:\n" - ] - }, - { - "data": { - "text/html": [ - "
distance_0distance_1product_idbrief_descriptioncategoryscore_0score_1combined_score
-1.19209289551e-070prod_3professional tennis racket for competitive playersequipment1.000000059611.00000001192
0.145393729210.00900757312775prod_6swimming goggles with anti-fog coatingaccessories0.9273031353950.9954962134360.981857597828
0.4366961717610.219131231308prod_4yoga mat with extra cushioning for comfortaccessories0.781651914120.8904343843460.868677890301
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "# More emphasis on image similarity\n", "text_vec = Vector(\n", @@ -879,7 +938,9 @@ "print(\"Results with emphasis on image similarity:\")\n", "results = index.query(image_heavy_query)\n", "result_print(results)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -892,22 +953,7 @@ }, { "cell_type": "code", - "execution_count": 43, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
distance_0distance_1product_idbrief_descriptioncategorypricescore_0score_1combined_score
5.96046447754e-085.96046447754e-08prod_1comfortable running shoes for athletesfootwear89.990.9999999701980.9999999701980.999999970198
0.009852528572080.00266629457474prod_5basketball shoes with excellent ankle supportfootwear139.990.9950737357140.9986668527130.996510982513
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "# Multi-vector search with category filter\n", "text_vec = Vector(\n", @@ -933,7 +979,9 @@ "\n", "results = index.query(filtered_multi_query)\n", "result_print(results)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -946,36 +994,7 @@ }, { "cell_type": "code", - "execution_count": 44, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TextQuery Results (keyword-based):\n" - ] - }, - { - "data": { - "text/html": [ - "
scoreproduct_idbrief_description
2.8773943004779676prod_1comfortable running shoes for athletes
2.085315593627535prod_5basketball shoes with excellent ankle support
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], "source": [ "# TextQuery - keyword-based search\n", "text_q = TextQuery(\n", @@ -988,40 +1007,22 @@ "print(\"TextQuery Results (keyword-based):\")\n", "result_print(index.query(text_q))\n", "print()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "AggregateHybridQuery Results (text + vector):\n" - ] - }, - { - "data": { - "text/html": [ - "
vector_distanceproduct_idbrief_descriptionvector_similaritytext_scorehybrid_score
5.96046447754e-08prod_1comfortable running shoes for athletes0.9999999701982.877394300481.56321826928
0.0038834810257prod_4yoga mat with extra cushioning for comfort0.99805825948700.698640781641
0.00985252857208prod_2lightweight running jacket with water resistance0.99507373571400.696551615
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.860414Z", + "iopub.status.busy": "2025-11-21T00:42:13.860347Z", + "iopub.status.idle": "2025-11-21T00:42:13.864887Z", + "shell.execute_reply": "2025-11-21T00:42:13.864461Z" } - ], + }, + "outputs": [], "source": [ "# AggregateHybridQuery - combines text and vector search\n", "hybrid_q = AggregateHybridQuery(\n", @@ -1040,29 +1041,16 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MultiVectorQuery Results (multiple vectors):\n" - ] - }, - { - "data": { - "text/html": [ - "
distance_0distance_1product_idbrief_descriptionscore_0score_1combined_score
5.96046447754e-085.96046447754e-08prod_1comfortable running shoes for athletes0.9999999701980.9999999701980.999999970198
0.009852528572080.00266629457474prod_5basketball shoes with excellent ankle support0.9950737357140.9986668527130.996870294213
0.009852528572080.0118260979652prod_2lightweight running jacket with water resistance0.9950737357140.9940869510170.994580343366
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2025-11-21T00:42:13.865922Z", + "iopub.status.busy": "2025-11-21T00:42:13.865857Z", + "iopub.status.idle": "2025-11-21T00:42:13.869441Z", + "shell.execute_reply": "2025-11-21T00:42:13.868990Z" } - ], + }, + "outputs": [], "source": [ "# MultiVectorQuery - searches multiple vector fields\n", "mv_text = Vector(\n", @@ -1118,13 +1106,13 @@ }, { "cell_type": "code", - "execution_count": 47, "metadata": {}, - "outputs": [], "source": [ "# Cleanup\n", "index.delete()" - ] + ], + "outputs": [], + "execution_count": null } ], "metadata": { @@ -1142,8 +1130,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.0" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/redisvl/index/index.py b/redisvl/index/index.py index 3865879d..dcfaaab6 100644 --- a/redisvl/index/index.py +++ b/redisvl/index/index.py @@ -79,7 +79,9 @@ BaseVectorQuery, CountQuery, FilterQuery, + TextQuery, ) +from redisvl.query.aggregate import AggregateHybridQuery from redisvl.query.filter import FilterExpression from redisvl.redis.connection import ( RedisConnectionFactory, @@ -248,6 +250,34 @@ def _validate_query(self, query: BaseQuery) -> None: "Vector field using 'flat' algorithm does not support EF_RUNTIME query parameter." ) + # Warn if using query-time stopwords with index-level STOPWORDS 0 + if isinstance(query, (TextQuery, AggregateHybridQuery)): + index_stopwords = self.schema.index.stopwords + query_stopwords = query.stopwords + + # Check if index has STOPWORDS 0 (empty list) and query has stopwords configured + # Note: query.stopwords is a set, and when any falsy value (None, False, '', 0, [], etc.) + # is passed to TextQuery/AggregateHybridQuery, it becomes an empty set. So we check if the set is non-empty. + if ( + index_stopwords is not None + and len(index_stopwords) == 0 + and len(query_stopwords) > 0 + ): + query_type = ( + "TextQuery" + if isinstance(query, TextQuery) + else "AggregateHybridQuery" + ) + warnings.warn( + f"Query-time stopwords are configured but the index has STOPWORDS 0 (stopwords = []). " + "This is counterproductive: all words including common words like 'of', 'the', 'a' are indexed, " + "but your query-time stopwords will filter them from the search query. " + "This makes your search less precise than it could be. " + f"Consider setting stopwords=None (or any falsy value) in {query_type} to search for all indexed words.", + UserWarning, + stacklevel=3, + ) + @property def name(self) -> str: """The name of the Redis search index.""" @@ -601,17 +631,22 @@ def create(self, overwrite: bool = False, drop: bool = False) -> None: definition = IndexDefinition( prefix=[self.schema.index.prefix], index_type=self._storage.type ) + # Extract stopwords from schema + stopwords = self.schema.index.stopwords + if isinstance(self._redis_client, RedisCluster): cluster_create_index( index_name=self.name, client=self._redis_client, fields=redis_fields, definition=definition, + stopwords=stopwords, ) else: self._redis_client.ft(self.name).create_index( fields=redis_fields, definition=definition, + stopwords=stopwords, ) except redis.exceptions.RedisError as e: raise RedisSearchError( @@ -1384,17 +1419,22 @@ async def create(self, overwrite: bool = False, drop: bool = False) -> None: definition = IndexDefinition( prefix=[self.schema.index.prefix], index_type=self._storage.type ) + # Extract stopwords from schema + stopwords = self.schema.index.stopwords + if isinstance(client, AsyncRedisCluster): await async_cluster_create_index( index_name=self.schema.index.name, client=client, fields=redis_fields, definition=definition, + stopwords=stopwords, ) else: await client.ft(self.schema.index.name).create_index( fields=redis_fields, definition=definition, + stopwords=stopwords, ) except redis.exceptions.RedisError as e: raise RedisSearchError( diff --git a/redisvl/query/aggregate.py b/redisvl/query/aggregate.py index 89371849..299de0ce 100644 --- a/redisvl/query/aggregate.py +++ b/redisvl/query/aggregate.py @@ -128,6 +128,10 @@ def __init__( provided then a default set of stopwords for that language will be used. if a list, set, or tuple of strings is provided then those will be used as stopwords. Defaults to "english". if set to "None" then no stopwords will be removed. + + Note: This parameter controls query-time stopword filtering (client-side). + For index-level stopwords configuration (server-side), see IndexInfo.stopwords. + Using query-time stopwords with index-level STOPWORDS 0 is counterproductive. dialect (int, optional): The Redis dialect version. Defaults to 2. text_weights (Optional[Dict[str, float]]): The importance weighting of individual words within the query text. Defaults to None, as no modifications will be made to the diff --git a/redisvl/query/query.py b/redisvl/query/query.py index d7443584..1237c07f 100644 --- a/redisvl/query/query.py +++ b/redisvl/query/query.py @@ -1061,10 +1061,14 @@ def __init__( params (Optional[Dict[str, Any]], optional): The parameters for the query. Defaults to None. stopwords (Optional[Union[str, Set[str]]): The set of stop words to remove - from the query text. If a language like 'english' or 'spanish' is provided + from the query text (client-side filtering). If a language like 'english' or 'spanish' is provided a default set of stopwords for that language will be used. Users may specify their own stop words by providing a List or Set of words. if set to None, then no words will be removed. Defaults to 'english'. + + Note: This parameter controls query-time stopword filtering (client-side). + For index-level stopwords configuration (server-side), see IndexInfo.stopwords. + Using query-time stopwords with index-level STOPWORDS 0 is counterproductive. text_weights (Optional[Dict[str, float]]): The importance weighting of individual words within the query text. Defaults to None, as no modifications will be made to the text_scorer score. diff --git a/redisvl/redis/connection.py b/redisvl/redis/connection.py index 7b5951d5..6d8ff96f 100644 --- a/redisvl/redis/connection.py +++ b/redisvl/redis/connection.py @@ -204,6 +204,17 @@ def convert_index_info_to_schema(index_info: Dict[str, Any]) -> Dict[str, Any]: prefixes = prefixes[0] storage_type = index_info["index_definition"][1].lower() + # Parse stopwords if present in FT.INFO output + # stopwords_list is only present when explicitly set (STOPWORDS 0 or custom list) + # If not present, we use None to indicate default Redis behavior + stopwords = None + if "stopwords_list" in index_info: + # Convert bytes to strings if needed + stopwords_list = index_info["stopwords_list"] + stopwords = [ + sw.decode("utf-8") if isinstance(sw, bytes) else sw for sw in stopwords_list + ] + index_fields = index_info["attributes"] def parse_vector_attrs(attrs): @@ -411,8 +422,12 @@ def parse_attrs(attrs, field_type=None): # append field schema_fields.append(field) + index_dict = {"name": index_name, "prefix": prefixes, "storage_type": storage_type} + if stopwords is not None: + index_dict["stopwords"] = stopwords + return { - "index": {"name": index_name, "prefix": prefixes, "storage_type": storage_type}, + "index": index_dict, "fields": schema_fields, } diff --git a/redisvl/schema/schema.py b/redisvl/schema/schema.py index c97d9708..8a9ec974 100644 --- a/redisvl/schema/schema.py +++ b/redisvl/schema/schema.py @@ -1,8 +1,7 @@ -import re from collections.abc import Mapping, Sequence from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Literal, Union +from typing import Any, Dict, List, Literal, Optional, Union import yaml from pydantic import BaseModel, Field, model_validator @@ -31,7 +30,7 @@ class StorageType(Enum): class IndexInfo(BaseModel): """Index info includes the essential details regarding index settings, - such as its name, prefix, key separator, and storage type in Redis. + such as its name, prefix, key separator, storage type, and stopwords in Redis. In yaml format, the index info section looks like: @@ -42,6 +41,7 @@ class IndexInfo(BaseModel): prefix: user key_separtor: ':' storage_type: json + stopwords: [] # Disable stopwords (STOPWORDS 0) In dict format, the index info section looks like: @@ -51,7 +51,8 @@ class IndexInfo(BaseModel): "name": "user-index", "prefix": "user", "key_separator": ":", - "storage_type": "json" + "storage_type": "json", + "stopwords": ["the", "a", "an"] # Custom stopwords }} """ @@ -64,6 +65,9 @@ class IndexInfo(BaseModel): """The separator character used in designing Redis keys.""" storage_type: StorageType = StorageType.HASH """The storage type used in Redis (e.g., 'hash' or 'json').""" + stopwords: Optional[List[str]] = None + """Index-level stopwords configuration. None (default) uses Redis default stopwords, + empty list [] disables stopwords (STOPWORDS 0), or provide a custom list of stopwords.""" class IndexSchema(BaseModel): diff --git a/tests/integration/test_stopwords_integration.py b/tests/integration/test_stopwords_integration.py new file mode 100644 index 00000000..14ebc742 --- /dev/null +++ b/tests/integration/test_stopwords_integration.py @@ -0,0 +1,192 @@ +"""Integration tests for stopwords support.""" + +import pytest + +from redisvl.index import SearchIndex +from redisvl.query import FilterQuery +from redisvl.schema import IndexSchema + + +@pytest.fixture +def stopwords_disabled_schema(): + """Schema with stopwords disabled (STOPWORDS 0).""" + return { + "index": { + "name": "test_stopwords_disabled", + "prefix": "test_sw_disabled:", + "storage_type": "hash", + "stopwords": [], # STOPWORDS 0 + }, + "fields": [ + {"name": "title", "type": "text"}, + {"name": "description", "type": "text"}, + ], + } + + +@pytest.fixture +def custom_stopwords_schema(): + """Schema with custom stopwords list.""" + return { + "index": { + "name": "test_custom_stopwords", + "prefix": "test_sw_custom:", + "storage_type": "hash", + "stopwords": ["the", "a", "an"], + }, + "fields": [ + {"name": "title", "type": "text"}, + ], + } + + +@pytest.fixture +def default_stopwords_schema(): + """Schema with default stopwords (no stopwords field).""" + return { + "index": { + "name": "test_default_stopwords", + "prefix": "test_sw_default:", + "storage_type": "hash", + }, + "fields": [ + {"name": "title", "type": "text"}, + ], + } + + +@pytest.fixture +def stopwords_disabled_index(client, stopwords_disabled_schema): + """Index fixture with stopwords disabled.""" + schema = IndexSchema.from_dict(stopwords_disabled_schema) + index = SearchIndex(schema, redis_client=client) + index.create(overwrite=True, drop=True) + + yield index + + index.delete(drop=True) + + +@pytest.fixture +def custom_stopwords_index(client, custom_stopwords_schema): + """Index fixture with custom stopwords.""" + schema = IndexSchema.from_dict(custom_stopwords_schema) + index = SearchIndex(schema, redis_client=client) + index.create(overwrite=True, drop=True) + + yield index + + index.delete(drop=True) + + +@pytest.fixture +def default_stopwords_index(client, default_stopwords_schema): + """Index fixture with default stopwords.""" + schema = IndexSchema.from_dict(default_stopwords_schema) + index = SearchIndex(schema, redis_client=client) + index.create(overwrite=True, drop=True) + + yield index + + index.delete(drop=True) + + +def test_create_index_with_stopwords_disabled(client, stopwords_disabled_index): + """Test creating an index with STOPWORDS 0.""" + # Verify index was created + assert stopwords_disabled_index.exists() + + # Get FT.INFO and verify stopwords_list is empty + info = client.ft(stopwords_disabled_index.name).info() + assert "stopwords_list" in info + assert info["stopwords_list"] == [] + + +def test_create_index_with_custom_stopwords(client, custom_stopwords_index): + """Test creating an index with custom stopwords list.""" + # Verify index was created + assert custom_stopwords_index.exists() + + # Get FT.INFO and verify stopwords_list matches + info = client.ft(custom_stopwords_index.name).info() + assert "stopwords_list" in info + + # Convert bytes to strings for comparison + stopwords_list = [ + sw.decode("utf-8") if isinstance(sw, bytes) else sw + for sw in info["stopwords_list"] + ] + assert set(stopwords_list) == {"the", "a", "an"} + + +def test_create_index_with_default_stopwords(default_stopwords_index): + """Test creating an index with default stopwords (no STOPWORDS clause).""" + # Verify index was created + assert default_stopwords_index.exists() + + # When no STOPWORDS clause is used, Redis doesn't include stopwords_list in FT.INFO + # (or it may include the default list depending on Redis version) + # We just verify the index was created successfully with default behavior + + +def test_from_existing_preserves_stopwords_disabled(client, stopwords_disabled_index): + """Test that from_existing() correctly reconstructs stopwords=[] configuration.""" + # Reconstruct from existing + reconstructed_index = SearchIndex.from_existing( + stopwords_disabled_index.name, redis_client=client + ) + + # Verify stopwords configuration was preserved + assert reconstructed_index.schema.index.stopwords == [] + + +def test_from_existing_preserves_custom_stopwords(client, custom_stopwords_index): + """Test that from_existing() correctly reconstructs custom stopwords configuration.""" + # Reconstruct from existing + reconstructed_index = SearchIndex.from_existing( + custom_stopwords_index.name, redis_client=client + ) + + # Verify stopwords configuration was preserved + assert set(reconstructed_index.schema.index.stopwords) == {"the", "a", "an"} + + +def test_from_existing_default_stopwords(client, default_stopwords_index): + """Test that from_existing() handles default stopwords (no stopwords_list in FT.INFO).""" + # Reconstruct from existing + reconstructed_index = SearchIndex.from_existing( + default_stopwords_index.name, redis_client=client + ) + + # Verify stopwords is None (default behavior) + assert reconstructed_index.schema.index.stopwords is None + + +def test_stopwords_disabled_allows_searching_common_words( + client, stopwords_disabled_index +): + """Test that STOPWORDS 0 allows searching for common stopwords like 'the', 'a', 'of'.""" + # Add test data with common stopwords + test_data = [ + {"title": "Bank of Glasberliner", "description": "A major bank"}, + {"title": "The Great Gatsby", "description": "A classic novel"}, + { + "title": "An Introduction to Python", + "description": "A programming guide", + }, + ] + + for i, data in enumerate(test_data): + key = f"test_sw_disabled:{i}" + client.hset(key, mapping=data) + + # Search for "of" - should find "Bank of Glasberliner" + query = FilterQuery( + filter_expression="@title:(of)", + return_fields=["title"], + ) + results = stopwords_disabled_index.search(query.query, query_params=query.params) + + # With STOPWORDS 0, "of" should be indexed and searchable + assert len(results.docs) > 0 + assert any("of" in doc.title.lower() for doc in results.docs) diff --git a/tests/unit/test_convert_index_info.py b/tests/unit/test_convert_index_info.py index c4cf0db1..2a4dc36d 100644 --- a/tests/unit/test_convert_index_info.py +++ b/tests/unit/test_convert_index_info.py @@ -1,7 +1,5 @@ """Unit tests for convert_index_info_to_schema function.""" -import pytest - from redisvl.redis.connection import convert_index_info_to_schema @@ -110,3 +108,67 @@ def test_convert_index_info_with_fields(): assert result["fields"][0]["type"] == "tag" assert result["fields"][1]["name"] == "text" assert result["fields"][1]["type"] == "text" + + +def test_convert_index_info_stopwords_disabled(): + """Test converting index info with STOPWORDS 0 (disabled stopwords).""" + index_info = { + "index_name": "test_stopwords_disabled", + "index_definition": [ + "key_type", + "HASH", + "prefixes", + ["test_sw:"], + ], + "attributes": [], + "stopwords_list": [], # STOPWORDS 0 + } + + result = convert_index_info_to_schema(index_info) + + assert result["index"]["name"] == "test_stopwords_disabled" + assert result["index"]["stopwords"] == [] + + +def test_convert_index_info_custom_stopwords(): + """Test converting index info with custom stopwords list.""" + index_info = { + "index_name": "test_custom_stopwords", + "index_definition": [ + "key_type", + "HASH", + "prefixes", + ["test_csw:"], + ], + "attributes": [], + "stopwords_list": [b"the", b"a", b"an"], # Custom stopwords (as bytes) + } + + result = convert_index_info_to_schema(index_info) + + assert result["index"]["name"] == "test_custom_stopwords" + assert result["index"]["stopwords"] == ["the", "a", "an"] + + +def test_convert_index_info_default_stopwords(): + """Test converting index info with default stopwords (no stopwords_list key). + + When no STOPWORDS clause is specified in FT.CREATE, Redis uses its default + stopwords list, and FT.INFO does not include a stopwords_list key. + """ + index_info = { + "index_name": "test_default_stopwords", + "index_definition": [ + "key_type", + "HASH", + "prefixes", + ["test_dsw:"], + ], + "attributes": [], + # No stopwords_list key - indicates default behavior + } + + result = convert_index_info_to_schema(index_info) + + assert result["index"]["name"] == "test_default_stopwords" + assert "stopwords" not in result["index"] # Should not be present diff --git a/tests/unit/test_field_modifier_ordering.py b/tests/unit/test_field_modifier_ordering.py index fad097fc..8f77a610 100644 --- a/tests/unit/test_field_modifier_ordering.py +++ b/tests/unit/test_field_modifier_ordering.py @@ -309,11 +309,11 @@ def test_empty_suffix(self): assert field.args_suffix == [] -class TestMLPCommandsScenario: - """Test the exact scenario from mlp_commands.txt.""" +class TestFieldModifierScenario: + """Test field modifier ordering scenario.""" def test_work_experience_summary_field(self): - """Test TextField with INDEXMISSING SORTABLE UNF (mlp_commands.txt scenario).""" + """Test TextField with INDEXMISSING SORTABLE UNF (field modifier scenario).""" field = TextField( name="work_experience_summary", attrs={"index_missing": True, "sortable": True, "unf": True}, @@ -321,11 +321,11 @@ def test_work_experience_summary_field(self): redis_field = field.as_redis_field() suffix = redis_field.args_suffix - # Verify exact order from mlp_commands.txt + # Verify exact order from field modifier requirements assert suffix == ["INDEXMISSING", "SORTABLE", "UNF"] - def test_mlp_scenario_redis_args(self): - """Test that redis_args() produces correct command for mlp_commands.txt scenario.""" + def test_field_modifier_scenario_redis_args(self): + """Test that redis_args() produces correct command for field modifier scenario.""" field = TextField( name="work_experience_summary", attrs={"index_missing": True, "sortable": True, "unf": True}, diff --git a/tests/unit/test_stopwords_schema.py b/tests/unit/test_stopwords_schema.py new file mode 100644 index 00000000..87c807a1 --- /dev/null +++ b/tests/unit/test_stopwords_schema.py @@ -0,0 +1,202 @@ +"""Unit tests for stopwords support in IndexSchema.""" + +import tempfile + +import yaml + +from redisvl.schema import IndexSchema + + +def test_index_schema_stopwords_none_default(): + """Test IndexSchema with no stopwords specified (default behavior).""" + schema_dict = { + "index": { + "name": "test_index", + "prefix": "test", + "storage_type": "hash", + }, + "fields": [ + {"name": "title", "type": "text"}, + ], + } + + schema = IndexSchema.from_dict(schema_dict) + + assert schema.index.name == "test_index" + assert schema.index.stopwords is None # Default + + +def test_index_schema_stopwords_disabled(): + """Test IndexSchema with stopwords disabled (STOPWORDS 0).""" + schema_dict = { + "index": { + "name": "test_index", + "prefix": "test", + "storage_type": "hash", + "stopwords": [], # Empty list = STOPWORDS 0 + }, + "fields": [ + {"name": "title", "type": "text"}, + ], + } + + schema = IndexSchema.from_dict(schema_dict) + + assert schema.index.name == "test_index" + assert schema.index.stopwords == [] + + +def test_index_schema_custom_stopwords(): + """Test IndexSchema with custom stopwords list.""" + schema_dict = { + "index": { + "name": "test_index", + "prefix": "test", + "storage_type": "hash", + "stopwords": ["the", "a", "an"], + }, + "fields": [ + {"name": "title", "type": "text"}, + ], + } + + schema = IndexSchema.from_dict(schema_dict) + + assert schema.index.name == "test_index" + assert schema.index.stopwords == ["the", "a", "an"] + + +def test_index_schema_stopwords_from_yaml_disabled(): + """Test IndexSchema from YAML with stopwords disabled.""" + yaml_content = """ +version: '0.1.0' + +index: + name: test_yaml_index + prefix: test_yaml + storage_type: hash + stopwords: [] + +fields: + - name: title + type: text +""" + + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write(yaml_content) + yaml_path = f.name + + try: + schema = IndexSchema.from_yaml(yaml_path) + assert schema.index.name == "test_yaml_index" + assert schema.index.stopwords == [] + finally: + import os + + os.unlink(yaml_path) + + +def test_index_schema_stopwords_from_yaml_custom(): + """Test IndexSchema from YAML with custom stopwords.""" + yaml_content = """ +version: '0.1.0' + +index: + name: test_yaml_index + prefix: test_yaml + storage_type: hash + stopwords: + - the + - a + - an + +fields: + - name: title + type: text +""" + + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write(yaml_content) + yaml_path = f.name + + try: + schema = IndexSchema.from_yaml(yaml_path) + assert schema.index.name == "test_yaml_index" + assert schema.index.stopwords == ["the", "a", "an"] + finally: + import os + + os.unlink(yaml_path) + + +def test_index_schema_to_dict_preserves_stopwords(): + """Test that to_dict() preserves stopwords configuration.""" + schema_dict = { + "index": { + "name": "test_index", + "prefix": "test", + "storage_type": "hash", + "stopwords": ["the", "a"], + }, + "fields": [ + {"name": "title", "type": "text"}, + ], + } + + schema = IndexSchema.from_dict(schema_dict) + result_dict = schema.to_dict() + + assert result_dict["index"]["stopwords"] == ["the", "a"] + + +def test_index_schema_to_dict_omits_none_stopwords(): + """Test that to_dict() omits stopwords when None (default).""" + schema_dict = { + "index": { + "name": "test_index", + "prefix": "test", + "storage_type": "hash", + }, + "fields": [ + {"name": "title", "type": "text"}, + ], + } + + schema = IndexSchema.from_dict(schema_dict) + result_dict = schema.to_dict() + + # stopwords should not be in the dict when None (default behavior) + assert "stopwords" not in result_dict["index"] + + +def test_index_schema_to_yaml_preserves_stopwords(): + """Test that to_yaml() preserves stopwords configuration.""" + schema_dict = { + "index": { + "name": "test_index", + "prefix": "test", + "storage_type": "hash", + "stopwords": [], # STOPWORDS 0 + }, + "fields": [ + {"name": "title", "type": "text"}, + ], + } + + schema = IndexSchema.from_dict(schema_dict) + + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml_path = f.name + + try: + schema.to_yaml(yaml_path) + + # Read back and verify + with open(yaml_path, "r") as f: + yaml_data = yaml.safe_load(f) + + assert yaml_data["index"]["stopwords"] == [] + finally: + import os + + os.unlink(yaml_path)