Use fulltext search for products (#9344)

Does not support substring matches but gives us proper result ranking with different weights assigned to product names, attributes, and descriptions. Also supports websearch expressions: - `"foo bar"` for word distance - `foo -bar` for excluding terms - `foo OR bar` for alternatives Co-authored-by: Filip Owczarek <filip.owczarek@saleor.io> Co-authored-by: Filip Owczarek <filip.owczarek@saleor.io>
saleor · Apr 26, 2022 · 4b6f259 · 4b6f259
1 parent 170efdc
commit 4b6f259
Show file tree

Hide file tree

Showing 32 changed files with 389 additions and 443 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -49,6 +49,7 @@ All notable, unreleased changes to this project will be documented in this file.
 - Allow plugins to create their custom error code - #9300 by @LeOndaz
 
 #### Other
+- Use full-text search for products search API - #9344 by @patrys
 
 - Include required permission in mutations' descriptions - #9363 by @maarcingebala
 - Make GraphQL list items non-nullable - #9391 by @maarcingebala

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -118,6 +118,9 @@ documentation = "https://docs.saleor.io/"
   django-stubs = "1.8.0"
   pytest-socket = "^0.5.1"
   before_after = "^1.0.1"
+  types-certifi = "^2021.10.8"
+  types-freezegun = "^1.1.7"
+  types-six = "^1.16.12"
 
 [tool.black]
 target_version = [ "py35", "py36", "py37", "py38" ]

diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -201,10 +201,13 @@ toml==0.10.2; python_version >= "3.7" and python_full_version < "3.0.0" or pytho
 tomli==2.0.1; python_version < "3.11" and python_full_version >= "3.6.2" and python_version >= "3.7" and (python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.7")
 tornado==6.1; python_version >= "3.7"
 tox==3.25.0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
+types-certifi==2021.10.8.1
+types-freezegun==1.1.9
 types-pkg-resources==0.1.3
 types-python-dateutil==2.8.12
 types-pytz==2021.3.6
 types-requests==2.27.20
+types-six==1.16.15
 types-urllib3==1.26.13
 typing-extensions==4.2.0; python_version < "3.10" and python_full_version >= "3.6.2" and python_version >= "3.7"
 urllib3==1.26.9; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version < "4" and python_version >= "3.7"

diff --git a/saleor/core/management/commands/update_search_indexes.py b/saleor/core/management/commands/update_search_indexes.py
@@ -1,8 +1,5 @@
 from django.core.management.base import BaseCommand
 
-from ....account.models import User
-from ....order.models import Order
-from ....product.models import Product
 from ...search_tasks import (
     set_order_search_document_values,
     set_product_search_document_values,
@@ -15,16 +12,13 @@ class Command(BaseCommand):
 
     def handle(self, *args, **options):
         # Update products
-        products_total_count = Product.objects.filter(search_document="").count()
-        self.stdout.write(f"Updating products: {products_total_count}")
+        self.stdout.write("Updating products")
         set_product_search_document_values.delay()
 
         # Update orders
-        orders_total_count = Order.objects.filter(search_document="").count()
-        self.stdout.write(f"Updating orders: {orders_total_count}")
+        self.stdout.write("Updating orders")
         set_order_search_document_values.delay()
 
         # Update users
-        users_total_count = User.objects.filter(search_document="").count()
-        self.stdout.write(f"Updating users: {users_total_count}")
+        self.stdout.write("Updating users")
         set_user_search_document_values.delay()
diff --git a/saleor/core/search_tasks.py b/saleor/core/search_tasks.py
@@ -10,7 +10,7 @@
 from ..product.models import Product
 from ..product.search import (
     PRODUCT_FIELDS_TO_PREFETCH,
-    prepare_product_search_document_value,
+    prepare_product_search_vector_value,
 )
 
 task_logger = get_task_logger(__name__)
@@ -82,7 +82,7 @@ def set_order_search_document_values(updated_count: int = 0) -> None:
 @app.task
 def set_product_search_document_values(updated_count: int = 0) -> None:
     products = list(
-        Product.objects.filter(search_document="")
+        Product.objects.filter(search_vector=None)
         .prefetch_related(*PRODUCT_FIELDS_TO_PREFETCH)[:BATCH_SIZE]
         .iterator()
     )
@@ -91,8 +91,9 @@ def set_product_search_document_values(updated_count: int = 0) -> None:
         task_logger.info("No products to update.")
         return
 
-    updated_count += set_search_document_values(
-        products, prepare_product_search_document_value
+    updated_count += set_search_vector_values(
+        products,
+        prepare_product_search_vector_value,
     )
 
     task_logger.info("Updated %d products", updated_count)
@@ -117,3 +118,17 @@ def set_search_document_values(instances: List, prepare_search_document_func):
     Model.objects.bulk_update(instances, ["search_document"])
 
     return len(instances)
+
+
+def set_search_vector_values(
+    instances,
+    prepare_search_vector_func,
+):
+    Model = instances[0]._meta.model
+    for instance in instances:
+        instance.search_vector = prepare_search_vector_func(
+            instance, already_prefetched=True
+        )
+    Model.objects.bulk_update(instances, ["search_vector"])
+
+    return len(instances)
diff --git a/saleor/core/tests/test_postgresql_search.py b/saleor/core/tests/test_postgresql_search.py
@@ -1,4 +1,6 @@
 import pytest
+from django.contrib.postgres.search import SearchVector
+from django.db.models import Value
 from django.utils.text import slugify
 
 from ...account.models import Address
@@ -24,6 +26,10 @@ def gen_product(name, description):
             product_type=product_type,
             category=category,
             search_document=f"{name}{description}",
+            search_vector=(
+                SearchVector(Value(name), weight="A")
+                + SearchVector(Value(description), weight="C")
+            ),
         )
         ProductChannelListing.objects.create(
             product=product,

diff --git a/saleor/core/utils/random_data.py b/saleor/core/utils/random_data.py
@@ -80,7 +80,7 @@
     ProductVariantChannelListing,
     VariantMedia,
 )
-from ...product.search import update_products_search_document
+from ...product.search import update_products_search_vector
 from ...product.tasks import update_products_discounted_prices_of_discount_task
 from ...product.thumbnails import (
     create_category_background_image_thumbnails,
@@ -472,7 +472,7 @@ def create_products_by_schema(placeholder_dir, create_images):
     assign_products_to_collections(associations=types["product.collectionproduct"])
 
     all_products_qs = Product.objects.all()
-    update_products_search_document(all_products_qs)
+    update_products_search_vector(all_products_qs)
     update_products_discounted_prices(all_products_qs)
 
 

diff --git a/saleor/graphql/attribute/bulk_mutations.py b/saleor/graphql/attribute/bulk_mutations.py
@@ -4,7 +4,7 @@
 from ...attribute import models
 from ...core.permissions import PageTypePermissions
 from ...product import models as product_models
-from ...product.search import update_products_search_document
+from ...product.search import update_products_search_vector
 from ..core.mutations import ModelBulkDeleteMutation
 from ..core.types import AttributeError, NonNullList
 from ..utils import resolve_global_ids_to_primary_keys
@@ -32,7 +32,7 @@ def perform_mutation(cls, _root, info, ids, **data):
         _, attribute_pks = resolve_global_ids_to_primary_keys(ids, "Attribute")
         product_ids = cls.get_product_ids_to_update(attribute_pks)
         response = super().perform_mutation(_root, info, ids, **data)
-        update_products_search_document(
+        update_products_search_vector(
             product_models.Product.objects.filter(id__in=product_ids)
         )
         return response
@@ -86,7 +86,7 @@ def perform_mutation(cls, _root, info, ids, **data):
         _, attribute_pks = resolve_global_ids_to_primary_keys(ids, "AttributeValue")
         product_ids = cls.get_product_ids_to_update(attribute_pks)
         response = super().perform_mutation(_root, info, ids, **data)
-        update_products_search_document(
+        update_products_search_vector(
             product_models.Product.objects.filter(id__in=product_ids)
         )
         return response

diff --git a/saleor/graphql/attribute/mutations.py b/saleor/graphql/attribute/mutations.py
@@ -17,7 +17,7 @@
 from ...core.tracing import traced_atomic_transaction
 from ...core.utils import generate_unique_slug
 from ...product import models as product_models
-from ...product.search import update_products_search_document
+from ...product.search import update_products_search_vector
 from ..core.enums import MeasurementUnitsEnum
 from ..core.fields import JSONString
 from ..core.inputs import ReorderInput
@@ -734,7 +734,7 @@ def post_save_action(cls, info, instance, cleaned_input):
             Q(Exists(instance.productassignments.filter(product_id=OuterRef("id"))))
             | Q(Exists(variants.filter(product_id=OuterRef("id"))))
         )
-        update_products_search_document(products)
+        update_products_search_vector(products)
 
 
 class AttributeValueDelete(ModelDeleteMutation):
@@ -757,7 +757,7 @@ def perform_mutation(cls, _root, info, **data):
         instance = cls.get_node_or_error(info, node_id, only_type=AttributeValue)
         product_ids = cls.get_product_ids_to_update(instance)
         response = super().perform_mutation(_root, info, **data)
-        update_products_search_document(
+        update_products_search_vector(
             product_models.Product.objects.filter(id__in=product_ids)
         )
         return response

diff --git a/saleor/graphql/attribute/tests/mutations/test_attribute_value_delete.py b/saleor/graphql/attribute/tests/mutations/test_attribute_value_delete.py
@@ -87,10 +87,6 @@ def test_delete_attribute_value_product_search_document_updated(
     with pytest.raises(value._meta.model.DoesNotExist):
         value.refresh_from_db()
 
-    product.refresh_from_db()
-    assert product.search_document
-    assert name.lower() not in product.search_document
-
 
 def test_delete_attribute_value_product_search_document_updated_variant_attribute(
     staff_api_client,
@@ -121,7 +117,3 @@ def test_delete_attribute_value_product_search_document_updated_variant_attribut
     # then
     with pytest.raises(value._meta.model.DoesNotExist):
         value.refresh_from_db()
-
-    product.refresh_from_db()
-    assert product.search_document
-    assert name.lower() not in product.search_document
diff --git a/saleor/graphql/attribute/tests/mutations/test_attribute_value_update.py b/saleor/graphql/attribute/tests/mutations/test_attribute_value_update.py
@@ -127,9 +127,6 @@ def test_update_attribute_value_product_search_document_updated(
         value["node"]["name"] for value in data["attribute"]["choices"]["edges"]
     ]
 
-    product.refresh_from_db()
-    assert name.lower() in product.search_document
-
 
 def test_update_attribute_value_product_search_document_updated_variant_attribute(
     staff_api_client,
@@ -167,9 +164,6 @@ def test_update_attribute_value_product_search_document_updated_variant_attribut
         value["node"]["name"] for value in data["attribute"]["choices"]["edges"]
     ]
 
-    product.refresh_from_db()
-    assert name.lower() in product.search_document
-
 
 def test_update_swatch_attribute_value(
     staff_api_client,

diff --git a/saleor/graphql/attribute/tests/mutations/test_bulk_delete.py b/saleor/graphql/attribute/tests/mutations/test_bulk_delete.py
@@ -104,16 +104,6 @@ def test_delete_attributes_products_search_document_updated(
         id__in=[attr.id for attr in product_type_attribute_list]
     ).exists()
 
-    product_1.refresh_from_db()
-    product_2.refresh_from_db()
-    assert product_1.search_document
-    assert attr_1_name not in product_1.search_document
-    assert color_attribute_value.name.lower() in product_1.search_document
-    assert attr_3_name not in product_1.search_document
-
-    assert product_2.search_document
-    assert attr_2_name not in product_2.search_document
-
 
 ATTRIBUTE_VALUE_BULK_DELETE_MUTATION = """
     mutation attributeValueBulkDelete($ids: [ID!]!) {
@@ -160,11 +150,6 @@ def test_delete_attribute_values_search_document_updated(
         slug="orange", name="Orange", attribute=attribute, value="#ABCD"
     )
 
-    val_1_name = value_1.name
-    val_2_name = value_2.name
-    val_3_name = value_3.name
-    val_4_name = value_4.name
-
     product_1 = product_list[0]
     product_2 = product_list[1]
     variant_1 = product_1.variants.first()
@@ -195,10 +180,5 @@ def test_delete_attribute_values_search_document_updated(
 
     product_1.refresh_from_db()
     product_2.refresh_from_db()
-    assert product_1.search_document
-    assert val_1_name not in product_1.search_document
-    assert val_4_name.lower() in product_1.search_document
-    assert val_3_name not in product_1.search_document
-
-    assert product_2.search_document
-    assert val_2_name not in product_2.search_document
+    assert product_1.search_vector
+    assert product_2.search_vector
diff --git a/saleor/graphql/core/connection.py b/saleor/graphql/core/connection.py
@@ -1,5 +1,5 @@
 import json
-from decimal import Decimal
+from decimal import Decimal, InvalidOperation
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast
 
 import graphene
@@ -51,6 +51,28 @@ def get_field_value(instance: DjangoModel, field_name: str):
     return attr
 
 
+def _prepare_filter_by_rank_expression(
+    cursor: List[str],
+    sorting_direction: str,
+) -> Q:
+    try:
+        rank = Decimal(cursor[0])
+        int(cursor[1])
+    except (InvalidOperation, ValueError, TypeError, KeyError):
+        raise ValueError("Invalid cursor for sorting by rank.")
+
+    # Because rank is float number, it gets mangled by PostgreSQL's query parser
+    # making equal comparisons impossible. Instead we compare rank against small
+    # range of values, constructed using epsilon.
+    if sorting_direction == "gt":
+        return Q(
+            search_rank__range=(rank - EPSILON, rank + EPSILON), id__lt=cursor[1]
+        ) | Q(search_rank__gt=rank + EPSILON)
+    return Q(search_rank__range=(rank - EPSILON, rank + EPSILON), id__gt=cursor[1]) | Q(
+        search_rank__lt=rank - EPSILON
+    )
+
+
 def _prepare_filter_expression(
     field_name: str,
     index: int,
@@ -92,6 +114,9 @@ def _prepare_filter(
                 ('first_field', 'first_value_form_cursor'))
         )
     """
+    if sorting_fields == ["search_rank", "id"]:
+        # Fast path for filtering by rank
+        return _prepare_filter_by_rank_expression(cursor, sorting_direction)
     filter_kwargs = Q()
     for index, field_name in enumerate(sorting_fields):
         if cursor[index] is None and sorting_direction == "gt":