From db94bf53bdc59f8192aa437cc368743258f17e38 Mon Sep 17 00:00:00 2001 From: Sylvain Boissel Date: Wed, 19 Nov 2025 14:49:24 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=B8(backend)=20use=20unaccented=20full?= =?UTF-8?q?=20name=20for=20user=20search?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have the user full name through OIDC in the database, but the search only used the email field. This change allows to search for a user by their first and/or last name (fix #929). Given that user names are more likely than emails to include diacritics, it unaccents both the query and the database entry for search (fix #1091). It also unaccents for email so that internationalized domain names are managed whether or not the accent is included in the search. An unaccented gin index is added on users full_name an email fields. Using a manual migration because a wrapper around unaccent is necessary to make it IMMUTABLE (cf. https://stackoverflow.com/questions/9063402/ ) --- CHANGELOG.md | 1 + src/backend/core/api/viewsets.py | 20 ++- .../migrations/0027_auto_20251120_0956.py | 37 ++++++ src/backend/core/tests/test_api_users.py | 125 ++++++++++++++++++ 4 files changed, 177 insertions(+), 6 deletions(-) create mode 100644 src/backend/core/migrations/0027_auto_20251120_0956.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f3b5e6f02..e8dd1932f4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to - ♻️(frontend) preserve @ character when esc is pressed after typing it #1512 - ♻️(frontend) make summary button fixed to remain visible during scroll #1581 - ♻️(frontend) pdf embed use full width #1526 +- 🚸(backend) use unaccented full name for user search #1637 ### Fixed diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 1c1b9ef50a..7594770bdd 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -1,4 +1,5 @@ """API endpoints""" + # pylint: disable=too-many-lines import base64 @@ -18,7 +19,7 @@ from django.db import connection, transaction from django.db import models as db from django.db.models.expressions import RawSQL -from django.db.models.functions import Left, Length +from django.db.models.functions import Greatest, Left, Length from django.http import Http404, StreamingHttpResponse from django.urls import reverse from django.utils import timezone @@ -37,6 +38,7 @@ from rest_framework.permissions import AllowAny from core import authentication, choices, enums, models +from core.api.filters import remove_accents from core.services.ai_services import AIService from core.services.collaboration_services import CollaborationService from core.services.converter_services import ( @@ -188,13 +190,15 @@ def get_queryset(self): queryset = queryset.exclude(documentaccess__document_id=document_id) filter_data = filterset.form.cleaned_data - query = filter_data["q"] + query = remove_accents(filter_data["q"]) # For emails, match emails by Levenstein distance to prevent typing errors if "@" in query: return ( queryset.annotate( - distance=RawSQL("levenshtein(email::text, %s::text)", (query,)) + distance=RawSQL( + "levenshtein(unaccent(email::text), %s::text)", (query,) + ) ) .filter(distance__lte=3) .order_by("distance", "email")[: settings.API_USERS_LIST_LIMIT] @@ -203,11 +207,15 @@ def get_queryset(self): # Use trigram similarity for non-email-like queries # For performance reasons we filter first by similarity, which relies on an # index, then only calculate precise similarity scores for sorting purposes + return ( - queryset.filter(email__trigram_word_similar=query) - .annotate(similarity=TrigramSimilarity("email", query)) + queryset.annotate( + sim_email=TrigramSimilarity("email", query), + sim_name=TrigramSimilarity("full_name", query), + ) + .annotate(similarity=Greatest("sim_email", "sim_name")) .filter(similarity__gt=0.2) - .order_by("-similarity", "email")[: settings.API_USERS_LIST_LIMIT] + .order_by("-similarity")[: settings.API_USERS_LIST_LIMIT] ) @drf.decorators.action( diff --git a/src/backend/core/migrations/0027_auto_20251120_0956.py b/src/backend/core/migrations/0027_auto_20251120_0956.py new file mode 100644 index 0000000000..fe795ff5f2 --- /dev/null +++ b/src/backend/core/migrations/0027_auto_20251120_0956.py @@ -0,0 +1,37 @@ +# Generated by Django 5.2.8 on 2025-11-20 09:56 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0026_comments"), + ] + + operations = [ + migrations.RunSQL( + sql=""" + CREATE OR REPLACE FUNCTION public.immutable_unaccent(regdictionary, text) + RETURNS text + LANGUAGE c IMMUTABLE PARALLEL SAFE STRICT AS + '$libdir/unaccent', 'unaccent_dict'; + + CREATE OR REPLACE FUNCTION public.f_unaccent(text) + RETURNS text + LANGUAGE sql IMMUTABLE PARALLEL SAFE STRICT + RETURN public.immutable_unaccent(regdictionary 'public.unaccent', $1); + + CREATE INDEX IF NOT EXISTS user_email_unaccent_trgm_idx + ON impress_user + USING gin (f_unaccent(email) gin_trgm_ops); + + CREATE INDEX IF NOT EXISTS user_full_name_unaccent_trgm_idx + ON impress_user + USING gin (f_unaccent(full_name) gin_trgm_ops); + """, + reverse_sql=""" + DROP INDEX IF EXISTS user_email_unaccent_trgm_idx; + DROP INDEX IF EXISTS user_full_name_unaccent_trgm_idx; + """, + ), + ] diff --git a/src/backend/core/tests/test_api_users.py b/src/backend/core/tests/test_api_users.py index a0a4355280..926e731bd4 100644 --- a/src/backend/core/tests/test_api_users.py +++ b/src/backend/core/tests/test_api_users.py @@ -76,6 +76,131 @@ def test_api_users_list_query_email(): assert user_ids == [] +def test_api_users_list_query_email_with_internationalized_domain_names(): + """ + Authenticated users should be able to list users and filter by email. + It should work even if the email address contains an internationalized domain name. + """ + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + jean = factories.UserFactory(email="jean.martin@éducation.fr") + marie = factories.UserFactory(email="marie.durand@education.fr") + kurokawa = factories.UserFactory(email="contact@黒川.日本") + + response = client.get("/api/v1.0/users/?q=jean.martin@education.fr") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(jean.id)] + + response = client.get("/api/v1.0/users/?q=jean.martin@éducation.fr") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(jean.id)] + + response = client.get("/api/v1.0/users/?q=marie.durand@education.fr") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(marie.id)] + + response = client.get("/api/v1.0/users/?q=marie.durand@éducation.fr") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(marie.id)] + + response = client.get("/api/v1.0/users/?q=contact@黒川.日本") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(kurokawa.id)] + + +def test_api_users_list_query_full_name(): + """ + Authenticated users should be able to list users and filter by full name. + Only results with a Trigram similarity greater than 0.2 with the query should be returned. + """ + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + dave = factories.UserFactory(email="contact@work.com", full_name="David Bowman") + + response = client.get( + "/api/v1.0/users/?q=David", + ) + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(dave.id)] + + response = client.get("/api/v1.0/users/?q=Bowman") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(dave.id)] + + response = client.get("/api/v1.0/users/?q=bowman") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(dave.id)] + + response = client.get("/api/v1.0/users/?q=BOWMAN") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(dave.id)] + + response = client.get("/api/v1.0/users/?q=BoWmAn") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(dave.id)] + + response = client.get("/api/v1.0/users/?q=Bovin") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [] + + +def test_api_users_list_query_accented_full_name(): + """ + Authenticated users should be able to list users and filter by full name with accents. + Only results with a Trigram similarity greater than 0.2 with the query should be returned. + """ + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + fred = factories.UserFactory( + email="contact@work.com", full_name="Frédérique Lefèvre" + ) + + response = client.get("/api/v1.0/users/?q=Frédérique") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(fred.id)] + + response = client.get("/api/v1.0/users/?q=Frederique") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(fred.id)] + + response = client.get("/api/v1.0/users/?q=Lefèvre") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(fred.id)] + + response = client.get("/api/v1.0/users/?q=Lefevre") + assert response.status_code == 200 + user_ids = [user["id"] for user in response.json()] + assert user_ids == [str(fred.id)] + + response = client.get("/api/v1.0/users/?q=François Lorfebvre") + assert response.status_code == 200 + users = [user["full_name"] for user in response.json()] + assert users == [] + + def test_api_users_list_limit(settings): """ Authenticated users should be able to list users and the number of results