From ed7fc6d1a0567f1137abcaa462037dbdc4f51082 Mon Sep 17 00:00:00 2001 From: Mike Fiedler Date: Tue, 22 Apr 2025 14:36:47 -0400 Subject: [PATCH 1/6] chore: add index to Email.domain_last_checked Prevent full-table scans for any date-related lookups. Signed-off-by: Mike Fiedler --- warehouse/accounts/models.py | 1 + ..._add_index_to_email_domain_last_checked.py | 36 +++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 warehouse/migrations/versions/c8384ca429fc_add_index_to_email_domain_last_checked.py diff --git a/warehouse/accounts/models.py b/warehouse/accounts/models.py index 55e93a8c65e2..d4f8d5ffbe39 100644 --- a/warehouse/accounts/models.py +++ b/warehouse/accounts/models.py @@ -427,6 +427,7 @@ class Email(db.ModelBase): # Domain validation information domain_last_checked: Mapped[datetime.datetime | None] = mapped_column( comment="Last time domain was checked with the domain validation service.", + index=True, ) domain_last_status: Mapped[list[str] | None] = mapped_column( ARRAY(String), diff --git a/warehouse/migrations/versions/c8384ca429fc_add_index_to_email_domain_last_checked.py b/warehouse/migrations/versions/c8384ca429fc_add_index_to_email_domain_last_checked.py new file mode 100644 index 000000000000..91ef779f03fd --- /dev/null +++ b/warehouse/migrations/versions/c8384ca429fc_add_index_to_email_domain_last_checked.py @@ -0,0 +1,36 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Add Index to Email.domain_last_checked + +Revision ID: c8384ca429fc +Revises: f609b35e981b +Create Date: 2025-04-22 18:36:03.844860 +""" + +from alembic import op + +revision = "c8384ca429fc" +down_revision = "f609b35e981b" + + +def upgrade(): + op.create_index( + op.f("ix_user_emails_domain_last_checked"), + "user_emails", + ["domain_last_checked"], + unique=False, + ) + + +def downgrade(): + op.drop_index(op.f("ix_user_emails_domain_last_checked"), table_name="user_emails") From 96780bce028917391174501e18610a9d116c21f2 Mon Sep 17 00:00:00 2001 From: Mike Fiedler Date: Tue, 22 Apr 2025 15:00:07 -0400 Subject: [PATCH 2/6] refactor: extract email domain check to function In preparation for reuse in a background task, move the code to run the check to a utility function. Signed-off-by: Mike Fiedler --- warehouse/accounts/utils.py | 21 +++++++++++++++++++++ warehouse/admin/views/users.py | 10 ++-------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/warehouse/accounts/utils.py b/warehouse/accounts/utils.py index 3960ebd935b2..38be5c5df281 100644 --- a/warehouse/accounts/utils.py +++ b/warehouse/accounts/utils.py @@ -12,10 +12,17 @@ from __future__ import annotations +import datetime + from dataclasses import dataclass from typing import TYPE_CHECKING +from warehouse.accounts.models import Email +from warehouse.accounts.services import IDomainStatusService + if TYPE_CHECKING: + from pyramid.request import Request + from warehouse.accounts.models import User from warehouse.macaroons.models import Macaroon @@ -44,3 +51,17 @@ class UserContext: def __principals__(self) -> list[str]: return self.user.__principals__() + + +def update_email_domain_status(email: Email, request: Request) -> None: + """ + Update the domain status of the given email address. + """ + domain_status_service = request.find_service(IDomainStatusService) + domain_status = domain_status_service.get_domain_status(email.domain) + + email.domain_last_checked = datetime.datetime.now(datetime.UTC) + email.domain_last_status = domain_status + request.db.add(email) + + return None diff --git a/warehouse/admin/views/users.py b/warehouse/admin/views/users.py index 4524bcc5e074..cec31b26a325 100644 --- a/warehouse/admin/views/users.py +++ b/warehouse/admin/views/users.py @@ -28,7 +28,6 @@ from warehouse.accounts.interfaces import ( BurnedRecoveryCode, - IDomainStatusService, IEmailBreachedService, InvalidRecoveryCode, IUserService, @@ -40,6 +39,7 @@ ProhibitedUserName, User, ) +from warehouse.accounts.utils import update_email_domain_status from warehouse.authnz import Permissions from warehouse.email import ( send_account_recovery_initiated_email, @@ -683,13 +683,7 @@ def user_email_domain_check(user, request): email_address = request.params.get("email_address") email = request.db.scalar(select(Email).where(Email.email == email_address)) - domain_status_service = request.find_service(IDomainStatusService) - domain_status = domain_status_service.get_domain_status(email.domain) - - # set the domain status to the email address - email.domain_last_checked = datetime.datetime.now(datetime.UTC) - email.domain_last_status = domain_status - request.db.add(email) + update_email_domain_status(email, request) request.session.flash( f"Domain status check for {email.domain!r} completed", queue="success" From 9d1230824e702f45f8f2fabdca03d6e1c5bbf1f0 Mon Sep 17 00:00:00 2001 From: Mike Fiedler Date: Tue, 22 Apr 2025 16:14:53 -0400 Subject: [PATCH 3/6] chore(deps): add pytest-mock See https://pytest-mock.readthedocs.io/ Signed-off-by: Mike Fiedler --- requirements/tests.in | 1 + requirements/tests.txt | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/requirements/tests.in b/requirements/tests.in index 0db91b605dc5..0c8c524f062c 100644 --- a/requirements/tests.in +++ b/requirements/tests.in @@ -4,6 +4,7 @@ freezegun pretend pytest>=3.0.0 pytest-icdiff +pytest-mock pytest-postgresql>=3.1.3,<8.0.0 pytest-randomly pytest-socket diff --git a/requirements/tests.txt b/requirements/tests.txt index 110aa87c7be1..4c6e6020e38c 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -252,6 +252,7 @@ pytest==8.3.5 \ # via # -r requirements/tests.in # pytest-icdiff + # pytest-mock # pytest-postgresql # pytest-randomly # pytest-socket @@ -261,6 +262,10 @@ pytest-icdiff==0.9 \ --hash=sha256:13aede616202e57fcc882568b64589002ef85438046f012ac30a8d959dac8b75 \ --hash=sha256:efee0da3bd1b24ef2d923751c5c547fbb8df0a46795553fba08ef57c3ca03d82 # via -r requirements/tests.in +pytest-mock==3.14.0 \ + --hash=sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f \ + --hash=sha256:2719255a1efeceadbc056d6bf3df3d1c5015530fb40cf347c0f9afac88410bd0 + # via -r requirements/tests.in pytest-postgresql==7.0.1 \ --hash=sha256:7723dfbfc57ea6f6f9876c2828e7b36f8b0e60b6cb040b1ddd444a60eed06e0a \ --hash=sha256:cbc6a67bbad5128b1f00def8cca5cf597020acc79893723f7a9cb60981b6840f From e014c578528829cd66fa9c03db047389c6f2f249 Mon Sep 17 00:00:00 2001 From: Mike Fiedler Date: Tue, 22 Apr 2025 16:15:24 -0400 Subject: [PATCH 4/6] test: use mocker to spy on service call Signed-off-by: Mike Fiedler --- tests/conftest.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index c16f1137339e..d8c9310704cf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -550,8 +550,10 @@ def search_service(): @pytest.fixture -def domain_status_service(): - return account_services.NullDomainStatusService() +def domain_status_service(mocker): + service = account_services.NullDomainStatusService() + mocker.spy(service, "get_domain_status") + return service class QueryRecorder: From fb9327bb38a819aa547e0b623301b5d3a556813d Mon Sep 17 00:00:00 2001 From: Mike Fiedler Date: Wed, 23 Apr 2025 16:29:16 -0400 Subject: [PATCH 5/6] feat: add periodic task to update domain statues Performs batches of 10k every 5 minutes, estimated to complete in under 24 hours, to backfill the entire dataset. After which, we may move to daily. Signed-off-by: Mike Fiedler --- tests/unit/accounts/test_tasks.py | 43 ++++++++++++++++++++++++++++++- warehouse/accounts/__init__.py | 8 +++++- warehouse/accounts/tasks.py | 41 +++++++++++++++++++++++++++-- 3 files changed, 88 insertions(+), 4 deletions(-) diff --git a/tests/unit/accounts/test_tasks.py b/tests/unit/accounts/test_tasks.py index 93585361537b..56df2571ffc5 100644 --- a/tests/unit/accounts/test_tasks.py +++ b/tests/unit/accounts/test_tasks.py @@ -17,7 +17,11 @@ from warehouse.accounts import tasks from warehouse.accounts.models import TermsOfServiceEngagement -from warehouse.accounts.tasks import compute_user_metrics, notify_users_of_tos_update +from warehouse.accounts.tasks import ( + batch_update_email_domain_status, + compute_user_metrics, + notify_users_of_tos_update, +) from ...common.db.accounts import EmailFactory, UserFactory from ...common.db.packaging import ProjectFactory, ReleaseFactory @@ -192,3 +196,40 @@ def test_compute_user_metrics(db_request, metrics): ], ), ] + + +def test_update_email_domain_status(db_request, domain_status_service, mocker): + """ + Test that the batch update performs the correct queries and updates + """ + never_checked = EmailFactory.create( + email="me@never-checked.com", domain_last_checked=None + ) + over_threshold = EmailFactory.create( + email="me@over-threshold.com", + domain_last_checked=datetime.now(tz=timezone.utc) - timedelta(days=90), + ) + on_threshold = EmailFactory.create( + email="me@on-threshold.com", + domain_last_checked=datetime.now(tz=timezone.utc) - timedelta(days=30), + ) + under_threshold = EmailFactory.create( + email="me@under-threshold.com", + domain_last_checked=datetime.now(tz=timezone.utc) - timedelta(days=1), + ) + + batch_update_email_domain_status(db_request) + + assert domain_status_service.get_domain_status.call_count == 3 + domain_status_service.get_domain_status.assert_has_calls( + [ + mocker.call(never_checked.domain), + mocker.call(over_threshold.domain), + mocker.call(on_threshold.domain), + ] + ) + + assert never_checked.domain_last_status == ["active"] + assert over_threshold.domain_last_status == ["active"] + assert on_threshold.domain_last_status == ["active"] + assert under_threshold.domain_last_status is None # no default, not updated diff --git a/warehouse/accounts/__init__.py b/warehouse/accounts/__init__.py index 11d64295fdf2..d778e0b9e6f0 100644 --- a/warehouse/accounts/__init__.py +++ b/warehouse/accounts/__init__.py @@ -32,7 +32,11 @@ TokenServiceFactory, database_login_factory, ) -from warehouse.accounts.tasks import compute_user_metrics, notify_users_of_tos_update +from warehouse.accounts.tasks import ( + batch_update_email_domain_status, + compute_user_metrics, + notify_users_of_tos_update, +) from warehouse.accounts.utils import UserContext from warehouse.admin.flags import AdminFlagValue from warehouse.macaroons.security_policy import MacaroonSecurityPolicy @@ -215,3 +219,5 @@ def includeme(config): # Add a periodic task to generate Account metrics config.add_periodic_task(crontab(minute="*/20"), compute_user_metrics) config.add_periodic_task(crontab(minute="*"), notify_users_of_tos_update) + # TODO: After initial backfill, this can be done less frequently + config.add_periodic_task(crontab(minute="*/5"), batch_update_email_domain_status) diff --git a/warehouse/accounts/tasks.py b/warehouse/accounts/tasks.py index a72aa9af726e..137190e71d18 100644 --- a/warehouse/accounts/tasks.py +++ b/warehouse/accounts/tasks.py @@ -10,9 +10,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from datetime import datetime, timedelta, timezone +from __future__ import annotations -from sqlalchemy import func +import typing + +from datetime import UTC, datetime, timedelta, timezone + +from sqlalchemy import func, nullsfirst, or_, select from warehouse import tasks from warehouse.accounts.models import ( @@ -22,10 +26,14 @@ UserTermsOfServiceEngagement, ) from warehouse.accounts.services import IUserService +from warehouse.accounts.utils import update_email_domain_status from warehouse.email import send_user_terms_of_service_updated from warehouse.metrics import IMetricsService from warehouse.packaging.models import Release +if typing.TYPE_CHECKING: + from pyramid.request import Request + @tasks.task(ignore_result=True, acks_late=True) def notify_users_of_tos_update(request): @@ -136,3 +144,32 @@ def compute_user_metrics(request): "primary:true", ], ) + + +@tasks.task(ignore_result=True, acks_late=True) +def batch_update_email_domain_status(request: Request) -> None: + """ + Update the email domain status for any checked over 30 days ago. + + 30 days is roughly the time between a domain's expiration + and when it enters a renewal grace period. + Each TLD may express their own grace period, 30 days is an estimate + of time before the registrar is likely to sell it. + """ + stmt = ( + select(Email) + .where( + # TODO: After completely backfilled, remove the `or_` for None + or_( + Email.domain_last_checked.is_(None), + Email.domain_last_checked < datetime.now(tz=UTC) - timedelta(days=30), + ) + ) + .order_by(nullsfirst(Email.domain_last_checked.asc())) + .limit(10_000) + ) + # Run in batches to avoid too much memory usage, API rate limits + stmt = stmt.execution_options(yield_per=1_000) + + for email in request.db.scalars(stmt): + update_email_domain_status(email, request) From 7b21c8b3f99edead47ea80b213d77b21afd3154e Mon Sep 17 00:00:00 2001 From: Mike Fiedler Date: Thu, 24 Apr 2025 11:52:12 -0400 Subject: [PATCH 6/6] update with feedback Signed-off-by: Mike Fiedler --- tests/unit/accounts/test_services.py | 2 +- tests/unit/accounts/test_tasks.py | 15 +++++++++++++++ warehouse/accounts/interfaces.py | 2 +- warehouse/accounts/services.py | 4 ++-- warehouse/accounts/tasks.py | 2 +- warehouse/accounts/utils.py | 8 ++++---- 6 files changed, 24 insertions(+), 9 deletions(-) diff --git a/tests/unit/accounts/test_services.py b/tests/unit/accounts/test_services.py index 4a99a8758608..6a60eb74e57a 100644 --- a/tests/unit/accounts/test_services.py +++ b/tests/unit/accounts/test_services.py @@ -1691,7 +1691,7 @@ def __init__(self): session=session, client_id="some_client_id" ) - assert svc.get_domain_status("example.com") == [] + assert svc.get_domain_status("example.com") is None assert session.get.calls == [ pretend.call( "https://api.domainr.com/v2/status", diff --git a/tests/unit/accounts/test_tasks.py b/tests/unit/accounts/test_tasks.py index 56df2571ffc5..c77fd55587e5 100644 --- a/tests/unit/accounts/test_tasks.py +++ b/tests/unit/accounts/test_tasks.py @@ -233,3 +233,18 @@ def test_update_email_domain_status(db_request, domain_status_service, mocker): assert over_threshold.domain_last_status == ["active"] assert on_threshold.domain_last_status == ["active"] assert under_threshold.domain_last_status is None # no default, not updated + + +def test_update_email_domain_status_does_not_update_if_not_needed( + db_request, domain_status_service, mocker +): + mocker.patch.object(domain_status_service, "get_domain_status", return_value=None) + + fail_check = EmailFactory.create() + + batch_update_email_domain_status(db_request) + + domain_status_service.get_domain_status.assert_called_once_with(fail_check.domain) + + assert fail_check.domain_last_checked is None + assert fail_check.domain_last_status is None diff --git a/warehouse/accounts/interfaces.py b/warehouse/accounts/interfaces.py index 15f7114955c0..0b4560f4653f 100644 --- a/warehouse/accounts/interfaces.py +++ b/warehouse/accounts/interfaces.py @@ -301,7 +301,7 @@ def get_email_breach_count(email: str) -> int | None: class IDomainStatusService(Interface): - def get_domain_status(domain: str) -> list[str]: + def get_domain_status(domain: str) -> list[str] | None: """ Returns a list of status strings for the given domain. """ diff --git a/warehouse/accounts/services.py b/warehouse/accounts/services.py index efb9904c4a7a..9477452c94ba 100644 --- a/warehouse/accounts/services.py +++ b/warehouse/accounts/services.py @@ -992,7 +992,7 @@ def create_service(cls, _context, request: Request) -> DomainrDomainStatusServic domainr_client_id = request.registry.settings.get("domain_status.client_id") return cls(session=request.http, client_id=domainr_client_id) - def get_domain_status(self, domain: str) -> list[str]: + def get_domain_status(self, domain: str) -> list[str] | None: """ Check if a domain is available or not. See https://domainr.com/docs/api/v2/status @@ -1006,6 +1006,6 @@ def get_domain_status(self, domain: str) -> list[str]: resp.raise_for_status() except requests.RequestException as exc: logger.warning("Error contacting Domainr: %r", exc) - return [] + return None return resp.json()["status"][0]["status"].split() diff --git a/warehouse/accounts/tasks.py b/warehouse/accounts/tasks.py index 137190e71d18..4402b948ad8a 100644 --- a/warehouse/accounts/tasks.py +++ b/warehouse/accounts/tasks.py @@ -149,7 +149,7 @@ def compute_user_metrics(request): @tasks.task(ignore_result=True, acks_late=True) def batch_update_email_domain_status(request: Request) -> None: """ - Update the email domain status for any checked over 30 days ago. + Update the email domain status for any domain last checked over 30 days ago. 30 days is roughly the time between a domain's expiration and when it enters a renewal grace period. diff --git a/warehouse/accounts/utils.py b/warehouse/accounts/utils.py index 38be5c5df281..c585c1417945 100644 --- a/warehouse/accounts/utils.py +++ b/warehouse/accounts/utils.py @@ -58,10 +58,10 @@ def update_email_domain_status(email: Email, request: Request) -> None: Update the domain status of the given email address. """ domain_status_service = request.find_service(IDomainStatusService) - domain_status = domain_status_service.get_domain_status(email.domain) - email.domain_last_checked = datetime.datetime.now(datetime.UTC) - email.domain_last_status = domain_status - request.db.add(email) + if domain_status := domain_status_service.get_domain_status(email.domain): + email.domain_last_checked = datetime.datetime.now(datetime.UTC) + email.domain_last_status = domain_status + request.db.add(email) return None