Skip to content

Commit

Permalink
Fix get_jobs method in Comparison Monitor (#444)
Browse files Browse the repository at this point in the history
* Fix get_jobs in jobs comparison monitor

* Avoid extra call api when no more pagination is required
  • Loading branch information
VMRuiz committed May 6, 2024
1 parent 4384fe5 commit f0d8ea5
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 27 deletions.
30 changes: 17 additions & 13 deletions spidermon/contrib/scrapy/monitors/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,26 +557,30 @@ def run(self, result):
def _get_jobs(self, states, number_of_jobs):
tags = self._get_tags_to_filter()

jobs = []
total_jobs = []
start = 0
client = Client(self.crawler.settings)
MAX_API_COUNT = 1000

_jobs = client.spider.jobs.list(
start=start,
state=states,
count=number_of_jobs,
filters=dict(has_tag=tags) if tags else None,
)
while _jobs:
jobs.extend(_jobs)
start += 1000
_jobs = client.spider.jobs.list(
while True:
# API has a 1000 results limit
count = min(number_of_jobs - len(total_jobs), MAX_API_COUNT)
current_jobs = client.spider.jobs.list(
start=start,
state=states,
count=number_of_jobs,
count=count,
filters=dict(has_tag=tags) if tags else None,
)
return jobs
total_jobs.extend(current_jobs)

if len(current_jobs) < MAX_API_COUNT or len(total_jobs) == number_of_jobs:
# Stop paginating if results are less than 1000 (pagination not required)
# or target jobs was reached - no more pagination required
break

start += len(current_jobs)

return total_jobs

def _get_tags_to_filter(self):
"""
Expand Down
77 changes: 64 additions & 13 deletions tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from unittest.mock import Mock, call
import math
from unittest.mock import Mock, call, patch

import pytest
from spidermon import MonitorSuite
Expand All @@ -10,6 +11,7 @@
ZyteJobsComparisonMonitor,
monitors,
)
from spidermon.core.factories import MonitorFactory
from spidermon.exceptions import NotConfigured


Expand All @@ -24,21 +26,16 @@ def mock_suite(mock_jobs, monkeypatch):
return MonitorSuite(monitors=[ZyteJobsComparisonMonitor])


def get_paginated_jobs(**kwargs):
return [Mock() for _ in range(kwargs["count"])]


@pytest.fixture
def mock_suite_and_zyte_client(
monkeypatch,
number_of_jobs,
):
def get_paginated_jobs(**kwargs):
start = kwargs["start"]
if start < number_of_jobs:
return [
Mock() for _ in range(start, max(number_of_jobs - 1000, number_of_jobs))
]
return []

monkeypatch.setenv("SHUB_JOB_DATA", '{"tags":["tag1","tag2","tag3"]}')

mock_client = Mock()
mock_client.spider.jobs.list.side_effect = get_paginated_jobs

Expand Down Expand Up @@ -88,6 +85,58 @@ def test_jobs_comparison_monitor_is_enabled(
runner.run(mock_suite, **data)


class TestZyteJobsComparisonMonitor(ZyteJobsComparisonMonitor):
def runTest():
pass


def test_jobs_comparison_monitor_get_jobs():
mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)

# Return exact number of jobs
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
assert len(jobs) == 50
mock_client.spider.jobs.list.assert_called_once()

mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
output = [Mock(), Mock()]
mock_client.spider.jobs.list = Mock(return_value=output)

# Return less jobs than expected
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
assert jobs == output
mock_client.spider.jobs.list.assert_called_once()

with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)

# Jobs bigger than 1000
jobs = monitor._get_jobs(states=None, number_of_jobs=2500)
assert len(jobs) == 2500
assert mock_client.spider.jobs.list.call_count == 3


@pytest.mark.parametrize(
["item_count", "previous_counts", "threshold", "should_raise"],
[
Expand Down Expand Up @@ -144,12 +193,14 @@ def test_arguments_passed_to_zyte_client(

calls = [
call.mock_client.spider.jobs.list(
start=n,
start=n * 1000,
state=list(states),
count=number_of_jobs,
# Count goes from pending number of jobs up to 1000
count=min(number_of_jobs - n * 1000, 1000),
filters={"has_tag": list(tags)},
)
for n in range(0, number_of_jobs + 1000, 1000)
# One call to api every 1000 expected jobs
for n in range(0, math.ceil(number_of_jobs / 1000))
]

mock_client.spider.jobs.list.assert_has_calls(calls)
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ skip_missing_interpreters = True

[testenv]
deps =
pytest
pytest==8.1.1
pytest-cov
pytest-mock
extras =
Expand Down

0 comments on commit f0d8ea5

Please sign in to comment.