From 5808a4fbec794ffa0c58609af72895ff0540bbcc Mon Sep 17 00:00:00 2001 From: Gabriel Blanchard Date: Mon, 17 Mar 2025 15:09:44 -0400 Subject: [PATCH 1/2] Add pagination option for items_and_annotations_generator. Default of 10,000 items per page remains, but includes the option to specify fewer pages to reduce timout errors. --- nucleus/dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nucleus/dataset.py b/nucleus/dataset.py index 26fa57e7..ea95f840 100644 --- a/nucleus/dataset.py +++ b/nucleus/dataset.py @@ -1518,6 +1518,7 @@ def items_and_annotation_generator( query: Optional[str] = None, use_mirrored_images: bool = False, only_most_recent_tasks: bool = True, + page_size=10000 ) -> Iterable[Dict[str, Union[DatasetItem, Dict[str, List[Annotation]]]]]: """Provides a generator of all DatasetItems and Annotations in the dataset. @@ -1525,6 +1526,7 @@ def items_and_annotation_generator( query: Structured query compatible with the `Nucleus query language `_. use_mirrored_images: If True, returns the location of the mirrored image hosted in Scale S3. Useful when the original image is no longer available. only_most_recent_tasks: If True, only the annotations corresponding to the most recent task for each item is returned. + page_size: Number of items to fetch per page. Default is maximum ES page size of 10000. Returns: Generator where each element is a dict containing the DatasetItem @@ -1548,7 +1550,7 @@ def items_and_annotation_generator( client=self._client, endpoint=f"dataset/{self.id}/exportForTrainingPage", result_key=EXPORT_FOR_TRAINING_KEY, - page_size=10000, # max ES page size + page_size=page_size, # default is max ES page size of 10000 query=query, chip=use_mirrored_images, onlyMostRecentTask=only_most_recent_tasks, From 44a58bd36adda252a013c16c533465b32aa3b3a7 Mon Sep 17 00:00:00 2001 From: Gabriel Blanchard Date: Wed, 19 Mar 2025 10:58:08 -0400 Subject: [PATCH 2/2] Bumped package version Included changes to changelog and project toml --- CHANGELOG.md | 6 ++++++ pyproject.toml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index caf3b741..d8f6abd6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.17.10](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.10) - 2025-03-19 + +### Added +- Adding page size variable to `items_and_annotation_generator()` to reduce timeout errors for customers with large datasets + ## [0.17.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.9) - 2025-03-11 ### Added diff --git a/pyproject.toml b/pyproject.toml index aa169d82..8fe8503e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"] # Easy ignore for getting it running [tool.poetry] name = "scale-nucleus" -version = "0.17.9" +version = "0.17.10" description = "The official Python client library for Nucleus, the Data Platform for AI" license = "MIT" authors = ["Scale AI Nucleus Team "]