diff --git a/servicex_analysis_utils/__init__.py b/servicex_analysis_utils/__init__.py index 1fd1019..ff3f501 100644 --- a/servicex_analysis_utils/__init__.py +++ b/servicex_analysis_utils/__init__.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from .materialization import to_awk from .file_peeking import get_structure +from .dataset_resolver import ds_type_resolver __version__ = "1.1.1" -__all__ = ["to_awk", "get_structure"] +__all__ = ["to_awk", "get_structure", "ds_type_resolver"] diff --git a/servicex_analysis_utils/dataset_resolver.py b/servicex_analysis_utils/dataset_resolver.py new file mode 100644 index 0000000..a817d16 --- /dev/null +++ b/servicex_analysis_utils/dataset_resolver.py @@ -0,0 +1,80 @@ +# Copyright (c) 2025, IRIS-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import re +from typing import Union +from urllib.parse import urlparse + +from servicex import dataset + + +def ds_type_resolver( + ds_name: Union[str, list[str]], +) -> Union[dataset.FileList, dataset.Rucio, dataset.XRootD, dataset.CERNOpenData]: + """Determine the type of dataset based on the input + string and then return the ServiceX dataset object. + + Args: + ds_name (str): Name of the dataset to fetch. + + Returns: + dataset: The dataset object + """ + + if isinstance(ds_name, list): + return dataset.FileList(ds_name) + + elif re.match(r"^https?://", ds_name): + url = ds_name + + parsed_url = urlparse(url) + if "cernbox.cern.ch" in parsed_url.netloc and parsed_url.path.startswith( + "/files/spaces" + ): + url = f"root://eospublic.cern.ch{parsed_url.path[13:]}" + + return dataset.FileList([url]) + + elif re.match(r"^rucio://", ds_name): + did = ds_name[8:] + return dataset.Rucio(did) + + elif ds_name.count(":") == 1 and "/" not in ds_name: + return dataset.Rucio(ds_name) + + elif ds_name.isdigit(): + return dataset.CERNOpenData(int(ds_name)) + + elif ds_name.startswith("root://") and ds_name.endswith("*"): + return dataset.XRootD(ds_name) + + elif re.match(r"^root://", ds_name): + return dataset.FileList(ds_name) + + raise RuntimeError( + f"Unable to find the type of input provided for dataset: {ds_name}" + ) diff --git a/tests/test_dataset_resolver.py b/tests/test_dataset_resolver.py new file mode 100644 index 0000000..0460299 --- /dev/null +++ b/tests/test_dataset_resolver.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025, IRIS-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest +from servicex_analysis_utils import ds_type_resolver +from servicex import dataset + + +@pytest.mark.parametrize( + "input_ds, expected_type", + [ + ("https://test.com", dataset.FileList), + ("test:data", dataset.Rucio), + ("rucio://test:test", dataset.Rucio), + ("123", dataset.CERNOpenData), + ("root://eosatlas.cern.ch//eos/", dataset.FileList), + ("root://eosatlas.cern.ch//eos/*", dataset.XRootD), + (["root://eosatlas.cern.ch//eos/", "https://test.com"], dataset.FileList), + ], +) +def test_find_dataset(input_ds, expected_type): + dataset = ds_type_resolver(input_ds) + assert isinstance(dataset, expected_type)