pykeen · migalkin · Sep 16, 2022 · Sep 17, 2022 · Sep 17, 2022 · Sep 29, 2022
diff --git a/src/pykeen/datasets/__init__.py b/src/pykeen/datasets/__init__.py
@@ -26,6 +26,7 @@
     TarFileSingleDataset,
     UnpackedRemoteDataset,
     ZipSingleDataset,
+    HyperRelationalUnpackedRemoteDataset,
 )
 from .biokg import BioKG
 from .ckg import CKG
@@ -49,7 +50,7 @@
 from .primekg import PrimeKG
 from .umls import UMLS
 from .utils import get_dataset
-from .wd50k import WD50KT
+from .wd50k import WD50KT, WD50K, WD50K_33, WD50K_66, WD50K_100
 from .wikidata5m import Wikidata5M
 from .wordnet import WN18, WN18RR
 from .yago import YAGO310
@@ -92,6 +93,10 @@
     "OpenEA",
     "Countries",
     "WD50KT",
+    "WD50K",
+    "WD50K_33",
+    "WD50K_66",
+    "WD50K_100",
     "Wikidata5M",
     "PharmKG8k",
     "PharmKG",
@@ -120,6 +125,7 @@
         MTransEDataset,
         OGBLoader,
         EADataset,
+        HyperRelationalUnpackedRemoteDataset,
     },
 )
 dataset_resolver.register_entrypoint("pykeen.datasets")

diff --git a/src/pykeen/datasets/base.py b/src/pykeen/datasets/base.py
@@ -22,7 +22,7 @@
 from tabulate import tabulate
 
 from ..constants import PYKEEN_DATASETS
-from ..triples import CoreTriplesFactory, TriplesFactory
+from ..triples import CoreTriplesFactory, TriplesFactory, StatementFactory
 from ..triples.deteriorate import deteriorate
 from ..triples.remix import remix
 from ..triples.triples_factory import splits_similarity
@@ -44,6 +44,7 @@
     "ZipSingleDataset",
     "TabbedDataset",
     "SingleTabbedDataset",
+    "HyperRelationalUnpackedRemoteDataset",
     # Utilities
     "dataset_similarity",
 ]
@@ -944,3 +945,105 @@ def _get_df(self) -> pd.DataFrame:
             df = df[usecols]
 
         return df
+
+
+class HyperRelationalUnpackedRemoteDataset(PathDataset):
+
+    def __init__(
+            self,
+            training_url: str,
+            testing_url: str,
+            validation_url: str,
+            cache_root: Optional[str] = None,
+            force: bool = False,
+            eager: bool = False,
+            create_inverse_triples: bool = False,
+            max_num_qualifier_pairs: int = -1,
+            load_triples_kwargs: Optional[Mapping[str, Any]] = None,
+            download_kwargs: Optional[Mapping[str, Any]] = None,
+    ):
+        """Initialize dataset.
+
+        :param training_url: The URL of the training file
+        :param testing_url: The URL of the testing file
+        :param validation_url: The URL of the validation file
+        :param cache_root:
+            An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
+            This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.data/pykeen``.
+        :param force: If true, redownload any cached files
+        :param eager: Should the data be loaded eagerly? Defaults to false.
+        :param create_inverse_triples: Should inverse triples be created? Defaults to false.
+        :param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path`
+            and ultimately through to :func:`pykeen.triples.utils.load_triples`.
+        :param download_kwargs: Keyword arguments to pass to :func:`pystow.utils.download`
+        """
+        self.cache_root = self._help_cache(cache_root)
+
+        self.training_url = training_url
+        self.testing_url = testing_url
+        self.validation_url = validation_url
+
+        self.max_num_qualifier_pairs = max_num_qualifier_pairs
+
+        training_path = self.cache_root.joinpath(name_from_url(self.training_url))
+        testing_path = self.cache_root.joinpath(name_from_url(self.testing_url))
+        validation_path = self.cache_root.joinpath(name_from_url(self.validation_url))
+
+        download_kwargs = {} if download_kwargs is None else dict(download_kwargs)
+        download_kwargs.setdefault("backend", "urllib")
+
+
+        # TODO the only difference with vanilla UnpackedRemoteDataset is here:
+        # we update the kwargs with the max number of qualifier pairs to keep
+        if load_triples_kwargs is None:
+            load_triples_kwargs = {"max_num_qualifier_pairs": max_num_qualifier_pairs}
+        else:
+            load_triples_kwargs.update({"max_num_qualifier_pairs": max_num_qualifier_pairs})
+
+        for url, path in [
+            (self.training_url, training_path),
+            (self.testing_url, testing_path),
+            (self.validation_url, validation_path),
+        ]:
+            if force or not path.is_file():
+                download(url, path, **download_kwargs)
+
+        super().__init__(
+            training_path=training_path,
+            testing_path=testing_path,
+            validation_path=validation_path,
+            eager=eager,
+            create_inverse_triples=create_inverse_triples,
+            load_triples_kwargs=load_triples_kwargs,
+        )
+
+    def _load(self) -> None:
+        self._training = StatementFactory.from_path(
+            path=self.training_path,
+            create_inverse_triples=self._create_inverse_triples,
+            load_triples_kwargs=self.load_triples_kwargs,
+        )
+        self._testing = StatementFactory.from_path(
+            path=self.testing_path,
+            entity_to_id=self._training.entity_to_id,  # share entity index with training
+            relation_to_id=self._training.relation_to_id,  # share relation index with training
+            # do not explicitly create inverse triples for testing; this is handled by the evaluation code
+            create_inverse_triples=False,
+            load_triples_kwargs=self.load_triples_kwargs,
+        )
+
+    def _load_validation(self) -> None:
+        # don't call this function by itself. assumes called through the `validation`
+        # property and the _training factory has already been loaded
+        assert self._training is not None
+        if self.validation_path is None:
+            self._validation = None
+        else:
+            self._validation = StatementFactory.from_path(
+                path=self.validation_path,
+                entity_to_id=self._training.entity_to_id,  # share entity index with training
+                relation_to_id=self._training.relation_to_id,  # share relation index with training
+                # do not explicitly create inverse triples for testing; this is handled by the evaluation code
+                create_inverse_triples=False,
+                load_triples_kwargs=self.load_triples_kwargs,
+            )
diff --git a/src/pykeen/datasets/wd50k.py b/src/pykeen/datasets/wd50k.py
@@ -12,13 +12,30 @@
 from docdata import parse_docdata
 from more_click import verbose_option
 
-from .base import UnpackedRemoteDataset
+from .base import UnpackedRemoteDataset, HyperRelationalUnpackedRemoteDataset
 
 BASE_URL = "https://raw.githubusercontent.com/migalkin/StarE/master/data/clean/wd50k/"
 TRIPLES_VALID_URL = f"{BASE_URL}/triples/valid.txt"
 TRIPLES_TEST_URL = f"{BASE_URL}/triples/test.txt"
 TRIPLES_TRAIN_URL = f"{BASE_URL}/triples/train.txt"
 
+HYPER_RELATIONAL_BASE_URL = "https://raw.githubusercontent.com/migalkin/StarE/master/data/clean/"
+HYPER_RELATIONAL_MAIN_TRAIN_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k/statements/train.txt"
+HYPER_RELATIONAL_MAIN_VALID_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k/statements/valid.txt"
+HYPER_RELATIONAL_MAIN_TEST_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k/statements/test.txt"
+
+HYPER_RELATIONAL_33_TRAIN_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_33/statements/train.txt"
+HYPER_RELATIONAL_33_VALID_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_33/statements/valid.txt"
+HYPER_RELATIONAL_33_TEST_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_33/statements/test.txt"
+
+HYPER_RELATIONAL_66_TRAIN_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_66/statements/train.txt"
+HYPER_RELATIONAL_66_VALID_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_66/statements/valid.txt"
+HYPER_RELATIONAL_66_TEST_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_66/statements/test.txt"
+
+HYPER_RELATIONAL_100_TRAIN_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_100/statements/train.txt"
+HYPER_RELATIONAL_100_VALID_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_100/statements/valid.txt"
+HYPER_RELATIONAL_100_TEST_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_100/statements/test.txt"
+
 
 @parse_docdata
 class WD50KT(UnpackedRemoteDataset):
@@ -55,10 +72,152 @@ def __init__(self, **kwargs):
         )
 
 
+
+@parse_docdata
+class WD50K(HyperRelationalUnpackedRemoteDataset):
+    """The hyper-relational version of WD50K.
+
+    ---
+    name: WD50K (hyper-relational)
+    citation:
+        author: Galkin
+        year: 2020
+        link: https://www.aclweb.org/anthology/2020.emnlp-main.596/
+        arxiv: 2009.10847
+        github: migalkin/StarE
+    statistics:
+        entities: 47,156 (5,460 qualifier-only)
+        relations: 532 (45 qualifier-only)
+        training: 166,435
+        testing: 46,159
+        validation: 23,913
+        statements: 236,507
+        statements w/ qualifiers: 32,167 (13.6%)
+    """
+
+    def __init__(self, **kwargs):
+        """Initialize the WD50K (hyper-relational) dataset from [galkin2020]_.
+
+        :param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.UnpackedRemoteDataset`.
+        """
+        super().__init__(
+            training_url=HYPER_RELATIONAL_MAIN_TRAIN_URL,
+            testing_url=HYPER_RELATIONAL_MAIN_TEST_URL,
+            validation_url=HYPER_RELATIONAL_MAIN_VALID_URL,
+            load_triples_kwargs={"delimiter": ","},
+            **kwargs,
+        )
+
+@parse_docdata
+class WD50K_33(HyperRelationalUnpackedRemoteDataset):
+    """The hyper-relational version of WD50K where 33% of statements have at lease one qualifier pair.
+
+    ---
+    name: WD50K (hyper-relational)
+    citation:
+        author: Galkin
+        year: 2020
+        link: https://www.aclweb.org/anthology/2020.emnlp-main.596/
+        arxiv: 2009.10847
+        github: migalkin/StarE
+    statistics:
+        entities: 38,124 (6,463 qualifier-only)
+        relations: 475 (47 qualifier-only)
+        training: 73,406
+        testing: 18,133
+        validation: 10,568
+        statements: 102,107
+        statements w/ qualifiers: 31,866 (31.2%)
+    """
+
+    def __init__(self, **kwargs):
+        """Initialize the WD50K (33) (hyper-relational) dataset from [galkin2020]_.
+
+        :param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.UnpackedRemoteDataset`.
+        """
+        super().__init__(
+            training_url=HYPER_RELATIONAL_33_TRAIN_URL,
+            testing_url=HYPER_RELATIONAL_33_TEST_URL,
+            validation_url=HYPER_RELATIONAL_33_VALID_URL,
+            load_triples_kwargs={"delimiter": ","},
+            **kwargs,
+        )
+
+@parse_docdata
+class WD50K_66(HyperRelationalUnpackedRemoteDataset):
+    """The hyper-relational version of WD50K where 66% of statements have at lease one qualifier pair.
+
+    ---
+    name: WD50K (hyper-relational)
+    citation:
+        author: Galkin
+        year: 2020
+        link: https://www.aclweb.org/anthology/2020.emnlp-main.596/
+        arxiv: 2009.10847
+        github: migalkin/StarE
+    statistics:
+        entities: 27,347 (7,167 qualifier-only)
+        relations: 494 (53 qualifier-only)
+        training: 35,968
+        testing: 8,045
+        validation: 5,154
+        statements: 49,167
+        statements w/ qualifiers: 31,696 (64.5%)
+    """
+
+    def __init__(self, **kwargs):
+        """Initialize the WD50K (66) (hyper-relational) dataset from [galkin2020]_.
+
+        :param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.UnpackedRemoteDataset`.
+        """
+        super().__init__(
+            training_url=HYPER_RELATIONAL_66_TRAIN_URL,
+            testing_url=HYPER_RELATIONAL_66_TEST_URL,
+            validation_url=HYPER_RELATIONAL_66_VALID_URL,
+            load_triples_kwargs={"delimiter": ","},
+            **kwargs,
+        )
+
+@parse_docdata
+class WD50K_100(HyperRelationalUnpackedRemoteDataset):
+    """The hyper-relational version of WD50K where 100% of statements have at lease one qualifier pair.
+
+    ---
+    name: WD50K (hyper-relational)
+    citation:
+        author: Galkin
+        year: 2020
+        link: https://www.aclweb.org/anthology/2020.emnlp-main.596/
+        arxiv: 2009.10847
+        github: migalkin/StarE
+    statistics:
+        entities: 18,792 (7,862 qualifier-only)
+        relations: 279 (75 qualifier-only)
+        training: 22,738
+        testing: 5,297
+        validation: 3,279
+        statements: 31,314
+        statements w/ qualifiers: 31,314 (100%)
+    """
+
+    def __init__(self, **kwargs):
+        """Initialize the WD50K (100) (hyper-relational) dataset from [galkin2020]_.
+
+        :param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.UnpackedRemoteDataset`.
+        """
+        super().__init__(
+            training_url=HYPER_RELATIONAL_100_TRAIN_URL,
+            testing_url=HYPER_RELATIONAL_100_TEST_URL,
+            validation_url=HYPER_RELATIONAL_100_VALID_URL,
+            load_triples_kwargs={"delimiter": ","},
+            **kwargs,
+        )
+
+
 @click.command()
 @verbose_option
 def _main():
-    for cls in [WD50KT]:
+    for cls in [WD50KT, WD50K, WD50K_33, WD50K_66, WD50K_100]:
         click.secho(f"Loading {cls.__name__}", fg="green", bold=True)
         d = cls()
         d.summarize()

diff --git a/src/pykeen/triples/__init__.py b/src/pykeen/triples/__init__.py
@@ -12,6 +12,7 @@
     get_mapped_triples,
     relation_inverter,
 )
+from .statement_factory import StatementFactory
 from .triples_numeric_literals_factory import TriplesNumericLiteralsFactory
 
 __all__ = [
@@ -26,4 +27,5 @@
     "TriplesNumericLiteralsFactory",
     "get_mapped_triples",
     "AnyTriples",
+    "StatementFactory"
 ]