Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

馃悕 馃専 Hyper-Relational Statement Factory #1117

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/pykeen/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
TarFileSingleDataset,
UnpackedRemoteDataset,
ZipSingleDataset,
HyperRelationalUnpackedRemoteDataset,
)
from .biokg import BioKG
from .ckg import CKG
Expand All @@ -49,7 +50,7 @@
from .primekg import PrimeKG
from .umls import UMLS
from .utils import get_dataset
from .wd50k import WD50KT
from .wd50k import WD50KT, WD50K, WD50K_33, WD50K_66, WD50K_100
from .wikidata5m import Wikidata5M
from .wordnet import WN18, WN18RR
from .yago import YAGO310
Expand Down Expand Up @@ -92,6 +93,10 @@
"OpenEA",
"Countries",
"WD50KT",
"WD50K",
"WD50K_33",
"WD50K_66",
"WD50K_100",
"Wikidata5M",
"PharmKG8k",
"PharmKG",
Expand Down Expand Up @@ -120,6 +125,7 @@
MTransEDataset,
OGBLoader,
EADataset,
HyperRelationalUnpackedRemoteDataset,
},
)
dataset_resolver.register_entrypoint("pykeen.datasets")
Expand Down
105 changes: 104 additions & 1 deletion src/pykeen/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from tabulate import tabulate

from ..constants import PYKEEN_DATASETS
from ..triples import CoreTriplesFactory, TriplesFactory
from ..triples import CoreTriplesFactory, TriplesFactory, StatementFactory
from ..triples.deteriorate import deteriorate
from ..triples.remix import remix
from ..triples.triples_factory import splits_similarity
Expand All @@ -44,6 +44,7 @@
"ZipSingleDataset",
"TabbedDataset",
"SingleTabbedDataset",
"HyperRelationalUnpackedRemoteDataset",
# Utilities
"dataset_similarity",
]
Expand Down Expand Up @@ -944,3 +945,105 @@ def _get_df(self) -> pd.DataFrame:
df = df[usecols]

return df


class HyperRelationalUnpackedRemoteDataset(PathDataset):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need a different base class mixin for this kind of dataset to note that the triples factories have to be SatementFactories


def __init__(
self,
training_url: str,
testing_url: str,
validation_url: str,
cache_root: Optional[str] = None,
force: bool = False,
eager: bool = False,
create_inverse_triples: bool = False,
max_num_qualifier_pairs: int = -1,
load_triples_kwargs: Optional[Mapping[str, Any]] = None,
download_kwargs: Optional[Mapping[str, Any]] = None,
):
"""Initialize dataset.

:param training_url: The URL of the training file
:param testing_url: The URL of the testing file
:param validation_url: The URL of the validation file
:param cache_root:
An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.data/pykeen``.
:param force: If true, redownload any cached files
:param eager: Should the data be loaded eagerly? Defaults to false.
:param create_inverse_triples: Should inverse triples be created? Defaults to false.
:param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path`
and ultimately through to :func:`pykeen.triples.utils.load_triples`.
:param download_kwargs: Keyword arguments to pass to :func:`pystow.utils.download`
"""
self.cache_root = self._help_cache(cache_root)

self.training_url = training_url
self.testing_url = testing_url
self.validation_url = validation_url

self.max_num_qualifier_pairs = max_num_qualifier_pairs

training_path = self.cache_root.joinpath(name_from_url(self.training_url))
testing_path = self.cache_root.joinpath(name_from_url(self.testing_url))
validation_path = self.cache_root.joinpath(name_from_url(self.validation_url))

download_kwargs = {} if download_kwargs is None else dict(download_kwargs)
download_kwargs.setdefault("backend", "urllib")


# TODO the only difference with vanilla UnpackedRemoteDataset is here:
# we update the kwargs with the max number of qualifier pairs to keep
if load_triples_kwargs is None:
load_triples_kwargs = {"max_num_qualifier_pairs": max_num_qualifier_pairs}
else:
load_triples_kwargs.update({"max_num_qualifier_pairs": max_num_qualifier_pairs})

for url, path in [
(self.training_url, training_path),
(self.testing_url, testing_path),
(self.validation_url, validation_path),
]:
if force or not path.is_file():
download(url, path, **download_kwargs)

super().__init__(
training_path=training_path,
testing_path=testing_path,
validation_path=validation_path,
eager=eager,
create_inverse_triples=create_inverse_triples,
load_triples_kwargs=load_triples_kwargs,
)

def _load(self) -> None:
self._training = StatementFactory.from_path(
path=self.training_path,
create_inverse_triples=self._create_inverse_triples,
load_triples_kwargs=self.load_triples_kwargs,
)
self._testing = StatementFactory.from_path(
path=self.testing_path,
entity_to_id=self._training.entity_to_id, # share entity index with training
relation_to_id=self._training.relation_to_id, # share relation index with training
# do not explicitly create inverse triples for testing; this is handled by the evaluation code
create_inverse_triples=False,
load_triples_kwargs=self.load_triples_kwargs,
)

def _load_validation(self) -> None:
# don't call this function by itself. assumes called through the `validation`
# property and the _training factory has already been loaded
assert self._training is not None
if self.validation_path is None:
self._validation = None
else:
self._validation = StatementFactory.from_path(
path=self.validation_path,
entity_to_id=self._training.entity_to_id, # share entity index with training
relation_to_id=self._training.relation_to_id, # share relation index with training
# do not explicitly create inverse triples for testing; this is handled by the evaluation code
create_inverse_triples=False,
load_triples_kwargs=self.load_triples_kwargs,
)
163 changes: 161 additions & 2 deletions src/pykeen/datasets/wd50k.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,30 @@
from docdata import parse_docdata
from more_click import verbose_option

from .base import UnpackedRemoteDataset
from .base import UnpackedRemoteDataset, HyperRelationalUnpackedRemoteDataset

BASE_URL = "https://raw.githubusercontent.com/migalkin/StarE/master/data/clean/wd50k/"
TRIPLES_VALID_URL = f"{BASE_URL}/triples/valid.txt"
TRIPLES_TEST_URL = f"{BASE_URL}/triples/test.txt"
TRIPLES_TRAIN_URL = f"{BASE_URL}/triples/train.txt"

HYPER_RELATIONAL_BASE_URL = "https://raw.githubusercontent.com/migalkin/StarE/master/data/clean/"
HYPER_RELATIONAL_MAIN_TRAIN_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k/statements/train.txt"
HYPER_RELATIONAL_MAIN_VALID_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k/statements/valid.txt"
HYPER_RELATIONAL_MAIN_TEST_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k/statements/test.txt"

HYPER_RELATIONAL_33_TRAIN_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_33/statements/train.txt"
HYPER_RELATIONAL_33_VALID_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_33/statements/valid.txt"
HYPER_RELATIONAL_33_TEST_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_33/statements/test.txt"

HYPER_RELATIONAL_66_TRAIN_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_66/statements/train.txt"
HYPER_RELATIONAL_66_VALID_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_66/statements/valid.txt"
HYPER_RELATIONAL_66_TEST_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_66/statements/test.txt"

HYPER_RELATIONAL_100_TRAIN_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_100/statements/train.txt"
HYPER_RELATIONAL_100_VALID_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_100/statements/valid.txt"
HYPER_RELATIONAL_100_TEST_URL = f"{HYPER_RELATIONAL_BASE_URL}/wd50k_100/statements/test.txt"


@parse_docdata
class WD50KT(UnpackedRemoteDataset):
Expand Down Expand Up @@ -55,10 +72,152 @@ def __init__(self, **kwargs):
)



@parse_docdata
class WD50K(HyperRelationalUnpackedRemoteDataset):
"""The hyper-relational version of WD50K.

---
name: WD50K (hyper-relational)
citation:
author: Galkin
year: 2020
link: https://www.aclweb.org/anthology/2020.emnlp-main.596/
arxiv: 2009.10847
github: migalkin/StarE
statistics:
entities: 47,156 (5,460 qualifier-only)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cthoyt do we rely somewhere on parsing these statistics to integers? If yes, we should store the second number under a different key

relations: 532 (45 qualifier-only)
training: 166,435
testing: 46,159
validation: 23,913
statements: 236,507
statements w/ qualifiers: 32,167 (13.6%)
"""

def __init__(self, **kwargs):
"""Initialize the WD50K (hyper-relational) dataset from [galkin2020]_.

:param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.UnpackedRemoteDataset`.
"""
super().__init__(
training_url=HYPER_RELATIONAL_MAIN_TRAIN_URL,
testing_url=HYPER_RELATIONAL_MAIN_TEST_URL,
validation_url=HYPER_RELATIONAL_MAIN_VALID_URL,
load_triples_kwargs={"delimiter": ","},
**kwargs,
)

@parse_docdata
class WD50K_33(HyperRelationalUnpackedRemoteDataset):
"""The hyper-relational version of WD50K where 33% of statements have at lease one qualifier pair.

---
name: WD50K (hyper-relational)
citation:
author: Galkin
year: 2020
link: https://www.aclweb.org/anthology/2020.emnlp-main.596/
arxiv: 2009.10847
github: migalkin/StarE
statistics:
entities: 38,124 (6,463 qualifier-only)
relations: 475 (47 qualifier-only)
training: 73,406
testing: 18,133
validation: 10,568
statements: 102,107
statements w/ qualifiers: 31,866 (31.2%)
"""

def __init__(self, **kwargs):
"""Initialize the WD50K (33) (hyper-relational) dataset from [galkin2020]_.

:param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.UnpackedRemoteDataset`.
"""
super().__init__(
training_url=HYPER_RELATIONAL_33_TRAIN_URL,
testing_url=HYPER_RELATIONAL_33_TEST_URL,
validation_url=HYPER_RELATIONAL_33_VALID_URL,
load_triples_kwargs={"delimiter": ","},
**kwargs,
)

@parse_docdata
class WD50K_66(HyperRelationalUnpackedRemoteDataset):
"""The hyper-relational version of WD50K where 66% of statements have at lease one qualifier pair.

---
name: WD50K (hyper-relational)
citation:
author: Galkin
year: 2020
link: https://www.aclweb.org/anthology/2020.emnlp-main.596/
arxiv: 2009.10847
github: migalkin/StarE
statistics:
entities: 27,347 (7,167 qualifier-only)
relations: 494 (53 qualifier-only)
training: 35,968
testing: 8,045
validation: 5,154
statements: 49,167
statements w/ qualifiers: 31,696 (64.5%)
"""

def __init__(self, **kwargs):
"""Initialize the WD50K (66) (hyper-relational) dataset from [galkin2020]_.

:param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.UnpackedRemoteDataset`.
"""
super().__init__(
training_url=HYPER_RELATIONAL_66_TRAIN_URL,
testing_url=HYPER_RELATIONAL_66_TEST_URL,
validation_url=HYPER_RELATIONAL_66_VALID_URL,
load_triples_kwargs={"delimiter": ","},
**kwargs,
)

@parse_docdata
class WD50K_100(HyperRelationalUnpackedRemoteDataset):
"""The hyper-relational version of WD50K where 100% of statements have at lease one qualifier pair.

---
name: WD50K (hyper-relational)
citation:
author: Galkin
year: 2020
link: https://www.aclweb.org/anthology/2020.emnlp-main.596/
arxiv: 2009.10847
github: migalkin/StarE
statistics:
entities: 18,792 (7,862 qualifier-only)
relations: 279 (75 qualifier-only)
training: 22,738
testing: 5,297
validation: 3,279
statements: 31,314
statements w/ qualifiers: 31,314 (100%)
"""

def __init__(self, **kwargs):
"""Initialize the WD50K (100) (hyper-relational) dataset from [galkin2020]_.

:param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.UnpackedRemoteDataset`.
"""
super().__init__(
training_url=HYPER_RELATIONAL_100_TRAIN_URL,
testing_url=HYPER_RELATIONAL_100_TEST_URL,
validation_url=HYPER_RELATIONAL_100_VALID_URL,
load_triples_kwargs={"delimiter": ","},
**kwargs,
)


@click.command()
@verbose_option
def _main():
for cls in [WD50KT]:
for cls in [WD50KT, WD50K, WD50K_33, WD50K_66, WD50K_100]:
click.secho(f"Loading {cls.__name__}", fg="green", bold=True)
d = cls()
d.summarize()
Expand Down
2 changes: 2 additions & 0 deletions src/pykeen/triples/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
get_mapped_triples,
relation_inverter,
)
from .statement_factory import StatementFactory
from .triples_numeric_literals_factory import TriplesNumericLiteralsFactory

__all__ = [
Expand All @@ -26,4 +27,5 @@
"TriplesNumericLiteralsFactory",
"get_mapped_triples",
"AnyTriples",
"StatementFactory"
]