diff --git a/README.md b/README.md index 8f2ba75e1c..4acc72a4a5 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@
Installation •
Quickstart •
- Datasets •
+ Datasets •
Models •
Support •
Citation
@@ -96,13 +96,14 @@ The full documentation can be found at https://pykeen.readthedocs.io.
Below are the models, datasets, training modes, evaluators, and metrics implemented
in ``pykeen``.
-### Datasets (19)
+### Datasets (20)
| Name | Reference | Description |
|---------------|-------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------|
| codexlarge | [`pykeen.datasets.CoDExLarge`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.CoDExLarge.html) | The CoDEx large dataset. |
| codexmedium | [`pykeen.datasets.CoDExMedium`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.CoDExMedium.html) | The CoDEx medium dataset. |
| codexsmall | [`pykeen.datasets.CoDExSmall`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.CoDExSmall.html) | The CoDEx small dataset. |
+| conceptnet | [`pykeen.datasets.ConceptNet`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.ConceptNet.html) | The ConceptNet dataset from [speer2017]_. |
| drkg | [`pykeen.datasets.DRKG`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.DRKG.html) | The DRKG dataset. |
| fb15k | [`pykeen.datasets.FB15k`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.FB15k.html) | The FB15k dataset. |
| fb15k237 | [`pykeen.datasets.FB15k237`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.FB15k237.html) | The FB15k-237 dataset. |
diff --git a/setup.cfg b/setup.cfg
index 59307cbd10..b979aff187 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -60,6 +60,7 @@ install_requires =
optuna>=2.0.0
pandas>=1.0.0
tabulate
+ more_click
pystow>=0.0.3
zip_safe = false
@@ -100,6 +101,7 @@ console_scripts =
pykeen.datasets =
hetionet = pykeen.datasets.hetionet:Hetionet
+ conceptnet = pykeen.datasets.conceptnet:ConceptNet
drkg = pykeen.datasets.drkg:DRKG
kinships = pykeen.datasets.kinships:Kinships
nations = pykeen.datasets.nations:Nations
diff --git a/src/pykeen/datasets/__init__.py b/src/pykeen/datasets/__init__.py
index fb6672e76f..e74ba1b523 100644
--- a/src/pykeen/datasets/__init__.py
+++ b/src/pykeen/datasets/__init__.py
@@ -18,6 +18,8 @@
TarFileRemoteDataset, UnpackedRemoteDataset, ZipFileRemoteDataset,
)
from .codex import CoDExLarge, CoDExMedium, CoDExSmall
+from .conceptnet import ConceptNet
+from .drkg import DRKG
from .freebase import FB15k, FB15k237
from .hetionet import Hetionet
from .kinships import Kinships
@@ -49,6 +51,8 @@
'WN18',
'WN18RR',
'YAGO310',
+ 'DRKG',
+ 'ConceptNet',
'get_dataset',
'has_dataset',
]
diff --git a/src/pykeen/datasets/base.py b/src/pykeen/datasets/base.py
index 70a97f588e..28b54975b1 100644
--- a/src/pykeen/datasets/base.py
+++ b/src/pykeen/datasets/base.py
@@ -10,7 +10,7 @@
import zipfile
from abc import abstractmethod
from io import BytesIO
-from typing import List, Optional, TextIO, Tuple, Union
+from typing import Any, List, Mapping, Optional, TextIO, Tuple, Union
from urllib.parse import urlparse
from urllib.request import urlretrieve
@@ -597,8 +597,8 @@ def __init__(
cache_root: Optional[str] = None,
eager: bool = False,
create_inverse_triples: bool = False,
- delimiter: Optional[str] = None,
random_state: RandomHint = None,
+ read_csv_kwargs: Optional[Mapping[str, Any]] = None,
):
"""Initialize dataset.
@@ -618,7 +618,8 @@ def __init__(
self._triples_factory = None
self.random_state = random_state
- self.delimiter = delimiter or '\t'
+ self.read_csv_kwargs = read_csv_kwargs or {}
+ self.read_csv_kwargs.setdefault('sep', '\t')
self.url = url
if not os.path.exists(self._get_path()) and not self.url:
@@ -639,7 +640,7 @@ def _load(self) -> None:
if not os.path.exists(self._get_path()):
logger.info('downloading data from %s to %s', self.url, self._get_path())
_urlretrieve(self.url, self._get_path()) # noqa:S310
- df = pd.read_csv(self._get_path(), sep=self.delimiter)
+ df = pd.read_csv(self._get_path(), **self.read_csv_kwargs)
tf = TriplesFactory.from_labeled_triples(triples=df.values, create_inverse_triples=self.create_inverse_triples)
tf.path = self._get_path()
self._training, self._testing, self._validation = tf.split(
diff --git a/src/pykeen/datasets/conceptnet.py b/src/pykeen/datasets/conceptnet.py
new file mode 100644
index 0000000000..bb397714fa
--- /dev/null
+++ b/src/pykeen/datasets/conceptnet.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+"""The `ConceptNet