From f25c01159fe5fcbd5c77d51d67946c5a4a1ae7c9 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 10 Dec 2020 18:15:41 +0100 Subject: [PATCH] Add ConceptNet (#160) Closes #15 --- README.md | 5 +-- setup.cfg | 2 ++ src/pykeen/datasets/__init__.py | 4 +++ src/pykeen/datasets/base.py | 9 +++--- src/pykeen/datasets/conceptnet.py | 54 +++++++++++++++++++++++++++++++ 5 files changed, 68 insertions(+), 6 deletions(-) create mode 100644 src/pykeen/datasets/conceptnet.py diff --git a/README.md b/README.md index 8f2ba75e1c..4acc72a4a5 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@

InstallationQuickstart • - Datasets • + DatasetsModelsSupportCitation @@ -96,13 +96,14 @@ The full documentation can be found at https://pykeen.readthedocs.io. Below are the models, datasets, training modes, evaluators, and metrics implemented in ``pykeen``. -### Datasets (19) +### Datasets (20) | Name | Reference | Description | |---------------|-------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------| | codexlarge | [`pykeen.datasets.CoDExLarge`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.CoDExLarge.html) | The CoDEx large dataset. | | codexmedium | [`pykeen.datasets.CoDExMedium`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.CoDExMedium.html) | The CoDEx medium dataset. | | codexsmall | [`pykeen.datasets.CoDExSmall`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.CoDExSmall.html) | The CoDEx small dataset. | +| conceptnet | [`pykeen.datasets.ConceptNet`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.ConceptNet.html) | The ConceptNet dataset from [speer2017]_. | | drkg | [`pykeen.datasets.DRKG`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.DRKG.html) | The DRKG dataset. | | fb15k | [`pykeen.datasets.FB15k`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.FB15k.html) | The FB15k dataset. | | fb15k237 | [`pykeen.datasets.FB15k237`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.FB15k237.html) | The FB15k-237 dataset. | diff --git a/setup.cfg b/setup.cfg index 59307cbd10..b979aff187 100644 --- a/setup.cfg +++ b/setup.cfg @@ -60,6 +60,7 @@ install_requires = optuna>=2.0.0 pandas>=1.0.0 tabulate + more_click pystow>=0.0.3 zip_safe = false @@ -100,6 +101,7 @@ console_scripts = pykeen.datasets = hetionet = pykeen.datasets.hetionet:Hetionet + conceptnet = pykeen.datasets.conceptnet:ConceptNet drkg = pykeen.datasets.drkg:DRKG kinships = pykeen.datasets.kinships:Kinships nations = pykeen.datasets.nations:Nations diff --git a/src/pykeen/datasets/__init__.py b/src/pykeen/datasets/__init__.py index fb6672e76f..e74ba1b523 100644 --- a/src/pykeen/datasets/__init__.py +++ b/src/pykeen/datasets/__init__.py @@ -18,6 +18,8 @@ TarFileRemoteDataset, UnpackedRemoteDataset, ZipFileRemoteDataset, ) from .codex import CoDExLarge, CoDExMedium, CoDExSmall +from .conceptnet import ConceptNet +from .drkg import DRKG from .freebase import FB15k, FB15k237 from .hetionet import Hetionet from .kinships import Kinships @@ -49,6 +51,8 @@ 'WN18', 'WN18RR', 'YAGO310', + 'DRKG', + 'ConceptNet', 'get_dataset', 'has_dataset', ] diff --git a/src/pykeen/datasets/base.py b/src/pykeen/datasets/base.py index 70a97f588e..28b54975b1 100644 --- a/src/pykeen/datasets/base.py +++ b/src/pykeen/datasets/base.py @@ -10,7 +10,7 @@ import zipfile from abc import abstractmethod from io import BytesIO -from typing import List, Optional, TextIO, Tuple, Union +from typing import Any, List, Mapping, Optional, TextIO, Tuple, Union from urllib.parse import urlparse from urllib.request import urlretrieve @@ -597,8 +597,8 @@ def __init__( cache_root: Optional[str] = None, eager: bool = False, create_inverse_triples: bool = False, - delimiter: Optional[str] = None, random_state: RandomHint = None, + read_csv_kwargs: Optional[Mapping[str, Any]] = None, ): """Initialize dataset. @@ -618,7 +618,8 @@ def __init__( self._triples_factory = None self.random_state = random_state - self.delimiter = delimiter or '\t' + self.read_csv_kwargs = read_csv_kwargs or {} + self.read_csv_kwargs.setdefault('sep', '\t') self.url = url if not os.path.exists(self._get_path()) and not self.url: @@ -639,7 +640,7 @@ def _load(self) -> None: if not os.path.exists(self._get_path()): logger.info('downloading data from %s to %s', self.url, self._get_path()) _urlretrieve(self.url, self._get_path()) # noqa:S310 - df = pd.read_csv(self._get_path(), sep=self.delimiter) + df = pd.read_csv(self._get_path(), **self.read_csv_kwargs) tf = TriplesFactory.from_labeled_triples(triples=df.values, create_inverse_triples=self.create_inverse_triples) tf.path = self._get_path() self._training, self._testing, self._validation = tf.split( diff --git a/src/pykeen/datasets/conceptnet.py b/src/pykeen/datasets/conceptnet.py new file mode 100644 index 0000000000..bb397714fa --- /dev/null +++ b/src/pykeen/datasets/conceptnet.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +"""The `ConceptNet `_ dataset. + +Get a summary with ``python -m pykeen.datasets.conceptnet`` +""" + +import click +from more_click import verbose_option + +from .base import SingleTabbedDataset +from ..typing import RandomHint + +URL = 'https://s3.amazonaws.com/conceptnet/downloads/2019/edges/conceptnet-assertions-5.7.0.csv.gz' + + +class ConceptNet(SingleTabbedDataset): + """The ConceptNet dataset from [speer2017]_. + + The dataset is structured into 5 columns (see https://github.com/commonsense/conceptnet5/wiki/Downloads#assertions): + edge URL, relation, head, tail, metadata. + + .. [speer2017] Robyn Speer, Joshua Chin, and Catherine Havasi. (2017) + `ConceptNet 5.5: An Open Multilingual Graph of General Knowledge `_. + *In proceedings of AAAI 31*. + """ + + def __init__( + self, + create_inverse_triples: bool = False, + random_state: RandomHint = 0, + **kwargs, + ): + super().__init__( + url=URL, + create_inverse_triples=create_inverse_triples, + random_state=random_state, + read_csv_kwargs=dict( + usecols=[1, 2, 3], + header=None, + ), + **kwargs, + ) + + +@click.command() +@verbose_option +def _main(): + ds = ConceptNet() + ds.summarize() + + +if __name__ == '__main__': + _main()