Skip to content

Commit

Permalink
Add ConceptNet (#160)
Browse files Browse the repository at this point in the history
Closes #15
  • Loading branch information
cthoyt committed Dec 10, 2020
1 parent c8beeda commit f25c011
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 6 deletions.
5 changes: 3 additions & 2 deletions README.md
Expand Up @@ -33,7 +33,7 @@
<p align="center">
<a href="#installation">Installation</a> •
<a href="#quickstart">Quickstart</a> •
<a href="#datasets-19">Datasets</a> •
<a href="#datasets-20">Datasets</a> •
<a href="#models-23">Models</a> •
<a href="#supporters">Support</a> •
<a href="#citation">Citation</a>
Expand Down Expand Up @@ -96,13 +96,14 @@ The full documentation can be found at https://pykeen.readthedocs.io.
Below are the models, datasets, training modes, evaluators, and metrics implemented
in ``pykeen``.

### Datasets (19)
### Datasets (20)

| Name | Reference | Description |
|---------------|-------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------|
| codexlarge | [`pykeen.datasets.CoDExLarge`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.CoDExLarge.html) | The CoDEx large dataset. |
| codexmedium | [`pykeen.datasets.CoDExMedium`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.CoDExMedium.html) | The CoDEx medium dataset. |
| codexsmall | [`pykeen.datasets.CoDExSmall`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.CoDExSmall.html) | The CoDEx small dataset. |
| conceptnet | [`pykeen.datasets.ConceptNet`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.ConceptNet.html) | The ConceptNet dataset from [speer2017]_. |
| drkg | [`pykeen.datasets.DRKG`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.DRKG.html) | The DRKG dataset. |
| fb15k | [`pykeen.datasets.FB15k`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.FB15k.html) | The FB15k dataset. |
| fb15k237 | [`pykeen.datasets.FB15k237`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.FB15k237.html) | The FB15k-237 dataset. |
Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Expand Up @@ -60,6 +60,7 @@ install_requires =
optuna>=2.0.0
pandas>=1.0.0
tabulate
more_click
pystow>=0.0.3

zip_safe = false
Expand Down Expand Up @@ -100,6 +101,7 @@ console_scripts =

pykeen.datasets =
hetionet = pykeen.datasets.hetionet:Hetionet
conceptnet = pykeen.datasets.conceptnet:ConceptNet
drkg = pykeen.datasets.drkg:DRKG
kinships = pykeen.datasets.kinships:Kinships
nations = pykeen.datasets.nations:Nations
Expand Down
4 changes: 4 additions & 0 deletions src/pykeen/datasets/__init__.py
Expand Up @@ -18,6 +18,8 @@
TarFileRemoteDataset, UnpackedRemoteDataset, ZipFileRemoteDataset,
)
from .codex import CoDExLarge, CoDExMedium, CoDExSmall
from .conceptnet import ConceptNet
from .drkg import DRKG
from .freebase import FB15k, FB15k237
from .hetionet import Hetionet
from .kinships import Kinships
Expand Down Expand Up @@ -49,6 +51,8 @@
'WN18',
'WN18RR',
'YAGO310',
'DRKG',
'ConceptNet',
'get_dataset',
'has_dataset',
]
Expand Down
9 changes: 5 additions & 4 deletions src/pykeen/datasets/base.py
Expand Up @@ -10,7 +10,7 @@
import zipfile
from abc import abstractmethod
from io import BytesIO
from typing import List, Optional, TextIO, Tuple, Union
from typing import Any, List, Mapping, Optional, TextIO, Tuple, Union
from urllib.parse import urlparse
from urllib.request import urlretrieve

Expand Down Expand Up @@ -597,8 +597,8 @@ def __init__(
cache_root: Optional[str] = None,
eager: bool = False,
create_inverse_triples: bool = False,
delimiter: Optional[str] = None,
random_state: RandomHint = None,
read_csv_kwargs: Optional[Mapping[str, Any]] = None,
):
"""Initialize dataset.
Expand All @@ -618,7 +618,8 @@ def __init__(

self._triples_factory = None
self.random_state = random_state
self.delimiter = delimiter or '\t'
self.read_csv_kwargs = read_csv_kwargs or {}
self.read_csv_kwargs.setdefault('sep', '\t')

self.url = url
if not os.path.exists(self._get_path()) and not self.url:
Expand All @@ -639,7 +640,7 @@ def _load(self) -> None:
if not os.path.exists(self._get_path()):
logger.info('downloading data from %s to %s', self.url, self._get_path())
_urlretrieve(self.url, self._get_path()) # noqa:S310
df = pd.read_csv(self._get_path(), sep=self.delimiter)
df = pd.read_csv(self._get_path(), **self.read_csv_kwargs)
tf = TriplesFactory.from_labeled_triples(triples=df.values, create_inverse_triples=self.create_inverse_triples)
tf.path = self._get_path()
self._training, self._testing, self._validation = tf.split(
Expand Down
54 changes: 54 additions & 0 deletions src/pykeen/datasets/conceptnet.py
@@ -0,0 +1,54 @@
# -*- coding: utf-8 -*-

"""The `ConceptNet <https://conceptnet.io/>`_ dataset.
Get a summary with ``python -m pykeen.datasets.conceptnet``
"""

import click
from more_click import verbose_option

from .base import SingleTabbedDataset
from ..typing import RandomHint

URL = 'https://s3.amazonaws.com/conceptnet/downloads/2019/edges/conceptnet-assertions-5.7.0.csv.gz'


class ConceptNet(SingleTabbedDataset):
"""The ConceptNet dataset from [speer2017]_.
The dataset is structured into 5 columns (see https://github.com/commonsense/conceptnet5/wiki/Downloads#assertions):
edge URL, relation, head, tail, metadata.
.. [speer2017] Robyn Speer, Joshua Chin, and Catherine Havasi. (2017)
`ConceptNet 5.5: An Open Multilingual Graph of General Knowledge <https://arxiv.org/abs/1612.03975>`_.
*In proceedings of AAAI 31*.
"""

def __init__(
self,
create_inverse_triples: bool = False,
random_state: RandomHint = 0,
**kwargs,
):
super().__init__(
url=URL,
create_inverse_triples=create_inverse_triples,
random_state=random_state,
read_csv_kwargs=dict(
usecols=[1, 2, 3],
header=None,
),
**kwargs,
)


@click.command()
@verbose_option
def _main():
ds = ConceptNet()
ds.summarize()


if __name__ == '__main__':
_main()

0 comments on commit f25c011

Please sign in to comment.