Create a small demo dataset by parsing JSON files created from OSE wi…

…ki GVCS product ecology (#7811) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: rusty1s <matthias.fey@tu-dortmund.de>
pyg-team · Jul 31, 2023 · 1199597 · 1199597
1 parent e8f752f
commit 1199597
Show file tree

Hide file tree

Showing 3 changed files with 114 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added the `OSE_GVCS` dataset ([#7811](https://github.com/pyg-team/pytorch_geometric/pull/7811))
 - Added `output_initializer` argument to `DimeNet` models ([#7774](https://github.com/pyg-team/pytorch_geometric/pull/7774), [#7780](https://github.com/pyg-team/pytorch_geometric/pull/7780))
 - Added `lexsort` implementation ([#7775](https://github.com/pyg-team/pytorch_geometric/pull/7775))
 - Added possibility to run inference benchmarks on XPU device ([#7705](https://github.com/pyg-team/pytorch_geometric/pull/7705))

diff --git a/torch_geometric/datasets/__init__.py b/torch_geometric/datasets/__init__.py
@@ -86,6 +86,7 @@
 from .igmc_dataset import IGMCDataset
 from .amazon_book import AmazonBook
 from .hm import HM
+from .ose_gvcs import OSE_GVCS
 
 from .fake import FakeDataset, FakeHeteroDataset
 from .sbm_dataset import StochasticBlockModelDataset
@@ -190,6 +191,7 @@
     'IGMCDataset',
     'AmazonBook',
     'HM',
+    'OSE_GVCS',
 ]
 synthetic_datasets = [
     'FakeDataset',

diff --git a/torch_geometric/datasets/ose_gvcs.py b/torch_geometric/datasets/ose_gvcs.py
@@ -0,0 +1,111 @@
+import json
+import os
+from collections import defaultdict
+from typing import Callable, List, Optional
+
+import torch
+
+from torch_geometric.data import (
+    HeteroData,
+    InMemoryDataset,
+    download_url,
+    extract_tar,
+)
+
+
+class OSE_GVCS(InMemoryDataset):
+    r"""A dataset describing the `Product ecology
+    <https://wiki.opensourceecology.org/wiki/Product_Ecologies>`_ of the Open
+    Source Ecology's iconoclastic `Global Village Construction Set
+    <https://wiki.opensourceecology.org/wiki/
+    Global_Village_Construction_Set>`_.
+    GVCS is a modular, DIY, low-cost set of blueprints that enables the
+    fabrication of the 50 different industrial machines that it takes to
+    build a small, sustainable civilization with modern comforts.
+
+    The dataset contains a heterogenous graphs with 50 :obj:`machine` nodes,
+    composing the GVCS, and 290 directed edges, each representing one out of
+    three relationships of machines.
+    """
+    machines = [
+        '3D Printer', '3D Scanner', 'Aluminum Extractor', 'Backhoe',
+        'Bakery Oven', 'Baler', 'Bioplastic Extruder', 'Bulldozer', 'Car',
+        'CEB Press', 'Cement Mixer', 'Chipper Hammermill', 'CNC Circuit Mill',
+        'CNC Torch Table', 'Dairy Milker', 'Drill Press',
+        'Electric Motor Generator', 'Gasifier Burner', 'Hay Cutter',
+        'Hay Rake', 'Hydraulic Motor', 'Induction Furnace', 'Industrial Robot',
+        'Ironworker', 'Laser Cutter', 'Metal Roller', 'Microcombine',
+        'Microtractor', 'Multimachine', 'Nickel-Iron Battery', 'Pelletizer',
+        'Plasma Cutter', 'Power Cube', 'Press Forge', 'Rod and Wire Mill',
+        'Rototiller', 'Sawmill', 'Seeder', 'Solar Concentrator', 'Spader',
+        'Steam Engine', 'Steam Generator', 'Tractor', 'Trencher', 'Truck',
+        'Universal Power Supply', 'Universal Rotor', 'Welder',
+        'Well-Drilling Rig', 'Wind Turbine'
+    ]
+    categories = [
+        'habitat', 'agriculture', 'industry', 'energy', 'materials',
+        'transportation'
+    ]
+    relationships = ['from', 'uses', 'enables']
+
+    url = 'https://github.com/Wesxdz/ose_gvcs/raw/master/ose_gvcs.tar.gz'
+
+    def __init__(
+        self,
+        root: str,
+        transform: Optional[Callable] = None,
+        pre_transform: Optional[Callable] = None,
+    ):
+        super().__init__(root, transform, pre_transform)
+        self.load(self.processed_paths[0], data_cls=HeteroData)
+
+    @property
+    def raw_file_names(self) -> List[str]:
+        return [
+            f"{machine.lower().replace(' ', '_')}.json"
+            for machine in self.machines
+        ]
+
+    @property
+    def processed_file_names(self) -> str:
+        return 'data.pt'
+
+    def download(self):
+        path = download_url(self.url, self.root)
+        extract_tar(path, self.raw_dir)
+        os.unlink(path)
+
+    def process(self):
+        data = HeteroData()
+
+        categories = []
+        edges = defaultdict(list)
+
+        for path in self.raw_paths:
+            with open(path, 'r') as f:
+                product = json.load(f)
+            categories.append(self.categories.index(product['category']))
+            for interaction in product['ecology']:
+                # NOTE Some ecology items are not GVCS machines or have other
+                # relationship types we don't want included.
+                rt = interaction['relationship']
+                if rt not in self.relationships:
+                    continue
+                dst = interaction['tool']
+                if dst not in self.machines:
+                    continue
+                src = self.machines.index(product['machine'])
+                dst = self.machines.index(dst)
+                edges[rt].append((src, dst))
+
+        data['machine'].num_nodes = len(categories)
+        data['machine'].category = torch.tensor(categories)
+
+        for rel, edge_index, in edges.items():
+            edge_index = torch.tensor(edge_index).t()
+            data['machine', rel, 'machine'].edge_index = edge_index
+
+        if self.pre_transform is not None:
+            data = self.pre_transform(data)
+
+        self.save([data], self.processed_paths[0])