Updated GEDDataset documentation (#3501)

* Documented ged_dataset.py More Documentation on the way * clean up * fix intend * Updated Documentation Updated caveats regarding the dataset * clean up * Added more documentation for the dataset I added comments about what each function is doing while extracting the already hosted graphs on google drive. Adding documentation would make it easier for people who are aiming to do a similar kind of pre-processing for graph similarity computation. * Updated GED-Dataset Documentation Added an extensive documentation about the inner working of the GED-Dataset class of PyG including the pre-processing steps to aid practitioners who might want to use similar pre-processing. Detailed explanation for each step is given. * Updated dataset documentation Added an extensive documentation about the inner working of the GED-Dataset class of PyG including the pre-processing steps to aid practitioners who might want to use similar pre-processing. Detailed explanation for each step is given. * Updated Documentation for GEDDataset Added an extensive documentation about the inner working of the GED-Dataset class of PyG including the pre-processing steps to aid practitioners who might want to use similar pre-processing. Detailed explanation for each step is given. * Updated GEDDataset Documentation Added an extensive documentation about the inner working of the GED-Dataset class of PyG including the pre-processing steps to aid practitioners who might want to use similar pre-processing. Detailed explanation for each step is given. * update Co-authored-by: rusty1s <matthias.fey@tu-dortmund.de>
pyg-team · Nov 15, 2021 · 83d929c · 83d929c
1 parent 8830505
commit 83d929c
Showing 1 changed file with 16 additions and 4 deletions.
diff --git a/torch_geometric/datasets/ged_dataset.py b/torch_geometric/datasets/ged_dataset.py
@@ -103,18 +103,22 @@ def __init__(self, root: str, name: str, train: bool = True,
 
     @property
     def raw_file_names(self) -> List[str]:
+        # Returns, e.g., ['LINUX/train', 'LINUX/test']
         return [osp.join(self.name, s) for s in ['train', 'test']]
 
     @property
     def processed_file_names(self) -> List[str]:
+        # Returns, e.g., ['LINUX_training.pt', 'LINUX_test.pt']
         return [f'{self.name}_{s}.pt' for s in ['training', 'test']]
 
     def download(self):
+        # Downloads the .tar/.zip file of the graphs and extracts them:
         name = self.datasets[self.name]['id']
         path = download_url(self.url.format(name), self.raw_dir)
         self.datasets[self.name]['extract'](path, self.raw_dir)
         os.unlink(path)
 
+        # Downloads the pickle file containing pre-computed GEDs:
         name = self.datasets[self.name]['pickle']
         path = download_url(self.url.format(name), self.raw_dir)
         os.rename(path, osp.join(self.raw_dir, self.name, 'ged.pickle'))
@@ -123,16 +127,20 @@ def process(self):
         import networkx as nx
 
         ids, Ns = [], []
+        # Iterating over paths for raw and processed data (train + test):
         for r_path, p_path in zip(self.raw_paths, self.processed_paths):
+            # Find the paths of all raw graphs:
             names = glob.glob(osp.join(r_path, '*.gexf'))
-            # Get the graph IDs given by the file name:
+            # Get sorted graph IDs given filename: 123.gexf -> 123
             ids.append(sorted([int(i.split(os.sep)[-1][:-5]) for i in names]))
 
             data_list = []
             # Convert graphs in .gexf format to a NetworkX Graph:
             for i, idx in enumerate(ids[-1]):
                 i = i if len(ids) == 1 else i + len(ids[0])
+                # Reading the raw `*.gexf` graph:
                 G = nx.read_gexf(osp.join(r_path, f'{idx}.gexf'))
+                # Mapping of nodes in `G` to a contiguous number:
                 mapping = {name: j for j, name in enumerate(G.nodes())}
                 G = nx.relabel_nodes(G, mapping)
                 Ns.append(G.number_of_nodes())
@@ -146,7 +154,7 @@ def process(self):
                 data.num_nodes = Ns[-1]
 
                 # Create a one-hot encoded feature matrix denoting the atom
-                # type for the  AIDS700nef dataset:
+                # type (for the `AIDS700nef` dataset):
                 if self.name == 'AIDS700nef':
                     x = torch.zeros(data.num_nodes, dtype=torch.long)
                     for node, info in G.nodes(data=True):
@@ -167,7 +175,9 @@ def process(self):
         assoc = {idx: i for i, idx in enumerate(ids[0])}
         assoc.update({idx: i + len(ids[0]) for i, idx in enumerate(ids[1])})
 
+        # Extracting ground-truth GEDs from the GED pickle file
         path = osp.join(self.raw_dir, self.name, 'ged.pickle')
+        # Initialize GEDs as float('inf'):
         mat = torch.full((len(assoc), len(assoc)), float('inf'))
         with open(path, 'rb') as f:
             obj = pickle.load(f)
@@ -176,9 +186,11 @@ def process(self):
                 xs += [assoc[x]]
                 ys += [assoc[y]]
                 gs += [g]
+            # The pickle file does not contain GEDs for test graph pairs, i.e.
+            # GEDs for (test_graph, test_graph) pairs are still float('inf'):
             x, y = torch.tensor(xs), torch.tensor(ys)
-            g = torch.tensor(gs, dtype=torch.float)
-            mat[x, y], mat[y, x] = g, g
+            ged = torch.tensor(gs, dtype=torch.float)
+            mat[x, y], mat[y, x] = ged, ged
 
         path = osp.join(self.processed_dir, f'{self.name}_ged.pt')
         torch.save(mat, path)