Skip to content

Commit

Permalink
Updated GEDDataset documentation (#3501)
Browse files Browse the repository at this point in the history
* Documented ged_dataset.py

More Documentation on the way

* clean up

* fix intend

* Updated Documentation

Updated caveats regarding the dataset

* clean up

* Added more documentation for the dataset

I added comments about what each function is doing while extracting the already hosted graphs on google drive. Adding documentation would make it easier for people who are aiming to do a similar kind of pre-processing for graph similarity computation.

* Updated GED-Dataset Documentation

Added an extensive documentation about the inner working of the GED-Dataset class of PyG including the pre-processing steps to aid practitioners who might want to use similar pre-processing. Detailed explanation for each step is given.

* Updated dataset documentation

Added an extensive documentation about the inner working of the GED-Dataset class of PyG including the pre-processing steps to aid practitioners who might want to use similar pre-processing. Detailed explanation for each step is given.

* Updated Documentation for GEDDataset

Added an extensive documentation about the inner working of the GED-Dataset class of PyG including the pre-processing steps to aid practitioners who might want to use similar pre-processing. Detailed explanation for each step is given.

* Updated GEDDataset Documentation

Added an extensive documentation about the inner working of the GED-Dataset class of PyG including the pre-processing steps to aid practitioners who might want to use similar pre-processing. Detailed explanation for each step is given.

* update

Co-authored-by: rusty1s <matthias.fey@tu-dortmund.de>
  • Loading branch information
kunind27 and rusty1s committed Nov 15, 2021
1 parent 8830505 commit 83d929c
Showing 1 changed file with 16 additions and 4 deletions.
20 changes: 16 additions & 4 deletions torch_geometric/datasets/ged_dataset.py
Expand Up @@ -103,18 +103,22 @@ def __init__(self, root: str, name: str, train: bool = True,

@property
def raw_file_names(self) -> List[str]:
# Returns, e.g., ['LINUX/train', 'LINUX/test']
return [osp.join(self.name, s) for s in ['train', 'test']]

@property
def processed_file_names(self) -> List[str]:
# Returns, e.g., ['LINUX_training.pt', 'LINUX_test.pt']
return [f'{self.name}_{s}.pt' for s in ['training', 'test']]

def download(self):
# Downloads the .tar/.zip file of the graphs and extracts them:
name = self.datasets[self.name]['id']
path = download_url(self.url.format(name), self.raw_dir)
self.datasets[self.name]['extract'](path, self.raw_dir)
os.unlink(path)

# Downloads the pickle file containing pre-computed GEDs:
name = self.datasets[self.name]['pickle']
path = download_url(self.url.format(name), self.raw_dir)
os.rename(path, osp.join(self.raw_dir, self.name, 'ged.pickle'))
Expand All @@ -123,16 +127,20 @@ def process(self):
import networkx as nx

ids, Ns = [], []
# Iterating over paths for raw and processed data (train + test):
for r_path, p_path in zip(self.raw_paths, self.processed_paths):
# Find the paths of all raw graphs:
names = glob.glob(osp.join(r_path, '*.gexf'))
# Get the graph IDs given by the file name:
# Get sorted graph IDs given filename: 123.gexf -> 123
ids.append(sorted([int(i.split(os.sep)[-1][:-5]) for i in names]))

data_list = []
# Convert graphs in .gexf format to a NetworkX Graph:
for i, idx in enumerate(ids[-1]):
i = i if len(ids) == 1 else i + len(ids[0])
# Reading the raw `*.gexf` graph:
G = nx.read_gexf(osp.join(r_path, f'{idx}.gexf'))
# Mapping of nodes in `G` to a contiguous number:
mapping = {name: j for j, name in enumerate(G.nodes())}
G = nx.relabel_nodes(G, mapping)
Ns.append(G.number_of_nodes())
Expand All @@ -146,7 +154,7 @@ def process(self):
data.num_nodes = Ns[-1]

# Create a one-hot encoded feature matrix denoting the atom
# type for the AIDS700nef dataset:
# type (for the `AIDS700nef` dataset):
if self.name == 'AIDS700nef':
x = torch.zeros(data.num_nodes, dtype=torch.long)
for node, info in G.nodes(data=True):
Expand All @@ -167,7 +175,9 @@ def process(self):
assoc = {idx: i for i, idx in enumerate(ids[0])}
assoc.update({idx: i + len(ids[0]) for i, idx in enumerate(ids[1])})

# Extracting ground-truth GEDs from the GED pickle file
path = osp.join(self.raw_dir, self.name, 'ged.pickle')
# Initialize GEDs as float('inf'):
mat = torch.full((len(assoc), len(assoc)), float('inf'))
with open(path, 'rb') as f:
obj = pickle.load(f)
Expand All @@ -176,9 +186,11 @@ def process(self):
xs += [assoc[x]]
ys += [assoc[y]]
gs += [g]
# The pickle file does not contain GEDs for test graph pairs, i.e.
# GEDs for (test_graph, test_graph) pairs are still float('inf'):
x, y = torch.tensor(xs), torch.tensor(ys)
g = torch.tensor(gs, dtype=torch.float)
mat[x, y], mat[y, x] = g, g
ged = torch.tensor(gs, dtype=torch.float)
mat[x, y], mat[y, x] = ged, ged

path = osp.join(self.processed_dir, f'{self.name}_ged.pt')
torch.save(mat, path)
Expand Down

0 comments on commit 83d929c

Please sign in to comment.