# install

In [1]:
! pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.0.4.tar.gz (407 kB)
[?25l[K     |▉                               | 10 kB 30.9 MB/s eta 0:00:01[K     |█▋                              | 20 kB 33.1 MB/s eta 0:00:01[K     |██▍                             | 30 kB 16.2 MB/s eta 0:00:01[K     |███▏                            | 40 kB 6.8 MB/s eta 0:00:01[K     |████                            | 51 kB 7.0 MB/s eta 0:00:01[K     |████▉                           | 61 kB 8.3 MB/s eta 0:00:01[K     |█████▋                          | 71 kB 8.8 MB/s eta 0:00:01[K     |██████▍                         | 81 kB 8.3 MB/s eta 0:00:01[K     |███████▎                        | 92 kB 9.3 MB/s eta 0:00:01[K     |████████                        | 102 kB 7.6 MB/s eta 0:00:01[K     |████████▉                       | 112 kB 7.6 MB/s eta 0:00:01[K     |█████████▋                      | 122 kB 7.6 MB/s eta 0:00:01[K     |██████████▌                     | 133 kB 7.6 MB/s eta 0:00:

In [2]:
! pip install torch_sparse

Collecting torch_sparse
  Downloading torch_sparse-0.6.13.tar.gz (48 kB)
[?25l[K     |██████▊                         | 10 kB 39.8 MB/s eta 0:00:01[K     |█████████████▌                  | 20 kB 34.6 MB/s eta 0:00:01[K     |████████████████████▎           | 30 kB 12.7 MB/s eta 0:00:01[K     |███████████████████████████     | 40 kB 6.0 MB/s eta 0:00:01[K     |████████████████████████████████| 48 kB 3.7 MB/s 
Building wheels for collected packages: torch-sparse
  Building wheel for torch-sparse (setup.py) ... [?25l[?25hdone
  Created wheel for torch-sparse: filename=torch_sparse-0.6.13-cp37-cp37m-linux_x86_64.whl size=1669162 sha256=36dee5e4709e752a0796bfcb6bfb17c187480843acb2ca7a93b1af36d6e64e6c
  Stored in directory: /root/.cache/pip/wheels/e0/01/be/6b2966e0ff20bb023ae35e5d17903e6e5b4df46dd5892f6be6
Successfully built torch-sparse
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.13


In [3]:
! pip install torch_scatter

Collecting torch_scatter
  Downloading torch_scatter-2.0.9.tar.gz (21 kB)
Building wheels for collected packages: torch-scatter
  Building wheel for torch-scatter (setup.py) ... [?25l[?25hdone
  Created wheel for torch-scatter: filename=torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl size=3874186 sha256=7df852bb525b06a4a0e9bbd2cd1e690b24fff0bc12c0fa11d7b592357d4d11ad
  Stored in directory: /root/.cache/pip/wheels/dd/57/a3/42ea193b77378ce634eb9454c9bc1e3163f3b482a35cdee4d1
Successfully built torch-scatter
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9


# Creating Own Datasets 

abstract classes for datasets:
- torch_geometric.data.Dataset
- torch_geometric.data.InMemoryDataset
  - [InMemoryDataset] inherits from [Dataset] and should be used if the whole dataset fits into CPU memory


- torchvision 처럼 각각의 dataset은 'root folder'(dataset 저장되는 폴더)를 받는데 PyG에서는 이걸 'raw_dir' 폴더와 'processed_dir' 폴더로 나눔
  - raw_dir: 데이터셋 다운로드 하는 경로
  - processed_dir: 처리된 데이터셋이 저장되는 경로

- 각각의 dataset은 'tranfrom', 'pre_transform', 'pre_filter' 함수를 인자로 받을 수 있는데 모두 default는 None
  - tranform: transforms the data object before accessing to make it best used for data augmentation.
  - pre_transform: applies the transformation before saving the data objects to disk - so it is best used for heavy precomputation which needs to be only done once.
  - pre_filter: filters out data objects before saving

# Creating "In Memory Datasets"

FOUR fundmental methods for creating a 'pyg.data.InMemoryDataset'
 1. torch_geometric.data.InMemoryDataset.raw_file_names()
  : raw_dir에 있는 파일 리스트
 2. torch_geometric.data.InMemoryDataset.processed_file_names()
  : processed_dir에 있는 파일 리스트
 3. torch_geometric.data.InMemoryDataset.download()
  : raw data를 raw_dir에 다운로드
 4. torch_geometric.data.InMemoryDataset.process()
  : raw data를 process하여 processed_dir에 저장

In [4]:
import torch
from torch_geometric.data import InMemoryDataset, download_url

class MyOwnDataset(InMemoryDataset):
  def __init__(self, root, transform=None, pre_transform=None, 
               pre_filter=None):
    super().__init__(root, transform, pre_transform, pre_filter)
    self.data, self.slices = torch.load(self.processed_paths[0])

  @property
  def raw_file_names(self):
    return ['some_file_1', 'some_file_2', ...]
  
  @property
  def processed_file_names(self):
    return ['data.pt']

  def download(self):
    # download to 'self.raw_dir'.
    download_url(url, self.raw_dir)
    ...

  def process(self):
    # read data into huge 'Data' list.
    data_list = [...] # a list of Data objects

    if self.pre_filter is not None:
      data_list = [data for data in data_list if self.pre_filter(data)]

    if self.pre_transform is not None:
      data_list = [self.pre_transform(data) for data in data_list]

    data, slices = self.collate(data_list) #  Collating the list into one huge Data object via .collate() - a huge python list 저장하는 거 느리기때문
    # The collated data object has concatenated all examples into one big data object
    # + returns a slices dictionary to reconstruct single examples from this object.

    torch.save((data, slices), self.processed_paths[0])



# Creating "Larger" Datasets

- torch_geometric.data.Dataset
  - can be used for creating datasets which do not fit into memory
  - it closely follows the concepts of the torchvision datasets. 
  - It expects the following methods to be implemented in addition:
    - torch_geometric.data.Dataset.len(): Returns the number of examples in your dataset.
    - torch_geometric.data.Dataset.get(): Implements the logic to load a single graph.

- torch_geometric.data.Dataset. _ _ getitem _ _() 
  - gets data objects from torch_geometric.data.Dataset.get() and optionally transforms them according to transform.

Let’s see this process in a simplified example:

In [5]:
import os.path as osp

import torch
from torch_geometric.data import Dataset, download_url


class MyOwnDataset(Dataset):
  def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
    super().__init__(root, transform, pre_transform, pre_filter)

  @property
  def raw_file_names(self):
    return ['some_file_1', 'some_file_2', ...]

  @property
  def processed_file_names(self):
    return ['data_1.pt', 'data_2.pt', ...]

  def download(self):
    # Download to `self.raw_dir`.
    path = download_url(url, self.raw_dir)
    ...

  def process(self):
    idx = 0
    for raw_path in self.raw_paths:
      # Read data from `raw_path`.
      data = Data(...)

      if self.pre_filter is not None and not self.pre_filter(data):
        continue

      if self.pre_transform is not None:
        data = self.pre_transform(data)

      torch.save(data, osp.join(self.processed_dir, f'data_{idx}.pt'))
      idx += 1

  def len(self):
    return len(self.processed_file_names)

  def get(self, idx):
    data = torch.load(osp.join(self.processed_dir, f'data_{idx}.pt'))
    return data

# Here, each graph data object gets saved individually in process(), and is manually loaded in get().

# Exercises

Consider the following InMemoryDataset constructed from a list of Data objects:

In [6]:
class MyDataset(InMemoryDataset):
  def __init__(self, root, data_list, transform=None):
    self.data_list = data_list
    super.__init__(root, transform)
    self.data, self.slices = torch.load(self.processed_paths[0])

  @property
  def processed_file_names(self):
    return 'data.pt'
  
  def process(self):
    torch.save(self.collate(self.data_list), self.processed_paths[0])


1. What is the output of self.processed_paths[0]?
2. What does collate() do?

## A Beginner’s Guide to Graph Neural Networks Using PyTorch Geometric — Part 1 by R. Teja (일부)

출처: https://towardsdatascience.com/a-beginners-guide-to-graph-neural-networks-using-pytorch-geometric-part-1-d98dc93e7742

In [None]:
# dataset: Zachary's Karate Club
# 34 nodes (students in the club)
# 78 edges (interactions between pairs of members outside the club)
# 2 labels (2 factions)

### preparations

In [45]:
import networkx as nx
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler

# load graph from networkx library
G = nx.karate_club_graph()

# retrieve the labels for each node
labels = np.asarray([G.nodes[i]['club'] != 'Mr. Hi' for i in G.nodes]).astype(np.int64)
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [46]:
print(f'G.nodes: {G.nodes}')
print(f'G.nodes[1]: {G.nodes[1]}')
print(f'G.nodes[31]: {G.nodes[33]}')

G.nodes: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
G.nodes[1]: {'club': 'Mr. Hi'}
G.nodes[31]: {'club': 'Officer'}


In [48]:
adj = nx.to_scipy_sparse_matrix(G)
adj

<34x34 sparse matrix of type '<class 'numpy.longlong'>'
	with 156 stored elements in Compressed Sparse Row format>

In [56]:
# create edge index from
adj = nx.to_scipy_sparse_matrix(G).tocoo()
row = torch.from_numpy(adj.row.astype(np.int64)).to(torch.long)
col = torch.from_numpy(adj.col.astype(np.int64)).to(torch.long)
edge_index = torch.stack([row, col], dim=0)

print(f'adj:{adj}\n\nrow:{row}\n\ncol:{col}\n\nedge_index:{edge_index}')

adj:  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1
  (0, 8)	1
  (0, 10)	1
  (0, 11)	1
  (0, 12)	1
  (0, 13)	1
  (0, 17)	1
  (0, 19)	1
  (0, 21)	1
  (0, 31)	1
  (1, 0)	1
  (1, 2)	1
  (1, 3)	1
  (1, 7)	1
  (1, 13)	1
  (1, 17)	1
  (1, 19)	1
  (1, 21)	1
  (1, 30)	1
  :	:
  (32, 18)	1
  (32, 20)	1
  (32, 22)	1
  (32, 23)	1
  (32, 29)	1
  (32, 30)	1
  (32, 31)	1
  (32, 33)	1
  (33, 8)	1
  (33, 9)	1
  (33, 13)	1
  (33, 14)	1
  (33, 15)	1
  (33, 18)	1
  (33, 19)	1
  (33, 20)	1
  (33, 22)	1
  (33, 23)	1
  (33, 26)	1
  (33, 27)	1
  (33, 28)	1
  (33, 29)	1
  (33, 30)	1
  (33, 31)	1
  (33, 32)	1

row:tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,
         3,  3,  3,  3,  3,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,
         7,  7,  8,  8,  8,  8,  8,  9,  9, 10, 10, 10, 11, 12, 12, 13, 13, 13,
        13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18

In [53]:
# using degreee as embedding
embeddings = np.array(list(dict(G.degree()).values()))
embeddings

array([16,  9, 10,  6,  3,  4,  4,  4,  5,  2,  3,  1,  2,  5,  2,  2,  2,
        2,  2,  3,  2,  2,  2,  5,  3,  3,  2,  4,  3,  4,  4,  6, 12, 17])

In [58]:
# normalizing degree values
scale = StandardScaler()
embeddings = scale.fit_transform(embeddings.reshape(-1, 1))
print(f'scaled embeddings:\n{embeddings}')

scaled embeddings:
[[ 2.98709092]
 [ 1.15480319]
 [ 1.41655858]
 [ 0.36953702]
 [-0.41572915]
 [-0.15397376]
 [-0.15397376]
 [-0.15397376]
 [ 0.10778163]
 [-0.67748454]
 [-0.41572915]
 [-0.93923993]
 [-0.67748454]
 [ 0.10778163]
 [-0.67748454]
 [-0.67748454]
 [-0.67748454]
 [-0.67748454]
 [-0.67748454]
 [-0.41572915]
 [-0.67748454]
 [-0.67748454]
 [-0.67748454]
 [ 0.10778163]
 [-0.41572915]
 [-0.41572915]
 [-0.67748454]
 [-0.15397376]
 [-0.41572915]
 [-0.15397376]
 [-0.15397376]
 [ 0.36953702]
 [ 1.94006936]
 [ 3.24884631]]


### The Custom Dataset

In [66]:
G.number_of_nodes()

34

In [61]:
import torch
import pandas as pd
from torch_geometric.data import InMemoryDataset, Data
from sklearn.model_selection import train_test_split
import torch_geometric.transforms as T

# custom dataset
class KarateDataset(InMemoryDataset):
  def __init__(self, transform=None):
    super(KarateDataset, self).__init__('.', transform, None, None) # root('.' 현재 디렉토리??), transform, pre_transform, pre_filter

    data = Data(edge_index = edge_index) # edge_index 위에서 정의
    data.num_nodes = G.number_of_nodes()

    # embedding
    data.x = torch.from_numpy(embeddings).type(torch.float32) # embeddings 위에서 정의

    # labels
    y = torch.from_numpy(labels).type(torch.long) # labels 위에서 정의
    data.y = y.clone().detach()

    data.num_classes = 2

    # splitting the data into train, validation and test
    X_train, X_test, y_train, y_test = train_test_split(pd.Series(list(G.nodes())),
                                                        pd.Series(labels),
                                                        test_size = 0.30,
                                                        random_state = 42)
    n_nodes = G.number_of_nodes()

    # create train and test masks for data
    train_mask = torch.zeros(n_nodes, dtype = torch.bool)
    test_mask = torch.zeros(n_nodes, dtype = torch.bool)
    train_mask[X_train.index] = True
    test_mask[X_test.index] = True
    data['train_mask'] = train_mask
    data['test_mask'] = test_mask

    self.data, self.slices = self.collate([data])

  def _download(self):
    return

  def _process(self):
    return
  
  def __repr__(self):
    return '{}()'.format(self.__class__.name__)


dataset = KarateDataset()
data = dataset[0]

In [65]:
dataset[0]

Data(edge_index=[2, 156], num_nodes=34, x=[34, 1], y=[34], num_classes=2, train_mask=[34], test_mask=[34])