In [3]:
from torch_geometric.datasets import OGB_MAG

dataset = OGB_MAG(root='./data', preprocess='metapath2vec')
data = dataset[0]

In [4]:
data

HeteroData(
  [1mpaper[0m={
    x=[736389, 128],
    year=[736389],
    y=[736389],
    train_mask=[736389],
    val_mask=[736389],
    test_mask=[736389]
  },
  [1mauthor[0m={ x=[1134649, 128] },
  [1minstitution[0m={ x=[8740, 128] },
  [1mfield_of_study[0m={ x=[59965, 128] },
  [1m(author, affiliated_with, institution)[0m={ edge_index=[2, 1043998] },
  [1m(author, writes, paper)[0m={ edge_index=[2, 7145660] },
  [1m(paper, cites, paper)[0m={ edge_index=[2, 5416271] },
  [1m(paper, has_topic, field_of_study)[0m={ edge_index=[2, 7505078] }
)

In [7]:
import os
import networkx as nx
import json
from tqdm import tqdm
import torch
import torch_geometric

def nx2hetero(G):
	name2int = dict()
	int2name = dict()
	for i,name in enumerate(list(G.nodes)):
		name2int[name] = i
		int2name[i] = name

	node_types = set([node[1]["node_type"] for node in G.nodes(data=True)])
	nodes_by_type = dict()
	for node_type in node_types:
		nodes_by_type[node_type] = [node[1] for node in list(G.nodes(data=True)) if node[1]["node_type"] == node_type][:10]
	nodes_by_type

	# build node index
	playlists = []
	tracks = []
	num_artists = 0
	num_albums = 0
	for node in G.nodes(data=True):
		t = node[1]["node_type"]
		if t == "playlist":
			playlists += [node[1]["num_followers"]]
		elif t == "track":
			tracks += [node[1]["duration"]]
		elif t == "artist":
			num_artists += 1
		elif t == "album":
			num_albums += 1

	# build edge_index
	playlist_track = []
	album_track = []
	artist_track = []

	for edge in G.edges(data=True):
		if G[edge[0]][edge[1]]["edge_type"] == "track-playlist":
			playlist_track += [(name2int[edge[0]], name2int[edge[1]])]
		elif G[edge[0]][edge[1]]["edge_type"] == "track-album":
			album_track += [(name2int[edge[0]], name2int[edge[1]])]
		elif G[edge[0]][edge[1]]["edge_type"] == "track-artist":
			artist_track += [(name2int[edge[0]], name2int[edge[1]])]
		
		node_start = edge[0].split(":")[1]
		node_end = edge[1].split(":")[1]

		if node_end != "track":
			node_start, node_end = node_end, node_start		

	# construct HeteroData
	hetero = torch_geometric.data.HeteroData()

	# add initial node features
	hetero["playlist"].x = torch.IntTensor(playlists).reshape(-1,1)
	hetero["track"].x = torch.IntTensor(tracks).reshape(-1,1)
	hetero["artist"].x = torch.IntTensor([1 for _ in range(num_artists)]).reshape(-1,1)
	hetero["album"].x = torch.IntTensor([1 for _ in range(num_albums)]).reshape(-1,1)

	# add edge indices
	hetero["playlist", "contains", "track"].edge_index = torch.tensor(playlist_track).t()
	hetero["album", "includes", "track"].edge_index = torch.tensor(album_track).t()
	hetero["artist", "authors", "track"].edge_index = torch.tensor(artist_track).t()

	return hetero


In [11]:
import pickle
base = "spotify_million_playlist_dataset"
pickles = base + "/pickles"
graph_path = os.path.join(pickles, "G_example.pkl")

G = pickle.load(open(graph_path, "rb"))

our_data = nx2hetero(G)

In [12]:
# create training mask for playlist nodes
train_mask = torch.zeros(our_data["playlist"].x.shape[0], dtype=torch.bool)
train_mask[torch.randperm(train_mask.shape[0])[:int(train_mask.shape[0]*0.8)]] = True

our_data["playlist"].train_mask = train_mask

In [13]:
our_data

HeteroData(
  [1mplaylist[0m={
    x=[1000, 1],
    train_mask=[1000]
  },
  [1mtrack[0m={ x=[35289, 1] },
  [1martist[0m={ x=[10091, 1] },
  [1malbum[0m={ x=[20469, 1] },
  [1m(playlist, contains, track)[0m={ edge_index=[2, 66331] },
  [1m(album, includes, track)[0m={ edge_index=[2, 35289] },
  [1m(artist, authors, track)[0m={ edge_index=[2, 35289] }
)

In [20]:
our_data.metadata()

(['playlist', 'track', 'artist', 'album'],
 [('playlist', 'contains', 'track'),
  ('album', 'includes', 'track'),
  ('artist', 'authors', 'track'),
  ('track', 'rev_contains', 'playlist'),
  ('track', 'rev_includes', 'album'),
  ('track', 'rev_authors', 'artist')])

In [15]:
data = our_data

In [16]:
data.is_undirected()

False

In [17]:
homogeneous_data = data.to_homogeneous()
homogeneous_data

Data(edge_index=[2, 136909], x=[66849, 1], train_mask=[66849], node_type=[66849], edge_type=[136909])

In [18]:
import torch_geometric.transforms as T

data = T.ToUndirected()(data)
# data = T.NormalizeFeatures()(data)

In [19]:
data.is_undirected()

True

In [21]:
import torch_geometric.transforms as T
from torch_geometric.datasets import OGB_MAG
from torch_geometric.nn import SAGEConv, to_hetero
import torch
import torch.nn.functional as F

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


model = GNN(hidden_channels=64, out_channels=dataset.num_classes)
model = to_hetero(model, data.metadata(), aggr='sum')
# model = model.to('cuda:0')

In [22]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [23]:
data.validate()

ValueError: 'edge_index' of edge type ('playlist', 'contains', 'track') contains larger source indices than the number of nodes (1000) of this node type in 'HeteroData' (found 66694)

In [None]:
import torch_geometric.transforms as T
from torch_geometric.datasets import OGB_MAG
from torch_geometric.loader import NeighborLoader

# transform = T.ToUndirected()  # Add reverse edge types.
# data = OGB_MAG(root='./data', preprocess='metapath2vec', transform=transform)[0]

train_loader = NeighborLoader(
    data,
    # Sample 15 neighbors for each node and each edge type for 2 iterations:
    num_neighbors=[15] * 2,
    # num_neighbors = {key: [15] * 2 for key in data.edge_types} will sample different ammounts for each node type
    # Use a batch size of 128 for sampling training nodes of type "paper":
    batch_size=128,
    input_nodes=('playlist', data['playlist'].train_mask),
)

batch = next(iter(train_loader))

: 

: 

In [None]:
batch

HeteroData(
  [1mpaper[0m={
    x=[20796, 128],
    year=[20796],
    y=[20796],
    train_mask=[20796],
    val_mask=[20796],
    test_mask=[20796],
    input_id=[128],
    batch_size=128
  },
  [1mauthor[0m={ x=[4454, 128] },
  [1minstitution[0m={ x=[306, 128] },
  [1mfield_of_study[0m={ x=[2584, 128] },
  [1m(author, affiliated_with, institution)[0m={ edge_index=[2, 0] },
  [1m(author, writes, paper)[0m={ edge_index=[2, 5916] },
  [1m(paper, cites, paper)[0m={ edge_index=[2, 11751] },
  [1m(paper, has_topic, field_of_study)[0m={ edge_index=[2, 10573] },
  [1m(institution, rev_affiliated_with, author)[0m={ edge_index=[2, 821] },
  [1m(paper, rev_writes, author)[0m={ edge_index=[2, 5484] },
  [1m(field_of_study, rev_has_topic, paper)[0m={ edge_index=[2, 10432] }
)

In [None]:
def train():
    model.train()

    total_examples = total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        # batch = batch.to('cuda:0')
        batch_size = batch['paper'].batch_size
        out = model(batch.x_dict, batch.edge_index_dict)
        loss = F.cross_entropy(out['paper'][:batch_size],
                               batch['paper'].y[:batch_size])
        loss.backward()
        optimizer.step()

        total_examples += batch_size
        print(f'Loss: {loss:.4f}')
        total_loss += float(loss) * batch_size

    return total_loss / total_examples

In [None]:
train()

ValueError: `MessagePassing.propagate` only supports integer tensors of shape `[2, num_messages]`, `torch_sparse.SparseTensor` or `torch.sparse.Tensor` for argument `edge_index`.