In [1]:
# import torch
import tensorflow as tf
tf.debugging.set_log_device_placement(True)

In [2]:
import os
os.environ['DGLBACKEND'] = 'tensorflow'

In [3]:
import dgl
import networkx as nx
import tqdm.auto as tqdm
import pickle
import numpy as np
import random

import datetime
import socket

# import torch.nn as nn
# import torch.nn.functional as F
import dgl.nn

from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap

Using backend: tensorflow


Executing op Fill in device /job:localhost/replica:0/task:0/device:GPU:0


In [4]:
gpus = tf.config.get_visible_devices('GPU')
is_gpu_available = len(gpus) > 0
if is_gpu_available:
    device = gpus[0]
else:
    cpus = tf.config.get_visible_devices('CPU')
    device = cpus[0]

In [5]:
class Net(tf.keras.Model):
    def __init__(self, in_size, out_size, hidden_size, n_hidden=1, n_heads=1):
        super().__init__()
        
        self.hidden_size = hidden_size
        self.n_heads = n_heads
        
        self.msg_layers = [dgl.nn.GATConv(in_size, hidden_size, num_heads=self.n_heads, activation=tf.keras.activations.relu)]
        for _ in range(n_hidden - 1):
            self.msg_layers += [
                dgl.nn.GATConv(hidden_size*self.n_heads, hidden_size, num_heads=self.n_heads, activation=tf.keras.activations.relu),
            ]
        
        self.output_layer = tf.keras.layers.Dense(out_size, activation=tf.keras.activations.relu)
        
    def call(self, g, h, training=False):        
        for l in self.msg_layers:
            h = l(g, h)
            h = tf.reshape(h, (-1, self.hidden_size*self.n_heads))
            
        h = self.output_layer(h)
        
        return h

net = Net(18, 2, 64, n_hidden=3, n_heads=2)

Executing op RandomStandardNormal in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Add in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0


In [6]:
framework = os.environ['DGLBACKEND']
dgl_graphs, _ = dgl.data.utils.load_graphs(f'../data/dgl_graphs_{framework}.bin')
nx_graphs = nx.read_gpickle(f'../data/nx_graphs_{framework}.pkl')

dataset = list(zip(dgl_graphs, nx_graphs))

train_set, test_set = train_test_split(dataset, train_size=0.7, shuffle=True)
train_set, val_set = train_test_split(train_set, train_size=0.7, shuffle=True)

In [7]:
train_set, _ = map(list, zip(*train_set))
val_set, _ = map(list, zip(*val_set))

In [8]:
net = Net(18, 2, 64, n_hidden=3, n_heads=2)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
criterion = tf.keras.losses.SparseCategoricalCrossentropy(name='loss')

# https://stackoverflow.com/a/8290508/2426888
def batch_iterable(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

Executing op DestroyResourceOp in device /job:localhost/replica:0/task:0/device:GPU:0


In [9]:
print(device)

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [10]:
n_epochs = 100
batch_size = 32

file_name = '{}_{}_{}'.format(datetime.datetime.now().strftime('%b%d_%H-%M-%S'), socket.gethostname(), framework)
writer = tf.summary.create_file_writer(logdir='runs/' + file_name)

pbar = tqdm.trange(n_epochs)
with writer.as_default():
    for epoch in pbar:

        random.shuffle(train_set)
        epoch_loss = 0
        for batch_i, batch in enumerate(batch_iterable(train_set, n=batch_size)):
            batch = dgl.batch(batch).to(device)
            x = batch.ndata['x']
            y = batch.ndata['y']

            pos_weight = len(y)/tf.math.reduce_sum(y) - 1
            w = np.ones(len(y))
            w[tf.squeeze(y) == 1] = pos_weight

            with tf.GradientTape() as tape:
                y_pred = net(batch, x)
                loss = criterion(y, y_pred, sample_weight=w)
            grads = tape.gradient(loss, net.trainable_weights)
            optimizer.apply_gradients(zip(grads, net.trainable_weights))

            epoch_loss += loss

        epoch_loss /= (batch_i + 1)
        tf.summary.scalar("Loss/train", epoch_loss, step=epoch)

        # Validation
        batch = dgl.batch(val_set)

        x = batch.ndata['x']
        y = batch.ndata['y']
        
        pos_weight = len(y)/tf.math.reduce_sum(y) - 1
        w = np.ones(len(y))
        w[tf.squeeze(y) == 1] = pos_weight
        
        y_pred = net(batch, x)
        val_loss = criterion(y, y_pred, sample_weight=w)
        tf.summary.scalar("Loss/validation", val_loss, step=epoch)
        
        pbar.set_postfix({
            'Train Loss': '{:.4f}'.format(epoch_loss),
            'Validation Loss': '{:.4f}'.format(val_loss),
        })
    
        writer.flush()

writer.close()

Executing op Add in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op SummaryWriter in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op CreateSummaryFileWriter in device /job:localhost/replica:0/task:0/device:CPU:0


  0%|          | 0/100 [00:00<?, ?it/s]

Executing op Identity in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ConcatV2 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ConcatV2 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ConcatV2 in device /job:localhost/replica:0/task:0/device:GPU:0


DGLError: Cannot assign node feature "e" on device /gpu:0 to a graph on device /cpu:0. Call DGLGraph.to() to copy the graph to the same device.