In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from dgl.data import RedditDataset

Using backend: pytorch


# Read Dataset

In [3]:
dataset = RedditDataset()
g = dataset[0]

In [4]:
print("Number of categories:", dataset.num_classes)
print("Node features")
print(g.ndata)
print("Edge features")
print(g.edata)
print("Edges")
print(g.edges())

Number of categories: 41
Node features
{'label': tensor([30, 17, 18,  ...,  3, 13, 13]), 'feat': tensor([[ 1.2334,  9.0430, -0.9233,  ..., -0.2579,  0.3112, -0.3772],
        [-0.1386, -0.2022,  0.1277,  ...,  0.1563,  0.1048, -0.6534],
        [-0.1330, -0.1962, -0.0296,  ...,  0.0358,  0.2864,  0.2744],
        ...,
        [-0.0614, -0.2022,  0.9698,  ...,  1.1064, -1.4323, -0.2398],
        [-0.1606, -0.2022, -0.0892,  ...,  0.7440, -0.5046, -2.2288],
        [ 0.0929,  0.2822,  0.1768,  ...,  0.2196,  0.5967,  0.5588]]), 'test_mask': tensor([ True, False,  True,  ..., False, False,  True]), 'train_mask': tensor([False,  True, False,  ...,  True,  True, False]), 'val_mask': tensor([False, False, False,  ..., False, False, False])}
Edge features
{}
Edges
(tensor([     0,      0,      0,  ..., 232920, 232931, 232952]), tensor([225202, 177307, 107546,  ..., 232897, 232907, 232910]))


### Assume that we will be reading in the data from csv files

##### First, Save the node and edge data to csv files

In [11]:
node_features = g.ndata['feat'].tolist()
node_labels = g.ndata['label'].tolist()
print("Data count:", len(node_features))

node_df = pd.DataFrame(node_features)
node_df['node_ID'] = range(len(node_features))
node_df['label'] = node_labels
node_df.to_csv('data/reddit_nodes.csv', index=False)

Data count: 232965


In [12]:
from_nodes = g.edges()[0].tolist()
to_nodes = g.edges()[1].tolist()
edge_df = pd.DataFrame()
edge_df['from_node'] = from_nodes
edge_df['to_node'] = to_nodes
edge_df.to_csv('data/reddit_edges.csv', index=True, index_label="edge_ID")

##### Read again from the csv files

Reference: https://github.com/dglai/WWW20-Hands-on-Tutorial/blob/master/basic_tasks/1_load_data.ipynb

In [10]:
node_df = pd.read_csv("../data/reddit_nodes.csv").set_index('node_ID')
edge_df = pd.read_csv("../data/reddit_edges.csv").set_index("edge_ID")

In [11]:
node_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,593,594,595,596,597,598,599,600,601,label
node_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.233415,9.043012,-0.92328,1.054183,-1.112501,-0.02063,0.04254,2.152007,-0.908296,0.713842,...,-0.457901,0.002277,-0.178169,0.095668,-0.497469,-0.443911,-0.257895,0.311193,-0.377212,30
1,-0.138552,-0.202219,0.127716,-0.418801,0.106761,0.302031,-0.936621,-0.980957,-0.098751,0.62978,...,0.238544,0.461295,0.114464,-0.408528,-0.331293,0.854006,0.156271,0.104781,-0.65342,17
2,-0.133042,-0.196239,-0.02956,0.306849,0.07384,1.347005,0.695408,-0.657161,1.141495,-1.337327,...,0.989196,-1.283592,-1.252432,-1.707226,-0.860897,0.809932,0.035802,0.286367,0.274413,18
3,-0.138552,-0.202219,0.183453,0.567036,0.372372,-0.131731,-0.129563,0.387311,1.669948,0.048453,...,-1.396239,0.0101,0.204658,-0.495959,-0.32971,-0.363401,-0.440687,0.006481,0.103306,23
4,-0.155081,0.013065,-0.98674,1.667574,1.596521,-0.296647,-0.056293,0.792982,-0.782269,0.740889,...,-0.517946,-0.264403,0.521236,1.05679,-0.498133,0.213124,-1.222432,-0.916983,0.488467,22


In [12]:
edge_df.head()

Unnamed: 0_level_0,from_node,to_node
edge_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,225202
1,0,177307
2,0,107546
3,0,15240
4,0,176069


In [79]:
node_features = node_df[[col for col in node_df.columns if col != "label"]].values
node_labels = node_df['label'].tolist() if 'label' in node_df.columns else None
edge_features = edge_df[[col for col in edge_df.columns if col not in ['label', 'from_node', 'to_node']]].values
edge_labels = edge_df['label'].tolist() if 'label' in edge_df.columns else None
edge_src = edge_df['from_node'].to_numpy()
edge_dst = edge_df['to_node'].to_numpy()

# Turn them into DGL graph

In [16]:
g = dgl.graph((edge_src, edge_dst))
print(g)

Graph(num_nodes=232965, num_edges=114615892,
      ndata_schemes={}
      edata_schemes={})


In [80]:
# Loading features
g.ndata['feat'] = torch.tensor(node_features)
g.ndata['label'] = torch.tensor(node_labels)
g.edata['feat'] = torch.tensor(edge_features)

In [33]:
print(g)

Graph(num_nodes=232965, num_edges=114615892,
      ndata_schemes={'feat': Scheme(shape=(602,), dtype=torch.float64)}
      edata_schemes={'feat': Scheme(shape=(0,), dtype=torch.float64)})


In [34]:
g.edges()

(tensor([     0,      0,      0,  ..., 232920, 232931, 232952]),
 tensor([225202, 177307, 107546,  ..., 232897, 232907, 232910]))

### Querying graph structures

In [35]:
print("# Nodes", g.number_of_nodes())
print("# Edges", g.number_of_edges())
print("Node 0 has {} degree".format(g.in_degrees(0)))
print("Destinations from Node 0:", g.successors(0))
print("{} nodes presucceed Node 0".format(len(g.predecessors(0))))
print("Node 0 has {} in_edges and {} out_edges".format(len(g.in_edges(0)), len(g.out_edges(0))))
print("Is this multigraph?", g.is_multigraph)
print("Does this graph has node 329?", g.has_nodes(329))
print("Is there an edge between 329 and 324?", g.has_edges_between(329, 324))

# Nodes 232965
# Edges 114615892
Node 0 has 2204 degree
Destinations from Node 0: tensor([225202, 177307, 107546,  ...,  55707,  18371,  72216])
2204 nodes presucceed Node 0
Node 0 has 2 in_edges and 2 out_edges
Is this multigraph? False
Does this graph has node 329? True
Is there an edge between 329 and 324? False


# Visualizing graph

In [36]:
import networkx as nx

In [37]:
nx_g = g.to_networkx().to_undirected()
pos = nx.kamada_kawai_layout(nx_g)
nx.draw(nx_g, pos, with_labels=True, node_color=[[.7, .7, .7]])

KeyboardInterrupt: 

# AI Experiment Preprocessing (Data Split)

In [68]:
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2
node_count = len(node_features)

split_index = np.zeros((node_count), dtype=np.int)
split_index[:round(node_count * train_ratio)] = 1
split_index[round(node_count*train_ratio): round(node_count*(train_ratio+val_ratio))] = 2
split_index[round(node_count*(train_ratio+val_ratio)):] = 3

np.random.shuffle(split_index)

print("Node count in train:\t", sum(split_index == 1))
print("Node count in val:\t", sum(split_index == 2))
print("Node count in test:\t", sum(split_index == 3))
print("Total Sum Check:", sum(split_index == 1) + sum(split_index == 2) + sum(split_index == 3) == node_count)

Node count in train:	 139779
Node count in val:	 46593
Node count in test:	 46593
Total Sum Check: True


In [70]:
g.ndata['train_mask'] = torch.tensor(split_index == 1)
g.ndata['val_mask'] = torch.tensor(split_index == 2)
g.ndata['test_mask'] = torch.tensor(split_index == 3)

In [84]:
print(g.ndata)

{'feat': tensor([[ 1.2334,  9.0430, -0.9233,  ..., -0.2579,  0.3112, -0.3772],
        [-0.1386, -0.2022,  0.1277,  ...,  0.1563,  0.1048, -0.6534],
        [-0.1330, -0.1962, -0.0296,  ...,  0.0358,  0.2864,  0.2744],
        ...,
        [-0.0614, -0.2022,  0.9698,  ...,  1.1064, -1.4323, -0.2398],
        [-0.1606, -0.2022, -0.0892,  ...,  0.7440, -0.5046, -2.2288],
        [ 0.0929,  0.2822,  0.1768,  ...,  0.2196,  0.5967,  0.5588]],
       dtype=torch.float64), 'train_mask': tensor([ True,  True,  True,  ..., False,  True, False]), 'val_mask': tensor([False, False, False,  ...,  True, False,  True]), 'test_mask': tensor([False, False, False,  ..., False, False, False]), 'label': tensor([30, 17, 18,  ...,  3, 13, 13])}


# AI Experiment

In [111]:
# Algorithm 1: GCN
from dgl.nn import GraphConv
class GCN(nn.Module):
    def __init__(self, feat_dim, h_feat_dim, num_classes):
        super(GCN, self).__init__()
        
        self.num_layer = len(h_feat_dim)
        self.h_feat_dim = h_feat_dim
        self.num_classes = num_classes
        
        self.layers = []
        prev_dim = feat_dim
        for i in range(self.num_layer):
            print(prev_dim)
            print(h_feat_dim[i])
            self.layers.append(GraphConv(prev_dim, h_feat_dim[i]))
            prev_dim = h_feat_dim[i]
        self.layers.append(GraphConv(prev_dim, num_classes))
        
    def forward(self, g, in_feat):
        prev_feat = in_feat
        for i in len(self.num_layer-1):
            h = self.layers[i](g, prev_feat)
            h = F.relu(h)
            prev_feat = h
        h = self.layers[-1](g)

In [92]:
feat_dim = len(node_features[0])
num_classes = len(np.unique(node_labels))

# THIS IS THE INPUTS FOR EXPERIMENTATION
#Experimentation Scope

model_types = ['GCN', 'GraphSAGE']
aggregators = ['mean', 'max']
h_feat_dim = [[32], [16], [8], [32, 16], [32, 8], [16, 4], [32, 16, 4]]
activation_func = ['relu', 'sigmoid']
loss_func = ['cross_entropy']
optimization = ['adam', 'sgd']
opt_learning_rate = [0.01, 0.1]
train_iter = 100

In [95]:
# Create an experimentation table
tracking_values = []
exp_ID_counter = 0
for m in model_types:
    for ag in aggregators:
        for h in h_feat_dim:
            for a in activation_func:
                for l in loss_func:
                    for opt in optimization:
                        for opt_lr in opt_learning_rate:
                            for i in range(train_iter):
                                tracking_values.append([exp_ID_counter, i, m, ag, len(h), h, a, l, opt, opt_lr])
                            exp_ID_counter += 1
                            
tracking_df = pd.DataFrame(tracking_values)
tracking_df.columns = ['expID', 'iter', 'model', 'aggregator', 'num_layer', 'layer_dim', 
                        'activation_func', 'loss_func', 'optimization', 'opt_lr']

performance_cols = ['loss', 'acc_train', 'avg_recall_train', 'avg_prec_train', 'avg_f1_train', 
                    'acc_val', 'avg_recall_val', 'avg_prec_val', 'avg_f1_val']

for c in performance_cols:
    tracking_df[c] = None

In [113]:
tracking_df.head()

Unnamed: 0,expID,iter,model,aggregator,num_layer,layer_dim,activation_func,loss_func,optimization,opt_lr,loss,acc_train,avg_recall_train,avg_prec_train,avg_f1_train,acc_val,avg_recall_val,avg_prec_val,avg_f1_val
0,0,0,GCN,mean,1,[32],relu,cross_entropy,adam,0.01,,,,,,,,,
1,0,1,GCN,mean,1,[32],relu,cross_entropy,adam,0.01,,,,,,,,,
2,0,2,GCN,mean,1,[32],relu,cross_entropy,adam,0.01,,,,,,,,,
3,0,3,GCN,mean,1,[32],relu,cross_entropy,adam,0.01,,,,,,,,,
4,0,4,GCN,mean,1,[32],relu,cross_entropy,adam,0.01,,,,,,,,,


In [123]:
def get_optimizer(model, optimizer_type, learning_rate):
    print(model.parameters())
    if optimizer_type == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
    elif optimizer_type == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
    else:
        print(optimizer_type, "is currently not supported!")
        return None
    return optimizer

def run_node_clf_exp(g, train_iter, h_feat_dim, optimizer_type, opt_lr, loss_func):
    model = GCN(g.ndata['feat'].shape[1], h_feat_dim, len(g.ndata['label'].unique()))
    optimizer = get_optimizer(model, optimizer_type, opt_lr)
    
    best_train_acc = 0
    best_val_acc = 0
    best_test_acc = 0
    
    features = g.ndata['feat']
    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    
    for e in range(train_iter):
        logits = model(g, features)   # forward
        pred = logits.argmax(1)       # compute prediction
        
        # TODO check the loss function type. Currently only cross entropy is supported
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[train_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[train_mask] == labels[test_mask]).float().mean()
        
        # Save the best validation accuracy
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc
            
        # Backward prop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if e%5 == 0:
            print("In epoch {}, loss: {:.3f}, val acc: {:.3f}, test acc: {.3f} (best{:.3f})".format(
                e, loss, val_acc, best_val_acc, test_acc, best_test_all))

In [125]:
model = GCN(g.ndata['feat'].shape[1], [32], len(g.ndata['label'].unique()))

602
32


In [126]:
model.parameters()

<generator object Module.parameters at 0x7fcc3d748258>

In [127]:
torch.optim.Adam(model.parameters(), lr=0.01)

ValueError: optimizer got an empty parameter list

In [124]:
for ind, row in tracking_df.iterrows():
    m = row['model']
    ag = row['aggregator']
    h = row['layer_dim']
    a = row['activation_func']
    l = row['loss_func']
    opt = row['optimization']
    lr = row['opt_lr']
    
    print(m, ag, h, a, l, opt, lr)
    
    run_node_clf_exp(g, 10, h, opt, lr, l)
    break

GCN mean [32] relu cross_entropy adam 0.01
602
32
<generator object Module.parameters at 0x7fcc3d7481a8>


ValueError: optimizer got an empty parameter list

In [None]:
# train
optimizer = torch.optim.Adam(model.parameters)