## - Read adjacency matrix of PROTEINS dataset

In [23]:
from os.path import join as pjoin
import numpy as np

def parse_txt_file(fpath, line_parse_fn=None):
    with open(pjoin(data_dir, fpath), 'r') as f:
        lines = f.readlines()
    data = [line_parse_fn(s) if line_parse_fn is not None else s for s in lines]
    
    print(len(data))
    
    return data

def read_graph_nodes_relations(fpath):
    graph_ids = parse_txt_file(fpath, line_parse_fn=lambda s: int(s.rstrip()))
    nodes, graphs = {}, {}
    for node_id, graph_id in enumerate(graph_ids):
        if graph_id not in graphs:
            graphs[graph_id] = []
        graphs[graph_id].append(node_id)
        nodes[node_id] = graph_id
    graph_ids = np.unique(list(graphs.keys()))
    for graph_id in graphs:
        graphs[graph_id] = np.array(graphs[graph_id])
    return nodes, graphs

def read_graph_adj(fpath, nodes, graphs):
    edges = parse_txt_file(fpath, line_parse_fn=lambda s: s.split(','))
    adj_dict = {}
    for edge in edges:
        node1 = int(edge[0].strip()) - 1  # -1 because of zero-indexing in our code
        node2 = int(edge[1].strip()) - 1
        graph_id = nodes[node1]
        assert graph_id == nodes[node2], ('invalid data', graph_id, nodes[node2])
        if graph_id not in adj_dict:
            n = len(graphs[graph_id])
            adj_dict[graph_id] = np.zeros((n, n))
        ind1 = np.where(graphs[graph_id] == node1)[0]
        ind2 = np.where(graphs[graph_id] == node2)[0]
        assert len(ind1) == len(ind2) == 1, (ind1, ind2)
        adj_dict[graph_id][ind1, ind2] = 1

    return [adj_dict[graph_id] for graph_id in sorted(list(graphs.keys()))]

In [24]:
data_dir = './data/COLLAB/'

In [25]:
import os

files = os.listdir(data_dir)
print(files)

['COLLAB_A.txt', 'COLLAB_graph_indicator.txt', 'COLLAB_graph_labels.txt~', 'COLLAB_graph_labels.txt']


In [26]:
nodes, graphs = read_graph_nodes_relations(list(filter(lambda f: f.find('graph_indicator') >= 0, files))[0])
adj_list = read_graph_adj(list(filter(lambda f: f.find('_A') >= 0, files))[0], nodes, graphs)  

372474
49149990


In [27]:
graphs[1]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])

In [28]:
graphs[2]

array([45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
       62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
       79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
       96])

In [29]:
adj_list[0]

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [30]:
len(adj_list[0])

45

In [31]:
adj_list[1]

array([[0., 0., 1., ..., 1., 1., 1.],
       [0., 0., 1., ..., 1., 1., 0.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 0., 1., ..., 1., 1., 0.]])

In [32]:
len(adj_list[1])

52

## - Build degree node features list by adjacency matrix

- Node features list structure example   
0: graph 1 / node 1 feature   
1: graph 1 / node 2 feature   
2: graph 1 / node 3 feature   
3: graph 2 / node 1 faeture   
...   
100: graph n / node 7 feature   
- Node features list needs to use together with graph indicator file to recognize where each node feature includes in which graph

### - Numpy array to NetworkX

In [33]:
import networkx as nx

G=nx.from_numpy_matrix(np.array(adj_list)[0])
G2=nx.from_numpy_matrix(np.array(adj_list)[1])

### - Get degree of graph using NetworkX

In [34]:
G.degree()

DegreeView({0: 44, 1: 44, 2: 44, 3: 44, 4: 44, 5: 44, 6: 44, 7: 44, 8: 44, 9: 44, 10: 44, 11: 44, 12: 44, 13: 44, 14: 44, 15: 44, 16: 44, 17: 44, 18: 44, 19: 44, 20: 44, 21: 44, 22: 44, 23: 44, 24: 44, 25: 44, 26: 44, 27: 44, 28: 44, 29: 44, 30: 44, 31: 44, 32: 44, 33: 44, 34: 44, 35: 44, 36: 44, 37: 44, 38: 44, 39: 44, 40: 44, 41: 44, 42: 44, 43: 44, 44: 44})

In [35]:
G2.degree()

DegreeView({0: 35, 1: 36, 2: 51, 3: 51, 4: 37, 5: 51, 6: 28, 7: 51, 8: 39, 9: 33, 10: 51, 11: 52, 12: 45, 13: 51, 14: 51, 15: 28, 16: 35, 17: 51, 18: 49, 19: 38, 20: 31, 21: 45, 22: 51, 23: 35, 24: 51, 25: 39, 26: 51, 27: 38, 28: 45, 29: 50, 30: 28, 31: 36, 32: 49, 33: 30, 34: 44, 35: 45, 36: 51, 37: 22, 38: 31, 39: 45, 40: 36, 41: 39, 42: 48, 43: 51, 44: 39, 45: 38, 46: 35, 47: 45, 48: 38, 49: 51, 50: 51, 51: 39})

In [36]:
list(G2.degree())

[(0, 35),
 (1, 36),
 (2, 51),
 (3, 51),
 (4, 37),
 (5, 51),
 (6, 28),
 (7, 51),
 (8, 39),
 (9, 33),
 (10, 51),
 (11, 52),
 (12, 45),
 (13, 51),
 (14, 51),
 (15, 28),
 (16, 35),
 (17, 51),
 (18, 49),
 (19, 38),
 (20, 31),
 (21, 45),
 (22, 51),
 (23, 35),
 (24, 51),
 (25, 39),
 (26, 51),
 (27, 38),
 (28, 45),
 (29, 50),
 (30, 28),
 (31, 36),
 (32, 49),
 (33, 30),
 (34, 44),
 (35, 45),
 (36, 51),
 (37, 22),
 (38, 31),
 (39, 45),
 (40, 36),
 (41, 39),
 (42, 48),
 (43, 51),
 (44, 39),
 (45, 38),
 (46, 35),
 (47, 45),
 (48, 38),
 (49, 51),
 (50, 51),
 (51, 39)]

### - Build degree node features list

In [42]:
def get_node_features_degree(adj_list):
    node_features_list = []
    
    for adj in adj_list:
        sub_list = []
        for feature in nx.from_numpy_matrix(np.array(adj)).degree():
            sub_list.append(feature[1])
        node_features_list.append(sub_list)
    
    return node_features_list

In [43]:
node_features_lst = get_node_features_degree(adj_list)

In [44]:
len(nodes), len(node_features_lst)

(372474, 5000)

In [47]:
node_features_lst[:5]

[[44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44,
  44],
 [35,
  36,
  51,
  51,
  37,
  51,
  28,
  51,
  39,
  33,
  51,
  52,
  45,
  51,
  51,
  28,
  35,
  51,
  49,
  38,
  31,
  45,
  51,
  35,
  51,
  39,
  51,
  38,
  45,
  50,
  28,
  36,
  49,
  30,
  44,
  45,
  51,
  22,
  31,
  45,
  36,
  39,
  48,
  51,
  39,
  38,
  35,
  45,
  38,
  51,
  51,
  39],
 [37,
  41,
  44,
  47,
  47,
  47,
  44,
  44,
  41,
  41,
  44,
  44,
  41,
  23,
  4,
  44,
  41,
  47,
  47,
  47,
  47,
  47,
  47,
  44,
  47,
  41,
  4,
  47,
  44,
  37,
  45,
  47,
  41,
  23,
  47,
  51,
  41,
  47,
  47,
  41,
  4,
  47,
  23,
  44,
  4,
  41,
  33,
  45,
  47,
  47,
  44,
  47],
 [31,
  31,
  31,
  31,
  31,
  31,
  31,
  31,
  31,
  31,
  31,
  31,
  31,
  31,
  31,
  31,
  31,
  31,