In [170]:
import numpy as np
import scipy.sparse as sp
import torch


def encode_onehot(labels):
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
                    enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)),
                             dtype=np.int32)
    return labels_onehot
def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

def load_data(path="../data/cora/", dataset="cora"):
    """Load citation network dataset (cora only for now)"""
    print('Loading {} dataset...'.format(dataset))
    
    idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
                                        dtype=np.dtype(str))
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
    labels = encode_onehot(idx_features_labels[:, -1])

    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
                                    dtype=np.int32)
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    features = normalize(features)
    adj = normalize(adj + sp.eye(adj.shape[0]))

    idx_train = range(140)
    idx_val = range(200, 500)
    idx_test = range(500, 1500)

    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(np.where(labels)[1])
    adj = sparse_mx_to_torch_sparse_tensor(adj)

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    return adj, features, labels, idx_train, idx_val, idx_test

In [171]:
adj, features, labelsq, idx_train, idx_val, idx_test = load_data()

Loading cora dataset...


In [172]:
labelsq

tensor([5, 0, 2,  ..., 4, 6, 5])

# 单步测试

In [20]:
path="../data/cora/"
dataset="cora"

In [21]:
idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
                                        dtype=np.dtype(str))

In [44]:
idx_features_labels.shape

(2708, 1435)

In [86]:
idx_features_labels

array([['31336', '0', '0', ..., '0', '0', 'Neural_Networks'],
       ['1061127', '0', '0', ..., '0', '0', 'Rule_Learning'],
       ['1106406', '0', '0', ..., '0', '0', 'Reinforcement_Learning'],
       ...,
       ['1128978', '0', '0', ..., '0', '0', 'Genetic_Algorithms'],
       ['117328', '0', '0', ..., '0', '0', 'Case_Based'],
       ['24043', '0', '0', ..., '0', '0', 'Neural_Networks']],
      dtype='<U22')

In [62]:
features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)

In [64]:
features.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [68]:
idx_features_labels[:, 0]

array(['31336', '1061127', '1106406', ..., '1128978', '117328', '24043'],
      dtype='<U22')

In [66]:
idx = np.array(idx_features_labels[:, 0], dtype=np.int32)

In [89]:
idx

array([  31336, 1061127, 1106406, ..., 1128978,  117328,   24043])

In [69]:
idx_map = {j: i for i, j in enumerate(idx)}#对每一个节点的序号进行编码
idx_map

{31336: 0,
 1061127: 1,
 1106406: 2,
 13195: 3,
 37879: 4,
 1126012: 5,
 1107140: 6,
 1102850: 7,
 31349: 8,
 1106418: 9,
 1123188: 10,
 1128990: 11,
 109323: 12,
 217139: 13,
 31353: 14,
 32083: 15,
 1126029: 16,
 1118017: 17,
 49482: 18,
 753265: 19,
 249858: 20,
 1113739: 21,
 48766: 22,
 646195: 23,
 1126050: 24,
 59626: 25,
 340299: 26,
 354004: 27,
 242637: 28,
 1106492: 29,
 74975: 30,
 1152272: 31,
 100701: 32,
 66982: 33,
 13960: 34,
 13966: 35,
 66990: 36,
 182093: 37,
 182094: 38,
 13972: 39,
 13982: 40,
 16819: 41,
 273152: 42,
 237521: 43,
 1153703: 44,
 32872: 45,
 284025: 46,
 218666: 47,
 16843: 48,
 1153724: 49,
 1153728: 50,
 158098: 51,
 8699: 52,
 1134865: 53,
 28456: 54,
 248425: 55,
 1112319: 56,
 28471: 57,
 175548: 58,
 696345: 59,
 28485: 60,
 1139195: 61,
 35778: 62,
 28491: 63,
 310530: 64,
 1153784: 65,
 1481: 66,
 1153786: 67,
 13212: 68,
 1111614: 69,
 5055: 70,
 4329: 71,
 330148: 72,
 1105062: 73,
 4330: 74,
 5062: 75,
 4335: 76,
 158812: 77,
 40124: 78,

In [90]:
len(set(idx))

2708

In [70]:
edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
                                    dtype=np.int32)

In [75]:
edges_unordered

array([[     35,    1033],
       [     35,  103482],
       [     35,  103515],
       ...,
       [ 853118, 1140289],
       [ 853155,  853118],
       [ 954315, 1155073]])

In [76]:
edges_unordered.flatten()

array([     35,    1033,      35, ...,  853118,  954315, 1155073])

In [77]:
edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)

In [78]:
edges

array([[ 163,  402],
       [ 163,  659],
       [ 163, 1696],
       ...,
       [1887, 2258],
       [1902, 1887],
       [ 837, 1686]])

In [97]:
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]), #因为labels.shape[0]恰好就是样本数量，即这个数据集的总节点数目，因此可以直接构建对应的邻接矩阵
                        dtype=np.float32)

In [99]:
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) 

In [100]:
adj.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [133]:
rm = adj

In [134]:
adj.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [135]:
for i in adj.toarray():
    print(sum(i))

5.0
1.0
4.0
3.0
1.0
2.0
2.0
1.0
5.0
4.0
5.0
2.0
2.0
3.0
19.0
10.0
4.0
1.0
7.0
2.0
5.0
4.0
5.0
2.0
2.0
5.0
3.0
2.0
4.0
4.0
5.0
3.0
1.0
3.0
5.0
1.0
9.0
1.0
5.0
2.0
1.0
14.0
4.0
2.0
3.0
4.0
4.0
1.0
4.0
2.0
4.0
3.0
5.0
4.0
5.0
6.0
3.0
5.0
4.0
7.0
5.0
1.0
7.0
4.0
4.0
2.0
16.0
1.0
7.0
1.0
6.0
8.0
4.0
1.0
40.0
7.0
16.0
1.0
6.0
2.0
2.0
1.0
10.0
2.0
12.0
6.0
4.0
5.0
10.0
3.0
1.0
2.0
6.0
2.0
2.0
4.0
4.0
1.0
3.0
10.0
3.0
4.0
8.0
5.0
2.0
3.0
1.0
6.0
6.0
19.0
3.0
1.0
4.0
1.0
1.0
6.0
3.0
4.0
3.0
3.0
4.0
4.0
4.0
1.0
5.0
3.0
2.0
7.0
3.0
15.0
3.0
1.0
1.0
3.0
4.0
2.0
19.0
3.0
7.0
2.0
1.0
3.0
8.0
5.0
3.0
12.0
3.0
1.0
5.0
1.0
2.0
3.0
2.0
3.0
2.0
4.0
4.0
29.0
4.0
6.0
6.0
7.0
2.0
168.0
2.0
1.0
3.0
4.0
4.0
2.0
2.0
2.0
3.0
8.0
4.0
1.0
2.0
8.0
2.0
3.0
2.0
5.0
2.0
4.0
4.0
3.0
4.0
1.0
16.0
6.0
3.0
5.0
8.0
8.0
1.0
3.0
3.0
4.0
2.0
2.0
3.0
5.0
5.0
3.0
9.0
4.0
6.0
4.0
6.0
3.0
7.0
4.0
3.0
3.0
4.0
5.0
2.0
3.0
3.0
12.0
4.0
1.0
2.0
2.0
3.0
3.0
1.0
10.0
8.0
2.0
7.0
3.0
3.0
2.0
2.0
1.0
5.0
3.0
5.0
4.0
3.0
1.0
3.0
3.0
3.0


2.0
6.0
3.0
3.0
2.0
1.0
2.0
10.0
4.0
2.0
5.0
3.0
3.0
5.0
2.0
3.0
3.0
1.0
3.0
3.0
6.0
5.0
3.0
4.0
3.0
4.0
2.0
7.0
3.0
1.0
2.0
1.0
3.0
1.0
1.0
2.0
2.0
4.0
3.0
1.0
6.0
1.0
1.0
5.0
2.0
5.0
3.0
4.0
1.0
2.0
2.0
2.0
4.0
1.0
2.0
3.0
8.0
3.0
1.0
3.0
5.0
5.0
2.0
3.0
2.0
3.0
3.0
2.0
12.0
4.0
12.0
2.0
5.0
9.0
7.0
7.0
6.0
4.0
4.0
1.0
1.0
5.0
2.0
1.0
3.0
2.0
2.0
1.0
3.0
5.0
3.0
2.0
2.0
3.0
2.0
4.0
3.0
3.0
4.0
2.0
2.0
4.0
3.0
4.0
2.0
3.0
2.0
1.0
3.0
2.0
2.0
1.0
3.0
3.0
3.0
1.0
1.0
5.0
1.0
2.0
4.0
3.0
5.0
2.0
1.0
2.0
2.0
5.0
1.0
4.0
3.0
4.0
8.0
2.0
4.0
1.0
4.0
1.0
1.0
4.0
2.0
1.0
4.0
3.0
1.0
6.0
17.0
6.0
2.0
2.0
1.0
3.0
3.0
6.0
4.0
5.0
3.0
7.0
5.0
1.0
5.0
5.0
2.0
1.0
1.0
2.0
1.0
2.0
4.0
1.0
1.0
5.0
2.0
1.0
4.0
1.0
2.0
2.0
1.0
4.0
1.0
1.0
2.0
3.0
4.0
1.0
3.0
4.0
3.0
2.0
7.0
1.0
5.0
3.0
1.0
3.0
4.0
1.0
2.0
2.0
1.0
2.0
1.0
2.0
4.0
2.0
6.0
6.0
5.0
3.0
5.0
4.0
1.0
2.0
2.0
5.0
1.0
4.0
5.0
2.0
1.0
2.0
4.0
2.0
5.0
2.0
2.0
4.0
2.0
5.0
7.0
1.0
4.0
5.0
1.0
1.0
3.0
4.0
4.0
2.0
3.0
2.0
3.0
3.0
2.0
6.0
1.0
4.0
1.0


In [136]:
rowsum = np.array(adj.toarray().sum(1))
rowsum

array([5., 1., 4., ..., 4., 4., 3.], dtype=float32)

In [138]:
np.power(rowsum, -1)

array([0.2       , 1.        , 0.25      , ..., 0.25      , 0.25      ,
       0.33333334], dtype=float32)

In [140]:
r_inv = np.power(rowsum, -1).flatten()
r_inv2 = np.power(rowsum, -1)

In [151]:
r_inv

array([0.2       , 1.        , 0.25      , ..., 0.25      , 0.25      ,
       0.33333334], dtype=float32)

In [142]:
r_inv[np.isinf(r_inv)] = 0.

In [152]:
r_inv

array([0.2       , 1.        , 0.25      , ..., 0.25      , 0.25      ,
       0.33333334], dtype=float32)

In [153]:
r_mat_inv = sp.diags(r_inv)

In [154]:
r_mat_inv.toarray()

array([[0.2       , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.25      , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.25      , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.25      ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.33333334]], dtype=float32)

In [155]:
mx = r_mat_inv.dot(rm)

In [156]:
mx.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [157]:
np.array(mx.toarray().sum(1))

array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)

In [158]:
features

<2708x1433 sparse matrix of type '<class 'numpy.float32'>'
	with 3880564 stored elements in Compressed Sparse Row format>

In [159]:
features = normalize(features)

In [161]:
features.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [162]:
np.array(features)

array(<2708x1433 sparse matrix of type '<class 'numpy.float32'>'
	with 49216 stored elements in Compressed Sparse Row format>, dtype=object)

In [163]:
features.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [164]:
labels = encode_onehot(idx_features_labels[:, -1])

In [167]:
labels

array([[0, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0]])

In [169]:
np.where(labels)[1]

array([5, 0, 2, ..., 4, 6, 5], dtype=int64)