For the GCN, we need to create an adjacency matrix of the graph. As the size of the graph is massive (1.2 million paper nodes), we need to use a sparse matrix due to its efficiency. It is also a requirement for training the model.

First, let's practice with just the citation graph.


In [273]:
import numpy as np
from scipy.sparse import csr_matrix
import pickle

In [274]:
authors = np.load("test_data/author_array_test.npy")
papers = np.load("test_data/paper_array_test.npy")
paper_to_paper = np.load("test_data/edgelist_cites_test.npy")
author_to_paper = np.load("test_data/edgelist_writes_test.npy")

# Use this to fix the difference
#np.setdiff1d(papers, np.unique(paper_to_paper))


ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    
ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;

ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;

ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.

In [275]:
edgelist_reindexed = []
papers_reindexed = []
mapping = {}
for i in range(len(papers)):
    mapping[papers[i]] = i
    papers_reindexed.append(i)

for i in range(len(paper_to_paper.T)):
    edge = paper_to_paper[:,i]
    edgelist_reindexed.append([mapping[edge[0]],mapping[edge[1]]])

paper_to_paper_reindexed = np.array(edgelist_reindexed).T


In [276]:
labels = np.load("test_data/paper_label_test.npy") # 153 classes in the data
train_index = []
test_index = []
j = 0
label_matrix = np.zeros((len(labels),153))
for i in range(len(labels)):
    if not np.isnan(labels[i]):
        label_matrix[i,int(labels[i])] = 1
        if j < 30:
            train_index.append(i)
            j += 1
        else:
            test_index.append(i)

train_labels = label_matrix[train_index,:]
with open('/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.y', 'wb') as handle:
    pickle.dump(train_labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

test_labels = label_matrix[test_index,:]
with open('/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.ty', 'wb') as handle:
    pickle.dump(test_labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

label_matrix_no_test = np.delete(label_matrix, test_index, axis=0) # remove test index
with open('/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.ally', 'wb') as handle:
    pickle.dump(label_matrix_no_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.test.index', 'w') as f:
    for index in test_index:
        f.write(str(index) + "\n")


ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;

ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;

ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;

In [277]:
feature_matrix = np.load("test_data/feats_array_test.npy")
feature_matrix_no_test = np.delete(feature_matrix, test_index, axis=0) # remove test index
feature_matrix_train = feature_matrix[train_index]
feature_matrix_test = feature_matrix[test_index]

sparse_feature_matrix = csr_matrix(feature_matrix_no_test)
with open('/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.allx', 'wb') as handle:
    pickle.dump(sparse_feature_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

sparse_feature_matrix_train = csr_matrix(feature_matrix_train)
with open('/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.x', 'wb') as handle:
    pickle.dump(sparse_feature_matrix_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

sparse_feature_matrix_test = csr_matrix(feature_matrix_test)
with open('/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.tx', 'wb') as handle:
    pickle.dump(sparse_feature_matrix_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object;

In [278]:
G = dict()
for edge in paper_to_paper_reindexed.T:
    if edge[0] in G:
        G[edge[0]].append(int(edge[1]))
    else:
        G[edge[0]] = [int(edge[1])]

    if edge[1] in G: 
        G[edge[1]].append(int(edge[0]))
    else:
        G[edge[1]] = [int(edge[0])]

for no_edge in np.setdiff1d(papers_reindexed, np.unique(paper_to_paper_reindexed)):
    G[no_edge] = [int(no_edge)]

with open('/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.graph', 'wb') as handle:
    pickle.dump(G, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [281]:
allx = pd.read_pickle(r'/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.allx')
ally = pd.read_pickle(r'/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.ally')
graph = pd.read_pickle(r'/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.graph')
index = pd.read_csv(r'/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.test.index')
tx = pd.read_pickle(r'/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.tx')
ty = pd.read_pickle(r'/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.ty')
x = pd.read_pickle(r'/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.x')
y = pd.read_pickle(r'/Users/Samu/Desktop/Koulu/Gradu/gcn/gcn/data/ind.mag_prac.y')

print(allx.shape)
print(ally.shape)
print(len(graph))
print(len(index))
print(tx.shape)
print(ty.shape)
print(x.shape)
print(y.shape)

(180226, 768)
(180226, 153)
180241
14
(15, 768)
(15, 153)
(30, 768)
(30, 153)
