In [1]:
from scipy import io
import numpy as np
from scipy.sparse import csr_matrix
mat_file = io.loadmat('ACM.mat')

In [2]:
mat_file

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Mon Aug 08 18:23:50 2011',
 '__version__': '1.0',
 '__globals__': [],
 'TvsP': <1903x12499 sparse matrix of type '<class 'numpy.float64'>'
 	with 972973 stored elements in Compressed Sparse Column format>,
 'PvsA': <12499x17431 sparse matrix of type '<class 'numpy.float64'>'
 	with 37055 stored elements in Compressed Sparse Column format>,
 'PvsV': <12499x196 sparse matrix of type '<class 'numpy.float64'>'
 	with 12499 stored elements in Compressed Sparse Column format>,
 'AvsF': <17431x1804 sparse matrix of type '<class 'numpy.float64'>'
 	with 30424 stored elements in Compressed Sparse Column format>,
 'VvsC': <196x14 sparse matrix of type '<class 'numpy.float64'>'
 	with 196 stored elements in Compressed Sparse Column format>,
 'PvsL': <12499x73 sparse matrix of type '<class 'numpy.float64'>'
 	with 12499 stored elements in Compressed Sparse Column format>,
 'PvsC': <12499x14 sparse matrix of type '<class 'numpy.fl

In [3]:
paper_conf = mat_file['PvsC'].nonzero()[1]

In [4]:
# DataBase
paper_db = np.isin(paper_conf,[1,13])
paper_db_idx = np.where(paper_db == True)[0]
paper_db_idx = np.sort(np.random.choice(paper_db_idx,994,replace=False))
# Data Mining
paper_dm = np.isin(paper_conf,[0])
paper_dm_idx = np.where(paper_dm == True)[0]
# Wireless Communication
paper_wc = np.isin(paper_conf,[9,10])
paper_wc_idx = np.where(paper_wc == True)[0]

In [5]:
paper_idx = np.sort(list(paper_db_idx)+list(paper_dm_idx)+list(paper_wc_idx))

In [16]:
len(paper_idx)

3025

In [6]:
# 0 : database, 1: wireless communication, 2: data mining
paper_target = []
for idx in paper_idx:
    if idx in paper_db_idx:
        paper_target.append(0)
    elif idx in paper_wc_idx:
        paper_target.append(1)
    else:
        paper_target.append(2)
paper_target = np.array(paper_target)

In [7]:
paper_target.shape

(3025,)

## Edges (PAP, PSP)
[0,1,9,10,13] : KDD,SIGMOD,SIGCOMM,MobiCOMM,VLDB

In [8]:
mat_file['PvsA']

<12499x17431 sparse matrix of type '<class 'numpy.float64'>'
	with 37055 stored elements in Compressed Sparse Column format>

In [17]:
authors = mat_file['PvsA'][paper_idx].nonzero()[1]
author_dic = {}
re_authors = []
for author in authors:
    if author not in author_dic:
        author_dic[author] = len(author_dic) + len(paper_idx)
    re_authors.append(author_dic[author])
re_authors = np.array(re_authors)

In [18]:
len(author_dic)

5915

In [19]:
subjects = mat_file['PvsL'][paper_idx].nonzero()[1]
subject_dic = {}
re_subjects = []
for subject in subjects:
    if subject not in subject_dic:
        subject_dic[subject] = len(subject_dic) + len(paper_idx) + len(author_dic)
    re_subjects.append(subject_dic[subject])
re_subjects = np.array(re_subjects)

In [20]:
len(subject_dic)

56

In [21]:
node_num = len(paper_idx) + len(author_dic) + len(subject_dic)

In [22]:
node_num

8996

In [23]:
papers = mat_file['PvsA'][paper_idx].nonzero()[0]
data = np.ones_like(papers)

In [24]:
A_pa = csr_matrix((data, (papers, re_authors)), shape=(node_num,node_num))

In [25]:
A_pa

<8996x8996 sparse matrix of type '<class 'numpy.int32'>'
	with 10001 stored elements in Compressed Sparse Row format>

In [26]:
papers = mat_file['PvsL'][paper_idx].nonzero()[0]
data = np.ones_like(papers)

In [27]:
A_ps = csr_matrix((data, (papers, re_subjects)), shape=(node_num,node_num))

In [28]:
A_ps

<8996x8996 sparse matrix of type '<class 'numpy.int32'>'
	with 3025 stored elements in Compressed Sparse Row format>

In [29]:
A_ap = A_pa.transpose()

In [30]:
A_sp = A_ps.transpose()

In [31]:
edges = [A_pa,A_ap,A_ps,A_sp]

# Node Features

In [32]:
terms = mat_file['TvsP'].transpose()[paper_idx].nonzero()[1]
term_dic = {}
re_terms = []
for term in terms:
    if term not in term_dic:
        term_dic[term] = len(term_dic) + len(paper_idx) + len(author_dic) + len(subject_dic)
    re_terms.append(term_dic[term])
re_terms = np.array(re_terms)

In [33]:
mat_file['TvsP'].transpose()

<12499x1903 sparse matrix of type '<class 'numpy.float64'>'
	with 972973 stored elements in Compressed Sparse Row format>

In [34]:
# tmp
tmp_num_node = node_num + len(term_dic)
papers = mat_file['PvsA'][paper_idx].nonzero()[0]
data = np.ones_like(papers)
A_pa_tmp = csr_matrix((data, (papers, re_authors)), shape=(tmp_num_node,tmp_num_node))
papers = mat_file['PvsL'][paper_idx].nonzero()[0]
data = np.ones_like(papers)
A_ps_tmp = csr_matrix((data, (papers, re_subjects)), shape=(tmp_num_node,tmp_num_node))
papers = mat_file['PvsT'][paper_idx].nonzero()[0]
data = np.ones_like(papers)
A_pt_tmp = csr_matrix((data, (papers, re_terms)), shape=(tmp_num_node,tmp_num_node))

In [35]:
paper_feat = np.array(A_pt_tmp[:len(paper_idx),-len(term_dic):].toarray()>0, dtype=np.int)
author_feat = np.array(A_pa_tmp.transpose().dot(A_pt_tmp)[len(paper_idx):len(paper_idx)+len(author_dic),-len(term_dic):].toarray()>0, dtype=np.int)
subject_feat = np.array(A_ps_tmp.transpose().dot(A_pt_tmp)[len(paper_idx)+len(author_dic):len(paper_idx)+len(author_dic)+len(subject_dic),-len(term_dic):].toarray()>0, dtype=np.int)

In [36]:
node_faeture = np.concatenate((paper_feat,author_feat,subject_feat))

In [39]:
node_faeture.shape

(8996, 1902)

# Label

In [29]:
paper_target.shape

(3025,)

In [40]:
# Train, Valid
train_valid_DB = list(np.random.choice(np.where(paper_target==0)[0],300, replace=False))
train_valid_WC = list(np.random.choice(np.where(paper_target==1)[0],300, replace=False))
train_valid_DM = list(np.random.choice(np.where(paper_target==2)[0],300, replace=False))

train_idx = np.array(train_valid_DB[:200] + train_valid_WC[:200] + train_valid_DM[:200])
train_target = paper_target[train_idx]
train_label = np.vstack((train_idx,train_target)).transpose()
valid_idx = np.array(train_valid_DB[200:] + train_valid_WC[200:] + train_valid_DM[200:])
valid_target = paper_target[valid_idx]
valid_label = np.vstack((valid_idx,valid_target)).transpose()
test_idx = np.array(list((set(np.arange(paper_target.shape[0])) - set(train_idx)) - set(valid_idx)))
test_target = paper_target[test_idx]
test_label = np.vstack((test_idx,test_target)).transpose()

In [41]:
labels = [train_label,valid_label,test_label]

In [42]:
labels

[array([[1595,    0],
        [1782,    0],
        [1145,    0],
        ...,
        [ 618,    2],
        [ 207,    2],
        [ 373,    2]]), array([[ 933,    0],
        [1582,    0],
        [1470,    0],
        [1899,    0],
        [ 873,    0],
        [1925,    0],
        [1831,    0],
        [1857,    0],
        [1057,    0],
        [1040,    0],
        [1027,    0],
        [ 801,    0],
        [1921,    0],
        [1268,    0],
        [1728,    0],
        [1416,    0],
        [1732,    0],
        [1717,    0],
        [ 984,    0],
        [1939,    0],
        [1674,    0],
        [ 866,    0],
        [1874,    0],
        [1254,    0],
        [1761,    0],
        [1735,    0],
        [1098,    0],
        [1256,    0],
        [1500,    0],
        [1496,    0],
        [1133,    0],
        [ 973,    0],
        [2370,    0],
        [1699,    0],
        [ 941,    0],
        [2353,    0],
        [1151,    0],
        [1541,    0],
        [1090,    