In [9]:
from ogb.lsc import MAG240MDataset
import numpy as np

In [10]:
dataset = MAG240MDataset(root = "/Volumes/Seagate Backup Plus Drive/Gradu/")

In [None]:
# Basic properties

print(dataset.num_papers) # number of paper nodes
print(dataset.num_authors) # number of author nodes
print(dataset.num_institutions) # number of institution nodes
print(dataset.num_paper_features) # dimensionality of paper features
print(dataset.num_classes) # number of subject area classes

# get i-th paper feature
i = 1234
print(dataset.paper_feat[i]) # only i-th data is loaded into memory

# get the feature matrix storing features of papers in idx_arr
idx_arr = np.array([1,10,100,1000,10000])
print(dataset.paper_feat[idx_arr]) # only the 5 data is loaded into memory

In [4]:
'''
edge_index is numpy.ndarray of shape (2, num_edges).
- first row: indices of source nodes (indexed by source node types)
- second row: indices of target nodes (indexed by target node types)
In other words, i-th edge connects from edge_index[0,i] to edge_index[1,i].
'''

edge_index_writes = dataset.edge_index('author', 'paper')
edge_index_cites = dataset.edge_index('paper', 'paper')
edge_index_affiliated = dataset.edge_index('author', 'institution')

To create the test dataset to run on a regular computer, we begin by selecting an author and retrieving the papers they have contributed in. Next, we get the indexes of those papers, and retrieve both their citations and their authors. This cycle can be continued, and in the end we have an array of authors from which we can retrieve their affiliations. Then, we have an array of authors, an array of papers, three edgelists containing citations, writes and affiliations. We can then retrieve the required paper features from the array of papers and write the result into a file. 

We also need to select an article that actually has a label, since there are many unlabeled articles in the raw data.

In [5]:
print(np.where(dataset.paper_label==1))
print(edge_index_writes[:,edge_index_writes[1]==85]) # retrieve the author of paper 85 and use that as a starting point

(array([       85,      9052,      9166, ..., 121422531, 121494455,
       121634237]),)
[[1080565 1794309 7130696 7130697]
 [     85      85      85      85]]


In [6]:
author_array = edge_index_writes[0,edge_index_writes[0]==1080565]
paper_array = edge_index_writes[1,edge_index_writes[0]==1080565]
mask_cites = np.in1d(edge_index_cites[0],paper_array)
cites_array = edge_index_cites[:,mask_cites]

n = 2

for i in range(n):
    paper_array = np.concatenate((paper_array, np.unique(cites_array[1])))
    mask_author = np.in1d(edge_index_writes[1],paper_array)
    author_array = edge_index_writes[:,mask_author][0]
    mask_cites = np.in1d(edge_index_cites[0],paper_array)
    cites_array = edge_index_cites[:,mask_cites]
    print(paper_array)

mask_cites = np.in1d(edge_index_cites[0],paper_array)
cites_array = edge_index_cites[:,mask_cites]
paper_array = np.concatenate((paper_array, np.unique(cites_array[1])))
mask_author = np.in1d(edge_index_writes[1],paper_array)
author_array = edge_index_writes[:,mask_author][0]
print(paper_array)


[       84        85    490533    490534   2982342   5551868   6283888
   8414954  10013767  13658892  15101186  15101187  19923490  20409947
  25059751  27893775  31536464  32435004  35532256  44857869  47376659
  52622445  59323907  60990971  63550306  65585135  67201622  67201623
  72101074  72236376  73785647  74463918  77349996  77435098  77435099
  78366648  84608505  84961866  93779604  96702807  97170758  98191348
 100807308 100807309 102523993 104040469 105199562 105199563 105735949
 108384525 114772218 115473513 119862569        84     58195    490533
    615919    721392    965536   1123747   1705430   1771075   1779237
   1997869   2091814   2101297   2212608   2319301   2460081   2576935
   2714957   2771297   2779606   2977353   3033593   3107800   3810226
   3828613   3975206   4502727   4565529   4827877   4992264   5007440
   5065999   5397189   5423909   5563439   5691667   5995636   5997796
   6246812   6682996   6794499   7386267   8439253   8558880   8611882
   877

In [7]:
paper_array_test = np.unique(paper_array)
author_array_test = np.unique(author_array)

In [8]:
mask_affiliations = np.in1d(edge_index_affiliated[0],author_array_test)
edgelist_affiliations_test = edge_index_affiliated[:,mask_affiliations]
edgelist_affiliations_test

array([[       99,        99,        99, ..., 121914087, 122130702,
        122233377],
       [      447,      1248,      2652, ...,       112,       163,
              329]])

In [8]:
mask_affiliations = np.in1d(edge_index_affiliated[0],author_array_test)
edgelist_affiliations_test = edge_index_affiliated[:,mask_affiliations]
edgelist_affiliations_test

array([[       99,        99,        99, ..., 121914087, 122130702,
        122233377],
       [      447,      1248,      2652, ...,       112,       163,
              329]])

In [9]:
mask_writes = np.in1d(edge_index_writes[0],author_array_test)
edgelist_writes_test = edge_index_writes[:,mask_writes]
edgelist_writes_test


array([[       99,        99,        99, ..., 122233377, 122319849,
        122332293],
       [   383346,    505333,    972726, ...,  41748765, 107011542,
         90104736]])

In [10]:
edgelist_cites_test = cites_array
edgelist_cites_test

array([[       84,        84,        84, ..., 121712380, 121712380,
        121712380],
       [  2576935,  61873589,  90818614, ..., 114514237, 115059441,
        119900450]])

In [11]:
feats_array_test = dataset.paper_feat[paper_array_test]
paper_label_test = dataset.paper_label[paper_array_test]

In [12]:
print(author_array_test.shape)
print(paper_array_test.shape)
print(edgelist_writes_test.shape)
print(edgelist_cites_test.shape)
print(edgelist_affiliations_test.shape)
print(feats_array_test.shape)
print(paper_label_test.shape)


(332318,)
(180241,)
(2, 17845198)
(2, 316992)
(2, 606196)
(180241, 768)
(180241,)


In [13]:
np.save('test_data/author_array_test', author_array_test)
np.save('test_data/paper_array_test', paper_array_test)
np.save('test_data/edgelist_writes_test', edgelist_writes_test)
np.save('test_data/edgelist_cites_test', edgelist_cites_test)
np.save('test_data/edgelist_affiliations_test', edgelist_affiliations_test)
np.save('test_data/feats_array_test', feats_array_test)
np.save('test_data/paper_label_test', paper_label_test)

Now, we have a smaller part of MAG-dataset that should be much easier to train with, before attempting anything with the dataset as a whole. 