# Load Data and import packages

In [304]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from torch_geometric.utils import from_scipy_sparse_matrix

In [332]:
cora = np.load('cora_ml.npz', allow_pickle = True)

In [334]:
print(cora.files)

['idx_to_attr', 'attr_indices', 'attr_shape', 'idx_to_node', 'adj_shape', 'adj_indptr', 'adj_data', 'labels', 'attr_data', 'adj_indices', 'attr_indptr', 'idx_to_class', 'attr_text']


In [335]:

#print(g['idx_to_attr']) # mapping id to string (2879)
#print((g['attr_indices'])) # all indices
#print(g['attr_shape']) # (2995, 2879)
#print(g['idx_to_node']) # (2995)
#print(g['adj_shape']) # (2995, 2995) adjacency shape
#print(g['adj_indptr']) (8416)
#print(g['adj_data']) 8416
#print(g['labels']) 2995
#print(g['attr_data'].shape) # 151171
#print(g['adj_indices']) 8416
#print(g['attr_indptr']) 151171
#print(g['idx_to_class']) class strings

print(len(cora['attr_text'][0]))


1381


# Graph Creation

In [339]:
def get_graph_from_document(doc_str, isdirected, isweighted, size_window):
    #doc_str is a string (i.e the document)
    #doc_str should have more words than size_window
    import networkx as nx
    doc_array = doc_str.split()
    N = len(doc_array)
    
    if isdirected:
        G = nx.DiGraph()
    else:
        G=nx.Graph()
        
    for j in range(N):
        for i in range(max(j-size_window+1,0),j):
            if G.has_edge(doc_array[i], doc_array[j]):
                if isweighted:
                    # we added this one before, just increase the weight by one
                    G[doc_array[i]][doc_array[j]]['weight'] += 1
            else:
                # new edge. add with weight=1
                G.add_edge(doc_array[i], doc_array[j], weight=1)

    return G

def get_gow(corpus, isdirected, isweighted, size_window):
    dict_graph_of_words = dict()
    
    for i in range(corpus.shape[0]):
            dict_graph_of_words[i] = get_graph_from_document(corpus[i],isdirected,isweighted, size_window)
        
    return dict_graph_of_words

In [340]:
dict_graph_of_words = get_gow(cora['attr_text'],False,True,3)

In [319]:
print(type(dict_graph_of_words))

<class 'dict'>


In [341]:
## transform to torch_geometric and attach labels

from torch_geometric.utils import from_networkx
import torch

torch_geometric_graphlist = []
for key, value in dict_graph_of_words.items():
    torch_geometric_graphlist.append(from_networkx(value))


for i, graph in enumerate(torch_geometric_graphlist):
  graph.y = torch.tensor(cora['labels'][i])

# Word2Vec for Node features

In [342]:
import gensim.downloader

# Show all available models in gensim-data

print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [143]:
import gensim.downloader as api
wv = api.load('glove-wiki-gigaword-50')

In [321]:
def get_node_features(X):
  full_array = []
  for i, text in enumerate(X):
    text_array = []
    for word in get_unique_words(text):
      try:
        text_array.append(wv[word])
      except KeyError:
        text_array.append(np.zeros(50))
    full_array.append(text_array)
  return full_array

def get_unique_words(text):
  uniques = []
  for word in text.split():
    if word not in uniques:
      uniques.append(word)
  return uniques

In [343]:
node_features_list = get_node_features(cora['attr_text'])

# Steps to convert dataset to OGB object

In [367]:
#1. Constructor
!pip install ogb
from ogb.io import DatasetSaver
import numpy as np
import networkx as nx
import os

# constructor
dataset_name = 'ogbg-CoraAbstracts'
saver = DatasetSaver(dataset_name = dataset_name, is_hetero = False, version = 1)

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Found an existing submission directory at submission_ogbg_CoraAbstracts/. 
Will you remove it? (y/N)
y
Removed existing submission directory


In [369]:
# 2. Saving graph list
# generate random graphs with node and edge features
graph_list = []
num_data = 100
for i, g in enumerate(torch_geometric_graphlist):

    graph = dict()
    
    #Word2Vec node features
    graph['node_feat'] = np.array(node_features_list[i])
    graph['num_nodes'] = len(node_features_list[i])
    
    graph['edge_index'] = np.array(g.edge_index) #.transpose() 
    num_edges = graph['edge_index'].shape[1]
    graph['edge_feat'] = np.expand_dims(np.array(g.weight),axis=1).astype(np.int64)

    
    graph_list.append(graph)
# saving a list of graphs
saver.save_graph_list(graph_list)


dict_keys(['node_feat', 'num_nodes', 'edge_index', 'edge_feat'])
Saving edge_index
Saving all the files!
Validating...
Reading saved files
Loading necessary files...
This might take a while.


100%|██████████| 2995/2995 [00:00<00:00, 460144.34it/s]
 29%|██▉       | 870/2995 [00:00<00:00, 8694.93it/s]

Processing graphs...
Checking read graphs and given graphs are the same


100%|██████████| 2995/2995 [00:00<00:00, 7366.06it/s]


In [370]:
# helper to transform labels
def scalar_to_vector(scalar, num_classes):
  t = np.zeros(num_classes)
  t[scalar - 1] = 1
  return t



In [371]:
# 3. Saving target labels
num_classes = 7
labels = []
print(g)
for i, l in enumerate(cora['labels']):
  labels.append(scalar_to_vector(l, 7))

#print(labels)
labels = np.array([np.array(t) for t in labels]) #.transpose()
print(labels.shape)
saver.save_target_labels(labels)

Data(edge_index=[2, 433], weight=[433], y=3)
(2995, 7)


In [372]:
# 4. Saving dataset split
split_idx = dict()
num_data = len(bigram_count_adjacency_list)
perm = np.random.permutation(num_data)
split_idx['train'] = perm[:int(0.8*num_data)]
split_idx['valid'] = perm[int(0.8*num_data): int(0.9*num_data)]
split_idx['test'] = perm[int(0.9*num_data):]
saver.save_split(split_idx, split_name = 'random')

In [380]:
# 5. Copying mapping directory
mapping_path = 'mapping/'

# prepare mapping information first and store it under this directory (empty below).
os.makedirs(mapping_path)
os.mknod(os.path.join(mapping_path, 'README.md'))

saver.copy_mapping_dir(mapping_path)

In [374]:
# 6. Saving task information
saver.save_task_info(task_type = 'classification', eval_metric = 'rocauc', num_classes = num_classes)

classification
7


In [381]:
# 7. Getting meta information dictionary
meta_dict = saver.get_meta_dict()

In [376]:
# 8. Testing the dataset object
from ogb.graphproppred import GraphPropPredDataset
dataset = GraphPropPredDataset(dataset_name, meta_dict = meta_dict)

# see if it is working properly
print(dataset[0])
print(dataset.get_idx_split())

Loading necessary files...
This might take a while.


100%|██████████| 2995/2995 [00:00<00:00, 548724.09it/s]

Processing graphs...
Saving...





({'edge_index': array([[  0,   0,   1, ..., 137, 138, 138],
       [  1,   2,   0, ..., 136,   1,   2]]), 'edge_feat': array([[1],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [5],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [4],
       [2],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
     

In [383]:
print(dataset[0][0]['edge_index'].shape)

(2, 777)


In [378]:
print(dataset[0][0]['edge_feat'].shape)

(777, 1)


In [379]:
print(dataset[0][0]['node_feat'].shape)

(139, 50)


In [215]:
print(np.expand_dims(dataset[0][0]['edge_feat'], axis=1).shape)

(192, 1, 1)


In [217]:
print(dataset[0][0]['edge_feat'].shape[1])

1
