In [1]:
from __future__ import division
from __future__ import print_function

import time
import tensorflow as tf
from utils import *
from models import GCN, MLP

# Set random seed
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)


def load_data(dataset_str, max_sample):
    data_path = '../data/'
    files = [f for f in os.listdir(data_path) if f.startswith(dataset_str)]
    files = np.random.choice(files,max_sample,False) 
    
    adj_lst = list()
    features_lst = list()
    labels_lst = list()
    n_nodes_lst = list()
    
    # load and concatenate all graph data
    for i in range(files.size):
        file = open(data_path + files[i],'rb')
        
        adj = pkl.load(file)
        feature = pkl.load(file)
        label = pkl.load(file)
        n_nodes = [i+1]*feature.shape[0]

        adj_lst.append(adj)
        features_lst.append(feature)
        labels_lst.append(label)
        n_nodes_lst.append(n_nodes)
        
        file.close()
    
    # create labels matrix
    labels_mat =  np.vstack([[labels_lst[i]]*len(n_nodes_lst[i]) for i in range(len(labels_lst))])
    
    # create feature matrix
    features_mat =  np.vstack(features_lst)
    features_mat = sp.csr_matrix(features_mat)
    
    # adj block matrix
    nodes_before = 0
    nodes_after = features_mat.shape[0] 
    for i in range(len(adj_lst)):
        n_nodes = len(n_nodes_lst[i])
        nodes_after -= n_nodes
        if nodes_before>0:
            left_mat = sp.hstack([np.zeros((n_nodes, nodes_before)),adj_lst[i]])
        else :
            left_mat = adj_lst[i]
        if nodes_after>0:
          
            adj_lst[i] = sp.hstack([left_mat,np.zeros((n_nodes, nodes_after))])
        else:
            adj_lst[i] = left_mat
            
        nodes_before += n_nodes
    adj_mat = sp.vstack(adj_lst)
    
    # prepare masks (train/val/test sets)
    train_ratio = 0.8
    val_ratio = 0.2
    train_mask = np.zeros(len(files))
    val_mask = np.zeros(len(files))
    test_mask = np.zeros(len(files))
    
    train_mask_ind = np.random.choice(range(len(files)), int(len(files)*train_ratio), False)
    val_mask_ind = np.random.choice(train_mask_ind, int(len(train_mask_ind)*val_ratio), False)
    
    test_mask_ind = set(range(len(files))).difference(set(train_mask_ind))
    train_mask_ind = set(train_mask_ind).difference(set(val_mask_ind))
    
    for ind in train_mask_ind:
        train_mask[ind] = 1
    
    for ind in val_mask_ind:
        val_mask[ind] = 1
    
    for ind in test_mask_ind:
        test_mask[ind] = 1
    
    test_mask_mat =  np.vstack([[[test_mask[i]]*5]*len(n_nodes_lst[i]) for i in range(len(test_mask))])
    val_mask_mat =   np.vstack([[[val_mask[i]]*5]*len(n_nodes_lst[i]) for i in range(len(val_mask))])
    train_mask_mat =  np.vstack([[[train_mask[i]]*5]*len(n_nodes_lst[i]) for i in range(len(train_mask))])
    
    # check if the result is correct
    sum_check = test_mask_mat+val_mask_mat+train_mask_mat
    if (np.max(sum_check)>1):
        sys.exit()
    elif (np.any(sum_check==0)):
        sys.exit()
    
    # use mask for labels masking
    train_labels_mat = np.multiply(labels_mat, train_mask_mat)
    test_labels_mat = np.multiply(labels_mat, test_mask_mat)
    val_labels_mat = np.multiply(labels_mat, val_mask_mat)
    
    # check if the result is correct
    sum_check = train_labels_mat+test_labels_mat+val_labels_mat
    if (np.sum(sum_check)!= test_labels_mat.shape[0]):
        sys.exit()    
    
    return adj_mat, features_mat, train_labels_mat, test_labels_mat, val_labels_mat, train_mask_mat, test_mask_mat, val_mask_mat


In [2]:
# get paths to available grahs metadata
adj_mat, features_mat, train_labels_mat, test_labels_mat, val_labels_mat, train_mask_mat, test_mask_mat, val_mask_mat = load_data("txt_graph2216_21012020", 20)

In [16]:
print(adj_mat.shape)
print(features_mat.shape)
print(train_labels_mat.shape)
print(test_labels_mat.shape)
print(val_labels_mat.shape)
print(train_mask_mat.shape)
print(test_mask_mat.shape)
print(val_mask_mat.shape)

(3923, 3923)
(3923, 100)
(3923, 5)
(3923, 5)
(3923, 5)
(3923, 5)
(3923, 5)
(3923, 5)


In [152]:
#delete all flags before declaration new one
del_all_flags(tf.flags.FLAGS)

# Settings
flags = tf.app.flags
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('mode', '', 'kernel') # for line by line mode
tf.app.flags.DEFINE_string('port', '', 'kernel') # for line by line mode
tf.app.flags.DEFINE_string('f', '', 'kernel') # for jupyter notebook

flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.')
flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')

# Some preprocessing
features = preprocess_features(features_mat)
support = [preprocess_adj(adj_mat)]
num_supports = 1
model_func = GCN


AttributeError: 'numpy.ndarray' object has no attribute 'tocoo'

In [None]:
# Define placeholders
placeholders = {
    'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
    'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(features[2], dtype=tf.int64)),
    'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])),
    'labels_mask': tf.placeholder(tf.int32),
    'dropout': tf.placeholder_with_default(0., shape=()),
    'num_features_nonzero': tf.placeholder(tf.int32)  # helper variable for sparse dropout
}

# Create model
model = model_func(placeholders, input_dim=features[2][1], logging=True)

In [None]:
# Initialize session
sess = tf.Session()
# Init variables
sess.run(tf.global_variables_initializer())

In [None]:
# Train model
cost_val = []

for epoch in range(FLAGS.epochs):

    t = time.time()
    # Construct feed dictionary
    feed_dict = construct_feed_dict(features, support, y_train, train_mask, placeholders)
    feed_dict.update({placeholders['dropout']: FLAGS.dropout})

    # Training step
    outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict)

    # Validation
    cost, acc, duration = evaluate(features, support, y_val, val_mask, placeholders)
    cost_val.append(cost)

    # Print results
    print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),
          "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost),
          "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t))

    if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping+1):-1]):
        print("Early stopping...")
        break

print("Optimization Finished!")

In [None]:
# Testing
test_cost, test_acc, test_duration = evaluate(features, support, y_test, test_mask, placeholders)
print("Test set results:", "cost=", "{:.5f}".format(test_cost),
      "accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration))