# Data Familiarity

Downloaded the gcn binary features for cora from [this repository](https://github.com/tkipf/gcn/tree/master/gcn/data) into `/data` folder. The following code is taken from the utils and changed to match our project.

The adjacency and features matrices are scipy sparse matrices so we do `.A` to convert to numpy ndarrays, which we need for pystruct. Also the adjacency matrix is NxN but we want Nx2 so we get all the indices of nonzero entries and stack the indices in an Nx2 matrix.

In [52]:
# Reference: https://github.com/tkipf/gcn/blob/master/gcn/utils.py
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
import sys

def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

def load_data(dataset_str):
    """
    Loads input data from gcn/data directory
    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
    All objects above must be saved using python pickle module.
    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
#     y_train[train_mask, :] = labels[train_mask, :]
#     y_val[val_mask, :] = labels[val_mask, :]
#     y_test[test_mask, :] = labels[test_mask, :]

    y_train = labels[train_mask]
    y_val = labels[val_mask]
    y_test = labels[test_mask]
    # y must be int for pystruct
    return adj.A, features.A, y_train.astype(int), y_val.astype(int), y_test.astype(int), train_mask, val_mask, test_mask

In [175]:
def in_range(val1, val2, low, high):
    return (low <= val1 <= high) and (low <= val2 <= high)

adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data("cora")
adj = np.vstack(np.nonzero(adj))

# print(np.dstack(np.nonzero(adj))[0])
# print(adj.shape) # (2708, 2708)
# print(features.shape) # (2708, 1433)

features_train, features_test = features[train_mask], features[test_mask]
nodes_train = np.nonzero(train_mask)[0]
nodes_test = np.nonzero(test_mask)[0]

"""
 Keep edges in training / testing (but ignore edges to nodes outside the set)
 Also remap the node #, according to the index in nodes_train/test
 This is because originally we have 2708 nodes, but now for training 
 we  only use  140 nodes. We could pick a node that could have 
 been the 633rd node in the original graph but now it's one of 
 the 140, so the index changes.
"""
train_edges = [[], []]
test_edges = [[], []]
for i in range(adj.shape[1]):
    node1 = adj[0][i]
    node2 = adj[1][i]
    # Check if edge pair is in range
    if in_range(node1, node2, nodes_train[0], 
                nodes_train[len(nodes_train) - 1]):
        # This does the index mapping for the adj for the subset graph
        index1 = np.where(nodes_train==node1)[0][0]
        index2 = np.where(nodes_train==node2)[0][0]
        train_edges[0].append(index1)
        train_edges[1].append(index2)
    # Same thing but now for testing
    elif in_range(node1, node2, nodes_test[0], 
                nodes_test[len(nodes_test) - 1]):
        index1 = np.where(nodes_test==node1)[0][0]
        index2 = np.where(nodes_test==node2)[0][0]
        test_edges[0].append(index1)
        test_edges[1].append(index2)
        
# train_edges = np.vstack([np.arange(features_train.shape[0] - 1), np.arange(1, features_train.shape[0])])
# test_edges = np.vstack([np.arange(features_test.shape[0] - 1), np.arange(1, features_test.shape[0])])

X_train = [(features_train, np.array(train_edges))]
X_test = [(features_test, np.array(test_edges))]

# pystruct is expecting it the other way around
y_train = y_train.transpose()
y_test = y_test.transpose()

In [177]:
from time import time
import numpy as np

from pystruct.models import GraphCRF
from pystruct.learners import NSlackSSVM, FrankWolfeSSVM, SubgradientSSVM

# Asymmetric pairwise potentials directed=True, linear programming inference
asymmetric = GraphCRF(inference_method='max-product', directed=True)
symmetric = GraphCRF(inference_method='max-product', directed=False)

asymmetriclp = GraphCRF(inference_method='lp', directed=True)
symmetriclp = GraphCRF(inference_method='lp', directed=False)

models = [(asymmetric, "asymmetric, max-product"), (symmetric, "symmetric, max-product"),
         (asymmetriclp, "asymmetric, linear programming"), (symmetriclp, "symmetric, linear programming")]
for model, modeln in models:
    ssvm = SubgradientSSVM(model=model, C=0.1, max_iter=10)
    fwsvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10)
    nsvm = NSlackSSVM(model, C=100)
    
    print("*"*10)
    print(modeln)
    start = time()
    ssvm.fit(X_train, y_train)
    time_svm = time() - start
    y_pred = np.vstack(ssvm.predict(X_test))
    print("\tScore with pystruct crf subgradient svm: %f (took %f seconds)"
          % (np.mean(y_pred == y_test), time_svm))
    print("\tScore and time reported by API %f (%f seconds)"  % (ssvm.score(X_test,y_test), sum(ssvm.timestamps_[1:])))
    
    print("\t"+ "*"*5)
    start = time()
    fwsvm.fit(X_train, y_train)
    time_svm = time() - start
    y_pred = np.vstack(fwsvm.predict(X_test))
    print("\tScore with pystruct crf frankwolfe svm: %f (took %f seconds)"
          % (np.mean(y_pred == y_test), time_svm))
    print("\tScore and time reported by API %f (%f seconds)"  % (fwsvm.score(X_test,y_test), sum(fwsvm.timestamps_[1:])))

    print("\t" +"*"*5)
    start = time()
    nsvm.fit(X_train, y_train)
    time_svm = time() - start
    y_pred = np.vstack(nsvm.predict(X_test))
    print("\tScore with pystruct crf nslack svm: %f (took %f seconds)"
          % (np.mean(y_pred == y_test), time_svm))
    print("\tScore and time reported by API %f (%f seconds)"  % (nsvm.score(X_test,y_test), sum(nsvm.timestamps_[1:])))


**********
asymmetric, max-product
	Score with pystruct crf subgradient svm: 0.819286 (took 0.078039 seconds)
	Score and time reported by API 0.981000 (0.456822 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.810714 (took 0.028948 seconds)
	Score and time reported by API 0.981571 (0.030324 seconds)
	*****
	Score with pystruct crf nslack svm: 0.796429 (took 0.192461 seconds)
	Score and time reported by API 0.978714 (3.510830 seconds)
**********
symmetric, max-product
	Score with pystruct crf subgradient svm: 0.819286 (took 0.048861 seconds)
	Score and time reported by API 0.981000 (0.275885 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.810000 (took 0.030812 seconds)
	Score and time reported by API 0.981429 (0.033145 seconds)
	*****
	Score with pystruct crf nslack svm: 0.800714 (took 0.169075 seconds)
	Score and time reported by API 0.978429 (2.565343 seconds)
**********
asymmetric, linear programming
	Score with pystruct crf subgradient svm: 0.802143 (took 0.4565