# Data Familiarity

Downloaded the gcn binary features for cora from [this repository](https://github.com/tkipf/gcn/tree/master/gcn/data) into `/data` folder. The following code is taken from the utils and changed to match our project.

The adjacency and features matrices are scipy sparse matrices so we do `.A` to convert to numpy ndarrays, which we need for pystruct. Also the adjacency matrix is NxN but we want Nx2 so we get all the indices of nonzero entries and stack the indices in an Nx2 matrix.

In [167]:
# Reference: https://github.com/tkipf/gcn/blob/master/gcn/utils.py
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
import sys

def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

def load_data(dataset_str):
    """
    Loads input data from gcn/data directory
    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
    All objects above must be saved using python pickle module.
    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
#     y_train[train_mask, :] = labels[train_mask, :]
#     y_val[val_mask, :] = labels[val_mask, :]
#     y_test[test_mask, :] = labels[test_mask, :]

    y_train = labels[train_mask]
    y_val = labels[val_mask]
    y_test = labels[test_mask]
    # y must be int for pystruct
    print(labels[train_mask].shape)
    return adj.A, features.A, y_train.astype(int), y_val.astype(int), y_test.astype(int), train_mask, val_mask, test_mask

In [169]:
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data("cora")
adj = np.vstack(np.nonzero(adj))[0]

# print(np.dstack(np.nonzero(adj))[0])
# print(adj.shape) # (2708, 2708)
# print(features.shape) # (2708, 1433)

# X_train, X_test = (features[train_mask], adj[train_mask]), (features[test_mask], adj[test_mask])
features_train, features_test = features[train_mask], features[test_mask]
# X_train, X_test = [(features_train, adj)], [(features_test, adj)]
# X_train, X_test = [(features_train, adj)], [(features_test, adj)]
X_train = [(features_train, np.vstack([np.arange(features_train.shape[0] - 1), np.arange(1, features_train.shape[0])]))]
X_test = [(features_test, np.vstack([np.arange(features_test.shape[0] - 1), np.arange(1, features_test.shape[0])]))]

y_train = y_train.transpose()
y_test = y_test.transpose()

print(len([x for x in train_mask if x == True])) # num nodes in training # 140
print(X_train[0][0].shape) # 140, 1433
print(y_train.shape) # 2708, 7

(140, 7)
140
(140, 1433)
(7, 140)


# Feature Selection

In [172]:
from time import time
import numpy as np

from pystruct.models import GraphCRF
from pystruct.learners import NSlackSSVM, FrankWolfeSSVM

# Asymmetric pairwise potentials directed=True, linear programming inference
pbl = GraphCRF(inference_method='max-product', directed=True)
ssvm = FrankWolfeSSVM(model=pbl, C=.1, max_iter=10)
# svm = NSlackSSVM(pbl, C=100)

start = time()
ssvm.fit(X_train, y_train)
# svm.fit(X_train, y_train)
time_svm = time() - start
y_pred = np.vstack(ssvm.predict(X_test))
print("Score with pystruct crf svm: %f (took %f seconds)"
      % (np.mean(y_pred == y_test), time_svm))

Score with pystruct crf svm: 0.000000 (took 0.028184 seconds)


