# Data Familiarity

Downloaded the gcn binary features for cora from [this repository](https://github.com/tkipf/gcn/tree/master/gcn/data) into `/data` folder. The following code is taken from the utils and changed to match our project.

The adjacency and features matrices are scipy sparse matrices so we do `.A` to convert to numpy ndarrays, which we need for pystruct. Also the adjacency matrix is NxN but we want Nx2 so we get all the indices of nonzero entries and stack the indices in an Nx2 matrix.

In [13]:
# Reference: https://github.com/tkipf/gcn/blob/master/gcn/utils.py
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
import sys
import random
from time import time
import numpy as np

from pystruct.models import GraphCRF
from pystruct.learners import NSlackSSVM, FrankWolfeSSVM, SubgradientSSVM

def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

def load_data(dataset_str, size=None):
    """
    Loads input data from gcn/data directory
    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
    All objects above must be saved using python pickle module.
    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    if size:
        idx_train = random.sample(range(0, 2708), size)
        idx_test = [i for i in range(0,2708) if i not in idx_train]
    else:
        idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
#     y_train[train_mask, :] = labels[train_mask, :]
#     y_val[val_mask, :] = labels[val_mask, :]
#     y_test[test_mask, :] = labels[test_mask, :]

    y_train = labels[train_mask]
    y_val = labels[val_mask]
    y_test = labels[test_mask]
    # y must be int for pystruct
    return adj.A, features.A, labels, y_train.astype(int), y_val.astype(int), y_test.astype(int), train_mask, val_mask, test_mask

In [14]:
def get_train_test(size= None, model=None):
    adj, features, labels, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data("cora",size)
    adj = np.vstack(np.nonzero(adj))[0]
    
    # print(np.dstack(np.nonzero(adj))[0])
    # print(adj.shape) # (2708, 2708)
    # print(features.shape) # (2708, 1433)

    # X_train, X_test = (features[train_mask], adj[train_mask]), (features[test_mask], adj[test_mask])
    features_train, features_test = features[train_mask], features[test_mask]
    # X_train, X_test = [(features_train, adj)], [(features_test, adj)]
    # X_train, X_test = [(features_train, adj)], [(features_test, adj)]
    X_train = [(features_train, np.vstack([np.arange(features_train.shape[0] - 1), np.arange(1, features_train.shape[0])]))]
    X_test = [(features_test, np.vstack([np.arange(features_test.shape[0] - 1), np.arange(1, features_test.shape[0])]))]

    # pystruct is expecting it the other way around
    y_train = y_train.transpose()
    y_test = y_test.transpose()

    #print(len([x for x in train_mask if x == True])) # num nodes in training # 140
    #print(X_train[0][0].shape) # 140, 1433
    #print(X_test[0][0].shape)
    return X_train, y_train, X_test, y_test



# Code for comapring to GCN results

In [15]:
%%time
X_train, y_train, X_test, y_test = get_train_test()

print(X_train[0][0].shape)
print(X_test[0][0].shape)

# Asymmetric pairwise potentials directed=True, linear programming inference
asymmetric = GraphCRF(inference_method='max-product', directed=True)
symmetric = GraphCRF(inference_method='max-product', directed=False)

asymmetriclp = GraphCRF(inference_method='lp', directed=True)
symmetriclp = GraphCRF(inference_method='lp', directed=False)

models = [(asymmetric, "asymmetric, max-product"), (symmetric, "symmetric, max-product"),
         (asymmetriclp, "asymmetric, linear programming"), (symmetriclp, "symmetric, linear programming")]

for model, modeln in models:
    ssvm = SubgradientSSVM(model=model, C=0.1, max_iter=10)
    fwsvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10)
    nsvm = NSlackSSVM(model, C=100)

    print("*"*10)
    print(modeln)
    start = time()
    ssvm.fit(X_train, y_train)
    time_svm = time() - start
    y_pred = np.vstack(ssvm.predict(X_test))
    print("\tScore with pystruct crf subgradient svm: %f (took %f seconds)"
          % (np.mean(y_pred == y_test), time_svm))

    print("\t"+ "*"*5)
    start = time()
    fwsvm.fit(X_train, y_train)
    time_svm = time() - start
    y_pred = np.vstack(fwsvm.predict(X_test))
    print("\tScore with pystruct crf frankwolfe svm: %f (took %f seconds)"
          % (np.mean(y_pred == y_test), time_svm))

    print("\t" +"*"*5)
    start = time()
    nsvm.fit(X_train, y_train)
    time_svm = time() - start
    y_pred = np.vstack(nsvm.predict(X_test))
    print("\tScore with pystruct crf nslack svm: %f (took %f seconds)"
          % (np.mean(y_pred == y_test), time_svm))


(140, 1433)
(1000, 1433)
**********
asymmetric, max-product
	Score with pystruct crf subgradient svm: 0.833571 (took 0.033032 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.810714 (took 0.021192 seconds)
	*****
	Score with pystruct crf nslack svm: 0.801429 (took 0.125874 seconds)
**********
symmetric, max-product
	Score with pystruct crf subgradient svm: 0.832857 (took 0.036799 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.809286 (took 0.017745 seconds)
	*****
	Score with pystruct crf nslack svm: 0.795000 (took 0.107786 seconds)
**********
asymmetric, linear programming
	Score with pystruct crf subgradient svm: 0.825000 (took 0.195695 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.810714 (took 0.106027 seconds)
	*****
	Score with pystruct crf nslack svm: 0.801429 (took 0.438528 seconds)
**********
symmetric, linear programming
	Score with pystruct crf subgradient svm: 0.805000 (took 0.236681 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0

# Random dataset for varying size for 10 runs results. Shitty code but works under a deadline

In [16]:
%%time
datasizes = [68, 135 , 271, 677, 1354, 2031]

fwsvm_result = {}
nsvm_result = {}
ssvm_result = {}

for data_size in datasizes:
    ssvm_result[data_size] = {}
    fwsvm_result[data_size] = {}
    nsvm_result[data_size] = {}
    print("*"*5 + str(data_size) + "*"*5)
    for i in range(10):
        print("*"*3 + str(i) + "*"*3)
        X_train, y_train, X_test, y_test = get_train_test(data_size)

        print(X_train[0][0].shape)
        print(X_test[0][0].shape)

        # Asymmetric pairwise potentials directed=True, linear programming inference
        asymmetric = GraphCRF(inference_method='max-product', directed=True)
        symmetric = GraphCRF(inference_method='max-product', directed=False)

        asymmetriclp = GraphCRF(inference_method='lp', directed=True)
        symmetriclp = GraphCRF(inference_method='lp', directed=False)

        models = [(asymmetric, "asymmetric, max-product"), (symmetric, "symmetric, max-product"),
                 (asymmetriclp, "asymmetric, linear programming"), (symmetriclp, "symmetric, linear programming")]
        
        for model, modeln in models:
            ssvm = SubgradientSSVM(model=model, C=0.1, max_iter=10)
            fwsvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10)
            nsvm = NSlackSSVM(model, C=100)

            print("*"*10)
            print(modeln)
            start = time()
            ssvm.fit(X_train, y_train)
            time_svm = time() - start
            y_pred = np.vstack(ssvm.predict(X_test))
            print("\tScore with pystruct crf subgradient svm: %f (took %f seconds)"
                  % (np.mean(y_pred == y_test), time_svm))
            if modeln not in ssvm_result[data_size]:
                ssvm_result[data_size][modeln] = []
            ssvm_result[data_size][modeln].append({"score": np.mean(y_pred == y_test), "time": time_svm})
            
            print("\t"+ "*"*5)
            start = time()
            fwsvm.fit(X_train, y_train)
            time_svm = time() - start
            y_pred = np.vstack(fwsvm.predict(X_test))
            print("\tScore with pystruct crf frankwolfe svm: %f (took %f seconds)"
                  % (np.mean(y_pred == y_test), time_svm))
            if modeln not in fwsvm_result[data_size]:
                fwsvm_result[data_size][modeln] = []
            fwsvm_result[data_size][modeln].append({"score": np.mean(y_pred == y_test), "time": time_svm})

            print("\t" +"*"*5)
            start = time()
            nsvm.fit(X_train, y_train)
            time_svm = time() - start
            y_pred = np.vstack(nsvm.predict(X_test))
            print("\tScore with pystruct crf nslack svm: %f (took %f seconds)"
                  % (np.mean(y_pred == y_test), time_svm))
            if modeln not in nsvm_result[data_size]:
                nsvm_result[data_size][modeln] = []
            nsvm_result[data_size][modeln].append({"score": np.mean(y_pred == y_test), "time": time_svm})
    


*****68*****
***0***
(68, 1433)
(2640, 1433)
**********
asymmetric, max-product
	Score with pystruct crf subgradient svm: 0.826569 (took 0.019546 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.812500 (took 0.011456 seconds)
	*****
	Score with pystruct crf nslack svm: 0.800054 (took 0.087755 seconds)
**********
symmetric, max-product
	Score with pystruct crf subgradient svm: 0.828193 (took 0.023925 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.812500 (took 0.012613 seconds)
	*****
	Score with pystruct crf nslack svm: 0.795996 (took 0.132731 seconds)
**********
asymmetric, linear programming
	Score with pystruct crf subgradient svm: 0.833333 (took 0.092662 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.812500 (took 0.060717 seconds)
	*****
	Score with pystruct crf nslack svm: 0.800054 (took 0.235886 seconds)
**********
symmetric, linear programming
	Score with pystruct crf subgradient svm: 0.833604 (took 0.091580 seconds)
	*****
	Score with pystruct c

## Now only thing left to do is average score and time and form a nice little table and report it in the paper