# Data Familiarity

Downloaded the gcn binary features for cora from [this repository](https://github.com/tkipf/gcn/tree/master/gcn/data) into `/data` folder. The following code is taken from the utils and changed to match our project.

The adjacency and features matrices are scipy sparse matrices so we do `.A` to convert to numpy ndarrays, which we need for pystruct. Also the adjacency matrix is NxN but we want Nx2 so we get all the indices of nonzero entries and stack the indices in an Nx2 matrix.

In [1]:
# Reference: https://github.com/tkipf/gcn/blob/master/gcn/utils.py
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
import sys
import random
from time import time
import numpy as np

from pystruct.models import GraphCRF
from pystruct.learners import NSlackSSVM, FrankWolfeSSVM, SubgradientSSVM

def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

def load_data(dataset_str, size=None):
    """
    Loads input data from gcn/data directory
    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
    All objects above must be saved using python pickle module.
    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    if size:
        idx_train = random.sample(range(0, 2708), size)
        idx_test = [i for i in range(0,2708) if i not in idx_train]
    else:
        idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
#     y_train[train_mask, :] = labels[train_mask, :]
#     y_val[val_mask, :] = labels[val_mask, :]
#     y_test[test_mask, :] = labels[test_mask, :]

    y_train = labels[train_mask]
    y_val = labels[val_mask]
    y_test = labels[test_mask]
    # y must be int for pystruct
    return adj.A, features.A, labels, y_train.astype(int), y_val.astype(int), y_test.astype(int), train_mask, val_mask, test_mask

In [2]:
def get_train_test(size= None):
    adj, features, labels, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data("cora",size)
    adj = np.vstack(np.nonzero(adj))
    
    # print(np.dstack(np.nonzero(adj))[0])
    # print(adj.shape) # (2708, 2708)
    # print(features.shape) # (2708, 1433)

    features_train, features_test = features[train_mask], features[test_mask]
    nodes_train = np.nonzero(train_mask)[0]
    nodes_test = np.nonzero(test_mask)[0]
    
    edges_train = [[],[]]
    edges_test = [[],[]]
    for i in range(adj.shape[1]):
        node1 = adj[0][i]
        node2 = adj[1][i]
        if node1 in nodes_train and node2 in nodes_train:
            #print("**")
            #print(np.where(nodes_train==node1)[0][0])
            #print("*")
            #print(np.where(nodes_train==node2)[0][0])
            edges_train[0].append(np.where(nodes_train==node1)[0][0])
            edges_train[1].append(np.where(nodes_train==node2)[0][0])
        elif node1 in nodes_test and node2 in nodes_test:
            edges_test[0].append(np.where(nodes_test==node1)[0][0])
            edges_test[1].append(np.where(nodes_test==node2)[0][0])
    
    if len(edges_train[0]) == 0 or len(edges_train[1]) == 0 or len(edges_test[0]) == 0 or len(edges_test[1]) == 0:
        return get_train_test(size)
            
    X_train = [(features_train,  np.array(edges_train))]
    X_test = [(features_test,  np.array(edges_test))]
    
#     X_train = [(features_train, np.vstack([np.arange(features_train.shape[0] - 1), np.arange(1, features_train.shape[0])]))]
#     X_test = [(features_test, np.vstack([np.arange(features_test.shape[0] - 1), np.arange(1, features_test.shape[0])]))]

    # pystruct is expecting it the other way around
    y_train = y_train.transpose()
    y_test = y_test.transpose()

    #print(len([x for x in train_mask if x == True])) # num nodes in training # 140
    #print(X_train[0][0].shape) # 140, 1433
    #print(X_test[0][0].shape)
    return X_train, y_train, X_test, y_test

# Code for comapring to GCN results

In [3]:
%%time
X_train, y_train, X_test, y_test = get_train_test()

def get_y(y):
    y_final = []
    for i in range(len(y[0])):
        for j in range(len(y)):
            if y[j][i] == 1:
                y_final.append(j)
                break
    return y_final

y_train = np.array([get_y(y_train)])
y_test = np.array([get_y(y_test)])

# Asymmetric pairwise potentials directed=True, linear programming inference
asymmetric = GraphCRF(inference_method='max-product', directed=True)
symmetric = GraphCRF(inference_method='max-product', directed=False)

asymmetriclp = GraphCRF(inference_method='lp', directed=True)
symmetriclp = GraphCRF(inference_method='lp', directed=False)

models = [(asymmetric, "asymmetric, max-product"), (symmetric, "symmetric, max-product"),
         (asymmetriclp, "asymmetric, linear programming"), (symmetriclp, "symmetric, linear programming")]

for model, modeln in models:
    ssvm = SubgradientSSVM(model=model, C=0.1, max_iter=10)
    fwsvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10)
    nsvm = NSlackSSVM(model, C=100)

    print("*"*10)
    print(modeln)
    start = time()
    ssvm.fit(X_train, y_train)
    time_svm = time() - start
    y_pred = np.vstack(ssvm.predict(X_test))
    print("\tScore with pystruct crf subgradient svm: %f (took %f seconds)"
          % (np.mean(y_pred == y_test), time_svm))

    print("\t"+ "*"*5)
    start = time()
    fwsvm.fit(X_train, y_train)
    time_svm = time() - start
    y_pred = np.vstack(fwsvm.predict(X_test))
    print("\tScore with pystruct crf frankwolfe svm: %f (took %f seconds)"
          % (np.mean(y_pred == y_test), time_svm))

    print("\t" +"*"*5)
    start = time()
    nsvm.fit(X_train, y_train)
    time_svm = time() - start
    y_pred = np.vstack(nsvm.predict(X_test))
    print("\tScore with pystruct crf nslack svm: %f (took %f seconds)"
          % (np.mean(y_pred == y_test), time_svm))


**********
asymmetric, max-product
	Score with pystruct crf subgradient svm: 0.560000 (took 0.068659 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.603000 (took 0.037092 seconds)
	*****
	Score with pystruct crf nslack svm: 0.564000 (took 1.184892 seconds)
**********
symmetric, max-product
	Score with pystruct crf subgradient svm: 0.560000 (took 0.087508 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.603000 (took 0.060608 seconds)
	*****
	Score with pystruct crf nslack svm: 0.563000 (took 1.237662 seconds)
**********
asymmetric, linear programming
	Score with pystruct crf subgradient svm: 0.576000 (took 0.835623 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.588000 (took 0.419812 seconds)
	*****
	Score with pystruct crf nslack svm: 0.569000 (took 4.573227 seconds)
**********
symmetric, linear programming
	Score with pystruct crf subgradient svm: 0.576000 (took 0.735637 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.585000 (took 0.536061 se

# Random dataset for varying size for 10 runs results. Shitty code but works under a deadline

In [4]:
%%time
datasizes = [68, 135 , 271, 677, 1354, 2031]

fwsvm_result = {}
nsvm_result = {}
ssvm_result = {}

for data_size in datasizes:
    ssvm_result[data_size] = {}
    fwsvm_result[data_size] = {}
    nsvm_result[data_size] = {}
    print("*"*5 + str(data_size) + "*"*5)
    for i in range(10):
        print("*"*3 + str(i) + "*"*3)
        X_train, y_train, X_test, y_test = get_train_test(data_size)

        y_train = np.array([get_y(y_train)])
        y_test = np.array([get_y(y_test)])
        print(X_train[0][0].shape)
        print(X_test[0][0].shape)

        # Asymmetric pairwise potentials directed=True, linear programming inference
        asymmetric = GraphCRF(inference_method='max-product', directed=True)
        symmetric = GraphCRF(inference_method='max-product', directed=False)

        asymmetriclp = GraphCRF(inference_method='lp', directed=True)
        symmetriclp = GraphCRF(inference_method='lp', directed=False)

        models = [(asymmetric, "asymmetric, max-product"), (symmetric, "symmetric, max-product")]
                 #(asymmetriclp, "asymmetric, linear programming"), (symmetriclp, "symmetric, linear programming")]
        
        for model, modeln in models:
            ssvm = SubgradientSSVM(model=model, C=0.1, max_iter=10)
            fwsvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10)
            nsvm = NSlackSSVM(model, C=100)

            print("*"*10)
            print(modeln)
            start = time()
            ssvm.fit(X_train, y_train)
            time_svm = time() - start
            y_pred = np.vstack(ssvm.predict(X_test))
            print("\tScore with pystruct crf subgradient svm: %f (took %f seconds)"
                  % (np.mean(y_pred == y_test), time_svm))
            if modeln not in ssvm_result[data_size]:
                ssvm_result[data_size][modeln] = []
            ssvm_result[data_size][modeln].append({"score": np.mean(y_pred == y_test), "time": time_svm})
            
            print("\t"+ "*"*5)
            start = time()
            fwsvm.fit(X_train, y_train)
            time_svm = time() - start
            y_pred = np.vstack(fwsvm.predict(X_test))
            print("\tScore with pystruct crf frankwolfe svm: %f (took %f seconds)"
                  % (np.mean(y_pred == y_test), time_svm))
            if modeln not in fwsvm_result[data_size]:
                fwsvm_result[data_size][modeln] = []
            fwsvm_result[data_size][modeln].append({"score": np.mean(y_pred == y_test), "time": time_svm})

            print("\t" +"*"*5)
            start = time()
            nsvm.fit(X_train, y_train)
            time_svm = time() - start
            y_pred = np.vstack(nsvm.predict(X_test))
            print("\tScore with pystruct crf nslack svm: %f (took %f seconds)"
                  % (np.mean(y_pred == y_test), time_svm))
            if modeln not in nsvm_result[data_size]:
                nsvm_result[data_size][modeln] = []
            nsvm_result[data_size][modeln].append({"score": np.mean(y_pred == y_test), "time": time_svm})
    


*****68*****
***0***
(68, 1433)
(2640, 1433)
**********
asymmetric, max-product
	Score with pystruct crf subgradient svm: 0.539394 (took 0.027119 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.542803 (took 0.013619 seconds)
	*****
	Score with pystruct crf nslack svm: 0.541288 (took 0.416411 seconds)
**********
symmetric, max-product
	Score with pystruct crf subgradient svm: 0.539394 (took 0.030939 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.543182 (took 0.017142 seconds)
	*****
	Score with pystruct crf nslack svm: 0.534848 (took 0.418065 seconds)
***1***
(68, 1433)
(2640, 1433)
**********
asymmetric, max-product
	Score with pystruct crf subgradient svm: 0.478788 (took 0.023286 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.482955 (took 0.013365 seconds)
	*****
	Score with pystruct crf nslack svm: 0.482197 (took 0.412217 seconds)
**********
symmetric, max-product
	Score with pystruct crf subgradient svm: 0.478788 (took 0.023862 seconds)
	*****
	Sco

	Score with pystruct crf nslack svm: 0.563933 (took 0.881590 seconds)
**********
symmetric, max-product
	Score with pystruct crf subgradient svm: 0.589973 (took 0.036518 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.588030 (took 0.021562 seconds)
	*****
	Score with pystruct crf nslack svm: 0.563933 (took 0.918010 seconds)
***5***
(135, 1433)
(2573, 1433)
**********
asymmetric, max-product
	Score with pystruct crf subgradient svm: 0.612903 (took 0.037962 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.614069 (took 0.020818 seconds)
	*****
	Score with pystruct crf nslack svm: 0.575981 (took 0.877689 seconds)
**********
symmetric, max-product
	Score with pystruct crf subgradient svm: 0.612903 (took 0.036721 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.614069 (took 0.021434 seconds)
	*****
	Score with pystruct crf nslack svm: 0.574427 (took 0.886330 seconds)
***6***
(135, 1433)
(2573, 1433)
**********
asymmetric, max-product
	Score with pystruct crf su

	Score with pystruct crf nslack svm: 0.607304 (took 0.962079 seconds)
***9***
(271, 1433)
(2437, 1433)
**********
asymmetric, max-product
	Score with pystruct crf subgradient svm: 0.630283 (took 0.063660 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.653673 (took 0.033857 seconds)
	*****
	Score with pystruct crf nslack svm: 0.608945 (took 0.958982 seconds)
**********
symmetric, max-product
	Score with pystruct crf subgradient svm: 0.630693 (took 0.060192 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.652442 (took 0.034848 seconds)
	*****
	Score with pystruct crf nslack svm: 0.604842 (took 0.969379 seconds)
*****677*****
***0***
(677, 1433)
(2031, 1433)
**********
asymmetric, max-product
	Score with pystruct crf subgradient svm: 0.697194 (took 0.226591 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.708026 (took 0.112385 seconds)
	*****
	Score with pystruct crf nslack svm: 0.682915 (took 1.594468 seconds)
**********
symmetric, max-product
	Score with p

	Score with pystruct crf frankwolfe svm: 0.740030 (took 0.209375 seconds)
	*****
	Score with pystruct crf nslack svm: 0.706056 (took 2.455589 seconds)
**********
symmetric, max-product
	Score with pystruct crf subgradient svm: 0.691285 (took 0.387498 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.740030 (took 0.209526 seconds)
	*****
	Score with pystruct crf nslack svm: 0.706056 (took 2.481842 seconds)
***4***
(1354, 1433)
(1354, 1433)
**********
asymmetric, max-product
	Score with pystruct crf subgradient svm: 0.685377 (took 0.373472 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.735598 (took 0.237005 seconds)
	*****
	Score with pystruct crf nslack svm: 0.741507 (took 2.434413 seconds)
**********
symmetric, max-product
	Score with pystruct crf subgradient svm: 0.685377 (took 0.367884 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.735598 (took 0.210554 seconds)
	*****
	Score with pystruct crf nslack svm: 0.739291 (took 2.447020 seconds)
***5***
(1354

	Score with pystruct crf frankwolfe svm: 0.745938 (took 0.324833 seconds)
	*****
	Score with pystruct crf nslack svm: 0.729690 (took 3.486784 seconds)
***8***
(2031, 1433)
(677, 1433)
**********
asymmetric, max-product
	Score with pystruct crf subgradient svm: 0.675037 (took 0.488282 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.731167 (took 0.262826 seconds)
	*****
	Score with pystruct crf nslack svm: 0.714919 (took 2.957563 seconds)
**********
symmetric, max-product
	Score with pystruct crf subgradient svm: 0.675037 (took 0.591465 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.731167 (took 0.331806 seconds)
	*****
	Score with pystruct crf nslack svm: 0.714919 (took 3.262534 seconds)
***9***
(2031, 1433)
(677, 1433)
**********
asymmetric, max-product
	Score with pystruct crf subgradient svm: 0.722304 (took 0.510983 seconds)
	*****
	Score with pystruct crf frankwolfe svm: 0.744461 (took 0.277273 seconds)
	*****
	Score with pystruct crf nslack svm: 0.722304 (too

## Now only thing left to do is average score and time and form a nice little table and report it in the paper

In [5]:

def print_data(result):
    for size in result.keys():
        for model in result[size]:
            _score = []
            _time = []
            for run in result[size][model]:
                _score.append(run['score'])
                _time.append(run['time'])
            print("data size: " + str(size) + "\t\t" + model + ": \t" + str(sum(_score)/10*100) + "\t%.2fs" % (sum(_time)/10) )

In [6]:
print("Data for fwsvm")
print_data(fwsvm_result)
print("Data for nsvm")
print_data(nsvm_result)
print("Data for ssvm")
print_data(ssvm_result)

Data for fwsvm
data size: 68		asymmetric, max-product: 	50.465909090909086	0.03s
data size: 68		symmetric, max-product: 	50.465909090909086	0.03s
data size: 135		asymmetric, max-product: 	59.97668091721725	0.02s
data size: 135		symmetric, max-product: 	60.01943256898563	0.02s
data size: 271		asymmetric, max-product: 	65.25235945835044	0.03s
data size: 271		symmetric, max-product: 	65.21132540008206	0.04s
data size: 677		asymmetric, max-product: 	71.06351550960119	0.12s
data size: 677		symmetric, max-product: 	71.08321024126047	0.12s
data size: 1354		asymmetric, max-product: 	73.86262924667652	0.22s
data size: 1354		symmetric, max-product: 	73.86262924667652	0.21s
data size: 2031		asymmetric, max-product: 	73.79615952732644	0.28s
data size: 2031		symmetric, max-product: 	73.79615952732644	0.29s
Data for nsvm
data size: 68		asymmetric, max-product: 	48.3219696969697	0.60s
data size: 68		symmetric, max-product: 	48.151515151515156	0.62s
data size: 135		asymmetric, max-product: 	57.0695685