In [1]:
from collections import defaultdict
import glob
from time import time

import numpy as np
import pandas as pd
import networkx as nx
from new_SPOC import *
import old_SPOC
from generate_SPOC_model import *
import matplotlib.pyplot as plt
import matlab.engine
from tqdm import tqdm_notebook
from IPython.display import clear_output, display

%matplotlib inline

import random
random.seed(42)
np.random.seed(42)

eng = matlab.engine.start_matlab()

In [2]:
def load_data(adjacency_filepath, community_filepath):
    G=nx.read_edgelist(adjacency_filepath, nodetype=int)
    nodelist = list(G.nodes())
    node2indx = {n: i for i, n in enumerate(nodelist)}
    A = nx.to_scipy_sparse_matrix(G, nodelist=nodelist)
    comms = {}

    with open(community_filepath) as f:
        for line in f:
            args = line.split()
            k = int(args[1]) - 1
            if k not in comms:
                comms[k] = ([], [])
            comms[k][0].append(node2indx[int(args[0])])
            comms[k][1].append(float(args[2]))

    comm_count = len(comms)
    comms_array = np.zeros((A.shape[0], comm_count))
    for i in range(comm_count):
        comms_array[comms[i][0], i] = comms[i][1]
    return nodelist, A, comms_array

In [3]:
data_paths = glob.glob('./data/coauthorship/*_adjacency.txt')
data_paths = [(adj, adj.replace("adjacency", "community"))for adj in data_paths]
print("\n".join("{}, {}".format(*data) for data in data_paths))

./data/coauthorship/DBLP1_adjacency.txt, ./data/coauthorship/DBLP1_community.txt
./data/coauthorship/DBLP2_adjacency.txt, ./data/coauthorship/DBLP2_community.txt
./data/coauthorship/DBLP3_adjacency.txt, ./data/coauthorship/DBLP3_community.txt
./data/coauthorship/DBLP4_adjacency.txt, ./data/coauthorship/DBLP4_community.txt
./data/coauthorship/DBLP5_adjacency.txt, ./data/coauthorship/DBLP5_community.txt
./data/coauthorship/MAG1_adjacency.txt, ./data/coauthorship/MAG1_community.txt
./data/coauthorship/MAG2_adjacency.txt, ./data/coauthorship/MAG2_community.txt


#### SPOC methods from A matrix

In [5]:
methods = {#"SPOC": lambda A, n_clusters: SPOC(A, n_clusters, use_ellipsoid=False, use_cvxpy=False),
           "GeoNMF": lambda A, n_clusters: eng.GeoNMF(A, n_clusters, 0.25, 0.95, nargout=2),
           #"SPOC_bootstrap": lambda A, n_clusters: SPOC_bootstrap(A, n_clusters, n_repetitions=150, std_num=3),
          }

results = []
for method_name in tqdm_notebook(methods):
    print('\t', method_name)
    for data in tqdm_notebook(data_paths):
        print(time())
        nodelist, A, comms = load_data(*data)
        n_clusters = comms.shape[1]
        A_mat = matlab.double(A.todense().tolist())
        print(data, '...', 'clusters:', n_clusters, "nodes: ", A.shape[0])
        time_start = time()
        theta, b = methods[method_name](A_mat, n_clusters)
        time_end = time()
        res = {"relative MSE": find_permutation_Theta(comms, theta)[0], 
               "Mean Spearman coefficient": find_permutation_spearmanr(comms, theta)[0],
               "Method": method_name,
               "Dataset": data[0].split("/")[-1].split("_")[0],
               "cluster #": n_clusters, 
               "node #": A.shape[0],
               "time (sec)": time_end - time_start,
              }
        results.append(res)
        print(res)

('\t', 'GeoNMF')


1505642823.32



KeyboardInterrupt: 

In [11]:
results

[]

In [14]:
A.tode

array(<30566x30566 sparse matrix of type '<type 'numpy.int64'>'
	with 272124 stored elements in Compressed Sparse Row format>, dtype=object)