In [12]:
import sys
sys.path.append('../../sparsedense/')

In [13]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import importlib
from statsmodels.nonparametric.kde import KDEUnivariate
from collections import Counter
import pandas as pd

In [14]:
import optim as opt
import helpertests as tst
import sparsedense as spd

In [15]:
test = 'enron'

In [16]:
links = np.genfromtxt('{}.tsv'.format(test), delimiter='\t', dtype=int)
g = nx.Graph()
g.add_edges_from(links)
net = nx.adjacency_matrix(g).toarray().astype(int)
for i in range(net.shape[0]):
    net[i, i] = 1
    

In [17]:
# with open('{}_clique_init.pkl'.format(test, suffix), 'rb') as f:
#     runs = pickle.load(f)

In [26]:
alpha, sigma, c, num_cliques = runs['alphas'][-1], runs['sigmas'][-1], runs['cs'][-1], runs['Ks'][-1]
alpha, sigma, c, num_cliques

(2.8250965584052055, 0.3216353225668911, 50.037607173268064, 2773)

In [7]:
def gen_rcc_samples(nreps, alpha, sigma, c, num_cliques):
    nreps = 25
    for k in range(nreps):
        Z = opt.sample_from_ibp(num_cliques, alpha=alpha, sigma=sigma, c=c)
        net = Z.transpose() @ Z
        K, N = Z.shape
        links = []
        for i in range(N - 1):
            for j in range(i + 1, N):
                if net[i, j] >= 1:
                    links.append([i, j])
        np.savetxt('rcc_runs_clique_init/{}.tsv'.format(k), np.array(links), delimiter='\t', fmt='%d')

In [8]:
# nreps = 25
# gen_rcc_samples(nreps, alpha, sigma, c, num_cliques)

In [22]:
def test_samples(path, n=25, mc_size=500, verbose=True, offset=0):
    
    num_nodes, num_edges, edge_node, density, deg, max_clique_node, clust_coeff, conn_comp_largest, short_paths, triang_node = \
        [], [], [], [], [], [], [], [], [], []

    for i in range(1, n):
        links = np.genfromtxt(path + '{}.tsv'.format(i+offset), delimiter='\t', dtype=int)
        g = nx.Graph()
        g.add_edges_from(links - 1)

        num_nodes_i, num_edges_i, edge_node_ratio_i, density_i, deg_i, max_clique_node_i, clust_coeff_i, conn_comp_i, short_paths_i, triang_node_i = \
            tst.test_stats(g, mc_size=mc_size, verbose=False, return_results=True)

        num_nodes.append(num_nodes_i)
        num_edges.append(num_edges_i)
        edge_node.append(edge_node_ratio_i)
        density.append(density_i)
        deg.append(deg_i)
        max_clique_node.append(max_clique_node_i)
        clust_coeff.append(clust_coeff_i)
        conn_comp_largest.append(conn_comp_i / num_nodes_i)
        short_paths.append(short_paths_i)
        triang_node.append(triang_node_i)

    if verbose:
        print("- num nodes: {:f}, {:f}".format(np.mean(num_nodes), np.std(num_nodes)))
        print("- num edges: {:f}, {:f}".format(np.mean(num_edges), np.std(num_edges)))
        print("- edge node ratio: {:2.2f}, {:f}".format(np.mean(edge_node), np.std(edge_node)))
        print("- triang node ratio: {:f}, {:f}".format(np.mean(triang_node), np.std(triang_node)))
        print("- density: {:2.6f}, {:f}".format(np.mean(density), np.std(density)))
        print("- mean degree: {:2.2f}, {:f}".format(np.mean(deg), np.std(deg)))
        print("- mean maximal clique containing node: {:2.2f}, {:f}".format(np.mean(max_clique_node), np.std(max_clique_node)))
        print("- clustering coefficient: {:2.2f}, {:f}".format(np.mean(clust_coeff), np.std(clust_coeff)))
        print("- connected component sizes (largest): {}, {:f}".format(np.mean(conn_comp_largest), np.std(conn_comp_largest)))
        print("- mean distance between nodes (largest conn. comp.): {:2.2f}, {:f}".format(np.mean(short_paths), np.std(short_paths)))  

In [23]:
tst.test_stats(g)

- num nodes: 1172
- num edges: 4293
- edge node ratio: 3.66
- triang node ratio: 8.29
- density: 0.006256
- mean degree: 7.33
- clustering coefficient: 0.23
- mean maximal clique containing node: 3.22
- connected component sizes (top 5): [1166, 2, 2, 2]
- mean distance between nodes (largest conn. comp.): 3.47


In [24]:
test_samples('rcc_runs_clique_init/')

- num nodes: 1139.041667, 37.584881
- num edges: 9478.125000, 1002.417341
- edge node ratio: 8.32, 0.778312
- triang node ratio: 104.675150, 27.593268
- density: 0.014621, 0.001356
- mean degree: 16.63, 1.556624
- mean maximal clique containing node: 5.78, 0.456343
- clustering coefficient: 0.58, 0.026178
- connected component sizes (largest): 0.9929751960361939, 0.004076
- mean distance between nodes (largest conn. comp.): 2.74, 0.078138


In [32]:
test_samples('bnpgraph_runs/enron_', offset=1)

- num nodes: 1181.958333, 51.632902
- num edges: 4531.625000, 555.018754
- edge node ratio: 3.82, 0.341685
- density: 0.006472, 0.000443
- mean degree: 7.65, 0.683370
- mean maximal clique containing node: 2.58, 0.089446
- clustering coefficient: 0.05, 0.009877
- connected component sizes (largest): 0.9855017520039464, 0.005060
- mean distance between nodes (largest conn. comp.): 3.42, 0.093297


In [33]:
def fit_kde(x, grid):
    resol = len(grid)
    d = np.zeros(resol)
    kde = KDEUnivariate(x)
    kde.fit()
    d = kde.evaluate(grid)    
    return d

def fit_count(x, grid):
    cnt = Counter(x)
    d = np.array([cnt[y] for y in grid]) / len(x)
    return d

def degree_clique_density_runs(path, deg_grid, clique_grid, n = 25, offset=0):
    deg_dens = np.zeros(len(deg_grid))
    clique_dens = np.zeros(len(clique_grid))
    
    for k in range(n):
        links = np.genfromtxt(path + '{}.tsv'.format(k + offset), delimiter='\t', dtype=int)
        g = nx.Graph()
        g.add_edges_from(links - 1)


        degs = np.array([nx.degree(g, i) for i in g.nodes()], dtype=float)
        clique = np.array([nx.node_clique_number(g, i) for i in g.nodes()], dtype=float)
        
        deg_dens += fit_kde(degs, deg_grid) / n  
        clique_dens += fit_count(clique, clique_grid) / n  
        
    return deg_dens, clique_dens

def degree_clique_density_graph(g, deg_grid, clique_grid):
    degs = np.array([nx.degree(g, i) for i in g.nodes()], dtype=float)
    clique = np.array([nx.node_clique_number(g, i) for i in g.nodes()], dtype=float)

    deg_dens = fit_kde(degs, deg_grid)
    clique_dens = fit_count(clique, clique_grid) 
        
    return deg_dens, clique_dens

In [37]:
deg_grid = np.linspace(1, 45, 20)
clique_grid = np.arange(2, 20)

deg_dens_rcc, clique_dens_rcc = degree_clique_density_runs('rcc_runs_clique_init/', deg_grid, clique_grid)
deg_dens_bnp, clique_dens_bnp = degree_clique_density_runs('bnpgraph_runs/', deg_grid, clique_grid, offset=1)

OSError: bnpgraph_runs/1.tsv not found.

In [None]:
deg_dens_orig, clique_dens_orig = degree_clique_density_graph(g, deg_grid, clique_grid)

In [None]:
deg_df = pd.DataFrame({
    'Truth': deg_dens_orig, 
    'RCC': deg_dens_rcc, 
    'BNPGraph': deg_dens_bnp}, index=deg_grid)
maxclique_df = pd.DataFrame({
    'Truth': clique_dens_orig, 
    'RCC': clique_dens_rcc,
    'BNPGraph': clique_dens_bnp}, index=clique_grid)

In [None]:
style = ['b--','r-','c-.']
deg_df.plot(style=style, markersize=5, linewidth=2, figsize = (6, 4))
plt.legend(fontsize=14) # using a size in points
plt.xlim(1, 45)
plt.xlabel("Degree", fontsize=14)
# plt.ylabel("Density")
# plt.title("Degree distribution")
plt.savefig('enron_clique_init_degree_dist.png', format="PNG")
plt.show()

In [None]:
style = ['b--','r-','c-.']
maxclique_df.plot(style=style, markersize=5, linewidth=2, figsize = (6, 4))
plt.legend(fontsize=14) # using a size in points
plt.xlim(2, 15)
plt.xlabel("Maximal clique size per node", fontsize=14)
# plt.ylabel("Density")
# plt.title("Degree distribution")
plt.savefig('enron_clique_init_maximal_clique_dist.png', format="PNG")
plt.show()