In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [15]:
import networkx as nx
import numpy as np
import sys; sys.path.append('../')
from anytree import RenderTree
from anytree.cachedsearch import find
import logging
import pickle
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from pathlib import Path

In [2]:
from VRG.src.Tree import TreeNode, create_tree, dasgupta_cost
from VRG.runner import get_clustering

In [3]:
%matplotlib inline

In [4]:
def load_pickle(fname):
    return pickle.load(open(fname, 'rb'))

In [21]:
def get_graph(gname: str = 'sample'):
    attr_name = ''
    if gname == 'sample':
        g = nx.Graph()
        g.add_nodes_from(range(5), color='blue')
        g.add_nodes_from(range(5, 9), color='red')

        g.add_edges_from([(0, 1), (0, 3), (0, 4),
                          (1, 2), (1, 4), (1, 5),
                          (2, 3), (2, 4), (2, 8),
                          (3, 4),
                          (5, 6), (5, 7), (5, 8),
                          (6, 7), (6, 8),
                          (7, 8)])  # properly labeled
        g.name = 'sample'
        attr_name = 'color'
    elif gname == 'karate':
        g = nx.karate_club_graph()
        attr_name = 'club'
        g.name = 'karate'
    elif gname == 'BA':
        g = nx.barabasi_albert_graph(10, 2, seed=42)
        # g = nx.MultiGraph(g)
        g = nx.Graph()
    elif gname.endswith('.gpickle'):
        g = nx.read_gpickle(gname)
        g.name = Path(gname).stem
    else:
        if gname in ('waterloo', 'grenoble', 'uppsala'):
            g = nx.read_gpickle(f'../snap_data/cleaned/{gname}_lcc_attr.gpickle')
        elif gname in ('polblogs', 'polbooks', 'football', 'bipartite-10-10', 'cora', 'citeseer', 'pubmed'):
            g = nx.read_gml(f'../VRG/input/{gname}.gml')
            attr_name = 'value'
        else:
            path = f'../VRG/input/{gname}.g'
            g = nx.read_edgelist(path, nodetype=int, create_using=nx.Graph())

        g.remove_edges_from(nx.selfloop_edges(g))
        if not nx.is_connected(g):
            nodes_lcc = max(nx.connected_components(g), key=len)
            g = g.subgraph(nodes_lcc).copy()
        name = g.name
        g = nx.convert_node_labels_to_integers(g, label_attribute='orig_label')
        g.name = name

    dl = -1 # graph_dl(g)
    end_time = 0
    logging.error(f'Graph: {gname}, n = {g.order():_d}, m = {g.size():_d}, dl = {dl:_g} bits read in {round(end_time, 3):_g}s.')

    return g, attr_name

In [27]:
# double check all pickles
names = ['karate', 'football', 'polbooks', 'polblogs', 'citeseer', 'cora', 'pubmed']
clusterings = ['cond', 'spectral', 'leiden', 'louvain', 'infomap', 'labelprop', 'consensus']

for name in names:
    g, attr_name = get_graph(name)
    for fname in glob.glob(f'/data/ssikdar/attributed-vrg/dumps/trees/{name}/*'):
        path = Path(fname)
        pattern = r'(\w+)\_(\w+)'
        m = re.match(pattern, path.stem)
        clustering, _ = m.groups()
        obj = load_pickle(fname)
        root = create_tree(obj) if isinstance(obj, list) else obj
        if len(root.leaves) != g.order():
            print(f'{name} {g.order()} error {clustering} {len(root.leaves)}')

Graph: karate, n = 34, m = 78, dl = -1 bits read in 0s.
Graph: football, n = 115, m = 613, dl = -1 bits read in 0s.
Graph: polbooks, n = 105, m = 441, dl = -1 bits read in 0s.
Graph: polblogs, n = 1_222, m = 16_714, dl = -1 bits read in 0s.
Graph: citeseer, n = 2_110, m = 3_668, dl = -1 bits read in 0s.
Graph: cora, n = 2_485, m = 5_069, dl = -1 bits read in 0s.
Graph: pubmed, n = 19_717, m = 44_324, dl = -1 bits read in 0s.


In [None]:
clustering_algs = 'leiden', 'louvain', 'cond', 'spectral', 'infomap', 'labelprop', 'random'
for clustering in clustering_algs:
    clustering = get_clustering(g, outdir=f'../VRG/dumps/trees/{g.name}/', 
                                clustering=clustering, use_pickle=True)

## Compute the dasgupta cost of the trees

In [None]:
glob.glob('../VRG/dumps/trees/karate/*.pkl')

In [None]:
def get_dasgupta_df():
    df_path = './dasgupta.csv'
    if os.path.exists(df_path):
        df = pd.read_df(df_path)
    else:
        names = 'karate', 'football', 'polbooks', 'eucore', 'flights', 'polblogs'
        rows = []
        for name in names:
            g = read_graph(name)
            for pickle_file in glob.glob(f'../VRG/dumps/trees/{g.name}/*.pkl'):
                clustering = pickle_file.split('/')[-1].split('_')[0]
                print(clustering, end=' ', flush=True)
                pkl = load_pickle(pickle_file)
                if isinstance(pkl, list):
                    root = create_tree(pkl)
                else:
                    root = pkl
                cost = dasgupta_cost(g=g, root=root)
                row = {'name': name, 'clustering': clustering, 'cost': cost}
                rows.append(row)
        df = pd.DataFrame(rows)
        df.to_csv('./dasgupta.csv', index=False)
    return df

In [None]:
df = pd.read_csv('./dasgupta.csv')

In [None]:
plt.rcParams['figure.figsize'] = (15, 10)

In [None]:
ax = plt.gca();
ax.set_yscale('log');
sns.barplot(x='name', y='cost', hue='clustering', data=df);
ax.set_ylabel('Dasgupta Cost');

## Plot dendrograms for the 3-comm graphs

In [None]:
frac = (np.linspace(0, 1, 21, endpoint=True) * 100).astype(int)

In [None]:
frac

In [None]:
nx.read_gexf()

In [None]:
pkl = load_pickle('/data/ssikdar/attributed-vrg/dumps/trees/
root = create_tree(pkl) if isinstance(pkl, list) else pkl
root

In [None]:
for pre, _, node in RenderTree(root):
    print("%s%s" % (pre, node.name))

In [None]:
trees = [load_pickle('/data/ssikdar/attributed-vrg/dumps/trees/3-comm-0-attrs/')]