# MDL computations

Double check MDL computations

In [1]:
import networkx as nx
import igraph as ig
import numpy as np
import glob
import math
import re
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns; sns.set_style('white')
import sys; sys.path.append('../')
from time import time
import logging
from anytree import LevelOrderIter, RenderTree
from statistics import mean
import os
from collections import Counter
import pprint

In [2]:
from VRG.src.utils import load_pickle, nx_to_igraph, check_file_exists
from VRG.src.graph_stats import GraphStats
from VRG.src.graph_comparison import GraphPairCompare
from VRG.runner import get_clustering
from VRG.src.Tree import create_tree, dasgupta_cost
from VRG.src.MDL import graph_dl as graph_mdl, find_lu
from VRG.src.partitions import approx_min_conductance_partitioning, spectral_kmeans

sys path:  ['/Users/satyaki/PycharmProjects/Attributed-VRG/notebooks', '/Users/satyaki/miniconda3/envs/VRG/lib/python37.zip', '/Users/satyaki/miniconda3/envs/VRG/lib/python3.7', '/Users/satyaki/miniconda3/envs/VRG/lib/python3.7/lib-dynload', '', '/Users/satyaki/miniconda3/envs/VRG/lib/python3.7/site-packages', '/Users/satyaki/miniconda3/envs/VRG/lib/python3.7/site-packages/IPython/extensions', '/Users/satyaki/.ipython', '../', './../', './../../']


In [52]:
def get_graph(gname: str = 'sample'):
    start_time = time()
    attr_name = ''
    if gname == 'sample':
        g = nx.Graph()
        g.add_nodes_from(range(5), color='blue')
        g.add_nodes_from(range(5, 9), color='red')

        g.add_edges_from([(0, 1), (0, 3), (0, 4),
                          (1, 2), (1, 4), (1, 5),
                          (2, 3), (2, 4), (2, 8),
                          (3, 4),
                          (5, 6), (5, 7), (5, 8),
                          (6, 7), (6, 8),
                          (7, 8)])  # properly labeled
        g.name = 'sample'
        attr_name = 'color'
    elif gname == 'karate':
        g = nx.karate_club_graph()
        attr_name = 'club'
        g.name = 'karate'
    elif gname == 'BA':
        g = nx.barabasi_albert_graph(10, 2, seed=42)
    elif gname.endswith('.gpickle'):
        g = nx.read_gpickle(gname)
        g.name = Path(gname).stem
    else:
        if gname in ('waterloo', 'grenoble', 'uppsala'):
            g = nx.read_gpickle(f'../snap_data/cleaned/{gname}_lcc_attr.gpickle')
        elif gname in ('polblogs', 'polbooks', 'football', 'bipartite-10-10'):
            g = nx.read_gml(f'../VRG/input/{gname}.gml')
            attr_name = 'value'
        else:
            path = f'../VRG/input/{gname}.g'
            g = nx.read_edgelist(path, nodetype=int, create_using=nx.Graph())

        g.remove_edges_from(nx.selfloop_edges(g))
        if not nx.is_connected(g):
            nodes_lcc = max(nx.connected_components(g), key=len)
            g = g.subgraph(nodes_lcc).copy()
        name = g.name
        g = nx.convert_node_labels_to_integers(g, label_attribute='orig_label')
        g.name = name

    end_time = round(time() - start_time, 2)
    logging.error(f'Graph: {gname}, n = {g.order():_d}, m = {g.size():_d}, read in {round(end_time, 3):_g}s.')

    return g, attr_name

In [53]:
g, attr_name = get_graph('football')

Graph: football, n = 115, m = 613, read in 0.06s.


In [55]:
vrg = load_pickle('../VRG/dumps/grammars/karate/VRG-all-tnodes_cond_3_0.pkl')
print(vrg)

graph: 'karate', mu: 3, type: 'all_tnodes' clustering: 'cond' rules: 27(33) mdl: 1_167.49 bits


In [None]:
vrg.

In [60]:
rule = vrg.rule_list[0]
print(rule, rule.graph.nodes(data=True))

(1) 5 → (n = 2, m = 1) [(10, {'attr_dict': {'club': 'Mr. Hi'}, 'b_deg': 2}), (5, {'attr_dict': {'club': 'Mr. Hi'}, 'b_deg': 3})]


In [61]:
rule.cost

30.169925001442312

In [56]:
avrg = load_pickle('../VRG/dumps/grammars/karate/AVRG-all-tnodes_cond_3_0.pkl')
print(avrg)

graph: 'karate', mu: 3, type: 'A-VRG' clustering: 'cond' rules: 22(33) mdl: 1_019.22 bits


In [59]:
arule = avrg.rule_list[0]
print(arule, arule.graph.nodes(data=True))

(1) 8 → (n = 2, m = 1) [(3, {'attr_dict': {'club': 'Mr. Hi'}, 'b_deg': 5}), (7, {'attr_dict': {'club': 'Mr. Hi'}, 'b_deg': 3})]


In [67]:
rule, arule

(<5 → (2, 1)>, <8 → (2, 1)>)

In [68]:
rule.cost, arule.cost

(30.169925001442312, 39.529325012980806)

In [62]:
arule.cost

39.529325012980806

In [34]:
vrg.calculate_cost()

In [35]:
vrg.cost

5857.822604228463

In [37]:
vrg.rule_list[10]

<20 → (2, 1)>

In [38]:
rhs_n, rhs_m = [], []
for rule in vrg.rule_list:
    rhs_n.append(rule.graph.order())
    rhs_m.append(rule.graph.size())

In [39]:
counter_n, counter_m = Counter(rhs_n), Counter(rhs_m)

In [41]:
counter_n, counter_m

(Counter({8: 2, 10: 1, 6: 3, 3: 4, 2: 8, 9: 3, 5: 1, 11: 1, 7: 2}),
 Counter({28: 2,
          40: 1,
          15: 3,
          18: 5,
          36: 2,
          30: 1,
          1: 1,
          6: 1,
          7: 1,
          8: 1,
          31: 1,
          44: 1,
          17: 1,
          21: 1,
          16: 1,
          10: 1,
          89: 1}))

Tree stuff

In [9]:
name = 'eucore'
g, _, _ = get_graph(name)
# lst_of_lst = approx_min_conductance_partitioning(g)
# lst_of_lst = spectral_kmeans(g, int(math.sqrt(g.order() // 2)))
lst_of_lst = load_pickle(f'../VRG/dumps/trees/{name}/consensus_list.pkl')

root = create_tree(lst_of_lst) if isinstance(lst_of_lst, list) else lst_of_lst 

Graph: eucore, n = 986, m = 16_064, dl = 1.03746e+06 bits read in 0.28s.


In [45]:
avrg = load_pickle('../VRG/dumps/grammars/football/AVRG-all-tnodes_cond_3_0.pkl')
print(avrg)

graph: 'football', mu: 3, type: 'A-VRG' clustering: 'cond' rules: 62(114) mdl: 3_845.01 bits


In [46]:
vrg = load_pickle('../VRG/dumps/grammars/football/VRG-all-tnodes_cond_3_0.pkl')
print(vrg)

graph: 'football', mu: 3, type: 'all_tnodes' clustering: 'cond' rules: 67(114) mdl: 4_086.3 bits


In [49]:
arule = avrg.rule_list[0]
arule.graph.nodes(data=True)

NodeDataView({97: {'attr_dict': {'value': 10, 'orig_label': 'LouisianaLafayette'}, 'b_deg': 7}, 98: {'attr_dict': {'value': 3, 'orig_label': 'Texas'}, 'b_deg': 10}})

In [50]:
rule = vrg.rule_list[0]
rule.graph.nodes(data=True)

NodeDataView({99: {'attr_dict': {'value': 6, 'orig_label': 'Marshall'}, 'b_deg': 9}, 14: {'attr_dict': {'value': 6, 'orig_label': 'WesternMichigan'}, 'b_deg': 9}})