In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import os

from vigor import predicates
from vigor.evaluation import *

  from pandas.core import (


device cpu


# Generate Graphs and Statistics

1. Generate graphs using the fast_gnp_random_graph function from networkx

2. Calculate statistics for each graph

    * s_metric, sigma, and is_planar are prohibitively expensive to compute for a large number of graphs

In [8]:
file_path = 'graph_data'
if not os.path.exists(f'{file_path}.csv'):
    print('generating graph data')
    df = generate_graphs(300, 2, 200)
else:
    print('loading graph data')
    df = pd.read_csv(f'{file_path}.csv')

loading graph data


In [9]:
df

Unnamed: 0,graph_type,is_directed_int,has_spatial_attributes,has_temporal_attributes,is_planar,is_bipartite,n_components,avg_betweenness_centrality,avg_closeness_centrality,avg_eigenvector_centrality,...,sigma,n_nodes,node_types,node_attributes,number_of_isolates,density,edge_types,edge_attributes,n_parallel_edges,n_self_loops
0,4,0,0,0,,0,1,0.000534,0.947804,0.097103,...,,106,0,0.0,0,0.944474,0,0.0,0,0
1,4,0,0,0,,0,1,0.004174,0.583721,0.075489,...,,173,0,0.0,0,0.286194,0,0.0,0,0
2,4,0,0,0,,0,1,0.004833,0.641326,0.091558,...,,118,0,0.0,0,0.439374,0,0.0,0,0
3,4,0,0,0,,0,2,0.102564,0.376141,0.240887,...,,13,0,0.0,1,0.179487,0,0.0,0,0
4,4,0,0,0,,0,1,0.000759,0.947525,0.114659,...,,76,0,0.0,0,0.943860,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,4,0,0,0,,0,1,0.001242,0.856071,0.085069,...,,138,0,0.0,0,0.831059,0,0.0,0,0
293,3,0,0,0,,0,8,0.050950,0.140664,0.065252,...,,72,0,0.0,6,0.029343,0,0.0,0,0
294,4,0,0,0,,0,1,0.004470,0.575750,0.076719,...,,167,0,0.0,0,0.262463,0,0.0,0,0
295,4,0,0,0,,0,1,0.001171,0.815435,0.071373,...,,196,0,0.0,0,0.772894,0,0.0,0,0


# Prepare Data for Predicate Learning

In [10]:
attr_std = df.std()

In [11]:
attr_std

graph_type                     0.396574
is_directed_int                0.000000
has_spatial_attributes         0.000000
has_temporal_attributes        0.000000
is_planar                           NaN
is_bipartite                   0.189171
n_components                   1.400327
avg_betweenness_centrality     0.029138
avg_closeness_centrality       0.190156
avg_eigenvector_centrality     0.091201
avg_degree                    40.999756
std_degree                     1.722291
clustering_coefficient         0.295832
transitivity                   0.295636
modularity                     0.104869
communities                    1.401977
avg_shortest_path_length       0.699630
radius                         0.856768
diameter                       1.055290
assortativity                  0.112348
vertex_connectivity           38.332860
eccentricity_avg               0.922103
s_metric                            NaN
sigma                               NaN
n_nodes                       56.939827


In [12]:
attr_null = df.isnull().any(axis=0)

In [13]:
attr_null

graph_type                    False
is_directed_int               False
has_spatial_attributes        False
has_temporal_attributes       False
is_planar                      True
is_bipartite                  False
n_components                  False
avg_betweenness_centrality    False
avg_closeness_centrality      False
avg_eigenvector_centrality    False
avg_degree                    False
std_degree                    False
clustering_coefficient        False
transitivity                  False
modularity                    False
communities                    True
avg_shortest_path_length      False
radius                        False
diameter                      False
assortativity                  True
vertex_connectivity           False
eccentricity_avg              False
s_metric                       True
sigma                          True
n_nodes                       False
node_types                    False
node_attributes               False
number_of_isolates          

**Only use attributes that do not contain null values and have a standard deviation > 0:**

In [14]:
attr_use = (attr_std.fillna(0)>0) & ~attr_null

In [15]:
attr_use

graph_type                     True
is_directed_int               False
has_spatial_attributes        False
has_temporal_attributes       False
is_planar                     False
is_bipartite                   True
n_components                   True
avg_betweenness_centrality     True
avg_closeness_centrality       True
avg_eigenvector_centrality     True
avg_degree                     True
std_degree                     True
clustering_coefficient         True
transitivity                   True
modularity                     True
communities                   False
avg_shortest_path_length       True
radius                         True
diameter                       True
assortativity                 False
vertex_connectivity            True
eccentricity_avg               True
s_metric                      False
sigma                         False
n_nodes                        True
node_types                    False
node_attributes               False
number_of_isolates          

**Excluded Attributes:**

In [16]:
attr_use[~attr_use].index

Index(['is_directed_int', 'has_spatial_attributes', 'has_temporal_attributes',
       'is_planar', 'communities', 'assortativity', 's_metric', 'sigma',
       'node_types', 'node_attributes', 'edge_types', 'edge_attributes',
       'n_parallel_edges', 'n_self_loops'],
      dtype='object')

In [17]:
attr = attr_use[attr_use].index

In [18]:
attr

Index(['graph_type', 'is_bipartite', 'n_components',
       'avg_betweenness_centrality', 'avg_closeness_centrality',
       'avg_eigenvector_centrality', 'avg_degree', 'std_degree',
       'clustering_coefficient', 'transitivity', 'modularity',
       'avg_shortest_path_length', 'radius', 'diameter', 'vertex_connectivity',
       'eccentricity_avg', 'n_nodes', 'number_of_isolates', 'density'],
      dtype='object')

In [19]:
len(attr)

19

# Load Expert Predicates

In [20]:
predicates_labels = get_predicate_labels(df, predicates)

**List of predicates for each vis type:**

In [21]:
predicates_labels['predicates']

{'NODELINK': [{'density': [0, 0.1]},
  {'avg_degree': [1, 3]},
  {'clustering_coefficient': [0.1, 0.4]},
  {'node_types': [1, 3]},
  {'edge_types': [1, 2]}],
 'MATRIX': [{'density': [0.1, 1]},
  {'avg_degree': [10, 50]},
  {'modularity': [0.3, 0.7]},
  {'node_attributes': [2, 10]},
  {'edge_attributes': [1, 5]}],
 'NODETRIX': [{'communities': [4, 10]},
  {'clustering_coefficient': [0.5, 1]},
  {'density': [0.1, 0.5]},
  {'node_types': [2, 5]},
  {'modularity': [0.3, 0.8]},
  {'avg_degree': [5, 15]},
  {'node_attributes': [3, 10]},
  {'edge_types': [1, 3]}],
 'NODELINK_MAP': [{'avg_degree': [1, 5]}],
 'PAOHVIS': [{'n_nodes': [50, 500]},
  {'node_types': [3, 6]},
  {'edge_types': [2, 5]},
  {'density': [0.05, 0.2]},
  {'avg_degree': [5, 10]},
  {'transitivity': [0.2, 0.6]}],
 'CHORD_DIAGRAM': [{'n_nodes': [0, 6]},
  {'edge_types': [1, 3]},
  {'clustering_coefficient': [0.3, 0.7]},
  {'avg_degree': [2, 4]}],
 'TREEMAP': [{'graph_type': [0.5, 1.5]},
  {'modularity': [0.5, 1]},
  {'n_nodes'

**Vis type label for each graph:**

In [22]:
predicates_labels['labels']

0        MATRIX
1        MATRIX
2       PAOHVIS
3      NODELINK
4        MATRIX
         ...   
292      MATRIX
293    NODELINK
294      MATRIX
295      MATRIX
296      MATRIX
Length: 297, dtype: object

For each graph, each vis type gets one point for a matching predicate. The vis type with the most total points/matching predicates is chosen as the label.

# Learn Predicates

In [24]:
learned_predicates = get_predicates_labels(df, attr, predicates_labels['labels'], label_names=['NODELINK'], n_iter=1000)

[   0] loss 3.3214240074157715
[ 100] loss 9.225895881652832
[ 200] loss 9.427505493164062
[ 300] loss 0.22784195840358734
[ 400] loss 0.1697160005569458
[ 500] loss 0.19681721925735474
[ 600] loss 0.24632522463798523
[ 700] loss 0.3140135109424591
[ 800] loss 0.3935486078262329
[ 900] loss 0.47320684790611267

brush = 0
accuracy = 0.7811447811447811
precision = 0.2261904761904762
recall = 1.0
f1 = 0.3689320388349515

[   0] loss 2.5393917560577393
[ 100] loss 0.6756349205970764
[ 200] loss 0.2764913737773895
[ 300] loss 0.21806229650974274
[ 400] loss 0.19411863386631012
[ 500] loss 0.18646377325057983
[ 600] loss 0.19039420783519745
[ 700] loss 0.19553092122077942
[ 800] loss 0.199416846036911
[ 900] loss 0.20200425386428833

brush = 0
accuracy = 0.9663299663299664
precision = 1.0
recall = 0.9640287769784173
f1 = 0.9816849816849818



In [25]:
for vistype, predicate in learned_predicates.items():
    print(vistype)
    print(predicate)

NODELINK
({'avg_degree': [0.003195298575763901, 0.02762993474337021], 'vertex_connectivity': [0.0, 0.011834319526627219]}, {'n_components': [0.0, 0.10526315789473684], 'avg_betweenness_centrality': [0.0, 0.15302101895213127], 'eccentricity_avg': [0.23305794596672058, 0.7996454536914825], 'number_of_isolates': [0.0, 0.18181818181818182]})
