In [1]:
import graph_tool.all as gt
import pandas as pd
import glob
import itertools
import collections
import matplotlib
import math

In [2]:
DATA_DIR = '/home/aahu/Dropbox/black-market-recommender-systems/data/'
l = []
for fname in glob.glob(DATA_DIR+'*.tsv'):
    df = pd.read_csv(fname,sep='\t')
    l.append(df)
df_raw = pd.concat(l)

In [3]:
cols = ['category','vendor']
#drop dups and add count
df = df_raw[cols].copy(deep=True)
df['vendor'] = df['vendor'].map(lambda x: str(x).split('/')[-1].replace('#',''))
dfc = df.groupby(cols).size()
dfc = dfc.reset_index()
dfc['count'] = dfc[0]
cols.append('count')
df = dfc[cols].copy(deep=True)

In [33]:
def build_cat_cat_net(df_in, n_nodes=100):
    
    df = df_in.copy(deep=True)  # python mutable arguments...
    
    #filter to most common cats
    cats = collections.Counter(df['category']).most_common(n_nodes)
    cats = [c[0] for c in cats]
    df = df[df['category'].map(lambda x: x in cats)]
    
    #build graph-tool ids
    node_lbs = {}
    rev_node_lbs = {}
    for idx,cat in enumerate(cats):
        node_lbs[cat] = idx
        rev_node_lbs[idx] = cat
    df['id'] = df['category'].map(lambda x:node_lbs[x])
    
    edge_list = []
    dfg = df.groupby('vendor')
    for name,group in dfg:
        ei = itertools.combinations(group['id'].drop_duplicates(),2)
        for e in ei:
            edge_list.append(tuple(sorted(e)))
            
    #filter edges by num shared vendors
    MIN_SHARED_VENDORS=3
    c = collections.Counter(edge_list)
    edge_list = [e for e in c if c[e]>=MIN_SHARED_VENDORS]

    g = gt.Graph(directed=False)
    g.add_edge_list(edge_list)
    
    g.vertex_properties['label'] = g.new_vertex_property('string')
    for v in g.vertices():
        g.vertex_properties['label'][v] = rev_node_lbs[g.vertex_index[v]]
    print('g vert/edges: ',g.num_vertices(), g.num_edges())
    
    #add edge weight property
    g.edge_properties['weight'] = g.new_edge_property('double')
    g.edge_properties['color'] = g.new_edge_property('vector<double>')
    for e in g.edges():
        w = c[tuple(sorted([e.source(),e.target()]))]
        g.edge_properties['weight'][e] = w
        alpha = (float(w)/max(c.values())) + .025
        g.edge_properties['color'][e] = [103/255.0,134/255.0,239/255.0,alpha]  
        
    state = gt.minimize_nested_blockmodel_dl(g,deg_corr=False,
                                                       eweight=g.ep['weight'])
    t = gt.get_hierarchy_tree(state)[0]
    tpos = pos = gt.radial_tree_layout(t, t.vertex(t.num_vertices() - 1), weighted=True)
    cts = gt.get_hierarchy_control_points(g, t, tpos,beta=.86)
    pos = g.own_property(tpos)
    b = state.levels[0].b

    #text rotation
    text_rot = g.new_vertex_property('double')
    g.vertex_properties['text_rot'] = text_rot
    text_pos = g.new_vertex_property('double')
    g.vertex_properties['text_pos'] = text_pos
    for v in g.vertices():
        if pos[v][0] > 0:
            text_rot[v] = math.atan(pos[v][1]/pos[v][0])
        else:
            text_rot[v] = math.atan(pos[v][1]/pos[v][0])
            text_pos[v] = 10 #len(g.vp['label'][v].strip())
            
    gt.graph_draw(g, pos=pos, vertex_fill_color=b,
                edge_control_points=cts,
                vertex_size=20,
                vertex_text=g.vertex_properties['label'],
                vertex_text_rotation=g.vertex_properties['text_rot'],
                vertex_text_position=g.vp['text_pos'],
                vertex_font_size=20,
                vertex_font_family='mono',
                vertex_anchor=0,
                vertex_color=b,
                vcmap=matplotlib.cm.Set1,
                edge_color=g.edge_properties['color'],
                bg_color=[0,0,0,1],
                output_size=[2*1024,2*1024],
                output='/home/aahu/Desktop/all_min_edgew={0}.png'.format(MIN_SHARED_VENDORS))   
    print('done!')
#     gt.draw_hierarchy(state,
#                       vertex_text=g.vertex_properties['label'],
#                       vertex_text_rotation=g.vp['text_rot'],
#                       vertex_text_position=1,
#                       vertex_font_size=20,
#                       vertex_font_family='mono',
#                       vertex_anchor=0,
#                       vcmap=matplotlib.cm.Spectral,
#                       ecmap=matplotlib.cm.Spectral,
#                       bg_color=[0,0,0,1],
#                       output_size=[1024*2,1024*2],
#                       output='/home/aahu/Desktop/labeled_all_nvends={0}.png'.format(MIN_SHARED_VENDORS))

            
    return

build_cat_cat_net(df)

g vert/edges:  100 3224
done!
