In [1]:
import pandas as pd
import dateutil
import os
import matplotlib.pyplot as plt
import seaborn as sns

import graph_tool as gt
import graph_tool.draw
import graph_tool.community
import itertools
import collections

import logging
FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s'
DATE_FORMAT = '%b %d %H:%M:%S'
formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

In [2]:
DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/agora/'
l=[]
for fname in os.listdir(DATA_DIR):
    if fname.endswith('.tsv'):
        df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
        l.append(df0)
df = pd.concat(l)
logger.info(df.columns)
logger.info(df.shape)

May 16 21:08:14 INFO   Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
INFO:__main__:Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
May 16 21:08:14 INFO   (1773538, 7)
INFO:__main__:(1773538, 7)


In [8]:
#discard meta-categories"
meta_cats = ['Other','Drugs','Guides & Tutorials','Fraud Related',
             'Services','Digital Goods','Electronics', 'Custom Listings', 'Pills']
df = df[df['category'].map(lambda x:x not in meta_cats)]
logger.info(df.shape)

df['category'].value_counts().head(50)


May 16 21:11:50 INFO   (1621645, 7)
INFO:__main__:(1621645, 7)


Weed                  209911
Prescription          102483
Benzos                 97789
Cocaine                94635
MDMA                   91953
Pills                  90374
Steroids               77417
RCs                    56010
Watches                53163
LSD                    47886
Hash                   46623
Concentrates           42281
Speed                  41430
Meth                   38145
eBooks                 31954
Synthetics             28986
Heroin                 28845
Guides                 23747
Edibles                23189
Smoked                 23185
NB                     23146
Pirated                23105
Money                  22361
2C                     19233
Accounts               16643
Seeds                  14835
Physical documents     13723
Mushrooms              13638
Oxycodone              12981
DMT                    11871
Scans/Photos           11834
Fentanyl                9706
Opioids                 9040
Software                8853
Jewelry       

In [3]:
#takes too long
def build_category_category_graph(df, min_shared_vendors):
    node_lbs = {}
    rev_node_lbs = {}
    for idx,vendor in enumerate(df['category'].drop_duplicates()):
        node_lbs[vendor] = idx
        rev_node_lbs[idx] = vendor
    df['id'] = df['category'].map(lambda x:node_lbs[x])
     
    edge_list = []
    dfg = df.groupby('vendor')
    for name,group in dfg:
        ei = itertools.combinations(group['id'].drop_duplicates(),2)
        for e in ei:
            edge_list.append(tuple(sorted(e)))

    #filter edges by num shared vendor
    c = collections.Counter(edge_list)
    edge_list = [e for e in c if c[e]>=min_shared_vendors]

    #build graph
    g = gt.Graph(directed=False)
    g.add_edge_list(edge_list)
    g.vertex_properties['label'] = g.new_vertex_property('string')
    for v in g.vertices():
        g.vertex_properties['label'][v] = rev_node_lbs[g.vertex_index[v]]
    print('g vert/edges: ',g.num_vertices(), g.num_edges())

    #add edge weight property
    g.edge_properties['weight'] = g.new_edge_property('double')
    g.edge_properties['color'] = g.new_edge_property('vector<double>')
    for e in g.edges():
        w = c[tuple(sorted([e.source(),e.target()]))]
        g.edge_properties['weight'][e] = w
        alpha = (float(w)/max(c.values())) + .07
        g.edge_properties['color'][e] = [103/255.0,134/255.0,239/255.0,alpha] 
    return g



def block_model_plot(df,min_shared_vendors=3):
    """
    Graph-tool plot
    """
    g = build_category_category_graph(df,min_shared_vendors)
        
    logger.info(g)
    logger.info('begin stochastic block model')
    state = gt.community.minimize_nested_blockmodel_dl(g,deg_corr=True,
                                                    eweight=g.ep['weight'])
    bstack = state.get_bstack()
    t = gt.community.get_hierarchy_tree(bstack)[0]
    tpos = pos = gt.draw.radial_tree_layout(t, t.vertex(t.num_vertices() - 1), weighted=True)
    cts = gt.draw.get_hierarchy_control_points(g, t, tpos,beta=.86)
    pos = g.own_property(tpos)
    b = bstack[0].vp["b"]

    #text rotation
    text_rot = g.new_vertex_property('double')
    g.vertex_properties['text_rot'] = text_rot
    text_pos = g.new_vertex_property('double')
    g.vertex_properties['text_pos'] = text_pos
    for v in g.vertices():
        if pos[v][0] > 0:
            text_rot[v] = math.atan(pos[v][1]/pos[v][0])
        else:
            text_rot[v] = math.atan(pos[v][1]/pos[v][0])
            text_pos[v] = 10#len(g.vp['label'][v].strip())

    logger.info('saving to disk...')
    gt.draw.graph_draw(g, pos=pos, vertex_fill_color=b,
                edge_control_points=cts,
                vertex_size=20,
                vertex_text=g.vertex_properties['label'],
                vertex_text_rotation=g.vertex_properties['text_rot'],
                vertex_text_position=g.vp['text_pos'],
                vertex_font_size=20,
                vertex_font_family='mono',
                vertex_anchor=0,
                vertex_color=b,
                vcmap=matplotlib.cm.Spectral,
                edge_color=g.edge_properties['color'],
                bg_color=[0,0,0,1],
                output_size=[1024*2,1024*2],
                output='/home/aahu/Desktop/ago_nvends={0}.png'.format(MIN_SHARED_VENDORS))

    return

In [None]:
block_model_plot(df[['vendor','category']].drop_duplicates(), min_shared_vendors=10)

May 16 15:25:15 INFO   <Graph object, undirected, with 93 vertices and 535 edges at 0x7f8938f95eb8>
INFO:__main__:<Graph object, undirected, with 93 vertices and 535 edges at 0x7f8938f95eb8>
May 16 15:25:15 INFO   begin stochastic block model
INFO:__main__:begin stochastic block model
