In [1]:
import pandas as pd
import dateutil
import os
import matplotlib.pyplot as plt
import seaborn as sns
import ast

import graph_tool as gt
import graph_tool.draw
import graph_tool.community
import itertools
import collections

import logging
FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s'
DATE_FORMAT = '%b %d %H:%M:%S'
formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

In [2]:
DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine/'
l=[]
for fname in os.listdir(DATA_DIR):
    if fname.endswith('.tsv'):
        try:
            df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scraped_date'])
            l.append(df0)
        except ValueError:
            #logger.exception('no data in {}'.format(fname))
            pass
df = pd.concat(l)
df['cat'] = df['cat'].map(ast.literal_eval)
logger.info(df.columns)
logger.info(df.shape)

May 12 16:05:27 INFO   Index(['cat', 'listing', 'price', 'quantity_available', 'quantity_sold', 'scraped_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
INFO:__main__:Index(['cat', 'listing', 'price', 'quantity_available', 'quantity_sold', 'scraped_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
May 12 16:05:27 INFO   (92407, 9)
INFO:__main__:(92407, 9)


In [3]:
df['cat'].map(lambda x:x if len(x) == 2 else None).value_counts()

(Drugs, Prescription)          7312
(Other, Books)                 7090
(Drugs, Opioids)               5045
(Services, Money)              4780
(Drugs, Benzos)                3046
(Services, Other)              2221
(Other, Software)              1335
(Services, Sex)                1310
(Tobacco, Cigarettes)          1157
(Services, Hacking)            1015
(Other, Electronics)            940
(Drugs, Paraphernalia)          814
(Drugs, Steroids)               813
(Other, Accounts)               768
(Drugs, Other)                  767
(Counterfeits, Other)           730
(Drugs, RCs)                    665
(Drugs, Dissociatives)          565
(Tobacco, Paraphernalia)        564
(Other, Shipping Materials)     383
(Counterfeits, Watches)         278
(Counterfeits, Accessories)     272
(Drugs, Wholesale)              235
(Drugs, Supplements)            207
(Other, Precious Metals)        205
(Services, Training)            166
(Services, Logistics)           110
(Drugs, Weight Loss)        

In [25]:
def build_cat_tree(df):
    #build category tree
    cats = set(itertools.chain.from_iterable(df['cat']))

    #build graph-tool ids
    node_lbs = {}
    rev_node_lbs = {}
    for idx,vendor in enumerate(cats):
        node_lbs[vendor] = idx
        rev_node_lbs[idx] = vendor
        
    edge_list = []
    for cat_branch in df['cat']:
        for i in range(len(cat_branch)-1):
            v0 = cat_branch[i]
            v1 = cat_branch[i+1]
            e = node_lbs[v0], node_lbs[v1]
            edge_list.append(e)
    
    edge_list = set(edge_list)
    edge_list = [e for e in edge_list if e[0] != e[1]] # self-loops
    g = graph_tool.Graph(directed=True)
    g.add_edge_list(edge_list)
    
    g.vertex_properties['label'] = g.new_vertex_property('string')
    for v in g.vertices():
        g.vertex_properties['label'][v] = rev_node_lbs[g.vertex_index[v]]
    print('g vert/edges: ',g.num_vertices(), g.num_edges())
    
    pos = graph_tool.draw.arf_layout(g)
    graph_tool.draw.graph_draw(g,pos=pos,
                               vertex_text=g.vertex_properties['label'],
                               vertex_text_position=.1,
                               output_size=(1024,1024),
                              output='/home/aahu/Desktop/drug_dag.pdf')
    print(graph_tool.topology.is_DAG(g))

build_cat_tree(df)

g vert/edges:  79 79
True
