In [11]:
import pandas as pd
import dateutil
import os
import ast

import logging
FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s'
DATE_FORMAT = '%b %d %H:%M:%S'
formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

In [12]:
META_CATS = ['Other','Drugs','Services', 'Custom Listings', 'DRUGS & MORE',
             'other service','other drugs','others', 'digital', 'drug']
META_CATS = [s.lower() for s in META_CATS]

In [13]:
def load_agora():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/agora/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
            l.append(df0)
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    return df

In [14]:
def load_pandora():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/pandora/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                #logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    return df

In [15]:
def load_cloudnine():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    
    #be consistent
    df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)
    df['cat'] = df['cat'].map(lambda x: ast.literal_eval(x))
    df['category'] = df['cat'].map(lambda x: x[-1])
    
    return df

In [16]:
def load_hydra():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/hydra/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    
    #be consistent
    df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)
    df['cat'] = df['category'].map(lambda x: ast.literal_eval(x))
    df['category'] = df['cat'].map(lambda x: x[-1])
    
    return df

In [17]:
def load_evolution():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/evolution/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    
    #be consistent
    #df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)
    #df['cat'] = df['category'].map(lambda x: ast.literal_eval(x))
    #df['category'] = df['cat'].map(lambda x: x[-1])
    
    return df

In [18]:
def postprocess(df):
    """
    standardized postprocessing
    """
    #normalize
    df['category'] = df['category'].map(lambda x:x.lower())
    
    #discard meta-categories
    df = df[df['category'].map(lambda x:x not in META_CATS)]
    logger.info(df.shape)
    
    #discard non-string categories
    def isfloat(value):
        try:
            float(value)
            return True
        except ValueError:
            return False
    df = df[df['category'].map(lambda x:not isfloat(x))]
    
    return df

In [19]:
def load_silkroad2():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/silkroad2/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    
    #be consistent
    df['cat'] = df['category'].map(lambda x: x.split('-'))
    df['category'] = df['cat'].map(lambda x: x[-1])
    
    return df

sr = load_silkroad2()
sr['cat'].map(lambda x:x[0]).value_counts()

May 20 21:32:05 INFO   Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
May 20 21:32:05 INFO   Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
INFO:__main__:Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
May 20 21:32:05 INFO   (663912, 7)
May 20 21:32:05 INFO   (663912, 7)
INFO:__main__:(663912, 7)


drugs           541071
digital          20048
books            17697
apparel          15615
drug             15433
money            12064
custom            9945
services          7971
forgeries         7086
erotica           4177
jewelry           2830
electronics       2096
packaging         1636
computer          1273
writing           1192
lotteries          956
hardware           763
lab                692
medical            538
art                531
herbs              133
biotic              83
collectibles        82
dtype: int64

In [20]:
# cn = load_cloudnine()
# cn = postprocess(cn)
# cn.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine.tsv',sep='\t',index=False)

# ag = load_agora()
# ag = postprocess(ag)
# ag.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/agora.tsv',sep='\t',index=False)

#pa = load_pandora()
#pa = postprocess(pa)
#pa.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/pandora.tsv',sep='\t',index=False)

#hy = load_hydra()
#hy = postprocess(hy)
#hy.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/hydra.tsv',sep='\t',index=False)

# ev = load_evolution()
# ev = postprocess(ev)
# ev.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/evolution.tsv',sep='\t',index=False)

sr2 = load_silkroad2()
sr2 = postprocess(sr2)
sr2.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/silkroad2.tsv',sep='\t',index=False)


May 20 21:32:10 INFO   Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
May 20 21:32:10 INFO   Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
INFO:__main__:Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
May 20 21:32:10 INFO   (663912, 7)
May 20 21:32:10 INFO   (663912, 7)
INFO:__main__:(663912, 7)
May 20 21:32:12 INFO   (616980, 8)
May 20 21:32:12 INFO   (616980, 8)
INFO:__main__:(616980, 8)


In [21]:
ev

NameError: name 'ev' is not defined

In [None]:
ev['vendor'].value_counts()