In [3]:
import pandas as pd
import dateutil
import os
import ast

import logging
FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s'
DATE_FORMAT = '%b %d %H:%M:%S'
formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

In [4]:
META_CATS = ['Other','Drugs','Services', 'Custom Listings', 'DRUGS & MORE','other service','other drugs','others']
META_CATS = [s.lower() for s in META_CATS]

In [3]:
def load_agora():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/agora/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
            l.append(df0)
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    return df

In [4]:
def load_pandora():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/pandora/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                #logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    return df

In [5]:
def load_cloudnine():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    
    #be consistent
    df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)
    df['cat'] = df['cat'].map(lambda x: ast.literal_eval(x))
    df['category'] = df['cat'].map(lambda x: x[-1])
    
    return df

In [6]:
def load_hydra():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/hydra/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    
    #be consistent
    df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)
    df['cat'] = df['category'].map(lambda x: ast.literal_eval(x))
    df['category'] = df['cat'].map(lambda x: x[-1])
    
    return df

In [7]:
def load_evolution():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/evolution/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    
    #be consistent
    #df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)
    #df['cat'] = df['category'].map(lambda x: ast.literal_eval(x))
    #df['category'] = df['cat'].map(lambda x: x[-1])
    
    return df

In [8]:
def postprocess(df):
    """
    standardized postprocessing
    """
    #normalize
    df['category'] = df['category'].map(lambda x:x.lower())
    
    #discard meta-categories
    df = df[df['category'].map(lambda x:x not in META_CATS)]
    logger.info(df.shape)
    
    #discard non-string categories
    def isfloat(value):
        try:
            float(value)
            return True
        except ValueError:
            return False
    df = df[df['category'].map(lambda x:not isfloat(x))]
    
    return df

In [7]:
def load_silkroad2():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/silkroad2/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    
    #be consistent
    df['cat'] = df['category'].map(lambda x: x.split('-'))
    df['category'] = df['cat'].map(lambda x: x[-1])
    
    return df

sr = load_silkroad2()
sr

May 20 21:29:41 INFO   Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
INFO:__main__:Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
May 20 21:29:41 INFO   (663912, 7)
INFO:__main__:(663912, 7)


ValueError: malformed node or string: <_ast.BinOp object at 0x7f55aa712a90>

In [9]:
# cn = load_cloudnine()
# cn = postprocess(cn)
# cn.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine.tsv',sep='\t',index=False)

# ag = load_agora()
# ag = postprocess(ag)
# ag.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/agora.tsv',sep='\t',index=False)

#pa = load_pandora()
#pa = postprocess(pa)
#pa.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/pandora.tsv',sep='\t',index=False)

#hy = load_hydra()
#hy = postprocess(hy)
#hy.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/hydra.tsv',sep='\t',index=False)

# ev = load_evolution()
# ev = postprocess(ev)
# ev.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/evolution.tsv',sep='\t',index=False)

sr2 = load_silkroad2()
sr2 = postprocess(sr2)
sr2.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/silkroad2.tsv',sep='\t',index=False)


May 18 18:50:45 INFO   Index(['category', 'listing', 'scrape_date', 'vendor'], dtype='object')
INFO:__main__:Index(['category', 'listing', 'scrape_date', 'vendor'], dtype='object')
May 18 18:50:45 INFO   (3702353, 4)
INFO:__main__:(3702353, 4)
May 18 18:50:48 INFO   (3001716, 4)
INFO:__main__:(3001716, 4)


In [10]:
ev

Unnamed: 0,category,listing,scrape_date,vendor
0,stimulants,"Welcome back, gwern 0 0 0 BTC 0.0000 Hom...",2014-07-21,gwern
1,stimulants,,2014-07-21,NORWEGIANcom
2,stimulants,,2014-07-21,drzheng
3,stimulants,,2014-07-21,drzheng
4,stimulants,,2014-07-21,godfatherNL
5,stimulants,,2014-07-21,godfatherNL
6,stimulants,,2014-07-21,godfatherNL
7,stimulants,,2014-07-21,spencerhill
8,stimulants,,2014-07-21,Zable
9,stimulants,,2014-07-21,Zable


In [11]:
ev['vendor'].value_counts()

fake                113056
rc4me                42827
need4weed            42588
profesorhouse        37186
gwern                35980
RepAAA               35560
optiman              34522
sexyhomer            30600
Magicalthings123     28045
FoxyGirl             23205
DrPlatypus           21013
etimbuk              20863
cerberus             20190
theben               19920
AlbertHeijn          18801
...
fraudster           1
artash              1
EVERYTHING_CHEAP    1
tornado888          1
ViBE                1
firepower           1
franktors           1
Kastell             1
ambra               1
5cent               1
DonkeySausage       1
freshvybz           1
Bossen              1
TheDruMonSer        1
thecornershop       1
Length: 4139, dtype: int64