In [1]:
import pandas as pd
import dateutil
import os
import ast

import logging
FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s'
DATE_FORMAT = '%b %d %H:%M:%S'
formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

In [2]:
META_CATS = ['Other','Drugs','Services', 'Custom Listings', 'DRUGS & MORE','other service','other drugs','others']
META_CATS = [s.lower() for s in META_CATS]

In [3]:
def load_agora():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/agora/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
            l.append(df0)
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    return df

In [4]:
def load_pandora():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/pandora/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                #logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    return df

In [5]:
def load_cloudnine():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    
    #be consistent
    df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)
    df['cat'] = df['cat'].map(lambda x: ast.literal_eval(x))
    df['category'] = df['cat'].map(lambda x: x[-1])
    
    return df

In [13]:
def load_hydra():
    DATA_DIR='/home/aahu/Dropbox/black-market-recommender-systems/data/hydra/'
    l=[]
    for fname in os.listdir(DATA_DIR):
        if fname.endswith('.tsv'):
            try:
                df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
                l.append(df0)
            except ValueError:
                logger.exception('no data')
                pass
    df = pd.concat(l)
    logger.info(df.columns)
    logger.info(df.shape)
    
    #be consistent
    df.rename(columns={'scraped_date':'scrape_date'}, inplace=True)
    df['cat'] = df['category'].map(lambda x: ast.literal_eval(x))
    df['category'] = df['cat'].map(lambda x: x[-1])
    
    return df

May 17 00:17:59 ERROR  no data
Traceback (most recent call last):
  File "<ipython-input-13-ce57e3d15e45>", line 7, in load_hydra
    df0 = pd.read_csv(os.path.join(DATA_DIR,fname), sep='\t', parse_dates=['scrape_date'])
  File "/usr/local/lib/python3.4/dist-packages/pandas/io/parsers.py", line 465, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/usr/local/lib/python3.4/dist-packages/pandas/io/parsers.py", line 241, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/usr/local/lib/python3.4/dist-packages/pandas/io/parsers.py", line 557, in __init__
    self._make_engine(self.engine)
  File "/usr/local/lib/python3.4/dist-packages/pandas/io/parsers.py", line 694, in _make_engine
    self._engine = CParserWrapper(self.f, **self.options)
  File "/usr/local/lib/python3.4/dist-packages/pandas/io/parsers.py", line 1061, in __init__
    self._reader = _parser.TextReader(src, **kwds)
  File "pandas/parser.pyx", line 512, in pandas.parser.TextReader.__cinit_

Unnamed: 0,category,listing,price_usd,scrape_date,ships_from,ships_to,vendor,cat
0,Heroin,,$825.00,2014-10-24,United States,Worldwide,NODDING4YOU 4.4 25,"[Opioids, Heroin]"
1,Heroin,,$150.00,2014-10-24,United States,Worldwide,NODDING4YOU 4.4 25,"[Opioids, Heroin]"
2,Heroin,,$45.00,2014-10-24,United States,Worldwide,NODDING4YOU 4.4 25,"[Opioids, Heroin]"
3,Heroin,,$45.00,2014-10-24,United States,Worldwide,NODDING4YOU 4.4 25,"[Opioids, Heroin]"
4,Heroin,,$3200.00,2014-10-24,China,Worldwide,alchemy 5.0 3,"[Opioids, Heroin]"
5,Heroin,,$800.00,2014-10-24,China,Worldwide,alchemy 5.0 3,"[Opioids, Heroin]"
6,Heroin,,$600.00,2014-10-24,China,Worldwide,alchemy 5.0 3,"[Opioids, Heroin]"
7,Heroin,,$300.00,2014-10-24,China,Worldwide,alchemy 5.0 3,"[Opioids, Heroin]"
8,Heroin,,$160.00,2014-10-24,China,Worldwide,alchemy 5.0 3,"[Opioids, Heroin]"
9,Heroin,,$25.00,2014-10-24,China,Worldwide,alchemy 5.0 3,"[Opioids, Heroin]"


In [6]:
def postprocess(df):
    """
    standardized postprocessing
    """
    #normalize
    df['category'] = df['category'].map(lambda x:x.lower())
    
    #discard meta-categories
    df = df[df['category'].map(lambda x:x not in META_CATS)]
    logger.info(df.shape)
    
    #discard non-string categories
    def isfloat(value):
        try:
            float(value)
            return True
        except ValueError:
            return False
    df = df[df['category'].map(lambda x:not isfloat(x))]
    
    return df

In [7]:
# cn = load_cloudnine()
# cn = postprocess(cn)
# cn.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/cloudnine.tsv',sep='\t',index=False)

# ag = load_agora()
# ag = postprocess(ag)
# ag.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/agora.tsv',sep='\t',index=False)

#pa = load_pandora()
#pa = postprocess(pa)
#pa.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/pandora.tsv',sep='\t',index=False)

hy = load_hydra()
hy = postprocess(hy)
hy.to_csv('/home/aahu/Dropbox/black-market-recommender-systems/data/hydra.tsv',sep='\t',index=False)


May 17 00:04:28 INFO   Index(['cat', 'listing', 'price', 'quantity_available', 'quantity_sold', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
INFO:__main__:Index(['cat', 'listing', 'price', 'quantity_available', 'quantity_sold', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
May 17 00:04:28 INFO   (92407, 9)
INFO:__main__:(92407, 9)
May 17 00:04:29 INFO   (85559, 10)
INFO:__main__:(85559, 10)
May 17 00:04:36 INFO   Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
INFO:__main__:Index(['category', 'listing', 'price_btc', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
May 17 00:04:36 INFO   (1773538, 7)
INFO:__main__:(1773538, 7)
May 17 00:04:38 INFO   (1639069, 7)
INFO:__main__:(1639069, 7)
May 17 00:05:30 INFO   Index(['category', 'item', 'price_usd', 'scrape_date', 'ships_from', 'ships_to', 'vendor'], dtype='object')
INFO:__main__:Index(['category', 'item', 'pri