In [1]:
import os
import sys
import time
import json
import gzip
import string
from datetime import datetime
import unicodedata
import calendar

DATA_PATH = os.path.join(os.getcwd(), 'data')
RATEBEER_FILENAME = os.path.join(DATA_PATH, 'Ratebeer.txt.gz')
BEERADVOCATE_FILENAME = os.path.join(DATA_PATH, 'Beeradvocate.txt.gz')
PARSE_LIMIT = None

In [2]:
def convert_fraction_string(val_str):
    parts = val_str.split('/')
    try:
        return float(int(parts[0])) / int(parts[1])
    except (ZeroDivisionError, IndexError):
        return 0.0

In [3]:
def sanitize_ba(e):
    try:
        e['review/appearance'] = float(e['review/appearance'])
        e['review/taste'] = float(e['review/taste'])
        e['review/overall'] = float(e['review/overall'])
        e['review/palate'] = float(e['review/palate'])
        e['review/aroma'] = float(e['review/aroma'])
        e['review/timeUnix'] = int(e['review/time'])
        e.pop('review/time', None)
        try:
            e['beer/ABV'] = float(e['beer/ABV'])
        except Exception as q:
            e.pop('beer/ABV', None)
        e['user/profileName'] = e['review/profileName']
        e.pop('review/profileName', None)
        timeStruct = time.gmtime(e['review/timeUnix'])
        e['review/timeStruct'] = dict(zip(["year", "mon", "mday", "hour", "min", "sec", "wday", "yday", "isdst"], list(timeStruct)))
    except Exception as q:
        print q
        pass


def sanitize_rb(e):
    try:
        e['review/appearance'] = convert_fraction_string(e['review/appearance'])
        e['review/taste'] = convert_fraction_string(e['review/taste'])
        e['review/overall'] = convert_fraction_string(e['review/overall'])
        e['review/palate'] = convert_fraction_string(e['review/palate'])
        e['review/aroma'] = convert_fraction_string(e['review/aroma'])
        e['review/timeUnix'] = int(e['review/time'])
        e.pop('review/time', None)
        try:
            e['beer/ABV'] = float(e['beer/ABV'])
        except Exception as q:
            e.pop('beer/ABV', None)
        e['user/profileName'] = e['review/profileName']
        e.pop('review/profileName', None)
        timeStruct = time.gmtime(e['review/timeUnix'])
        e['review/timeStruct'] = dict(zip(["year", "mon", "mday", "hour", "min", "sec", "wday", "yday", "isdst"], list(timeStruct)))
    except Exception as q:
        print q
        pass

In [4]:
def sanitizer_switch(e, source):
    if source == 'BA':
        sanitize_ba(e)
    elif source == 'RB':
        sanitize_rb(e)
    return e

In [5]:
def parse_beer(filename, source=None):
    f = gzip.open(filename, 'r')
    entry = {}
    for l in f:
        l = l.strip()
        colonPos = l.find(':')
        if colonPos == -1:
            yield sanitizer_switch(entry, source)
            entry = {}
            continue
        eName = l[:colonPos]
        rest = l[colonPos+2:]
        entry[eName] = rest
    yield sanitizer_switch(entry, source)

In [6]:
def min_max_dates(beer_iter, parse_limit=None):
    min_date, max_date = None , None 
    first_review, last_review = None, None
    
    for i, review in enumerate(beer_iter):
        if parse_limit is not None and i >= parse_limit:
            break
        if review and (min_date is None or review.get('review/timeUnix') < min_date):
            first_review = review
            min_date = review.get('review/timeUnix')
        if review and (max_date is None or review.get('review/timeUnix') > max_date):
            last_review = review
            max_date = review.get('review/timeUnix')
    
    return first_review, last_review

In [7]:
def filter_data_date(start, end, beer_iter, filename):
    with open(filename, 'w') as f:
        for i, review in enumerate(beer_iter):
            if review and (review.get('review/timeUnix') >= start) and (review.get('review/timeUnix') <= end):
                json.dump(review, f, ensure_ascii=False)
                f.write('\n')
    f.close()

In [8]:
def clean_unicode(instr):
    """
    Helper to return unicode string as ascii with special characters removed.
    """
    return unicodedata.normalize('NFKD', instr).encode('ascii', 'ignore') 

In [9]:
# first, last = min_max_dates(data_iter, PARSE_LIMIT)

In [10]:
def parse_json(filename, normalize=True):
    with open(filename) as f:
        for line in f:
            out = json.loads(line, encoding='latin-1')
            if normalize:
                temp = {}
                for key in out:
                    try:
                        key = clean_unicode(key)
                    except:
                        key = key
                    try:
                        temp[key] = clean_unicode(out[key])
                    except:
                        temp[key] = out[key]
                out = temp
            yield out
    f.close()

In [11]:
# only run once - parses all data and writes reduced data to file
data_iter = parse_beer(RATEBEER_FILENAME, 'RB')
start_timestamp = calendar.timegm(datetime(2008,5,1).timetuple())
end_timestamp = calendar.timegm(datetime(2010,5,1).timetuple())
filter_data_date(start_timestamp, end_timestamp, data_iter, 'data/reduced_data.txt')

'review/appearance'


In [12]:
parsed_red = parse_json('data/reduced_data.txt')
dset = []
for i in parsed_red:
    dset.append(i)

In [13]:
dset[0:2]

[{'beer/ABV': 8.0,
  'beer/beerId': '91592',
  'beer/brewerId': '3228',
  'beer/name': 'Barley Island Barrel-Aged Count Hopula',
  'beer/style': 'Imperial/Double IPA',
  'review/appearance': 0.8,
  'review/aroma': 0.8,
  'review/overall': 0.75,
  'review/palate': 0.8,
  'review/taste': 0.8,
  'review/text': 'Handbottled from trade wth Sprinkle. Pours a nice dark copper color with medium size off white head. Aroma of bourbon, malt , hops and oak. Slight smokey flavor with a bourbon taste in the initial sip. Flavors of malt, vanilla and hops still remain although none dominate the brew. Taste is still very enjoyable with a smooth and balanced finish.',
  'review/timeStruct': {u'hour': 0,
   u'isdst': 0,
   u'mday': 3,
   u'min': 0,
   u'mon': 8,
   u'sec': 0,
   u'wday': 0,
   u'yday': 215,
   u'year': 2009},
  'review/timeUnix': 1249257600,
  'user/profileName': 'JJClark'},
 {'beer/ABV': 8.0,
  'beer/beerId': '91592',
  'beer/brewerId': '3228',
  'beer/name': 'Barley Island Barrel-Aged 