In [67]:
import os
import sys
import time
import json
import gzip
import string
from datetime import datetime

DATA_PATH = os.path.join(os.getcwd(), 'data')
RATEBEER_FILENAME = os.path.join(DATA_PATH, 'Ratebeer.txt.gz')
BEERADVOCATE_FILENAME = os.path.join(DATA_PATH, 'Beeradvocate.txt.gz')
PARSE_LIMIT = None

In [68]:
def convert_fraction_string(val_str):
    parts = val_str.split('/')
    try:
        return float(int(parts[0])) / int(parts[1])
    except (ZeroDivisionError, IndexError):
        return 0.0

In [69]:
def sanitize_ba(e):
    try:
        e['review/appearance'] = float(e['review/appearance'])
        e['review/taste'] = float(e['review/taste'])
        e['review/overall'] = float(e['review/overall'])
        e['review/palate'] = float(e['review/palate'])
        e['review/aroma'] = float(e['review/aroma'])
        e['review/timeUnix'] = int(e['review/time'])
        e.pop('review/time', None)
        try:
            e['beer/ABV'] = float(e['beer/ABV'])
        except Exception as q:
            e.pop('beer/ABV', None)
        e['user/profileName'] = e['review/profileName']
        e.pop('review/profileName', None)
        timeStruct = time.gmtime(e['review/timeUnix'])
        e['review/timeStruct'] = dict(zip(["year", "mon", "mday", "hour", "min", "sec", "wday", "yday", "isdst"], list(timeStruct)))
    except Exception as q:
        print q
        pass


def sanitize_rb(e):
    try:
        e['review/appearance'] = convert_fraction_string(e['review/appearance'])
        e['review/taste'] = convert_fraction_string(e['review/taste'])
        e['review/overall'] = convert_fraction_string(e['review/overall'])
        e['review/palate'] = convert_fraction_string(e['review/palate'])
        e['review/aroma'] = convert_fraction_string(e['review/aroma'])
        e['review/timeUnix'] = int(e['review/time'])
        e.pop('review/time', None)
        try:
            e['beer/ABV'] = float(e['beer/ABV'])
        except Exception as q:
            e.pop('beer/ABV', None)
        e['user/profileName'] = e['review/profileName']
        e.pop('review/profileName', None)
        timeStruct = time.gmtime(e['review/timeUnix'])
        e['review/timeStruct'] = dict(zip(["year", "mon", "mday", "hour", "min", "sec", "wday", "yday", "isdst"], list(timeStruct)))
    except Exception as q:
        print q
        pass

In [70]:
def pick_sanitize(e, source):
    if source == 'BA':
        sanitize_ba(e)
    elif source == 'RB':
        sanitize_rb(e)
    return e

In [71]:
def parse_beer(filename, source=None):
    f = gzip.open(filename, 'r')
    entry = {}
    for l in f:
        l = l.strip()
        colonPos = l.find(':')
        if colonPos == -1:
            yield pick_sanitize(entry, source)
            entry = {}
            continue
        eName = l[:colonPos]
        rest = l[colonPos+2:]
        entry[eName] = rest
    yield pick_sanitize(entry, source)

In [73]:
min_date = None
max_date = None
first_review, last_review = None, None
start = time.time()
for i, review in enumerate(parse_beer(BEERADVOCATE_FILENAME, source='BA')):
#     print review
    if PARSE_LIMIT is not None and i >= PARSE_LIMIT:
        break
    if review and (min_date is None or review.get('review/timeUnix') < min_date):
        first_review = review
        min_date = review.get('review/timeUnix')
    if review and (max_date is None or review.get('review/timeUnix') > max_date):
        last_review = review
        max_date = review.get('review/timeUnix')

print json.dumps(first_review, sort_keys=True, indent=4)
print json.dumps(last_review, sort_keys=True, indent=4)
print 'BA Parser finished in %.3f seconds' % (time.time() - start)

print
print
min_date, max_date = None, None
first_review, last_review = None, None
for i, review in enumerate(parse_beer(RATEBEER_FILENAME, source='RB')):
#     print review
    if PARSE_LIMIT is not None and i >= PARSE_LIMIT:
        break
    if review and (min_date is None or review.get('review/timeUnix') < min_date):
        first_review = review
        min_date = review.get('review/timeUnix')
    if review and (max_date is None or review.get('review/timeUnix') > max_date):
        last_review = review
        max_date = review.get('review/timeUnix')

print json.dumps(first_review, sort_keys=True, indent=4)
print json.dumps(last_review, sort_keys=True, indent=4)
print 'RB Parser finished in %.3f seconds' % (time.time() - start)

'review/appearance'
{
    "beer/ABV": 5.3, 
    "beer/beerId": "93", 
    "beer/brewerId": "33", 
    "beer/name": "Steel Rail Extra Pale Ale", 
    "beer/style": "American Pale Ale (APA)", 
    "review/appearance": 3.5, 
    "review/aroma": 3.5, 
    "review/overall": 4.0, 
    "review/palate": 4.0, 
    "review/taste": 4.0, 
    "review/text": "Presentation: Pint size, on tap from the Eastside Grill, Northampton, MA. \t\tAppearance: Slightly cloudy, with a smooth white head that left rings around the glass as the level went down.\t\tSmell: This brew had a nice clean balance of malt and hop aromas, with hints of a pleasing citrus smell.\t\tTaste: A very light ale, with an extremely refreshing amount of carbonation. Lovely honey, lemon, and nut overtones followed by light sweet malt afters.\t\tNotes: By far on of the best American Pale Ales in the NE on tap. The chicken, shrimp, and andouille jambalaya complemented this extrodianry brew. Mexican and other Cajun meals would also be reco