# Beer Data Analysis
#### jhh283, sjh293

### Abstract
Preece and Schneiderman's Reader-to-Leader Framework [1] details a number of usability and sociability factors that sites can deploy to help promote contributions of user generated content (UGC). In particular, the paper describes "visibility for...contributions" as a motivating factor for ongoing contributions of UGC, along with a "recognition of quality and quantity of contributions." This notebook's analysis focuses on RateBeer.com, one of the leading beer review websites, and in particular on site and user profile redesigns that provided additional visibility for user's contributions, and better recognition of the quantity and completeness of each user's reviews. Based on arguments in the Reader-to-Leader Framework, we hypothesized that these redesigns will result in a measurable increase the quantity and diversity of contributions of beer reviews.

Before & After:
* Contribution rate, before & after redesign
* Contribution rate per (active) user
* Diversity of contributions (i.e., beer type, breweries, beers)

TODO:
* Repeat analysis around launch of the mobile app

[1]: Preece, Jennifer and Ben Shneiderman (2009). “The Reader-to-Leader Framework: Motivating Technology-Mediated Social Participation,” in Transactions on Human-Computer Interaction.

### Notebook helpers

In [None]:
import os

from IPython.display import HTML
from IPython.display import display, Image
from PIL import Image as PILImage

def files_at_relpath(rel_path):
    return [os.path.join(rel_path, f) for f in os.listdir(rel_path)]


def display_images(img_path, **kwargs):
    scale = kwargs.get('scale')
    if scale is not None:
        kwargs.pop('scale', None)
    kwargs.pop("key", None)
    files = files_at_relpath(img_path)
    for filename in files:
        try:
            if scale is None:
                display(Image(filename=filename, **kwargs))
            else:
                w, h = None, None
                with PILImage.open(filename) as imgfile:
                    w, h = imgfile.size
                kwargs['width'] = float(w) * scale
                kwargs['height'] = float(h) * scale
                display(Image(filename=filename, **kwargs))
        except ValueError as e:
            continue

## RateBeer Profiles

### Profile for user 'MiP' as of April 2009 and August 2009 (respectively)
* 2008: https://web.archive.org/web/20090425001049/http://www.ratebeer.com/View-User-10185.htm
* 2009: https://web.archive.org/web/20090822202711/http://www.ratebeer.com/View-User-10185.htm

In [None]:
display_images('images/ratebeer/profiles-MiP/', scale=0.7)

## Analysis

In [None]:
%matplotlib inline

import json
from itertools import groupby
from collections import defaultdict
from datetime import date

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

import numpy as np

In [None]:
# SOURCE_FILE = 'data/reduced_data.txt'
SOURCE_FILE = 'data/mobile-data.txt'

### Data helpers

In [None]:
def parse_json(filename, normalize=True):
    with open(filename) as f:
        for line in f:
            out = json.loads(line, encoding='latin-1')
            if normalize:
                temp = {}
                for key in out:
                    try:
                        key = clean_unicode(key)
                    except:
                        key = key
                    try:
                        temp[key] = clean_unicode(out[key])
                    except:
                        temp[key] = out[key]
                out = temp
            yield out
    f.close()

    
# via http://matplotlib.org/examples/pylab_examples/finance_work2.html
def moving_average(x, n, type='simple'):
    """
    compute an n period moving average.

    type is 'simple' | 'exponential'

    """
    x = np.asarray(x)
    if type=='simple':
        weights = np.ones(n)
    else:
        weights = np.exp(np.linspace(-1., 0., n))

    weights /= weights.sum()


    a =  np.convolve(x, weights, mode='full')[:len(x)]
    a[:n] = a[n]
    return a    

### Load reviews and group by date

In [None]:
timestamp_keyfunc = lambda r: r.get('review/timeUnix')
date_keyfunc = lambda r: date.fromtimestamp(r.get('review/timeUnix'))

In [None]:
reviews_iter = parse_json(SOURCE_FILE)
sorted_reviews = sorted(reviews_iter, key=timestamp_keyfunc)
review_by_date = {}
for key, group in groupby(sorted_reviews, date_keyfunc):
    review_by_date[key] = len(list(group))

In [None]:
print 'Total reviews: %s' % "{:,}".format(len(sorted_reviews))
print
# print sorted_reviews[0:2]

### Plot of total RateBeer reviews between November 1, 2008 and November 1, 2009

In [None]:
dates = sorted(review_by_date.keys())
review_counts = [review_by_date[k] for k in dates]

ma_days = 45
ma = moving_average(review_counts, ma_days, 'exponential')

polynomial_order = 3
trend_line = np.polyfit(range(len(dates)), review_counts, polynomial_order)

In [None]:
current_palette = sns.color_palette("Paired", 10)

In [None]:
fig, axes = plt.subplots(1, figsize=(16, 10))

# Review count plot
axes.plot(dates, review_counts, color=current_palette[1], lw=1, label='Daily reviews')
axes.fill_between(dates, 0, review_counts, facecolor=current_palette[0], alpha=0.8)
# Review count plot
axes.plot(dates, ma, color=current_palette[3], lw=4, alpha=0.75, label=('%d day moving avg.' % ma_days))
# N order linear model plot
axes.plot(dates, np.polyval(trend_line, range(len(dates))), color=current_palette[5], lw=4, alpha=0.75, label=('Order %d linear model' % polynomial_order))

# Labels and axes formatting
axes.set_title('RateBeer Reviews by Date')
axes.set_xlabel('Dates')
axes.set_ylabel('Reviews')
fig.autofmt_xdate()
axes.fmt_xdata = mdates.DateFormatter('%Y-%m-%d')
handles, labels = axes.get_legend_handles_labels()
axes.legend(handles, labels)

plt.show()

### Stuff

In [None]:
user_keyfunc = lambda r: r.get('user/profileName')

def month_keyfunc(review):
    # Return the first day of the month for any day in a month
    review_date = date.fromtimestamp(review.get('review/timeUnix'))
    return date(review_date.year, review_date.month, 1)

In [None]:
monthly_user_count = {}
for key, group in groupby(sorted_reviews, month_keyfunc):
    user_sorted = sorted(group, key=user_keyfunc)
    monthly_user_count[key] = len(list(groupby(user_sorted, user_keyfunc)))

In [None]:
months = sorted(monthly_user_count.keys())
user_counts = [monthly_user_count[m] for m in months]

In [None]:
fig, axes = plt.subplots(1, figsize=(16, 10))

axes.plot(months, user_counts, color=current_palette[3], lw=1, label='Unique users')
axes.fill_between(months, 0, user_counts, facecolor=current_palette[2], alpha=0.8)

fig.autofmt_xdate()
axes.set_title('RateBeer Unique Users by Month')
axes.set_xlabel('Months')
axes.set_ylabel('Users')
axes.fmt_xdata = mdates.DateFormatter('%Y-%m')
plt.show()

In [None]:
users_html = ["""<table>
<tr>
<th>Date</th>
<th>User Counts</th>
</tr>"""]
for i in range(len(months)):
    users_html.append("""<tr><td>%s</td><td>%s</td></tr>""" % (months[i], "{:,}".format(user_counts[i])))
users_html.append("""</table>""")

h = HTML(''.join(users_html)); h

In [None]:
beer_style_keyfunc = lambda r: r.get(u'beer/style')

In [None]:
monthly_beer_count = {}
all_styles = {}
for key_month, group in groupby(sorted_reviews, month_keyfunc):
    style_sorted = sorted(group, key=beer_style_keyfunc)
    monthly_beer_count[key_month] = {}
    for key_style, subgroup in groupby(style_sorted, beer_style_keyfunc):
        style_count = len(list(subgroup))
        monthly_beer_count[key_month][key_style] = style_count
        all_styles[key_style] = all_styles.setdefault(key_style, 0) + style_count

In [None]:
N_styles = 10
top_N_styles = sorted(all_styles.items(), key=lambda s: s[1], reverse=True)[0:N_styles]
top_styles = [s[0] for s in top_N_styles]

months = sorted(monthly_beer_count.keys())
style_counts = {}

for style in top_styles:
    month_counts = []
    for month in months:
        month_counts.append(monthly_beer_count[month][style])
    style_counts[style] = month_counts
    
# Count of reviews for beers outside top_N_styles
other_counts = []
for i in range(len(months)):
    month = months[i]
    total_reviews = sum(monthly_beer_count[month].values())
    top_style_reviews = sum([style_counts[s][i] for s in top_styles])
    other_counts.insert(0, total_reviews - top_style_reviews)
style_counts['Other'] = other_counts
top_styles.insert(0, 'Other')

In [None]:
current_palette = sns.color_palette("hls", len(top_styles))
# sns.palplot(current_palette)

In [None]:
fig, axes = plt.subplots(1, figsize=(16, 10))

area_bottoms = [0 for i in range(len(months))]
cp_index = 0
for style in top_styles:
    area_tops = [style_counts[style][j] + area_bottoms[j] for j in range(len(months))]
   
    axes.plot(months, area_tops, color=current_palette[cp_index], lw=1, label=('%s' % style))
    axes.fill_between(months, area_bottoms, area_tops, facecolor=current_palette[cp_index], alpha=0.8)
    
    area_bottoms = list(area_tops)
    cp_index += 1
    
    
axes.set_title('RateBeer Style Reviews by Date')
axes.set_xlabel('Dates')
axes.set_ylabel('Reviews')
fig.autofmt_xdate()
axes.fmt_xdata = mdates.DateFormatter('%Y-%m-%d')
handles, labels = axes.get_legend_handles_labels()
axes.legend(handles, labels, loc=8, ncol=2, frameon=True, fontsize='large', title='Beer Styles')

plt.show()

In [None]:
styles_html = ['<table><tr>']
styles_html.append('<th></th>')  # Blank upper left corner
for month in months:
    styles_html.append(
        ('<th>%s</th>' % month.strftime('%b %Y'))
    )
styles_html.append('</tr>')

for style in top_styles:
    styles_html.append(('<tr><td>%s</td>' % style))
    for i in range(len(months)):
        styles_html.append(('<td>%s</td>' % ('{:,}'.format(style_counts[style][i]))))
    styles_html.append('</tr>')

h = HTML(''.join(styles_html)); h