In [388]:
# Make all plots inline.
%matplotlib inline 

# Import dependencies
import folium
from folium.plugins import HeatMap
import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt
from IPython.core.display import display
import folium.colormap as cm

# Loading datasets

In [76]:
# Load brewery data.
beer_data = pd.read_csv('./wrk_tables/beer_eda.csv')
pop_data = pd.read_csv('./wrk_tables/zip_features.csv')
review_data = pd.read_csv('./src_tables/review.csv')
yelp_data = pd.read_csv('./wrk_tables/yelp_brewery.csv')

In [634]:
beer_data = pd.read_csv('./wrk_tables/beer_eda.csv')

In [635]:
# Remove first column
beer_data = beer_data.iloc[:,1:len(beer_data.iloc[1])+1]

In [637]:
beer_data['id'] = beer_data['id'].str.replace(',', '')

In [None]:
pop_data.head()

In [None]:
review_data.head()

In [None]:
yelp_data.head()

In [640]:
# Replacing missing IBU values with mean value for corresponding beer style
for i in range(0, len(beer_data['id'])):
    if np.isnan(beer_data['ibu'].iloc[i]):
        beer_data['ibu'].iloc[i] = np.nanmean(beer_data['ibu'].loc[beer_data['beer_style_norm'] == beer_data['beer_style_norm'].iloc[i]])

In [641]:
# None of the Sakes have an IBU, so replace with 0
beer_data['ibu'] = beer_data['ibu'].fillna(0)

In [642]:
# Replacing missing calorie values with mean value for corresponding beer style
for i in range(0, len(beer_data['id'])):
    if np.isnan(beer_data['calories'].iloc[i]):
        beer_data['calories'].iloc[i] = np.nanmean(beer_data['calories'].loc[beer_data['beer_style_norm'] == beer_data['beer_style_norm'].iloc[i]])

In [643]:
# Replacing missing ABV values with mean value for corresponding beer style
for i in range(0, len(beer_data['id'])):
    if np.isnan(beer_data['abv'].iloc[i]):
        beer_data['abv'].iloc[i] = np.nanmean(beer_data['abv'].loc[beer_data['beer_style_norm'] == beer_data['beer_style_norm'].iloc[i]])

# Storing User Beer

In [644]:
# User searches for beer in database (with autocomplete)
# Here use 'Dogfish Head 90 Minute Imperial IPA' as example (beer_id = 1)
USER_BEER = beer_data.iloc[577]
USER_BEER

id                                                                1195
brewery_id                                                       4,327
beer_style_id                                                       30
brewery_type_id                                                      4
post_code                                                       24,767
loc_id                                                           17899
name                               Dogfish Head 90 Minute Imperial IPA
description          Esquire Magazine calls our 90 Minute I.I.PA., ...
abv                                                                  9
calories                                                           270
ibu                                                                 90
num_ratings                                                       4646
overall_rating                                                     100
beer_style_rating                                                   99
mean_r

In [645]:
USER_BEER_ID = USER_BEER['id']

# Calculating Style Similarity Score

In [646]:
# Calculate standard deviations of ABV, Calories, IBU, Style Score and Style Rating
abv_std = np.std(beer_data['abv'])
cal_std = np.std(beer_data['calories'])
ibu_std = np.std(beer_data['ibu'])
style_std = np.std(beer_data['style_score'])
style_rat_std = np.std(beer_data['beer_style_rating'])

In [647]:
# Determine relative weights
# I think similarity of style and ABV should be most important
# Then style rating, calories and IBUs (because many IBUs are just the mean)

style_wt = 0.25
abv_wt = 0.3
cal_wt = 0.15
ibu_wt = 0.15
style_rat_wt = 0.15

In [649]:
# Function running the algorithm for getting style similarity scores
# Runs quickly, so no need to pre-compute I think
def style_similarity(user_beer):
    sim_list = [0] * len(beer_data['id'])
    for i in range(0,len(beer_data['id'])):
        if user_beer['id'] == beer_data['id'].iloc[i]:
            sim_list[i] = np.nan
        else:
            dev = np.sqrt(style_wt*((user_beer['style_score'] - beer_data['style_score'].iloc[i])/style_std)**2 + 
                          abv_wt*((user_beer['abv'] - beer_data['abv'].iloc[i])/abv_std)**2 + 
                          cal_wt*((user_beer['calories'] - beer_data['calories'].iloc[i])/cal_std)**2 + 
                          ibu_wt*((user_beer['ibu'] - beer_data['ibu'].iloc[i])/ibu_std)**2 + 
                          style_rat_wt*((user_beer['beer_style_rating'] - beer_data['beer_style_rating'].iloc[i])/style_rat_std)**2)
            sim_score = 100*((1.25-dev)/1.25) # Aiming to have ~80% of beers above 0, depending on style
            sim_list[i] = sim_score
    return sim_list

In [650]:
# Generate Style similarity scores for the test case
SIMILARITY_SCORES = style_similarity(USER_BEER)

In [651]:
# Using this to return index of max value
import operator
index, value = max(enumerate(SIMILARITY_SCORES), key=operator.itemgetter(1))
print(index, value)

(1731, 93.349516765925159)


In [652]:
# Best similarity match
SIMILARITY_SCORES[1731]

93.349516765925159

In [653]:
beer_data.iloc[1731]

id                                                                3963
brewery_id                                                      12,507
beer_style_id                                                       81
brewery_type_id                                                      3
post_code                                                       12,182
loc_id                                                           17514
name                          Stone Sublimely Self Righteous Black IPA
description          We're brewers whose substantial mettle and idi...
abv                                                                8.7
calories                                                           261
ibu                                                                 90
num_ratings                                                       2253
overall_rating                                                     100
beer_style_rating                                                  100
mean_r

# Rating Component

In [655]:
# Function calculating the Bayesian weighted average overall rating
# S = wR + (1-w)C
# w = v/(v+m), where v = num ratings for each individual beer, and 
# m = avg num ratings for beer style
# R = overall_rating for each individual beer
# C = average overall_rating for the beer style
def weighted_rating(rtg_measure):
    rating_list = [0] * len(beer_data['id'])
    for i in range(0,len(beer_data['id'])):
        v = beer_data['num_ratings'].iloc[i]
        m = np.mean(beer_data['num_ratings'].loc[beer_data['beer_style_norm'] == beer_data['beer_style_norm'].iloc[i]])
        w = v/(v+m)
        R = beer_data[rtg_measure].iloc[i]
        C = np.mean(beer_data[rtg_measure].loc[beer_data['beer_style_norm'] == beer_data['beer_style_norm'].iloc[i]])
        rating_list[i] = w*R + (1-w)*C
    return rating_list

In [656]:
# Compile ratings based on overall_rating
RATING_SCORES = weighted_rating('overall_rating')

In [657]:
RATING_SCORES[1731]

99.850175046194352

In [658]:
RATING_SCORES[577]

99.920880063593316

# Calculate Preference Match Component

In [695]:
# Rohan's preferencing matching table loaded here
# containing top 50 matches for each beer_id
# all others would receive a score of 0

PREF_TABLE = pd.read_csv('./wrk_tables/beer_similarities.csv')


In [789]:
# Function to calculate similarity scores for user input beer
def review_match(user_beer):
    pref_list = [0] * len(beer_data['id'])
    pref_list[0] = 0
    pref_scores = PREF_TABLE.loc[PREF_TABLE['beer_id'] == int(user_beer['id'])]
    for i in range(1,len(beer_data)):
        if beer_data['id'].iloc[i] == user_beer['id']:
            pref_list[i] = np.nan
        elif int(beer_data['id'].iloc[i]) not in pref_scores['similar_beer_id'].values:
            pref_list[i] = 0
        else:
            x = pref_scores['similarity_score'].loc[pref_scores['similar_beer_id'] == int(beer_data['id'].iloc[i])].values
            pref_list[i] = x[0]
    pref_list = [i * 100 for i in pref_list]
    return pref_list

In [790]:
PREF_SCORES = review_match(USER_BEER)

In [793]:
# Find the most comparable beer based on reviews
index, value = max(enumerate(PREF_SCORES), key=operator.itemgetter(1))
print(index, value)

(586, 97.152757600000001)


In [794]:
# Check if this makes sense
beer_data.iloc[586]
# Based on a quick glance over RateBeer descriptions, I think this is a pretty good match
# Despite completely different styles (Imperial IPA vs. Doppelbock), there are similarities

id                                                                1210
brewery_id                                                         667
beer_style_id                                                       66
brewery_type_id                                                      4
post_code                                                       34,355
loc_id                                                            6960
name                 Ballast Point Navigator Doppelbock - Brandy Ba...
description          Navigator Doppelbock is our interpretation of ...
abv                                                                 10
calories                                                           300
ibu                                                               33.6
num_ratings                                                         43
overall_rating                                                      99
beer_style_rating                                                  100
mean_r

# Calculate Final Ranking Score (out of 100)

In [891]:
# Create a new dataframe specifically for this test (Dogfish Head 90 min)
beer_data_dfh90 = beer_data

In [892]:
# Appending the rating scores to the existing dataframe
beer_data_dfh90['review_sim'] = PREF_SCORES
beer_data_dfh90['style_sim'] = SIMILARITY_SCORES
beer_data_dfh90['weighted_rating'] = RATING_SCORES

In [893]:
# Placeholders for User-input weights between review similarity, style similarity, and overall rating
REVIEW_WEIGHT = 0.40
STYLE_WEIGHT = 0.40
RATING_WEIGHT = 0.20

In [894]:
# Applying the weights to each of the score categories
beer_data_dfh90['review_sim'] = [i * REVIEW_WEIGHT for i in beer_data_dfh90['review_sim']] 
beer_data_dfh90['style_sim'] = [i * STYLE_WEIGHT for i in beer_data_dfh90['style_sim']]
beer_data_dfh90['weighted_rating'] = [i * RATING_WEIGHT for i in beer_data_dfh90['weighted_rating']] 

In [895]:
# Calculate fina score for ranking beers
beer_data_dfh90['final_score'] = beer_data_dfh90['style_sim'] + beer_data_dfh90['review_sim'] + beer_data_dfh90['weighted_rating']

In [898]:
# Create new dataframe to sort the top scores and display
ranking_dfh90 = beer_data_dfh90.sort_values(by = 'final_score', ascending=False)
ranking_dfh90[['id','name','beer_style_norm','abv','ibu','calories','overall_rating','beer_style_rating','review_sim','style_sim','weighted_rating','final_score']].head(25)

Unnamed: 0,id,name,beer_style_norm,abv,ibu,calories,overall_rating,beer_style_rating,review_sim,style_sim,weighted_rating,final_score
1066,2283,Southampton Abbot 12,Strong Ale,10.0,38.061224,300.0,99,95,38.673899,4.344072,19.618284,62.636256
863,1834,Flossmoor Station Killer Kowalski Baltic Porter,Porter,8.0,42.307692,240.0,98,94,36.845834,4.628179,19.692925,61.166938
1063,2279,Lost Abbey Judgment Day,Strong Ale,10.5,38.061224,315.0,99,96,36.145332,3.511813,19.695775,59.35292
1022,2286,Iron Hill Oak Aged Quad,Strong Ale,10.5,38.061224,315.0,99,97,35.989325,3.637469,19.485654,59.112448
1068,2289,Sierra Nevada Ovila Quad - Plums,Strong Ale,10.2,38.061224,306.0,99,95,34.815302,3.986234,19.599791,58.401328
1731,3963,Stone Sublimely Self Righteous Black IPA,IPA,8.7,90.0,261.0,100,100,0.0,37.339807,19.970035,57.309842
578,1196,Alchemist The Crusher,IPA,9.0,84.1875,270.0,100,99,0.0,36.179439,19.874192,56.053631
572,1189,Trillium Upper Case,IPA,9.0,84.1875,270.0,100,100,0.0,35.946913,19.845335,55.792249
154,299,Alpine Beer Company Keene Idea,IPA,8.88,84.1875,266.0,100,100,0.0,35.836772,19.832414,55.669186
1733,3969,Hill Farmstead Society & Solitude #2,IPA,9.2,84.1875,276.0,100,100,0.0,35.668836,19.849116,55.517952


In [899]:
ranking_dfh90.to_csv('./rank_tables/ranking_dfh90.csv')

# Brewery Mapping (TBU)

In [6]:
# Creating the map object to hold the different layers.

# Starting coordinates to load map view.
start_coordinates = (39.965299, -98.266951)

# Create Map object.
map = folium.Map(location=start_coordinates,
                 zoom_start=5,
                tiles = 'Cartodb Positron',
                control_scale = 'True')

# Create layer group for breweries
col_group = folium.FeatureGroup(name='Breweries').add_to(map)

# Add marker clusters to the feature group for collisions
brewery_cluster = folium.MarkerCluster().add_to(col_group)
for row in brewery_data[0:2226].iterrows():
    # Only plot point if lat/long is available.
    if (not np.isnan(row[1]['lat']) and not np.isnan(row[1]['long'])):
        brewery_metadata = """
                <ul>
                    <li><strong>Brewery Name</strong>: {0}</li>
                    <li><strong>Brewery Type</strong>: {1}</li>
                    <li><strong>City</strong>: {2}</li>
                    <li><strong>State</strong>: {3}</li>
                </ul>""".format(
            str(row[1]['brewery_name']), str(row[1]['brewery_type']),
            str(row[1]['city']), str(row[1]['state']))
        iframe = folium.element.IFrame(html=brewery_metadata, width=250, height=100)
        popup = folium.Popup(iframe, max_width=2650)
        mark_color = 'red'
        if row[1]['brewery_type'] == 'Microbrewery':
            mark_color = 'blue'
        folium.Marker(
                location = [row[1]['lat'], row[1]['long']],
                icon = folium.Icon(color = mark_color, icon='asterisk'),
                popup=popup).add_to(brewery_cluster)

# Cluster Map

In [7]:
folium.LayerControl().add_to(map)
map

AttributeError: 'DataFrame' object has no attribute 'items'

# Heatmap

In [23]:
# Starting heatmap coordinates.
start_coordinates_HM = start_coordinates
radius = 15
blur = 15
min_opacity = 0.1
max_zoom = 13
max_val = 0.8


# Heatmap
map2 = folium.Map(location=start_coordinates_HM,
                 zoom_start=4,
                tiles = 'Cartodb Positron',
                control_scale = 'True')

data = []

for i in range(0,2226):
    lat = brewery_data.iloc[i]['lat']
    lng = brewery_data.iloc[i]['long']
    rtg = brewery_data.iloc[i]['mean_rating']
    
    row = [lat,lng,rtg]
    data.append(row)
    
# Can adjust radius and max_val to change the heatmap concentrations
HeatMap(data = data, name='Beer Rating', radius=radius, blur=blur,
        min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map2)

folium.LayerControl().add_to(map2)
map2

In [24]:
data

[[45.459109000000005, -123.80389, nan],
 [26.182160999999997, -80.133409999999998, 4.1500000000000004],
 [39.284607000000001, -76.556960000000004, nan],
 [42.108090000000004, -86.418009999999995, nan],
 [42.347974000000001, -71.044629999999998, nan],
 [45.459109000000005, -123.80389, nan],
 [42.108090000000004, -86.418009999999995, nan],
 [45.459109000000005, -123.80389, nan],
 [45.459109000000005, -123.80389, nan],
 [42.816359999999996, -89.640749999999997, nan],
 [41.551456999999999, -87.501429999999999, nan],
 [39.731285999999997, -104.98306000000001, 4.04],
 [40.645099000000002, -73.945031999999998, nan],
 [37.580345999999999, -77.488309999999998, 3.9700000000000002],
 [42.816359999999996, -89.640749999999997, nan],
 [39.284607000000001, -76.556960000000004, nan],
 [33.348593000000001, -111.80833, 4.0800000000000001],
 [38.927343999999998, -105.18746000000002, 3.9199999999999999],
 [45.459109000000005, -123.80389, 3.9399999999999999],
 [41.428162999999998, -73.127290000000002, 3.97

In [9]:
# Playing around with legend.
colormap = cm.LinearColormap(['blue', 'cyan', 'lime', 'yellow', 'red'],
                           vmin=0, vmax=10)
colormap.caption = 'Bucketed Score (Heatmaps)'
colormap