In [1]:
# a simple content-based recommendation engine using beer and brewery data from Kaggle

In [2]:
import pandas as pd
import numpy as np
import csv
import statsmodels
import sklearn
from random import randint
import redis
from sklearn.metrics.pairwise import linear_kernel
from sklearn import linear_model
import math

In [133]:
from sklearn.naive_bayes import MultinomialNB

In [24]:
beers = pd.read_csv('beers.csv')
breweries = pd.read_csv('breweries.csv')

In [25]:
del beers["Unnamed: 0"]
del breweries["Unnamed: 0"]

In [26]:
beers.head()

Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0
1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0
2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0
3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0
4,0.075,,2262,Sex and Candy,American IPA,177,12.0


In [27]:
breweries.head()

Unnamed: 0,name,city,state
0,NorthGate Brewing,Minneapolis,MN
1,Against the Grain Brewery,Louisville,KY
2,Jack's Abby Craft Lagers,Framingham,MA
3,Mike Hess Brewing Company,San Diego,CA
4,Fort Point Beer Company,San Francisco,CA


In [28]:
beers_table = beers.join(breweries, on="brewery_id", lsuffix = "_beer")

In [29]:
beers_table.head()

Unnamed: 0,abv,ibu,id,name_beer,style,brewery_id,ounces,name,city,state
0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0,10 Barrel Brewing Company,Bend,OR
1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0,18th Street Brewery,Gary,IN
2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0,18th Street Brewery,Gary,IN
3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0,18th Street Brewery,Gary,IN
4,0.075,,2262,Sex and Candy,American IPA,177,12.0,18th Street Brewery,Gary,IN


In [30]:
#get the most common words from beer name
pd.Series(' '.join(beers_table['name_beer']).lower().split()).value_counts()[45:60]

(2009)         16
light          16
oktoberfest    16
barrel         15
(2014)         14
island         14
mountain       14
kölsch         14
pumpkin        14
beach          14
honey          14
special        13
dark           13
&              13
cream          13
dtype: int64

In [31]:
#get all the styles to control for colinearity
pd.Series.unique(beers_table['style'])

array(['American Pale Lager', 'American Pale Ale (APA)', 'American IPA',
       'American Double / Imperial IPA', 'Oatmeal Stout',
       'American Porter', 'Saison / Farmhouse Ale', 'Belgian IPA', 'Cider',
       'Baltic Porter', 'Tripel', 'American Barleywine', 'Winter Warmer',
       'American Stout', 'Fruit / Vegetable Beer', 'English Strong Ale',
       'American Black Ale', 'Belgian Dark Ale', 'American Blonde Ale',
       'American Amber / Red Ale', 'Berliner Weissbier',
       'American Brown Ale', 'American Pale Wheat Ale',
       'Belgian Strong Dark Ale', 'K\xc3\xb6lsch', 'English Pale Ale',
       'American Amber / Red Lager', 'English Barleywine',
       'Milk / Sweet Stout', 'German Pilsener', 'Pumpkin Ale',
       'Belgian Pale Ale', 'American Pilsner', 'American Wild Ale',
       'English Brown Ale', 'Altbier', 'California Common / Steam Beer',
       'Gose', 'Cream Ale', 'Vienna Lager', 'Witbier',
       'American Double / Imperial Stout', 'Munich Helles Lager',
      

In [32]:
len(pd.Series.unique(beers_table['style']))

100

In [33]:
#pick some non-trivial, non-1:1 words
key_words = [
    'Amber',
    'Stout',
    'Blonde',
    'Wheat',
    'Brown',
    'Red',
    'Hop',
    'Summer',
    'Black',
    'White',
    'American',
    'Point',
    'Session',
    'Old',
    'Big',
    'Winter',
    'Belgian',
    'River',
    'Saison',
    "Dale's",
    'Great',
    'Light',
    'Oktoberfest',
    'Barrel',
    'Island',
    'Mountain'
]

In [34]:
len(beers_table)

2410

In [35]:
#create dummy variables
df_styles = pd.get_dummies(beers_table['style'])
df_state = pd.get_dummies(beers_table['state'])

df_new = pd.concat([beers_table, df_styles, df_state], axis=1)

In [36]:
df_new.head()

Unnamed: 0,abv,ibu,id,name_beer,style,brewery_id,ounces,name,city,state,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0,10 Barrel Brewing Company,Bend,OR,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0,18th Street Brewery,Gary,IN,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0,18th Street Brewery,Gary,IN,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0,18th Street Brewery,Gary,IN,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.075,,2262,Sex and Candy,American IPA,177,12.0,18th Street Brewery,Gary,IN,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
#create dummy variables for beer key words
def kwTest(kw):
    def kwCondition(row):
        if kw in row['name_beer']:
            val = 1
        else:
            val = 0
        return val
    return kwCondition

for word in key_words:
    df_new["keyword_" + word] = df_new.apply(kwTest(word), axis=1)

In [38]:
df_new.head()

Unnamed: 0,abv,ibu,id,name_beer,style,brewery_id,ounces,name,city,state,...,keyword_Belgian,keyword_River,keyword_Saison,keyword_Dale's,keyword_Great,keyword_Light,keyword_Oktoberfest,keyword_Barrel,keyword_Island,keyword_Mountain
0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0,10 Barrel Brewing Company,Bend,OR,...,0,0,0,0,0,0,0,0,0,0
1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0,18th Street Brewery,Gary,IN,...,0,0,0,0,0,0,0,0,0,0
2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0,18th Street Brewery,Gary,IN,...,0,0,0,0,0,0,0,0,0,0
3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0,18th Street Brewery,Gary,IN,...,0,0,0,0,0,0,0,0,0,0
4,0.075,,2262,Sex and Candy,American IPA,177,12.0,18th Street Brewery,Gary,IN,...,0,0,0,0,0,0,0,0,0,0


In [158]:
#test keyword assignments
kwTest = df_new["keyword_Mountain"] == 1
df_new[kwTest].head()

Unnamed: 0,abv,ibu,id,name_beer,style,brewery_id,ounces,name,city,state,...,keyword_Belgian,keyword_River,keyword_Saison,keyword_Dale's,keyword_Great,keyword_Light,keyword_Oktoberfest,keyword_Barrel,keyword_Island,keyword_Mountain
285,0.046,20.0,2484,Bomber Mountain Amber Ale,American Amber / Red Ale,79,12.0,Black Tooth Brewing Company,Sheridan,WY,...,0,0,0,0,0,0,0,0,0,1
312,0.053,22.0,1545,Blue Mountain Classic Lager,Euro Pale Lager,382,12.0,Blue Mountain Brewery,Afton,VA,...,0,0,0,0,0,0,0,0,0,1
659,0.052,25.0,356,Crazy Mountain Amber Ale,American Amber / Red Ale,63,12.0,Crazy Mountain Brewing Company,Edwards,CO,...,0,0,0,0,0,0,0,0,0,1
913,0.055,40.0,2205,Mountain Rescue Pale Ale,American Pale Ale (APA),194,12.0,Goodlife Brewing Co.,Bend,OR,...,0,0,0,0,0,0,0,0,0,1
2146,0.046,20.0,1200,Bomber Mountain Amber Ale (2013),American Amber / Red Ale,457,12.0,The Black Tooth Brewing Company,Sheridan,WY,...,0,0,0,0,0,0,0,0,0,1


In [52]:
#more data cleaning

#just ibu >= 0
df_new = df_new[df_new['ibu'] >= 0]
#need abv
df = df_new[df_new.abv >= 0]
#make index column
#df.reset_index(level=0, inplace=True)

In [58]:
df.reset_index(level=0, inplace=True)

In [56]:
len(df)

1405

In [172]:
#define profile
likes = []
dislikes = []

#randomly generated example
i = 0
while i < 10:
    new_number = randint(0,1000)
    if new_number in likes:
        continue
    else:
        likes.append(new_number)
        i += 1
print likes
i = 0
while i < 10:
    new_number = randint(0,1000)
    if new_number in likes:
        continue
    if new_number in dislikes:
        continue
    else:
        dislikes.append(new_number)
        i += 1
print dislikes

profile = (likes,dislikes)

[805, 280, 837, 679, 11, 636, 908, 49, 491, 720]
[783, 887, 782, 771, 731, 197, 263, 405, 696, 186]


In [173]:
#organize training data and test data
training_data = df.copy()
test_data = df.copy()
def likeTest(a,b):
    def likeCondition(row):
        if row['level_0'] in a:
            #likes get a 1
            #print row['index']
            val = 1
        elif row['level_0'] in b:
            #dislikes get a 0
            val = 0
        else:
            val = None
        return val
    return likeCondition
training_data["like"] = training_data.apply(likeTest(likes,dislikes), axis=1)
test_data["like"] = test_data.apply(likeTest(likes,dislikes), axis=1)


# test condition in brackets to separate into training and test data
training_data = training_data[~training_data['like'].isnull()]
test_data = test_data[test_data['like'].isnull()]


#df = df[~df['data'].isnull()]

In [174]:
#validating training data vs. test data

print len(likes), " should be 10"
print len(dislikes), " should be 10"
print len(training_data), "should be 20"
print len(training_data[training_data["like"] == 1]), "should be 10"
print len(training_data[training_data["like"] == 0]), "should be 10"

10  should be 10
10  should be 10
20 should be 20
10 should be 10
10 should be 10


In [175]:
len(training_data) + len(test_data) == len(df)

True

In [176]:
del training_data['index']
del test_data['index']

In [177]:
#run classifier
x = training_data[[1,2] + range(11,187)].as_matrix()
y = training_data["like"].values
clf = MultinomialNB()
clf.fit(x, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [178]:
score = []

test_data.iterrows()

for index, row in test_data.iterrows():
    numbers = []
    for column_index in [1,2] + range(11,187):
        numbers.append(row[column_index])
    score.append(clf.predict_proba([numbers]))

In [179]:
score_list = []

for w in score:
    score_list.append(w.tolist()[0][0])

In [180]:
#average score
sum(score_list)/len(score_list)

0.8320453535875383

In [181]:
test_data['score'] = score_list

In [182]:
#top 10 results

test_data[['name_beer','style','name','score']].sort_values(['score'],ascending=False).head(10)

Unnamed: 0,name_beer,style,name,score
88,Bitter Bitch Imperial IPA,American Double / Imperial IPA,Astoria Brewing Company,0.999865
784,Blazing World,American Amber / Red Ale,Modern Times Beer,0.999755
1253,Heady Topper,American Double / Imperial IPA,The Alchemist,0.999729
1254,Heady Topper,American Double / Imperial IPA,The Alchemist,0.999729
1123,Red Cockaded Ale (2013),American Double / Imperial IPA,Southern Star Brewing Company,0.999577
1121,Red Cockaded Ale,American Double / Imperial IPA,Southern Star Brewing Company,0.999577
242,More Cowbell,American Double / Imperial IPA,Buffalo Bayou Brewing Company,0.99954
310,Bay of Bengal Double IPA (2014),American Double / Imperial IPA,Christian Moerlein Brewing Company,0.999437
1393,Troopers Alley IPA,American IPA,Wolf Hills Brewing Company,0.999355
269,Dead-Eye DIPA,American Double / Imperial IPA,Cape Ann Brewing Company,0.99912


In [185]:
#compare to liked beers
training_data[['name_beer','style','name']][training_data['like']==1].head(10)

Unnamed: 0,name_beer,style,name
11,Hop Crisis,American Double / Imperial IPA,21st Amendment Brewery
49,The Brown Note,English Brown Ale,Against the Grain Brewery
280,Mutiny IPA,American IPA,Capital Brewery
491,Beaver Logger,American Pale Lager,Gore Range Brewery
636,House Lager,Keller Bier / Zwickel Bier,Jack's Abby Craft Lagers
679,Lancaster German Style Kölsch,Kölsch,Lancaster Brewing Company
720,Summer Paradise,American Pale Wheat Ale,Manayunk Brewing Company
805,Narragansett Bock,Bock,Narragansett Brewing Company
837,Wall's End,English Brown Ale,NorthGate Brewing
908,Cache La Porter,American Porter,Pateros Creek Brewing Company
