In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform

from sklearn.metrics.pairwise import cosine_similarity #if using text based similarity

Data for this project can be found here: https://www.kaggle.com/nickhould/craft-cans?select=beers.csv 

In [2]:
beer_df = pd.read_csv('beers.csv', na_values=['none'])
brewery_df = pd.read_csv('breweries.csv', na_values=['none'])

In [3]:
beer_df.head()

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
0,0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0
1,1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0
2,2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0
3,3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0
4,4,0.075,,2262,Sex and Candy,American IPA,177,12.0


In [4]:
brewery_df.head()

Unnamed: 0.1,Unnamed: 0,name,city,state
0,0,NorthGate Brewing,Minneapolis,MN
1,1,Against the Grain Brewery,Louisville,KY
2,2,Jack's Abby Craft Lagers,Framingham,MA
3,3,Mike Hess Brewing Company,San Diego,CA
4,4,Fort Point Beer Company,San Francisco,CA


In [5]:
#Checking to see that brewery_id matches with 'Unnamed:0' in brewery dataframe
beer_df.loc[beer_df.brewery_id == 1]

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
69,69,0.08,68.0,2686,Citra Ass Down,American Double / Imperial IPA,1,16.0
70,70,0.125,80.0,2685,London Balling,English Barleywine,1,16.0
71,71,0.077,25.0,2684,35 K,Milk / Sweet Stout,1,16.0
72,72,0.042,42.0,2683,A Beer,American Pale Ale (APA),1,16.0
73,73,0.05,25.0,2682,Rules are Rules,German Pilsener,1,16.0
74,74,0.066,21.0,2681,Flesh Gourd'n,Pumpkin Ale,1,16.0
75,75,0.04,13.0,2680,Sho'nuff,Belgian Pale Ale,1,16.0
76,76,0.055,17.0,2679,Bloody Show,American Pilsner,1,16.0
77,77,0.076,68.0,2678,Rico Sauvin,American Double / Imperial IPA,1,16.0
78,78,0.051,38.0,2677,Coq de la Marche,Saison / Farmhouse Ale,1,16.0


In [6]:
#Rename column so that we can merge the two datasets
brewery_df = brewery_df.rename(columns={'Unnamed: 0':'brewery_id'})

In [7]:
#Merging dataframes
df = pd.merge(beer_df, brewery_df, on='brewery_id')

#Renaming some columns for clarity
df = df.rename(columns={'name_x':'beer_name'})
df = df.rename(columns={'name_y':'brewery_name'})
df = df.rename(columns={'style': 'beer_style'})

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,beer_name,beer_style,brewery_id,ounces,brewery_name,city,state
0,0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0,10 Barrel Brewing Company,Bend,OR
1,1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0,18th Street Brewery,Gary,IN
2,2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0,18th Street Brewery,Gary,IN
3,3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0,18th Street Brewery,Gary,IN
4,4,0.075,,2262,Sex and Candy,American IPA,177,12.0,18th Street Brewery,Gary,IN


In [9]:
#Check for missing values
print(df.isnull().sum())

#print(len(df.ibu))

Unnamed: 0         0
abv               62
ibu             1005
id                 0
beer_name          0
beer_style         5
brewery_id         0
ounces             0
brewery_name       0
city               0
state              0
dtype: int64


In [10]:
#Dropping ibu due to high number of missing values. Dropping Unnamed: 0 as no helpful information.
df = df.drop(columns=['ibu','Unnamed: 0'])

#Saving dataframe to csv so that can used in additional modeling
#df = df.to_csv('beer_data.csv')

In [11]:
df_cross_beer_style = pd.crosstab(df.beer_name, df.beer_style)

In [12]:
df_cross_beer_style.head(5)

beer_style,Abbey Single Ale,Altbier,American Adjunct Lager,American Amber / Red Ale,American Amber / Red Lager,American Barleywine,American Black Ale,American Blonde Ale,American Brown Ale,American Dark Wheat Ale,...,Schwarzbier,Scotch Ale / Wee Heavy,Scottish Ale,Shandy,Smoked Beer,Tripel,Vienna Lager,Wheat Ale,Winter Warmer,Witbier
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#001 Golden Amber Lager,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#002 American I.P.A.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#003 Brown & Robust Porter,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#004 Session I.P.A.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
#Testing w/local Austin beer
pearl_snap_cross = df_cross_beer_style[df_cross_beer_style.index == 'Pearl-Snap']
#print(pearl_snap_cross)

In [14]:
#Calcuate distance between all beers

#Calculate jaccard metric
jaccard_distance = pdist(df_cross_beer_style.values, metric='jaccard')

#Squareform for processing in square matrix
squared_jaccard_distance = squareform(jaccard_distance)

#Find similarity
similarity_array = 1- squared_jaccard_distance

#Make dataframe
beer_style_similarity_df = pd.DataFrame(similarity_array, index=df_cross_beer_style.index, columns=df_cross_beer_style.index)

In [15]:
beer_style_similarity_df.head(5)

beer_name,#001 Golden Amber Lager,#002 American I.P.A.,#003 Brown & Robust Porter,#004 Session I.P.A.,#9,077XX,10 Degrees of Separation,10 Ton,113 IPA,11th Hour IPA,...,Yoshi's Nectar,"You're My Boy, Blue",Yunkin' Punkin',Zaison,Zaison (2012),Zen,Zombie Monkie,Zonker Stout,oSKAr the G'Rauch,"the Kimmie, the Yink and the Holy Gose"
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#001 Golden Amber Lager,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#002 American I.P.A.,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#003 Brown & Robust Porter,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
#004 Session I.P.A.,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
#Testing recommendations for an OKC craft beer
arjuna_similarity = beer_style_similarity_df['Arjuna']

arjuna_similarity = pd.DataFrame(arjuna_similarity.sort_values(ascending = False))

In [17]:
#15 recommendations for Arjuna
arjuna_similarity.head(15)

Unnamed: 0_level_0,Arjuna
beer_name,Unnamed: 1_level_1
Great Crescent Belgian Style Wit,1.0
Upland Wheat Ale,1.0
Arjuna,1.0
Pinata Protest,1.0
Plum Island Belgian White,1.0
Cold Snap,1.0
Lost Meridian Wit,1.0
Cotton Mouth,1.0
What the Butler Saw,1.0
White (2015),1.0
