In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data for this project can be found here: https://www.kaggle.com/nickhould/craft-cans?select=beers.csv 

In [2]:
#Read saved dataset 
df = pd.read_csv('beer_data.csv', na_values=['none'])

In [4]:
#df.head()

In [5]:
df = df.drop(columns='Unnamed: 0')

#Creating column to be used as index
df['beer_info'] = df['beer_name']
df = df.set_index('beer_info')

#Converting everything to string for concat
df = df.astype(str)

#Create description column
df['description'] = df['abv'] + ' ' + df['beer_style'] + ' ' + df['brewery_name'] + ' ' + df['city'] + ' ' + df['state']

In [6]:
df.head(5)

Unnamed: 0_level_0,abv,id,beer_name,beer_style,brewery_id,ounces,brewery_name,city,state,description
beer_info,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Pub Beer,0.05,1436,Pub Beer,American Pale Lager,408,12.0,10 Barrel Brewing Company,Bend,OR,0.05 American Pale Lager 10 Barrel Brewing Com...
Devil's Cup,0.066,2265,Devil's Cup,American Pale Ale (APA),177,12.0,18th Street Brewery,Gary,IN,0.066 American Pale Ale (APA) 18th Street Brew...
Rise of the Phoenix,0.071,2264,Rise of the Phoenix,American IPA,177,12.0,18th Street Brewery,Gary,IN,0.071 American IPA 18th Street Brewery Gary IN
Sinister,0.09,2263,Sinister,American Double / Imperial IPA,177,12.0,18th Street Brewery,Gary,IN,0.09 American Double / Imperial IPA 18th Stree...
Sex and Candy,0.075,2262,Sex and Candy,American IPA,177,12.0,18th Street Brewery,Gary,IN,0.075 American IPA 18th Street Brewery Gary IN


In [7]:
#Dropping everything but beer_name and description for the next step
df_vect = df[['beer_name', 'description']]

df_vect = df_vect.reset_index(drop = True)

In [8]:
#Set up vectorizer
tfidfvec = TfidfVectorizer(min_df=2, max_df=0.8)

In [9]:
#Apply and convert to array
vectorized_df = tfidfvec.fit_transform(df_vect.description)

vectorized_df = vectorized_df.toarray()

In [10]:
#Create dataframe and re-set index
tfidf_df = pd.DataFrame(vectorized_df, columns=tfidfvec.get_feature_names())

tfidf_df.index = df_vect['beer_name']

In [11]:
tfidf_df.head()

Unnamed: 0_level_0,027000000000000003,032,035,037000000000000005,038,039,04,040999999999999995,042,043,...,woodstock,works,worth,worthy,wv,wy,wynkoop,yakima,york,zwickel
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Pub Beer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Devil's Cup,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rise of the Phoenix,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sinister,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sex and Candy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#Set up cosine similarity on tfidf dataframe
cosine_similarity_array = cosine_similarity(tfidf_df)

In [13]:
cosine_sim_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index)

In [14]:
cosine_sim_df.head(5)

beer_name,Pub Beer,Devil's Cup,Rise of the Phoenix,Sinister,Sex and Candy,Black Exodus,Lake Street Express,Foreman,Jade,Cone Crusher,...,Worthy Pale,Patty's Chile Beer,Colorojo Imperial Red Ale,Wynkoop Pumpkin Ale,Rocky Mountain Oyster Stout,Belgorado,Rail Yard Ale,B3K Black Lager,Silverback Pale Ale,Rail Yard Ale (2009)
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Pub Beer,1.0,0.074297,0.018016,0.016634,0.018534,0.0,0.077515,0.018259,0.078608,0.015817,...,0.567431,0.034705,0.057794,0.040192,0.061436,0.038921,0.060409,0.039153,0.133133,0.060409
Devil's Cup,0.074297,1.0,0.72608,0.670403,0.746977,0.62927,0.851652,0.735877,0.863664,0.637488,...,0.148148,0.0,0.035858,0.023216,0.014517,0.0,0.03748,0.0,0.162108,0.03748
Rise of the Phoenix,0.018016,0.72608,1.0,0.732375,0.816028,0.656224,0.757527,0.767398,0.768211,0.696418,...,0.014239,0.0,0.014242,0.0,0.015139,0.039197,0.014886,0.0,0.01558,0.014886
Sinister,0.016634,0.670403,0.732375,1.0,0.753453,0.605904,0.699438,0.708552,0.709303,0.798636,...,0.013147,0.0,0.01315,0.0,0.013978,0.036191,0.013745,0.0,0.014386,0.013745
Sex and Candy,0.018534,0.746977,0.816028,0.753453,1.0,0.675111,0.779329,0.789484,0.790321,0.716461,...,0.014648,0.0,0.014652,0.0,0.193775,0.040325,0.015315,0.0,0.016029,0.015315


In [15]:
#Testing for Arjuna, local OKC beer
cosine_sim_series = cosine_sim_df.loc['Arjuna']

beers_simliar_to_Arjuna = cosine_sim_series.sort_values(ascending= False)

In [17]:
beers_simliar_to_Arjuna.head(10)

beer_name
Arjuna                       1.000000
Uroboros                     0.775140
Golden One                   0.765317
F5 IPA                       0.466419
Gran Sport                   0.458117
Horny Toad Cerveza (2013)    0.445524
Horny Toad Cerveza           0.445524
Native Amber (2013)          0.432591
Native Amber                 0.432591
RoughTail IPA                0.291590
Name: Arjuna, dtype: float64

In [18]:
#Basic recommender based on user input
beer_drinker = input()

cosine_sim_bd = cosine_sim_df.loc[beer_drinker]
bd_recommend = cosine_sim_bd.sort_values(ascending= False)

print(bd_recommend.head(5))

F5 IPA
beer_name
F5 IPA                       1.000000
Gran Sport                   0.845909
Horny Toad Cerveza (2013)    0.838829
Horny Toad Cerveza           0.838829
Native Amber (2013)          0.814478
Name: F5 IPA, dtype: float64
