In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data for this project can be found here: https://www.kaggle.com/nickhould/craft-cans?select=beers.csv 

In [20]:
#Read saved dataset 
df = pd.read_csv('beer_data.csv', na_values=['none'])

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,abv,id,beer_name,beer_style,brewery_id,ounces,brewery_name,city,state
0,0,0.05,1436,Pub Beer,American Pale Lager,408,12.0,10 Barrel Brewing Company,Bend,OR
1,1,0.066,2265,Devil's Cup,American Pale Ale (APA),177,12.0,18th Street Brewery,Gary,IN
2,2,0.071,2264,Rise of the Phoenix,American IPA,177,12.0,18th Street Brewery,Gary,IN
3,3,0.09,2263,Sinister,American Double / Imperial IPA,177,12.0,18th Street Brewery,Gary,IN
4,4,0.075,2262,Sex and Candy,American IPA,177,12.0,18th Street Brewery,Gary,IN


In [22]:
df = df.drop(columns='Unnamed: 0')

#For this test, checking on Texas beers
df_tx = df.loc[df.state == ' TX']

#Creating column to be used as index
df_tx['beer_info'] = df_tx['beer_name']
df_tx = df_tx.set_index('beer_info')

#Converting everything to string for concat
df_tx = df_tx.astype(str)

#Create description column
df_tx['description'] = df_tx['abv'] + ' ' + df_tx['beer_style'] + ' ' + df_tx['brewery_name'] + ' ' + df_tx['city']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tx['beer_info'] = df_tx['beer_name']


In [23]:
df_tx.head()

Unnamed: 0_level_0,abv,id,beer_name,beer_style,brewery_id,ounces,brewery_name,city,state,description
beer_info,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Quakertown Stout,0.092,1333,Quakertown Stout,American Double / Imperial Stout,426,12.0,Armadillo Ale Works,Denton,TX,0.092 American Double / Imperial Stout Armadil...
Greenbelt Farmhouse Ale,0.051,1332,Greenbelt Farmhouse Ale,Saison / Farmhouse Ale,426,12.0,Armadillo Ale Works,Denton,TX,0.051 Saison / Farmhouse Ale Armadillo Ale Wor...
Heavy Machinery IPA Series #1: Heavy Fist,0.07,1409,Heavy Machinery IPA Series #1: Heavy Fist,American Black Ale,413,16.0,Austin Beerworks,Austin,TX,0.07 American Black Ale Austin Beerworks Austin
Fire Eagle IPA,0.062,343,Fire Eagle IPA,American IPA,413,12.0,Austin Beerworks,Austin,TX,0.062 American IPA Austin Beerworks Austin
Peacemaker,0.051,342,Peacemaker,American Pale Ale (APA),413,12.0,Austin Beerworks,Austin,TX,0.051 American Pale Ale (APA) Austin Beerworks...


In [24]:
#Dropping everything but beer_name and description for next step
df_tx_vect = df_tx[['beer_name', 'description']]

df_tx_vect = df_tx_vect.reset_index(drop = True)

In [25]:
#Set up vectorizer
tfidfvec = TfidfVectorizer(min_df=2, max_df=0.8)

In [26]:
#Apply and convert to array
vectorized_df = tfidfvec.fit_transform(df_tx_vect.description)

vectorized_df = vectorized_df.toarray()

In [27]:
#Create dataframe and re-set index
tfidf_df = pd.DataFrame(vectorized_df, columns=tfidfvec.get_feature_names())

tfidf_df.index = df_tx_vect['beer_name']

In [28]:
tfidf_df.head()

Unnamed: 0_level_0,042,044000000000000004,046,047,048,049,05,051,052000000000000005,053,...,station,stout,texas,texian,twisted,uncle,wheat,witbier,works,worth
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Quakertown Stout,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.349532,0.0,0.0,0.0,0.0,0.0,0.0,0.408864,0.0
Greenbelt Farmhouse Ale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.319097,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.438326,0.0
Heavy Machinery IPA Series #1: Heavy Fist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fire Eagle IPA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Peacemaker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.398,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
#Set up cosine similarity on tfidf dataframe
cosine_similarity_array = cosine_similarity(tfidf_df)

In [30]:
cosine_sim_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index)

In [31]:
cosine_sim_df.head()

beer_name,Quakertown Stout,Greenbelt Farmhouse Ale,Heavy Machinery IPA Series #1: Heavy Fist,Fire Eagle IPA,Peacemaker,Pearl-Snap,Black Thunder,La Frontera Premium IPA,Tejas Lager,Number 22 Porter,...,Bombshell Blonde,First Stand,Battle LIne,Broken Bridge,Brutus,Cow Creek,Chupahopra,Twisted X,The Green Room,Humbucker Helles
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Quakertown Stout,1.0,0.58747,0.051601,0.028643,0.054441,0.0,0.0,0.025752,0.0,0.024019,...,0.059602,0.027486,0.053234,0.0,0.028336,0.01991,0.020827,0.021499,0.01804,0.0
Greenbelt Farmhouse Ale,0.58747,1.0,0.063145,0.0,0.193621,0.0,0.0,0.0,0.0,0.0,...,0.072936,0.296116,0.065143,0.0,0.060756,0.0,0.0,0.116831,0.0,0.0
Heavy Machinery IPA Series #1: Heavy Fist,0.051601,0.063145,1.0,0.662641,0.580038,0.450723,0.598983,0.032638,0.0,0.030441,...,0.075539,0.034836,0.067468,0.0,0.035913,0.025234,0.026396,0.027247,0.140199,0.104002
Fire Eagle IPA,0.028643,0.0,0.662641,1.0,0.699116,0.582821,0.774533,0.170908,0.0,0.039363,...,0.04193,0.0,0.03745,0.0,0.106519,0.03263,0.138225,0.035232,0.271452,0.134482
Peacemaker,0.054441,0.193621,0.580038,0.699116,1.0,0.475532,0.631954,0.034434,0.0,0.032117,...,0.079697,0.036753,0.071182,0.0,0.12219,0.026623,0.027849,0.174466,0.147916,0.109726


In [32]:
#Testing for Little Boss, local Austin beer
cosine_sim_series = cosine_sim_df.loc['Little Boss']

beers_simliar_to_Little_Boss = cosine_sim_series.sort_values(ascending= False)

In [33]:
beers_simliar_to_Little_Boss.head(15)

beer_name
Little Boss                         1.000000
Spirit Animal                       0.846100
Professor Black                     0.742843
Van Dayum!                          0.721066
Pete's Wicked Summer Brew (2002)    0.358466
Pete's Wicked Summer Brew (1996)    0.358466
Pete's Wicked Summer Brew (1997)    0.358466
Pete's Wicked Summer Brew (1995)    0.358466
Pete's Wicked Pale Ale (1997)       0.357433
Peacemaker                          0.305928
Monarch Classic American Wheat      0.278968
Special Release                     0.274249
Power & Light                       0.269154
Slow Ride                           0.233216
Fire Eagle IPA                      0.220659
Name: Little Boss, dtype: float64