In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

%matplotlib inline
from surprise import Dataset, Reader
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split

import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer

import gensim, logging, warnings
import gensim.corpora as corpora
from gensim import corpora, models
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [19]:
lemmatizer = WordNetLemmatizer()

In [3]:
stop_words.extend(['look', 'smell', 'taste', 'feel', 'overall', 'rdev', 'beer', 'beers','nice','adams', 'jan', 'feb', 'mar',
                  'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'ml', 'oz', 'date'])

In [12]:
def to_string(list_):
    return " ".join(list_)

In [20]:
def clean_review(review):
    """takes in a review and does the following:
    1. removes the '\xa0'
    2. removes the '\n'
    3. performs the simple preprocess from gensim"""
    review = review.replace(u'\\xa0', '')
    review = review.replace('\\n', '')
    review = re.sub(r'(characters){1}\S*', '', review)
    review = gensim.utils.simple_preprocess(str(review), deacc=True)
    review = [word for word in review if word not in stop_words]
    meta_lemmed = [lemmatizer.lemmatize(word, pos='v') for word in review]
#     c = " ".join(str(x) for x in meta_lemmed)
#     cleaned.append(c)
    return meta_lemmed

In [4]:
df = pd.read_csv('mvp_df.csv', index_col=0)

In [5]:
df.head()

Unnamed: 0,abv,avail,avg_score,ba_score,brewery,img,location,name,no_ratings,no_reviews,review,style,url,taste_avg,look_avg,smell_avg,feel_avg,clean_review
0,5.6,Winter,3.6,82.0,Boston Beer Company (Samuel Adams),https://cdn.beeradvocate.com/im/beers/101.jpg,Massachusetts,Samuel Adams Winter Lager,6210,2048,['3.31/5\xa0\xa0rDev -8.1%look: 4 | smell: 3.5...,German Bock,https://www.beeradvocate.com/beer/profile/35/101/,3.56,3.76,3.48,3.48,"['alright', 'one', 'depend', 'great', 'deep', ..."
1,4.4,Year-round,3.19,73.0,Spoetzl Brewery,https://cdn.beeradvocate.com/im/beers/101.jpg,Texas,Shiner Bock,4331,1163,"[""3.19/5\xa0\xa0rDev 0%look: 3.75 | smell: 2.7...",German Bock,https://www.beeradvocate.com/beer/profile/143/...,3.47,3.76,3.31,3.31,"['first', 'sample', 'visit', 'father', 'texas'..."
2,5.8,Rotating,3.66,83.0,Boston Beer Company (Samuel Adams),https://cdn.beeradvocate.com/im/beers/101.jpg,Massachusetts,Samuel Adams Chocolate Bock,3101,1225,['3.78/5\xa0\xa0rDev +3.3%look: 3.75 | smell: ...,German Bock,https://www.beeradvocate.com/beer/profile/35/1...,3.65,3.64,3.59,3.59,"['black', 'color', 'little', 'light', 'penetra..."
3,5.2,Year-round,2.89,68.0,Anheuser-Busch,https://cdn.beeradvocate.com/im/beers/101.jpg,Missouri,Michelob AmberBock,1906,707,"[""3.09/5\xa0\xa0rDev +6.9%look: 3.5 | smell: 2...",German Bock,https://www.beeradvocate.com/beer/profile/29/1...,3.24,3.62,3.27,3.27,"['bottle', 'abv', 'purchase', 'cold', 'single'..."
4,5.8,Rotating,3.5,80.0,Boston Beer Company (Samuel Adams),https://cdn.beeradvocate.com/im/beers/101.jpg,Massachusetts,Samuel Adams Cherry Chocolate Bock,873,187,['3/5\xa0\xa0rDev -14.3%look: 3 | smell: 3 | t...,German Bock,https://www.beeradvocate.com/beer/profile/35/6...,3.28,3.69,3.62,3.62,"['initial', 'whiff', 'dr', 'pepper', 'cherry',..."


In [21]:
df['clean_review'] = df['review'].apply(clean_review)

In [22]:
df['key_words'] = df['clean_review'].apply(to_string)

In [23]:
df.head()

Unnamed: 0,abv,avail,avg_score,ba_score,brewery,img,location,name,no_ratings,no_reviews,review,style,url,taste_avg,look_avg,smell_avg,feel_avg,clean_review,key_words
0,5.6,Winter,3.6,82.0,Boston Beer Company (Samuel Adams),https://cdn.beeradvocate.com/im/beers/101.jpg,Massachusetts,Samuel Adams Winter Lager,6210,2048,['3.31/5\xa0\xa0rDev -8.1%look: 4 | smell: 3.5...,German Bock,https://www.beeradvocate.com/beer/profile/35/101/,3.56,3.76,3.48,3.48,"[alright, one, depend, great, deep, copper, mo...",alright one depend great deep copper molasses ...
1,4.4,Year-round,3.19,73.0,Spoetzl Brewery,https://cdn.beeradvocate.com/im/beers/101.jpg,Texas,Shiner Bock,4331,1163,"[""3.19/5\xa0\xa0rDev 0%look: 3.75 | smell: 2.7...",German Bock,https://www.beeradvocate.com/beer/profile/143/...,3.47,3.76,3.31,3.31,"[first, sample, visit, father, texas, earlier,...",first sample visit father texas earlier year k...
2,5.8,Rotating,3.66,83.0,Boston Beer Company (Samuel Adams),https://cdn.beeradvocate.com/im/beers/101.jpg,Massachusetts,Samuel Adams Chocolate Bock,3101,1225,['3.78/5\xa0\xa0rDev +3.3%look: 3.75 | smell: ...,German Bock,https://www.beeradvocate.com/beer/profile/35/1...,3.65,3.64,3.59,3.59,"[black, color, little, light, penetration, fin...",black color little light penetration fingernai...
3,5.2,Year-round,2.89,68.0,Anheuser-Busch,https://cdn.beeradvocate.com/im/beers/101.jpg,Missouri,Michelob AmberBock,1906,707,"[""3.09/5\xa0\xa0rDev +6.9%look: 3.5 | smell: 2...",German Bock,https://www.beeradvocate.com/beer/profile/29/1...,3.24,3.62,3.27,3.27,"[bottle, abv, purchase, cold, single, pour, fl...",bottle abv purchase cold single pour flute gob...
4,5.8,Rotating,3.5,80.0,Boston Beer Company (Samuel Adams),https://cdn.beeradvocate.com/im/beers/101.jpg,Massachusetts,Samuel Adams Cherry Chocolate Bock,873,187,['3/5\xa0\xa0rDev -14.3%look: 3 | smell: 3 | t...,German Bock,https://www.beeradvocate.com/beer/profile/35/6...,3.28,3.69,3.62,3.62,"[initial, whiff, dr, pepper, cherry, die, flav...",initial whiff dr pepper cherry die flavor give...


In [26]:
df.columns

Index(['abv', 'avail', 'avg_score', 'ba_score', 'brewery', 'img', 'location',
       'name', 'no_ratings', 'no_reviews', 'review', 'style', 'url',
       'taste_avg', 'look_avg', 'smell_avg', 'feel_avg', 'clean_review',
       'key_words'],
      dtype='object')

In [27]:
df.drop(['abv', 'avail', 'avg_score', 'ba_score', 'brewery', 'img', 'location',
         'no_ratings', 'no_reviews', 'review', 'style', 'url',
       'taste_avg', 'look_avg', 'smell_avg', 'feel_avg'], inplace=True, axis=1)

In [33]:
df.drop(['clean_review'], axis=1, inplace=True)

In [36]:
df.set_index(['name'], inplace=True, drop=True)

In [38]:
"Shiner Bock"

'Shiner Bock'

In [43]:
df.sample(20)

Unnamed: 0_level_0,key_words
name,Unnamed: 1_level_1
Dead Mans Revenge Black IPA,bottle pour cola like colour big beige colour ...
Contrebandier,bottle cl taste appearance brown color tan foa...
Wet,pour golden bronze red tint layer puffy white ...
German Rauch Bier,serve taper pub pint emmetts west dundee color...
Table Table Beer,sample glass vermont tap house williston vt ap...
Hopper Texas Brown,pour dark brown thick moussey tan colour nose ...
50 Shades of Bray,canblack brown tall foam greasy leaf oily coff...
Redheaded Rauchstar,crisp smokey tobacco dry slight ginger bite mi...
Airbräu Festbier (Jet A-1),accompany daughter caitlin jamesw iain friend ...
Invader Dopplebock,pour dark amber color head finger white frothy...


# Putting this aside for now

In [29]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words=stop_words)
tfidf_matrix = tf.fit_transform(df['key_words'])

In [30]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

# Skip to here

In [39]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['key_words'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [40]:
cosine_sim

array([[1.        , 0.57243905, 0.49726859, ..., 0.4337387 , 0.41315144,
        0.50843727],
       [0.57243905, 1.        , 0.53924372, ..., 0.4046681 , 0.3588685 ,
        0.47653144],
       [0.49726859, 0.53924372, 1.        , ..., 0.3474918 , 0.34840618,
        0.39846679],
       ...,
       [0.4337387 , 0.4046681 , 0.3474918 , ..., 1.        , 0.37905668,
        0.52604679],
       [0.41315144, 0.3588685 , 0.34840618, ..., 0.37905668, 1.        ,
        0.55420416],
       [0.50843727, 0.47653144, 0.39846679, ..., 0.52604679, 0.55420416,
        1.        ]])

In [41]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use in the function to match the indexes
indices = pd.Series(df.index)

#  defining the function that takes in beer name 
# as input and returns the top 10 recommended beers
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended beers
    recommended_beers = []
    
    # gettin the index of the beer that matches the name
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching beers
    for i in top_10_indexes:
        recommended_beers.append(list(df.index)[i])
        
    return recommended_beers

In [46]:
recommendations(title="German Rauch Bier")

['Rauchbier',
 'Göller Rauchbier',
 'Aecht Schlenkerla Kräusen',
 'Lunar Rauchbier',
 'Cains Dark Mild',
 'Aecht Schlenkerla Rauchbier Weizen',
 "Bishop Bob's Holy Smoke",
 'Punkrauch',
 'Wildfire Rauchbier',
 'Rail Dog Smoked Black Lager']

In [57]:
def get_avg_taste(string):
    """takes in the review and pulls out the rating for the taste and returns an average"""
    list_list = [re.findall("\d{1}\.*\d*", item) for item in re.findall("taste: \d{1}\.*\d*", string)]
    sum_feel = 0
    tot = len(list_list)
    for list_item in list_list:
        for item in list_item:
            sum_feel += float(item)
    if not tot == 0:
        return sum_feel/tot
    else:
        return None


def get_avg_look(string):
    list_list = [re.findall("\d{1}\.*\d*", item) for item in re.findall("look: \d{1}\.*\d*", string)]
    sum_feel = 0
    tot = len(list_list)
    for list_item in list_list:
        for item in list_item:
            sum_feel += float(item)
    if not tot == 0:
        return sum_feel/tot
    else:
        return None

def get_avg_smell(string):
    list_list = [re.findall("\d{1}\.*\d*", item) for item in re.findall("smell: \d{1}\.*\d*", string)]
    sum_feel = 0
    tot = len(list_list)
    for list_item in list_list:
        for item in list_item:
            sum_feel += float(item)
    if not tot == 0:
        return sum_feel/tot
    else:
        return None


def get_avg_feel(string):
    list_list = [re.findall("\d{1}\.*\d*", item) for item in re.findall("feel: \d{1}\.*\d*", string)]
    sum_feel = 0
    tot = len(list_list)
    for list_item in list_list:
        for item in list_item:
            sum_feel += float(item)
    if not tot == 0:
        return sum_feel/tot
    else:
        return None

In [58]:
df['taste_avg'] = df['review'].apply(get_avg_taste)
df['look_avg'] = df['review'].apply(get_avg_look)
df['smell_avg'] = df['review'].apply(get_avg_smell)
df['feel_avg'] = df['review'].apply(get_avg_feel)

ValueError: could not convert string to float: '3..25'

In [53]:
df['no_reviews'] = df['no_reviews'].str.replace(',', '')
df['no_reviews'] = df['no_reviews'].astype(int)
# change ratings to int
df['no_ratings'] = df['no_ratings'].str.replace(',', '')
df['no_ratings'] = df['no_ratings'].astype(int)
# change image to none if it's a placeholder
df.loc[df['img'] == "https://cdn.beeradvocate.com/im/placeholder-beer.jpg", 'img'] = None
# change ABV to float
df['abv'] = df['abv'].str.replace('%', '')
df['abv'] = df['abv'].astype(float)

ValueError: cannot convert float NaN to integer