## Beer Recommender using Content Based Recommender System

In [38]:
# Dependencies and packages
%reload_ext lab_black
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
import random
from sklearn.metrics import accuracy_score

pd.options.display.max_columns = 30
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()

In [8]:
df = pd.read_csv("../data/csv/reviews_NLTK1686.csv", encoding="latin-1")

In [9]:
df.head()

Unnamed: 0,beer_id,username,text,score,name,style
0,5,bry5o5,"Carmel-y, malty, tasty! Very well balanced beer.",4.89,Amber,Vienna Lager
1,6,LeonardFournette,"Chocolatey, pretty strong hoppiness but anothe...",5.0,Turbodog,English Brown Ale
2,7,Sreyn2,Smooth,5.0,Purple Haze,Fruit and Field Beer
3,10,RBorsato,"Burgundy, mahogany with a nice light brown hea...",4.92,Dubbel Ale,Belgian Dubbel
4,17,jschalch08,Really good tasting wheat beer,5.0,Widmer Hefeweizen,German Hefeweizen


In [10]:
print("We have", len(df), "beers in the data")

We have 1686 beers in the data


In [11]:
def print_description(index):
    example = df[df.index == index][["text", "name"]].values[0]
    if len(example) > 0:
        print(example[0])
        print("Name:", example[1])

In [12]:
print_description(10)

Now this is a true Scottish Wee Heavy.. Very beautiful looking beer with a nice smooth smelling malt and the taste is unreal with grate tasting malts and no hop taste to speak of, Which is how a Wee Heavy is supposed to be....
Name: Traquair House Ale


In [13]:
print_description(200)

This is my absolute favorite beer of all time. I think it tastes great, even if it does not have a reputation of being an excellent beer. To me, it is just a higher quality, slightly more sophisticated version of domestic party and game-day beers like Budweiser and Miller, and if that does't sound appealing to you, then I don't know what to tell you.
Name: St. Pauli Girl


### Beer Review Text Length Distribution

In [14]:
df["word_count"] = df["text"].apply(lambda x: len(str(x).split()))

In [15]:
desc_lengths = list(df["word_count"])

print(
    "Number of descriptions:",
    len(desc_lengths),
    "\nAverage word count",
    np.average(desc_lengths),
    "\nMinimum word count",
    min(desc_lengths),
    "\nMaximum word count",
    max(desc_lengths),
)

Number of descriptions: 1686 
Average word count 72.96975088967972 
Minimum word count 1 
Maximum word count 690


### Preprocessing review description text 


In [16]:
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sheetalbongale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
df.dtypes

beer_id         int64
username       object
text           object
score         float64
name           object
style          object
word_count      int64
dtype: object

In [18]:
df["text"] = df["text"].astype(str)

In [19]:
REPLACE_BY_SPACE_RE = re.compile("[/(){}\[\]\|@,;]")
BAD_SYMBOLS_RE = re.compile("[^0-9a-z #+_Â]")
STOPWORDS = set(stopwords.words("english"))


def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()  # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(
        " ", text
    )  # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub(
        "", text
    )  # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.
    text = " ".join(
        word for word in text.split() if word not in STOPWORDS
    )  # remove stopwors from text
    return text


df["desc_clean"] = df["text"].apply(clean_text)

  REPLACE_BY_SPACE_RE = re.compile("[/(){}\[\]\|@,;]")


In [20]:
def print_description(index):
    example = df[df.index == index][["desc_clean", "name"]].values[0]
    if len(example) > 0:
        print(example[0])
        print("Name:", example[1])


print_description(0)

carmely malty tasty well balanced beer
Name: Amber


In [21]:
print_description(200)

absolute favorite beer time think tastes great even reputation excellent beer higher quality slightly sophisticated version domestic party gameday beers like budweiser miller doest sound appealing dont know tell
Name: St. Pauli Girl


In [29]:
df.set_index("name", inplace=True)

In [55]:
tf = TfidfVectorizer(
    analyzer="word", ngram_range=(0, 1), min_df=0, stop_words="english"
)
tfidf_matrix = tf.fit_transform(df["desc_clean"])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [56]:
indices = pd.Series(df.index)

In [57]:
indices[:50]

0                                                 Amber
1                                              Turbodog
2                                           Purple Haze
3                                            Dubbel Ale
4                                     Widmer Hefeweizen
5                             Mackeson Triple XXX Stout
6                                        Trois Pistoles
7                                    Blanche De Chambly
8                                               Maudite
9                                       La Fin Du Monde
10                                   Traquair House Ale
11                                           Alpha King
12                                            Grand Cru
13                                                White
14                                        Anchor Porter
15                                    Anchor Steam Beer
16                                            Budweiser
17                       Young's Double Chocolat

In [58]:
def recommendations(name, cosine_similarities=cosine_similarities):

    recommended_beers = []

    # gettin the index of the beer that matches the name
    idx = indices[indices == name].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)

    # getting the indexes of the 10 most similar beer except itself
    top_10_indexes = list(score_series.iloc[1:11].index)

    # populating the list with the names of the top 10 matching beers
    for i in top_10_indexes:
        recommended_beers.append(list(df.index)[i])

    return recommended_beers

In [59]:
recommendations("Hazy Little Thing IPA")

["Short's Soft Parade",
 'Hottenroth',
 'Bourbon County Brand Coffee Stout',
 'Bud Light Lime',
 'Prima Pils',
 "Hamm's",
 'Lucky 13 Mondo Large Red Ale',
 'Premium Beer',
 "O'Hara's Irish Stout",
 'Aecht Schlenkerla Rauchbier Urbock']

In [68]:
recommendations("Samuel Adams Summer Ale")

["Leinenkugel's Summer Shandy",
 'Andygator',
 'Otter Creek Citra Mantra',
 'Summer Love',
 'Scrimshaw Pilsner',
 'The Kimmie, The Yink & The Holy Gose',
 'Wailua Wheat',
 'Dead Guy Ale',
 'Otra Vez',
 'Twilight Summer Ale']

In [63]:
recommendations("Corona Extra")

['Samuel Adams White Christmas',
 'Blue Moon Agave Nectar Ale',
 'Arcadia HopMouth Double IPA',
 'White Rascal',
 'Darkness',
 'Third Shift Amber Lager',
 'Old Rasputin',
 'Gordon Biersch MÃ¤rzen',
 'Black Sheep Ale (Special)',
 'Simtra Triple IPA']

### These are not exactly great recommendations for a Corona Lover

In [74]:
cosine_similarities

array([[1.        , 0.01667019, 0.        , ..., 0.        , 0.0484561 ,
        0.        ],
       [0.01667019, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.0484561 , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [69]:
recommendations("Shiner Bock")

['Salvator Doppel Bock',
 'Samuel Adams Chocolate Bock',
 'Brooklyn Pilsner',
 'Noble Rot',
 'Bud Ice',
 'Biere De Mars',
 'LongShot Double IPA',
 "Terrapin Monk's Revenge",
 'Modelo Especial',
 'Urbock 23Â°']

In [70]:
recommendations("Salvator Doppel Bock")

['Shiner Bock',
 'Samuel Adams Chocolate Bock',
 'Cane And Ebel',
 'Urbock 23Â°',
 "G'Knight",
 'HofbrÃ¤u Maibock (Urbock)',
 'Weizenbock - Beer Camp #37 (Best Of Beer Camp)',
 'Strawberry Harvest Lager',
 'Samuel Adams Cold Snap']