## Developing a Beer recommendation system using crowdsourced data. 

In [1]:
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--dns-prefetch-disable")
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
from selenium.webdriver.support.wait import WebDriverWait
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from string import punctuation
import re
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from sklearn.cluster import KMeans
from sklearn import manifold
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
import seaborn as sns
import spacy 
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Libraries for Tokenizer and Stopwords

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/sharang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sharang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

### Task A
Extract about 10k reviews of beers from Beeradvocate.com. I would suggest choosing the following link, which shows the top 250 beers sorted by ratings:

We have already saved the data in a csv and attached it to our submission. For your convenience, please skip to <b>Task B</b>.

In [None]:
#Create a Dataframe to store all the scapred data.

reviews = pd.DataFrame(columns = ['Beer Name','User Name','User Review','Rating'])

In [None]:
#Get Driver of the website.

url = 'https://www.beeradvocate.com/beer/top-rated/'
driver = webdriver.Chrome()
driver.get(url)

In [None]:
#Scraping the 1-25 reviews of the 250 beers. 
number = 0;

#Loop to go through all the beers
for j in range(2,252):
    #Get and click on the link for every beer
    link = driver.find_elements_by_xpath('//*[@id="ba-content"]/table/tbody/tr['+str(j)+']/td[2]/a')[0]
    link.click()
    
    #Get the name of the beer
    beername = driver.find_elements_by_xpath('//*[@id="content"]/div/div/div[3]/div/div/div[1]/h1')[0]
    Beer = beername.text.split('\n')[0]
    
    #loop to go through all the reviews for the beer
    for i in range(25):
        try:
            #Get the entire HTML table row.
            fullrow = driver.find_elements_by_xpath('//*[@id="rating_fullview_content_2"]')[i]
            rowtext = fullrow.text
            
            #Split it into the ratings, review and username
            splitcomment = rowtext.split('\n') 
            rating = splitcomment[0].split(' ')[0]
            review = splitcomment[3:-3]
            userfull = splitcomment[-1].split(',')
            user = userfull[0]
            print("Beer number",j-1,": ",Beer + "\tReview Number:",i)
            
            #Saving it to a DataFrame
            reviews.loc[len(reviews)] = [Beer,user,review,rating]
        except:
            print("Skipping this comment, Error thrown")
    
    #Go back to the main page. The one with the top 250 beers list. 
    driver.execute_script("window.history.go(-1)")

In [None]:
reviews.head(5)

<br>As an <b>added</b> feature for our assignment, we have also scraped 25-50 reviews for each of our beers. This is the code.
We will be perfoming pagination.

In [None]:
#Create a Dataframe to store all the scapred data.

reviews2 = pd.DataFrame(columns = ['Beer Name','User Name','User Review','Rating'])

In [None]:
#Get Driver of the website.

url = 'https://www.beeradvocate.com/beer/top-rated/'
driver = webdriver.Chrome()
driver.get(url)

In [None]:
#Scraping the 25-50 reviews of the 250 beers.
#Because this scraper have to click on 2 links and not just 1, it's not very reliable. 
#It's glitchy and can sometime cause errors. 
number = 0;

#Loop to go through all the beers
for j in range(2,4):
    try: 
        #Get and click on the link for every beer
        link = driver.find_elements_by_xpath('//*[@id="ba-content"]/table/tbody/tr['+str(j)+']/td[2]/a')[0]
        link.click()
        
        #This is the main line for this loop. We click on the 25-50 link on the Beer's webpage. 
        #This will take us to the next page of the reviews section
        #We are accomplishing pagination this way
        link2 = driver.find_elements_by_xpath('//*[@id="ba-content"]/div[11]/span/a[1]')[0]
        link2.click()
        
        #Get the name of the beer
        beername = driver.find_elements_by_xpath('//*[@id="content"]/div/div/div[3]/div/div/div[1]/h1')[0]
        Beer = beername.text.split('\n')[0]
        print(Beer)
        
        #loop to go through all the reviews for the beer
        for i in range(25):
            try:
                #Get the entire HTML table row.
                fullrow = driver.find_elements_by_xpath('//*[@id="rating_fullview_content_2"]')[i]
                rowtext = fullrow.text
                
                #Split it into the ratings, review and username
                splitcomment = rowtext.split('\n') 
                rating = splitcomment[0].split(' ')[0]
                review = splitcomment[3:-3]
                userfull = splitcomment[-1].split(',')
                user = userfull[0]
                print("Beer:",Beer + "Review Number:",i+50)
                
                #Saving it to a DataFrame
                reviews2.loc[len(reviews2)] = [Beer,user,review,rating]
            except:
                print("Skipping this comment, Error thrown")
                
        #Go back to the main page. The one with the top 250 beers list.
        #This extra line of code is also important, since now we're going back two pages and not just one. 
        driver.execute_script("window.history.go(-1)")
        driver.execute_script("window.history.go(-1)")
    except:
        print("Skipping this Beer, Error thrown")

In [None]:
reviews2.head(5)

In [None]:
#Appending 1-25 review dataframe with the 25-50 Dataframe.
finalreviews = reviews.append(reviews2).copy()

#As you can see, the ratings in our DataFrame are of the 4.55/5 format.
#Changing that to one number. 
for i in range(len(finalreviews)):
    finalreviews.iloc[i,3] = finalreviews.iloc[i,3].split('/')[0]
finalreviews

In [None]:
#Saving the file in a CSV.
finalreviews.to_csv("BeerReviewsMiddle33.csv",index=False)

## Task B 

In [2]:
#Loading the scraped data into a dataframe.
comments = pd.read_csv(r'BeerReviews.csv')
comments.head()

Unnamed: 0,Beer Name,User Name,User Review,Rating
0,Kentucky Brunch Brand Stout,Ciocanelu,"['2016 Silver Wax. Aroma has whiskey, maple, t...",4.8
1,Kentucky Brunch Brand Stout,Phineasco,['The beer pours Pitch Black with a frothy tan...,4.74
2,Kentucky Brunch Brand Stout,Slack,"[""Probably the smoothest beer I have ever had....",4.68
3,Kentucky Brunch Brand Stout,miniditka77,"['Dark black, very thick, a little bit of tan ...",5.0
4,Kentucky Brunch Brand Stout,DoctorZombies,['Poured black as ink with thin ruby edges at ...,4.97


In [3]:
#Loading the word replacement file which has all the replacements to be made into a dataframe
df_replacements = pd.read_excel("BeerAttributes.xls")
df_replacements.head()

Unnamed: 0,attribute,word
0,aggressive,bold
1,aggressive,assertive
2,aggressive,heavy
3,aggressive,strong
4,balanced,proportion


In [4]:
# Word frequency count

# Cleaning the Tweets and get the WordCount
filteredComments = comments
master_list = []
i =0
for comment in comments['User Review']:
    
    #break sentence to words
    words = word_tokenize(str(comment))
    
    #Lower case
    words = [word.lower() for word in words]
    
    #Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) > 1]
  
    #Remove special character sequences like ...,''''''
    words = [word for word in words if word.isalnum()]
    
    #Replace words
    for word in words:
        my_word = re.sub('[^a-zA-Z0-9 \n\.]', '', word)
        if my_word in df_replacements['word'].values:
            words[words.index(word)] = df_replacements[df_replacements['word']==word]['attribute'].values[0] 
      
    #Remove numbers
    words = [word for word in words if not word.isnumeric()]
    
    #Remove duplicates (we want to count frequency of a word as 1 even if it has been used more than once in a single comment)
    unique_words = []
    [unique_words.append(word) for word in words if word not in unique_words]   
    
    #Remove stop words
    stop_words=set(stopwords.words("english"))
    filtered_words = []
    for word in unique_words:
        if word not in stop_words:
            filtered_words.append(word)
    
    master_list.extend(filtered_words)
    filteredComments.loc[i,'Filtered_Tweet'] = ' '.join(filtered_words)
    i=i+1
wordfreq = FreqDist(master_list)
wordfreq

FreqDist({'head': 5595, 'malty': 4806, 'beer': 4710, 'crisp': 4702, 'taste': 3353, 'fruity': 2851, 'hoppy': 2770, 'like': 2708, 'one': 2680, 'finish': 2599, ...})

In [5]:
wordfreq.most_common()

[('head', 5595),
 ('malty', 4806),
 ('beer', 4710),
 ('crisp', 4702),
 ('taste', 3353),
 ('fruity', 2851),
 ('hoppy', 2770),
 ('like', 2708),
 ('one', 2680),
 ('finish', 2599),
 ('nose', 2595),
 ('dark', 2576),
 ('chocolate', 2554),
 ('nice', 2467),
 ('well', 2396),
 ('good', 2389),
 ('body', 2246),
 ('lacing', 2196),
 ('light', 2154),
 ('robust', 2139),
 ('notes', 2119),
 ('flavor', 2115),
 ('aroma', 2102),
 ('bottle', 2085),
 ('black', 2045),
 ('bourbon', 1975),
 ('white', 1971),
 ('bit', 1968),
 ('vanilla', 1946),
 ('color', 1938),
 ('glass', 1926),
 ('mouthfeel', 1917),
 ('little', 1911),
 ('great', 1880),
 ('medium', 1873),
 ('really', 1854),
 ('pours', 1843),
 ('orange', 1813),
 ('flavors', 1770),
 ('coffee', 1748),
 ('thick', 1724),
 ('aggressive', 1678),
 ('hoppy ', 1619),
 ('barrel', 1528),
 ('overall', 1523),
 ('much', 1518),
 ('bitterness', 1501),
 ('complex', 1488),
 ('oak', 1487),
 ('brown', 1468),
 ('feel', 1467),
 ('creamy', 1441),
 ('sweetness', 1423),
 ('hazy', 1304),


In [6]:
nlp = spacy.load('en_core_web_sm')
print("Enter three beer attributes as space-separated words") 
words = input()
doc2 = nlp(words)

Enter three beer attributes as space-separated words
malty crisp aggressive


### Task C
Performing a similarity analysis with the 3-attribute set and the reviews. Extracting 300 reviews that have the highest similarity scores with the attribute set.

In [7]:
#Calculating similarity analysis with the 3 chosen attribute set and the reviews. 
similarity = list()
spacy_similarity = pd.DataFrame(columns = ['User Review','BeerName','Rating'])
for comment in comments['User Review']: 
    doc1 = nlp(comment)
    similarity.append(doc1.similarity(doc2)) 
spacy_similarity[['User Review','BeerName','Rating']] = comments[['User Review','Beer Name','Rating']]
spacy_similarity['spacy_similarity'] = similarity
spacy_similarity['attributes'] = words

In [8]:
#Filtering the top 300 reveiws with highest similarity scores
spacy_similarity = spacy_similarity.sort_values(by=['spacy_similarity'],ascending=False)[:300]

In [9]:
spacy_similarity

Unnamed: 0,User Review,BeerName,Rating,spacy_similarity,attributes
5259,['Bright copper orange in color with perfect c...,Hop JuJu Imperial IPA,4.37,0.666506,malty crisp aggressive
8363,['2016 bottle. Craziest part was experiencing ...,The Abyss,4.47,0.653602,malty crisp aggressive
3455,['Very cloudy straw appearance with slight whi...,3rd Anniversary Imperial IPA,4.73,0.644649,malty crisp aggressive
8353,['Missed out first release....tried hunting an...,Hold On To Sunshine,4.79,0.642710,malty crisp aggressive
8244,['Pours a golden hue color with a i finger egg...,Permanent Funeral,4.25,0.638598,malty crisp aggressive
3320,['Pitch black and nearly opaque with dark choc...,Barrel-Aged Sump Coffee Stout,4.39,0.631427,malty crisp aggressive
9034,['Fresh growler from Alpine. Poured a beautifu...,Bad Boy,4.72,0.631308,malty crisp aggressive
1753,['11/17/18 (Chicago): 2 oz 15.0% ABV pour at 2...,Chemtrailmix,4.38,0.631216,malty crisp aggressive
1468,['Pours a yellow and hazy color with small yel...,Nectarine Premiere,4.83,0.626157,malty crisp aggressive
4351,['Pour a nice smooth dark color smells of huge...,Maple Bacon Coffee Porter,4.82,0.621444,malty crisp aggressive


### Task D
Performing sentiment analysis on the top 300 reviews.

In [10]:
# Performing Sentiment Analysis on the top 300 reviews
sid_obj = SentimentIntensityAnalyzer() 
top300 = spacy_similarity.assign(overall_sentiment="",sentiment_score="")

for index,rows in top300.iterrows():
    review = rows['User Review']
    sentiment_dict = sid_obj.polarity_scores(str(review))
    score = sentiment_dict['compound']
    if score >= 0.05 : 
        sentiment = "Positive"
    elif score <= - 0.05 : 
        sentiment = "Negative"
    else:
        sentiment = "Neutral"
        
    top300.loc[index,'sentiment_score'] = score
    top300.loc[index,'overall_sentiment'] = sentiment

In [11]:
# Sorting the reviews (high to low) by the sentiment 
top300.sort_values(by=['sentiment_score'],ascending=False)

Unnamed: 0,User Review,BeerName,Rating,spacy_similarity,attributes,overall_sentiment,sentiment_score
7144,['Officially one of the top beers I’ve had. Po...,Green,4.84,0.549736,malty crisp aggressive,Positive,0.9977
7386,['750mL 2002 Yellow Label thanks to OldStyleCu...,Oude Geuze Vintage,4.67,0.577835,malty crisp aggressive,Positive,0.9976
1936,['A- 500ml bottle pours a motor oil black brew...,Triple Shot,4.58,0.561672,malty crisp aggressive,Positive,0.9974
3320,['Pitch black and nearly opaque with dark choc...,Barrel-Aged Sump Coffee Stout,4.39,0.631427,malty crisp aggressive,Positive,0.9965
12,"[""Pouring with a thick viscosity that's damn n...",Kentucky Brunch Brand Stout,4.87,0.553011,malty crisp aggressive,Positive,0.9945
9070,['Healthy two plus finger head of eggshell whi...,Crusher,4.05,0.557274,malty crisp aggressive,Positive,0.9893
5558,['Smell: this super barrel forward huge oak an...,Jackie O's / Side Project - Appervation,4.90,0.552786,malty crisp aggressive,Positive,0.9892
661,"['16oz can, dated 08.01.19, poured into a Tree...",JJJuliusss,4.07,0.548436,malty crisp aggressive,Positive,0.9874
4902,['5/2/19 (Chicago): $12.47 12 oz. can prepaid ...,Clare's Thirsty Ale,4.59,0.619732,malty crisp aggressive,Positive,0.9874
9466,['5/2/19 (Chicago): $12.47 12 oz. can prepaid ...,Clare's Thirsty Ale,4.59,0.619732,malty crisp aggressive,Positive,0.9874


### Task E
Top 3 beer recommendations based on spacy similarity and sentiment score

In [12]:
#Converting the sentiment_score and spacy_similarity to float type
top300['sentiment_score'] = top300['sentiment_score'].astype(float)
top300['spacy_similarity'] = top300['spacy_similarity'].astype(float)

In [13]:
# Calculating average similarity and sentiment scores for each beer  
avg = top300.groupby(['BeerName'])['Rating','sentiment_score','spacy_similarity'].mean().sort_values(by = ['sentiment_score'],ascending=False)
avg

Unnamed: 0_level_0,Rating,sentiment_score,spacy_similarity
BeerName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Green,4.8400,0.997700,0.549736
Oude Geuze Vintage,4.6700,0.997600,0.577835
Clare's Thirsty Ale,4.5900,0.987400,0.619732
DFPF,4.7650,0.986000,0.560869
Bodhi,3.7400,0.985500,0.595301
Ten FIDY - Bourbon Barrel Aged,4.1900,0.979400,0.553606
Black Gold,4.6900,0.979200,0.577443
Coconut B-Bomb,4.6600,0.977750,0.570449
It Was All A Dream,4.9900,0.977600,0.559540
Speedway Stout - Vietnamese Coffee,4.5525,0.976000,0.575256


In [14]:
print("Recommended beers to the customer based on the given preferences: " + str(words))
avg.head(3)

Recommended beers to the customer based on the given preferences: malty crisp aggressive


Unnamed: 0_level_0,Rating,sentiment_score,spacy_similarity
BeerName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Green,4.84,0.9977,0.549736
Oude Geuze Vintage,4.67,0.9976,0.577835
Clare's Thirsty Ale,4.59,0.9874,0.619732


### Task F
Beer recommendations if we ignored the similarity and sentiment scores.
Only considering the average user ratings from dataset.

In [47]:
top3 = comments.groupby(['Beer Name'])['Rating'].mean().sort_values(ascending=False).head(3)

In [48]:
#Top 3 Beer based on rating
print("The top three highest rated beers from whole dataset:")
comments.groupby(['Beer Name'])['Rating'].mean().sort_values(ascending=False).head(3)

The top three highest rated beers from whole dataset:


Beer Name
Kentucky Brunch Brand Stout    4.815882
Chemtrailmix                   4.811176
It Was All A Dream             4.785000
Name: Rating, dtype: float64

### Supporting Analysis
Differences in our recommendation vs. the top-rated beers

In [None]:
#Finding similarity scores for the reviews of the top 3 rated beers we found from Task F

In [31]:
threebeers = comments.groupby(['Beer Name'])['Rating'].mean().sort_values(ascending=False).head(3)

In [33]:
topthreecomments = top300[top300['BeerName'].isin(threebeers.index)].copy()
topthreecomments

Unnamed: 0,User Review,BeerName,Rating,spacy_similarity,attributes,overall_sentiment,sentiment_score
1753,['11/17/18 (Chicago): 2 oz 15.0% ABV pour at 2...,Chemtrailmix,4.38,0.631216,malty crisp aggressive,Positive,0.9859
1754,['L: Dark mocha and chestnut hues with a rust-...,Chemtrailmix,4.74,0.577062,malty crisp aggressive,Positive,0.4854
4,['Poured black as ink with thin ruby edges at ...,Kentucky Brunch Brand Stout,4.97,0.563839,malty crisp aggressive,Positive,0.9622
2442,['L: Rich and thick mahogany mocha hues with a...,It Was All A Dream,4.99,0.55954,malty crisp aggressive,Positive,0.9776
6096,"['2016 batch', 'L: Pours an insanely thick dar...",Kentucky Brunch Brand Stout,4.89,0.55466,malty crisp aggressive,Positive,0.9398
12,"[""Pouring with a thick viscosity that's damn n...",Kentucky Brunch Brand Stout,4.87,0.553011,malty crisp aggressive,Positive,0.9945


In [42]:
topthreecomments.groupby(["BeerName"])['Rating','sentiment_score','spacy_similarity'].mean()

Unnamed: 0_level_0,Rating,sentiment_score,spacy_similarity
BeerName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chemtrailmix,4.56,0.73565,0.604139
It Was All A Dream,4.99,0.9776,0.55954
Kentucky Brunch Brand Stout,4.91,0.9655,0.55717


### Analysis

Following are the recommendations we extracted from Task E and Task F respectively. Task E contains the beers that we recommended using the sentiment scores and similarities. Task F simply contains the top three highest rated beers. 
<br>The highest rated beers will not meet the requirements of the user because they all have lower sentiment scores than the ones from our recommendations. We speculate that this is because these beers do not take into consideration the user's preferred attributes.
<br>Any user looking for recommendations has specific traits of beer that they prefer which may not align with the general public's taste and/or rating. It is likely that this user does not just want to be suggested the overall highest rated beers but instead wants more personalized recommendations which may not be mainstream.

<table align="left" style="border: 1px solid black">
    <tr>
        <th style="border: 1px solid black">Our Recommendations - Task E</th>
        <th style="border: 1px solid black">Highest Rated - Task F</th>
    </tr>
    <tr>
        <td style="border: 1px solid black"><img src='our-recs.png' height="300" width="400"></td>
        <td style="border: 1px solid black"><img src='top-rated.png' height="300" width="400"></td>
    </tr>
</table>

