In [None]:
# Project team: Arjun,Achal, Shipra, Eduardo, Sruthi

In [2]:
#Importing all the libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


import spacy
nlp = spacy.load('en_core_web_lg')
from spacy.lang.en import English
import string
stop_words = spacy.lang.en.stop_words.STOP_WORDS

import nltk
nltk.download('punkt')
nltk.download('vader_lexicon')
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules, apriori


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\I068117\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Task A: Extract about 3–5k reviews from Beer Advocate review forum.

In [3]:
review_list = []
beer_list=[]
base_url = 'https://www.beeradvocate.com/beer/'
url_pages = ['top-rated', 'trending', 'top-new', 'fame', 'popular']
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
}

for url_page in url_pages:
    url = f'{base_url}{url_page}'
    response = requests.get(url, headers=headers)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    rows = soup.find('table').find_all('tr')
    
    for row in rows:
        review_link = row.find_all('td', class_='hr_bottom_light')
        try:
            review_list.append(review_link[1].find('a').get('href'))
            beer_list.append(review_link[1].find('a').text)
        except:
            pass

In [4]:
username = []
user_rating = []
user_review = []
beer_name=[]

for i,link in enumerate(review_list):
    beer_response = requests.get(f'https://www.beeradvocate.com/{link}', headers=headers)
    beer_soup = BeautifulSoup(beer_response.content, 'html.parser')
    
    beer_reviews = beer_soup.find_all('div', id = 'rating_fullview_container')
    #product_name = beer_soup.find('div', class_ = 'titleBar').text.split('\n')[1]
   
    for review in beer_reviews:
        user_box = review.find('div', id ='rating_fullview_content_2')
        username.append(user_box.find('a', class_ ='username').text)
        beer_name.append(beer_list[i])
    
        try:
            user_rating.append(user_box.find('span', class_='BAscore_norm').text)    
        except:
            user_rating.append(user_box.text.split()[1])
    
        try:
            user_review.append(user_box.find('div').text)
        except:
            user_review.append('')

In [5]:
df = pd.DataFrame({'username': username,
                   'beername':beer_name,
                   'user_rating': user_rating,
                   'user_review': user_review
                  })

reviews_df = df[df['user_review'] != '']

#removing duplicate values
reviews_df=reviews_df.drop_duplicates(keep='first').reset_index(drop=True)
reviews_df.to_csv('Reviews.csv')

In [6]:
reviews_df

Unnamed: 0,username,beername,user_rating,user_review
0,MonDak_Joe1953,Kentucky Brunch Brand Stout,4.71,Tap pour at the taproom. Served chilled in a s...
1,Rristow,Kentucky Brunch Brand Stout,5,The perfect barrel aged stout. Not overly swee...
2,BEERchitect,Kentucky Brunch Brand Stout,4.8,The flirtation with maple comes to a crescendo...
3,cdinardo21,Kentucky Brunch Brand Stout,4.98,On tap at TG for part of KBBS release day - ra...
4,GreenBayBA,Kentucky Brunch Brand Stout,4.7,"Perfect black color. I didn't see much foam, o..."
...,...,...,...,...
5852,TurboMon_911,Rare Vos Amber Ale,4.14,"12 oz. bottle, (Best by 05/11/23); Poured a cl..."
5853,Davidstan,Rare Vos Amber Ale,4.39,Just now becoming a Belgian fan. This beer cut...
5854,Bluerabbitbell,Rare Vos Amber Ale,4.05,"Enjoyed at home, 12/1/22, from a Kroger pick 6..."
5855,BubbleBobble,Rare Vos Amber Ale,4.02,"Pours a dark, ruby amber, big 2"" tan head that..."


### Extracting the most frequent words/ features from all the review based on count/ term frequency

### Task B: Use the above attributes as examples only, for a word frequency analysis of beer reviews is a better way to find important attributes in the actual data.

In [3]:
reviews_df=pd.read_csv('Reviews.csv')
All_review=reviews_df['user_review'].tolist()

#CountVectorizer attributes remove the stop words, punctuations etc. 
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(All_review)

# Get the feature names (words)
feature_names = count_vectorizer.get_feature_names_out()

# Sum up the word frequencies
word_frequencies = count_matrix.sum(axis=0).A1

# Create a DataFrame to display the word frequencies
df = pd.DataFrame({'Word': feature_names, 'Frequency': word_frequencies})

# Sort the DataFrame by frequency in descending order
df_sorted = df.sort_values(by='Frequency', ascending=False)

# Display the words with the highest frequency
df_sorted.head(20)

Unnamed: 0,Word,Frequency
1580,beer,3727
6355,head,3647
12944,taste,3054
3749,dark,2670
7686,light,2233
12783,sweet,2125
5989,good,2000
2790,chocolate,1998
7701,like,1953
8202,medium,1902


As countvectorizer takes into account the count and higher the count of words, higher the weights. It can be deceiving to attach high importance to the count. Instead we use tf-idf to down weight the impact of most frequent words in the document and assign higher weights for the rare words. See below

## Building a beer recommender system based on the features selected by a user

### Task C: Bag of words: Similarity analysis using cosine similarity (without word embeddings) the attributes specified by the customer and the reviews

In [61]:
reviews_df=pd.read_csv('Reviews.csv')
All_review=reviews_df['user_review'].tolist()

#User can select as many attributes as they want in a beer. e.g. 'sweet','dark','coffee','chocolate','malt'. 
#Below is the example for input sweet dark chocolate 
selected_attribute = input("Enter the beer attributes like dark/sweet/light/coffee/malt: ")

tfidf_vectorizer = TfidfVectorizer(vocabulary=selected_attribute.split(' '))
tfidf_matrix=tfidf_vectorizer.fit_transform(All_review)
sim=cosine_similarity(tfidf_matrix,[[ 1 for x in selected_attribute.split(' ')]])

flattened_list = [item for sublist in sim for item in sublist]
reviews_df['WOB similarity score'] = flattened_list

reviews_df.drop('Unnamed: 0',axis=1,inplace=True)
reviews_df.sort_values(by=['WOB similarity score','user_rating'],ascending = False).iloc[0:3,1:]

Enter the beer attributes like dark/sweet/light/coffee/malt: sweet dark chocolate


Unnamed: 0,beername,user_rating,user_review,WOB similarity score
10,Vanilla Bean Assassin,5.0,12oz bottle…2022 version. Drank 12/25/22.\nPou...,0.998554
4358,Vintage Ale,5.0,Review 3000!\n\nEnjoyed at the end of 2022\nBo...,0.998554
2370,"Term Oil Vanilla, Pistachio & Granola",4.94,"Term Oil Vanilla, Pistachio & Granola pours bl...",0.998554


Interpretation: Bag of words similarity is based on the tf-idf values & doesn't take in any context. If a review has all 3 words ( sweet dark chocolate) then it will have high similarity score if any of the words were missing. It will consider  'not overly sweet' as in second review as 'sweet' and hence it has a high similary score of .81 despite not being similar.

In [5]:
# viewing the count of words in the review based on the features selected.
count_vectorizer = CountVectorizer(stop_words='english', vocabulary= selected_attribute.split(' '))
count_matrix1 = count_vectorizer.fit_transform(All_review)
count_matrix1.todense()

matrix([[1, 3, 2],
        [1, 0, 0],
        [3, 1, 4],
        ...,
        [0, 0, 0],
        [0, 3, 0],
        [0, 0, 0]], dtype=int64)

### Task D: For every review, perform a sentiment analysis (using VADER). In case you have to change the default values of words in the VADER lexicon, use this article:

In [62]:
# Initialize VADER sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

# Function to calculate sentiment score
def calculate_sentiment(review):
    return sia.polarity_scores(review)['compound']

# Apply sentiment analysis to each review
reviews_df['sentiment_score'] = reviews_df['user_review'].apply(calculate_sentiment)

# Display the DataFrame with the new sentiment scores
reviews_df.head()

Unnamed: 0,username,beername,user_rating,user_review,WOB similarity score,sentiment_score
0,MonDak_Joe1953,Kentucky Brunch Brand Stout,4.71,Tap pour at the taproom. Served chilled in a s...,0.929577,0.9052
1,Rristow,Kentucky Brunch Brand Stout,5.0,The perfect barrel aged stout. Not overly swee...,0.57735,0.2216
2,BEERchitect,Kentucky Brunch Brand Stout,4.8,The flirtation with maple comes to a crescendo...,0.892974,0.9831
3,cdinardo21,Kentucky Brunch Brand Stout,4.98,On tap at TG for part of KBBS release day - ra...,0.0,0.9616
4,GreenBayBA,Kentucky Brunch Brand Stout,4.7,"Perfect black color. I didn't see much foam, o...",0.762652,0.9481


### Task E: Create an evaluation score for each beer that uses both similarity and sentiment scores [e.g., total score = average of (similarity score + sentiment score) or a multiplicative model, depending on what you consider to be more appropriate]. Now recommend three products to the customer.

In [63]:
# Calculate the evaluation score as an average of similarity and sentiment scores
reviews_df['evaluation_score_WOB'] = (reviews_df['WOB similarity score'] + reviews_df['sentiment_score']) / 2


# Now, to recommend three products to the customer, sort the DataFrame by 'evaluation_score' and select the top three beers
reviews_df.sort_values(by='evaluation_score_WOB', ascending=False).drop_duplicates(subset='beername',keep='first').head(3)


Unnamed: 0,username,beername,user_rating,user_review,WOB similarity score,sentiment_score,evaluation_score_WOB
4369,StonedTrippin,Big Black Voodoo Daddy,4.24,easily the best beer ive had from these guys s...,0.998554,0.9946,0.996577
287,defunksta,Bourbon County Brand Stout,4.0,2022 Version (bottled 10/2022 1.5 years old)\n...,0.998554,0.9888,0.993677
2329,Scotchboy,Firestone 27 - Anniversary Ale,4.25,Always destined to be a classic every year. Th...,0.998554,0.9867,0.992627


### Task F : Perform a similarity analysis using cosine similarity (with word embeddings) with the three attributes specified by the customer and the reviews.

#### Word2Vector: Similarity analysis using cosine similarity in Spacy (with word embeddings)based on the attributes specified by the customer and the reviews

In [64]:
#spacy has 300 vector word embeddings 

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    mytokens = nlp(sentence)
    punctuations = string.punctuation
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
    return " ".join(mytokens)

reviews_df["word2vector similarity"] = reviews_df['user_review'].apply(lambda text: 
    nlp(selected_attribute).similarity(nlp(spacy_tokenizer(text))) 
    if text.strip() else 0.0)

#reviews_df.drop('Unnamed: 0',axis=1,inplace=True)
reviews_df.sort_values(by=['word2vector similarity'],ascending = False).iloc[0:3,:]

  nlp(selected_attribute).similarity(nlp(spacy_tokenizer(text)))


Unnamed: 0,username,beername,user_rating,user_review,WOB similarity score,sentiment_score,evaluation_score_WOB,word2vector similarity
3970,atigerlife,Edmund Fitzgerald,4.56,Pours dark with a 1-finger dark brown head tha...,0.742658,0.9022,0.822429,0.883249
1898,dafla67,For Cory,4.6,Pours very dark brown. Aroma is dominated by c...,0.815224,0.25,0.532612,0.882092
5209,dylandrinksbeers,Old Chub,4.29,"Dark brown color, sweet chocolate taste. Try a...",0.998554,0.4588,0.728677,0.880664


Interpretation: The spacy similarity are based on cosine similarity of the entire review vector & selected attribute vector in 300 dimension space. The shorter the review vector, better is the similarity scores. Higher length vectors will have low similarity scores. Also this takes into account the context, so coffee, chocolate (0.65) will have similar weights or are similar compared to say coffee & vanilla, nlp('coffee').similarity(nlp('vanilla')) = 0.55 

We can also improvise further by removing the stop words, punctuations & lemmatize the words for better similarity based on our problem statement.  

In [65]:
# Calculate the evaluation score as an average of similarity and sentiment scores
reviews_df['evaluation_score_Spacy'] = (reviews_df['word2vector similarity'] + reviews_df['sentiment_score']) / 2

# Now, to recommend three products to the customer, sort the DataFrame by 'evaluation_score'
# and select the top three beers

reviews_df.sort_values(by='evaluation_score_Spacy', ascending=False).drop_duplicates(subset='beername',keep='first').head(3)

Unnamed: 0,username,beername,user_rating,user_review,WOB similarity score,sentiment_score,evaluation_score_WOB,word2vector similarity,evaluation_score_Spacy
2232,Beersnake,Mother of All Bricks,4.49,Poured at fridge temp. Pours a gorgeous dark b...,0.816497,0.9856,0.901048,0.851357,0.918479
2327,puck1225,Firestone 27 - Anniversary Ale,4.35,Had this beer from draft at Churchkey in DC. P...,0.816497,0.957,0.886748,0.872208,0.914604
1865,Brutaltruth,Maple Bourbon Barrel Paradise,4.43,From the 12 oz bottle in a snifter. This solid...,0.822808,0.9929,0.907854,0.831472,0.912186


#### How would your recommendation change if you use word vectors (e.g., the spaCy package with medium-sized pre-trained word vectors) instead of the plain vanilla Bag of Words cosine similarity? 

Yes, we get different recommendations with both WOB & Spacy similarity approaches. However we get one same recommendation Firestone 27 - Anniversary Ale for the beer attributes sweet dark chocolate 

With WOB model, our beer recommendations system recommends, Beer 
1. Big Black Voodoo Daddy
2. Bourbon County Brand Stout	
3. Firestone 27 - Anniversary Ale	

With Spacy word embeddings, the top 3 recommendations are:
1. Mother of All Bricks
2. Firestone 27 - Anniversary Ale	
3. Maple Bourbon Barrel Paradise

Clearly, the word vector approach differs to the BoW approach due to word vectors abilty to capture semantic info. This allows for a more nuanced undestanding of the text as BoW doesnt take into the account the sequence and context of the words. Word2Vector is usually more accurate as it taken into account semantic meaning also ( for e.g. chocolate is similar to coffee than orange). However, In countVector approach/WOB, if these 3 attributes exist in a review, then we would get a perfect score of 1 with WOB/token approach with countvector which may be higher than similarity score with Spacy as Spacy takes the averages of all the words vectors & compares with the word vectors of the 3 attributes.

### Task G: How would your recommendations differ if you ignored the similarity and feature sentiment scores and simply chose the three highest-rated products from your entire dataset? Would these products meet the requirements of the user looking for recommendations? Why or why not? Justify your answer with analysis. Use the similarity and sentiment scores as well as overall ratings to answer this question.

In [15]:
# Group the DataFrame by 'product_name' and calculate the average rating for each beer
average_ratings = reviews_df.groupby('beername')['user_rating'].mean().reset_index()

# Display the top 3 highest-rated beers
average_ratings.sort_values(by='user_rating', ascending=False).head(3)

Unnamed: 0,beername,user_rating
687,Saint Lamvinus,5.0
528,M.J.K.,4.994
487,Kentucky Brunch Brand Stout - Double Barrel Re...,4.94


In [17]:
reviews_df[reviews_df['beername']=='M.J.K.']

Unnamed: 0,username,beername,user_rating,user_review,WOB similarity score,word2vector similarity,sentiment_score,evaluation_score_WOB,evaluation_score_Spacy
51,TonyLuvsBeer,M.J.K.,5.0,Absolute phenomenal brew. Super complex. Perfe...,0.0,0.445366,0.8225,0.41125,0.633933
52,cdinardo21,M.J.K.,4.98,4 oz bottle pour at the brewery for the 10th A...,0.0,0.595938,0.2263,0.11315,0.411119
53,BeerRay,M.J.K.,5.0,Unbelievable poor at side projection,0.0,0.220659,-0.3182,-0.1591,-0.04877
54,dontknojack,M.J.K.,5.0,This was by far not only the best barleywine I...,0.0,0.506324,-0.1744,-0.0872,0.165962
55,SadMachine,M.J.K.,4.99,2021 vintage bottle pour on site.\n\nA- Pours ...,0.57735,0.719121,0.9958,0.786575,0.85746


 If we only chose the 3 highest rated products and ignored the sentiment/similarity scores, these recommendations would only be based on overall ratings, and wouldnt factor in the relevance of the product, or the associated semantics. As such, we are simply prioritizing popularity, rather than user desires for specific attributes. Additionally, sometimes products with high rating but mixed or negative reviews may be incorrectly captured as highest rated, while those with slightly lower ratings but stellar reviews are ignored. As seen from the above, the 3 "highest rated" beers are drastically different than those that are shown as most recommended depending on our features. 
 
For e.g. for beer M.J.K, it's highest rated but the evaluation scores are low & mixed(some are positives while one is negative) and it may not be a good recommendation solely based on user ratings which can be a fake /flawed review/entry error.

### Task H: Using the top four attributes of beer (from word frequency analysis), calculate the lifts between these attributes and any 10 beers in your data. Choose one beer and find the most similar beer (among the remaining nine) using the lift values. Explain your method.

In [21]:
def strip_quotations_newline(text):
    text = text.replace('"', '').replace("'", '')
    
    # Remove newline characters
    text = text.replace('\n', ' ').replace('\r', '')
    return text

def expand_around_chars(text, characters):
    for char in characters:
        text = text.replace(char, " "+char+" ")
    return text

def split_text(text):
    text = strip_quotations_newline(text)
    text = expand_around_chars(text, '!@#$%^&*()_-+={[}]|;:\"<>?/.,')
    splitted_text = text.split(" ")
    text_lowercase = [x.lower() for x in splitted_text]
    cleaned_text = [x for x in text_lowercase if (len(x)>1) & (x not in stop_words)]
    return list(set(cleaned_text))

reviews_df['token'] = reviews_df['user_review'].apply(lambda x:split_text(x))

In [33]:
# Assume top_four_attributes is a list of the top four attributes obtained from word frequency analysis
top4_attributes = ['dark', 'light', 'sweet', 'chocolate']

#Selecting the 10 beers based on frequency
top_10_beers = reviews_df['beername'].value_counts()[:10].index
    
def filter_top_4attribues(words_list):
    matching_brands = [word for word in words_list if word in top4_attributes]
    return matching_brands if matching_brands else ['none']

# Apply the function to the DataFrame column
reviews_df['filtered_attributes'] = reviews_df['token'].apply(filter_top_4attribues)


Approach : Flatten the beername with each attribute for a review and then cacluate the lift

In [39]:
reviews_df[reviews_df['beername'].isin(top_10_beers)].groupby('beername').agg({'filtered_attributes':list})

Unnamed: 0_level_0,filtered_attributes
beername,Unnamed: 1_level_1
120 Minute IPA,"[[chocolate, dark], [light], [light], [none], ..."
Aecht Schlenkerla Weichsel,"[[dark], [sweet], [dark], [light], [light], [n..."
Anchor Porter,"[[light, chocolate, dark], [none], [none], [ch..."
Cuvée Delphine,"[[chocolate, dark], [light, dark], [none], [ch..."
IPA,"[[sweet], [light], [none], [sweet], [sweet], [..."
Imperial Stout,"[[none], [chocolate, light, dark], [chocolate,..."
Porter,"[[none], [sweet], [none], [chocolate, light, d..."
Spiced Imperial Stout Barrel-Aged,"[[chocolate, light, dark], [sweet, dark], [cho..."
Ten FIDY - Bourbon Barrel-Aged,"[[sweet, dark], [chocolate, sweet, dark], [non..."
Vanilla Porter,"[[none], [none], [chocolate, dark], [chocolate..."


In [58]:
new_df = pd.DataFrame(reviews_df[(reviews_df['beername'].isin(top_10_beers)) & (reviews_df['filtered_attributes'].apply(lambda x: x != ['none']))])
new_df['association'] = new_df.apply(lambda x: x['filtered_attributes'] + [x['beername']], axis=1)

new_df_association = new_df['association'].tolist()

In [59]:
#Assosciations between top 10 beers
beertrans = TransactionEncoder()
beertrans_ary = beertrans.fit(new_df_association).transform(new_df_association)
beertrans_df = pd.DataFrame(beertrans_ary, columns=beertrans.columns_)

# Compute the support
support = beertrans_df.mean()
support = pd.DataFrame(support, columns=['support']).sort_values('support',ascending=False)
support.head(20)

#Compute lift with minimum threshold of 1
beerrules = apriori(beertrans_df, min_support=0.05, use_colnames=True,max_len=2)
beerrules = association_rules(beerrules, metric="lift", min_threshold=1)
beerrules.sort_values(by=['lift'],ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(chocolate),(Anchor Porter),0.478261,0.072464,0.072464,0.151515,2.090909,0.037807,1.093168,1.0
3,(Anchor Porter),(chocolate),0.072464,0.478261,0.072464,1.0,2.090909,0.037807,inf,0.5625
0,(light),(Aecht Schlenkerla Weichsel),0.42029,0.07971,0.057971,0.137931,1.730408,0.02447,1.067536,0.728125
1,(Aecht Schlenkerla Weichsel),(light),0.07971,0.42029,0.057971,0.727273,1.730408,0.02447,2.125604,0.458661
15,(Imperial Stout),(chocolate),0.115942,0.478261,0.094203,0.8125,1.698864,0.038752,2.782609,0.465322
14,(chocolate),(Imperial Stout),0.478261,0.115942,0.094203,0.19697,1.698864,0.038752,1.100902,0.788462
31,(Ten FIDY - Bourbon Barrel-Aged),(chocolate),0.086957,0.478261,0.065217,0.75,1.568182,0.023629,2.086957,0.396825
30,(chocolate),(Ten FIDY - Bourbon Barrel-Aged),0.478261,0.086957,0.065217,0.136364,1.568182,0.023629,1.057208,0.694444
4,(Anchor Porter),(dark),0.072464,0.586957,0.065217,0.9,1.533333,0.022684,4.130435,0.375
5,(dark),(Anchor Porter),0.586957,0.072464,0.065217,0.111111,1.533333,0.022684,1.043478,0.842105


Interpretation: We see Anchor Porter is highly associated with Chocolate with a lift of 2. Now, among all the other beers, Imperial Stout is also associated with Chocolate with a lift of 1.69. So, we can say that Anchor Porter is similar to Imperial Stout as they are both positively associated with chocolate.