In [88]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [89]:
review_list = []
beer_list=[]
base_url = 'https://www.beeradvocate.com/beer/'
url_pages = ['top-rated', 'trending', 'top-new', 'fame', 'popular']
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
}

for url_page in url_pages:
    url = f'{base_url}{url_page}'
    response = requests.get(url, headers=headers)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    rows = soup.find('table').find_all('tr')
    
    for row in rows:
        review_link = row.find_all('td', class_='hr_bottom_light')
        try:
            review_list.append(review_link[1].find('a').get('href'))
            beer_list.append(review_link[1].find('a').text)
        except:
            pass

In [90]:
username = []
user_rating = []
user_review = []
beer_name=[]

for i,link in enumerate(review_list):
    beer_response = requests.get(f'https://www.beeradvocate.com/{link}', headers=headers)
    beer_soup = BeautifulSoup(beer_response.content, 'html.parser')
    
    beer_reviews = beer_soup.find_all('div', id = 'rating_fullview_container')
    #product_name = beer_soup.find('div', class_ = 'titleBar').text.split('\n')[1]
   
    for review in beer_reviews:
        user_box = review.find('div', id ='rating_fullview_content_2')
        username.append(user_box.find('a', class_ ='username').text)
        beer_name.append(beer_list[i])
    
        try:
            user_rating.append(user_box.find('span', class_='BAscore_norm').text)    
        except:
            user_rating.append(user_box.text.split()[1])
    
        try:
            user_review.append(user_box.find('div').text)
        except:
            user_review.append('')

In [132]:
df = pd.DataFrame({'username': username,
                   'beername':beer_name,
                   'user_rating': user_rating,
                   'user_review': user_review
                  })

reviews_df = df[df['user_review'] != '']

#removing duplicate values
reviews_df=reviews_df.drop_duplicates(keep='first').reset_index(drop=True)
reviews_df.to_csv('Reviews.csv')

In [133]:
reviews_df

Unnamed: 0,username,beername,user_rating,user_review
0,MonDak_Joe1953,Kentucky Brunch Brand Stout,4.71,Tap pour at the taproom. Served chilled in a s...
1,Rristow,Kentucky Brunch Brand Stout,5,The perfect barrel aged stout. Not overly swee...
2,BEERchitect,Kentucky Brunch Brand Stout,4.8,The flirtation with maple comes to a crescendo...
3,cdinardo21,Kentucky Brunch Brand Stout,4.98,On tap at TG for part of KBBS release day - ra...
4,GreenBayBA,Kentucky Brunch Brand Stout,4.7,"Perfect black color. I didn't see much foam, o..."
...,...,...,...,...
5791,TurboMon_911,Rare Vos Amber Ale,4.14,"12 oz. bottle, (Best by 05/11/23); Poured a cl..."
5792,Davidstan,Rare Vos Amber Ale,4.39,Just now becoming a Belgian fan. This beer cut...
5793,Bluerabbitbell,Rare Vos Amber Ale,4.05,"Enjoyed at home, 12/1/22, from a Kroger pick 6..."
5794,BubbleBobble,Rare Vos Amber Ale,4.02,"Pours a dark, ruby amber, big 2"" tan head that..."


In [146]:
reviews_df=pd.read_csv('Reviews.csv')
All_review=reviews_df['user_review'].tolist()

from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(All_review)

# Get the feature names (words)
feature_names = count_vectorizer.get_feature_names_out()

# Sum up the word frequencies
word_frequencies = count_matrix.sum(axis=0).A1

# Create a DataFrame to display the word frequencies
df = pd.DataFrame({'Word': feature_names, 'Frequency': word_frequencies})

# Sort the DataFrame by frequency in descending order
df_sorted = df.sort_values(by='Frequency', ascending=False)

# Display the words with the highest frequency
df_sorted.head(50)

Unnamed: 0,Word,Frequency
1580,beer,3727
6355,head,3647
12944,taste,3054
3749,dark,2670
7686,light,2233
12783,sweet,2125
5989,good,2000
2790,chocolate,1998
7701,like,1953
8202,medium,1902


### Task C: Perform a similarity analysis using cosine similarity (without word embeddings) with the three attributes specified by the customer and the reviews.



In [149]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Features taste(flavour): sweet/roasty/orange/chocolate/malt | colour : dark/light/brown/white/creamy   |   aroma: sour/fruity/coffee
selected_features = ['sweet','dark','coffee']

# Create a TF-IDF vectorizer with the selected features
tfidf_vectorizer = TfidfVectorizer(vocabulary=selected_features)
tfidf_matrix=tfidf_vectorizer.fit_transform(All_review)

from sklearn.metrics.pairwise import cosine_similarity
sim=cosine_similarity(tfidf_matrix,[[1,1,1]])

flattened_list = [item for sublist in sim for item in sublist]
reviews_df['WOB similarity score'] = flattened_list

In [151]:
reviews_df.drop('Unnamed: 0',axis=1,inplace=True)
reviews_df

Unnamed: 0,username,beername,user_rating,user_review,WOB similarity score
0,MonDak_Joe1953,Kentucky Brunch Brand Stout,4.71,Tap pour at the taproom. Served chilled in a s...,0.730235
1,Rristow,Kentucky Brunch Brand Stout,5.00,The perfect barrel aged stout. Not overly swee...,0.809192
2,BEERchitect,Kentucky Brunch Brand Stout,4.80,The flirtation with maple comes to a crescendo...,0.895796
3,cdinardo21,Kentucky Brunch Brand Stout,4.98,On tap at TG for part of KBBS release day - ra...,0.577350
4,GreenBayBA,Kentucky Brunch Brand Stout,4.70,"Perfect black color. I didn't see much foam, o...",0.745166
...,...,...,...,...,...
5791,TurboMon_911,Rare Vos Amber Ale,4.14,"12 oz. bottle, (Best by 05/11/23); Poured a cl...",0.577350
5792,Davidstan,Rare Vos Amber Ale,4.39,Just now becoming a Belgian fan. This beer cut...,0.000000
5793,Bluerabbitbell,Rare Vos Amber Ale,4.05,"Enjoyed at home, 12/1/22, from a Kroger pick 6...",0.000000
5794,BubbleBobble,Rare Vos Amber Ale,4.02,"Pours a dark, ruby amber, big 2"" tan head that...",0.577350


### Task F:  recommend three products to the customer & will the recommendation change with bag of words & vector of words approach

In [152]:
#spacy has 300 vector word embeddings
nlp = spacy.load('en_core_web_lg')
doc_1 = nlp('sweet dark chocolate ')
reviews_df["spacy similarity"] = reviews_df['user_review'].apply(lambda text: doc_1.similarity(nlp(text)))

#spacy word embeddings without stop words & punctuations and with lemmatization 
from spacy.lang.en import English
import string
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    mytokens = nlp(sentence)
    punctuations = string.punctuation
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
    return " ".join(mytokens)

reviews_df["spacy no_stop_pun similarity"] = reviews_df['user_review'].apply(lambda text: doc_1.similarity(nlp(spacy_tokenizer(text))))


  reviews_df["spacy similarity"] = reviews_df['user_review'].apply(lambda text: doc_1.similarity(nlp(text)))


In [155]:
reviews_df.head()

Unnamed: 0,username,beername,user_rating,user_review,WOB similarity score,spacy similarity,spacy no_stop_pun similarity
0,MonDak_Joe1953,Kentucky Brunch Brand Stout,4.71,Tap pour at the taproom. Served chilled in a s...,0.730235,0.43576,0.822693
1,Rristow,Kentucky Brunch Brand Stout,5.0,The perfect barrel aged stout. Not overly swee...,0.809192,0.591384,0.687496
2,BEERchitect,Kentucky Brunch Brand Stout,4.8,The flirtation with maple comes to a crescendo...,0.895796,0.441235,0.788287
3,cdinardo21,Kentucky Brunch Brand Stout,4.98,On tap at TG for part of KBBS release day - ra...,0.57735,0.367552,0.68836
4,GreenBayBA,Kentucky Brunch Brand Stout,4.7,"Perfect black color. I didn't see much foam, o...",0.745166,0.471919,0.796873


Interpretation: Bag of words similarity is based on the tf-idf values & doesn't take in any context. If a review has all 3 words ( sweet dark chocolate) then it will have high similarity score if any of the words were missing. It will consider  'not overly sweet' as in second review as 'sweet' and hence it has a high similary score of .81 despite not being similar.

Spacy on the other hand is based on word embeddings of 300 vectors for a word on the global corpus & takes into account the sematic similarity. for e.g. dark/rich/brown are considered in the same context. It will give a better similary score on 2nd review as .59 as not overly sweet is not same as sweet. 

We can also improvise further based on the problem to remove the stop words, puntuations & lemmatize the words for better similarity based on our problem statement. We have shown the results in the column 'spacy no_stop_pun similarity' column. 

#### Top 3 recommendations for beer based on word of bags similarity 

In [157]:
reviews_df.sort_values(by=['WOB similarity score','user_rating'],ascending = False).iloc[0:3,:]

Unnamed: 0,username,beername,user_rating,user_review,WOB similarity score,spacy similarity,spacy no_stop_pun similarity
1275,mwilbur,Bourbon County Brand Backyard Stout (2023),4.49,2023 variant. Best by 11OCT28.\nPoured from 50...,0.998081,0.501352,0.781098
2371,BEERchitect,"Term Oil Vanilla, Pistachio & Granola",4.44,"When hitting the road, there's a need that's b...",0.992272,0.422162,0.776996
736,Rug,Canuckley,4.34,2022 vintage\n\nI picked this one up on a whim...,0.992227,0.520374,0.794323


#### Top 3 recommendations for beer based on word2vector similarity 

In [160]:
#top 3 recommendations for beer based on word of bags similarity 
reviews_df.sort_values(by=['spacy similarity','user_rating'],ascending = False).iloc[0:3,:]

Unnamed: 0,username,beername,user_rating,user_review,WOB similarity score,spacy similarity,spacy no_stop_pun similarity
1389,Argail,Rested Reserve,4.79,tasty well made stout! 67F cherries wood acidi...,0.816497,0.720092,0.869745
4862,tyden46,Dirty Bastard,4.0,Strong dark fruit flavors with roasted bittern...,0.57735,0.71902,0.808849
2000,drdiesel9483,Space Between Truths,4.9,Look - black thick very slight head and rimmin...,0.960166,0.696933,0.814474
