# Beer Recommender System

In [None]:
import sys
sys.path.insert(0,'chromedriver')
import os 
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# importing packages for NLP
import re
import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
from nltk import word_tokenize, FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer


# importing packages to scrape web
from selenium import webdriver
from selenium.webdriver.common.by import By
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [2]:
stop_words = stopwords.words('english') 

# Web scraping

We scarped all the data (total ~13k rows) and kept the reviews for the products which had atlead 15 reviews which resulted in 6.8k reviews which we are using for subsequent analysis 

In [25]:
driver = webdriver.Chrome('chromedriver', options=chrome_options)
url_list = ['https://www.beeradvocate.com/beer/popular/','https://www.beeradvocate.com/beer/worst/',\
            'https://www.beeradvocate.com/beer/top-rated/','https://www.beeradvocate.com/beer/fame/','https://www.beeradvocate.com/beer/top-styles/','https://www.beeradvocate.com/beer/trending/']

correct_links = []
for a in url_list:
    driver.get(a)
    links_list = []
    links = driver.find_elements(By.TAG_NAME,'a')
    for lnk in links:
       links_list.append(lnk.get_attribute('href'))
        
    for i in links_list:
        if "/profile/" in str(i) and i.count('/') == 7:
            correct_links.append(i)
correct_links_nd = list(set(correct_links))

driver.quit()

In [None]:
comments = []
product_name = []
driver = webdriver.Chrome('chromedriver',options=chrome_options)

for j in range(len(correct_links_nd)):
    driver.get(str(correct_links_nd[j]))
    beer = driver.find_element(By.CLASS_NAME, 'titleBar').text.split('\n')[0]
    review = driver.find_elements(By.CLASS_NAME,'user-comment')
    for i in review:
        comment = i.find_element(By.ID,'rating_fullview_content_2').text
        comments.append(comment)
        product_name.append(beer)
    print(j)
driver.quit()

In [13]:
# removes extra data that comes with each comment
def split_comment(s):
    split_result = s.split('\n')
    comment_body = split_result[5:-2]
    return comment_body

#selecting only beer profile links not company/place profiles
def altLinks(l):
    return l[::2]

# converts comments to a string and removes brackets
def list_to_string(x):
    string = str(x)
    return string.rstrip(']').lstrip('[')

# Function to replace \n and "," with blanks('')
def replacenewline(x):
    return(str(x).replace('\n', ' ').replace(",", ""))

# Function to replace "\'" with blanks('')
def replaceapostrophe(z):
    return(str(z).replace("'", "").replace('"', ""))

In [34]:
#Cleaning the scraped data

df = pd.DataFrame(columns = ["product_name", "user_rating","raw_comments", "product_review"])
df['product_name'] = product_name
df['raw_comments'] = comments
string = 'Rated: '
df['user_rating'] = df['raw_comments'].map(lambda x: x.split('\n')[2].split('/')[0] if(string not in x) else np.nan)
df['product_review'] = df['raw_comments'].map(lambda x: (' '.join(x.split('\n')[4:-1])).lower() if ('Rated: ' not in x) else np.nan)
df.drop(columns = ['raw_comments'], inplace = True)
df.dropna(inplace = True)
df.reset_index(drop = True, inplace = True)
df['product_review'] = df['product_review'].map(lambda x: re.sub(r'[^\w\s]', '', x.replace("\n", " ")))
df.to_csv('Comments.csv')
df

Unnamed: 0,product_name,user_rating,product_review
0,Dinner,4.78,been waiting over a decade to try this beer an...
1,Dinner,4.47,pours a dirty darker straw color cloudy but se...
2,Dinner,4.14,ive never come across this highly regarded bee...
3,Dinner,4.36,from a 169oz bottle dated 072922 served in a s...
4,Dinner,4.79,just an amazing step up from lunch another cla...
...,...,...,...
13275,Interlude,4.56,cloudy copper not much head to speak of no ret...
13276,Interlude,3.84,20180708 750ml bottle served in a pair of snif...
13277,Interlude,4.34,tastes more like a mild sour but thats not a b...
13278,Interlude,4.25,vinous light tartness plum earthy brett grape ...


# Start the code from here

In [15]:
df = pd.read_csv("Comments.csv", index_col=0)
df.head()

In [17]:
#Filter for brands with greater than 15 reviews
x = (df[['product_name']].value_counts() > 15).reset_index()

df  = df[df['product_name'].isin(x[x[0] == True]['product_name'])]

df.head()

Unnamed: 0,product_name,user_rating,product_review,cleaned_review
18,Lagerbier Hell,4.33,usually the first sip is decisive and this bee...,usually the first sip is decisive and this bee...
19,Lagerbier Hell,5.00,favourite less bitter than pilsner typical mun...,favourite less bitter than pilsner typical mun...
20,Lagerbier Hell,3.78,drank straight from the bottle at the font bar...,drank straight from the bottle at the font bar...
21,Lagerbier Hell,3.65,untappd backlog,untappd backlog
22,Lagerbier Hell,4.43,pours a gorgeous crystal clear yellow with thi...,pours a gorgeous crystal clear yellow with thi...
...,...,...,...,...
13236,Tripel Karmeliet,4.29,honeypeachgold colored with a tall foamy head ...,honeypeachgold colored with a tall foamy head ...
13237,Tripel Karmeliet,4.33,poured from a 330ml bottle into a boulevard sm...,poured from a ml bottle into a boulevard smoke...
13238,Tripel Karmeliet,4.75,wonderfully balanced rich flavor not overpowering,wonderfully balanced rich flavor not overpowering
13239,Tripel Karmeliet,4.82,112oz bottle life can be an utter nightmare d...,oz bottle life can be an utter nightmare devo...


# EDA

Look at the top adjectives and assess frequency to identify attributes. Also, look at complete word-frequency df to identify additional attributes that are not adjectives

In [18]:
# cleaning the reviews for analysis
reviews = df.product_review.copy()
regex_pattern = "[^a-zA-Z\s]" # Regex to match everything that is not a character or white space.
df.loc[:,'cleaned_review'] = df.product_review.str.lower().str.replace(pat=regex_pattern,repl='',regex=True) # lower case and strip out stuff

In [19]:
# Overall Word Frequency
entire_corpus = df.cleaned_review.str.cat(sep=' ') # Entire corpus in one big string
all_tokens = nltk.word_tokenize(entire_corpus) # Tokenize everything
tokens_no_stop_words = [token for token in all_tokens if token not in stop_words] # Remove stop words from all tokens
word_counts_all = pd.DataFrame(data=nltk.FreqDist(tokens_no_stop_words).most_common(), columns=['word','frequency'])
#word_counts_all.head()

# Overall Adjective Frequency
tagged_tokens = nltk.pos_tag(tokens_no_stop_words) # Here, we get the parts of speech for each token, this is needed to filter by adjectives in a minute
adjectives_only = [word for word, tag in tagged_tokens if tag in ['JJ','JJR','JJS']] # Filter for adjectives
word_counts_adjectives = pd.DataFrame(data=nltk.FreqDist(adjectives_only).most_common(), columns=['word','frequency']) 
#word_counts_adjectives.head()

We eyeballed the Adjective frequency and the overall word frequency to identify important attributes. Please refer the list below. 

['light','sweet','clear','dark','golden','strong','black','brown','bitter','smooth','moderate','floral','earthy','creamy','pale','yellow','fresh','bready','malt','mild','bourbon','chocolate','citrus','coffee','mouthfeel','hazy','vanilla','orange','roasted','tropical','fruit','thick','caramel','maple','hops']

In [20]:
#List of Attributes identified for this analysis 
attributes = ['light','sweet','clear','dark','golden','strong','black','brown','bitter','smooth','moderate','floral','earthy','creamy','pale','yellow','fresh','bready','malt','mild','bourbon','chocolate','citrus','coffee','mouthfeel','hazy','vanilla','orange','roasted','tropical','fruit','thick','caramel','maple','hops']
word_counts_adjectives[word_counts_adjectives['word'].isin(attributes)].sort_values(by='frequency', ascending=False)[:20]

Unnamed: 0,word,frequency
1,light,2162
2,sweet,2032
8,clear,1213
10,dark,1076
14,golden,837
16,strong,768
17,black,751
18,brown,701
20,bitter,673
21,smooth,662


# Recommender System using bag-of-words and sentiment analysis

In [45]:
count_vectorizer = CountVectorizer()

def cosine_bow(data):
    final_data=[data,required_attributes]
    vector_matrix = count_vectorizer.fit_transform(final_data)
    tokens = count_vectorizer.get_feature_names()
    cosine_similarity_matrix = cosine_similarity(vector_matrix)
    return cosine_similarity_matrix[0][1]

def remove_stop_words(input_text):
    text_tokens = word_tokenize(input_text)
    tokens_without_sw =' '.join([word for word in text_tokens if not word in stopwords.words('english')])
    return tokens_without_sw

In [26]:
#df = pd.read_csv('Comments.csv')
df_2 = df[~df["cleaned_review"].isna()].copy()

# removing strop words
df_2['cleaned_review_wo_stopwords'] = df_2['cleaned_review'].apply(remove_stop_words)


### User Input needed below - Input 3 attributes for recommendation

We are using the following atatributes - dark, strong and bitter

In [27]:
# user can add inputs per their choice
attribute_list = ['dark', 'strong', 'bitter']
required_attributes = ' '.join(attribute_list)

In [28]:

df_2['cosine_similarity_score'] = df_2['cleaned_review_wo_stopwords'].apply(cosine_bow)

df_3 = df_2[['product_name','product_review','cosine_similarity_score']].copy()

# writing a csv
df_3.to_csv("bow_similarity_score.csv")

# preview
df_3.sort_values(by=['cosine_similarity_score'], ascending = False)



Unnamed: 0,product_name,product_review,cosine_similarity_score
5338,Gulden Draak,went all the way to grace in growlers in oahu ...,0.458831
6179,Icelandic Toasted Porter,l dark brown with mahogany highlights with a ...,0.455104
4003,Arrogant Bastard Ale,pours dark ruby light tan head floral hoppy sm...,0.433013
12286,Export Stout London 1890,very dark brown with a mocha head aroma is ver...,0.423207
12024,Big Bad Baptist,dark dark thick back strap molasses colorwhisk...,0.421637
...,...,...,...
5261,Vintage Ale,2011,0.000000
5260,Vintage Ale,2000 vintage cellar aged deep copper color exc...,0.000000
5259,Vintage Ale,2014 vintage appearance pours hazy and burnt...,0.000000
5258,Vintage Ale,2011,0.000000


### Sentiment analysis using Vader

In [29]:
vader = SentimentIntensityAnalyzer()
df_2['polarity_scores'] = df_2['cleaned_review'].apply(lambda review: vader.polarity_scores(review)) # note here we are not removing stop words
df_2['overall_sentiment_score']  = df_2['polarity_scores'].apply(lambda score_dict: score_dict['compound']) #extracting the compound score
df_2[['product_name','product_review','cosine_similarity_score','overall_sentiment_score']]

Unnamed: 0,product_name,product_review,cosine_similarity_score,overall_sentiment_score
18,Lagerbier Hell,usually the first sip is decisive and this bee...,0.000000,0.9116
19,Lagerbier Hell,favourite less bitter than pilsner typical mun...,0.166667,0.8655
20,Lagerbier Hell,drank straight from the bottle at the font bar...,0.000000,0.2263
21,Lagerbier Hell,untappd backlog,0.000000,0.0000
22,Lagerbier Hell,pours a gorgeous crystal clear yellow with thi...,0.000000,0.9421
...,...,...,...,...
13236,Tripel Karmeliet,honeypeachgold colored with a tall foamy head ...,0.000000,0.9329
13237,Tripel Karmeliet,poured from a 330ml bottle into a boulevard sm...,0.000000,0.8973
13238,Tripel Karmeliet,wonderfully balanced rich flavor not overpowering,0.000000,0.8176
13239,Tripel Karmeliet,112oz bottle life can be an utter nightmare d...,0.000000,0.8385


### Calculating a compound score using cosine similarity and sentiment analysis

We have considered a multiplicative model here - becuase if we take an additive model - the reviews that have no similarity but some sentiment will have some score associated with them.

Hence, we multiplied the cosine similarity with sentiment score and took average of that score at a product level\

Then, the top beers with highest score will be recommended to the customer based on this score

In [30]:
df_2['final_score'] = df_2['cosine_similarity_score'] * df_2['overall_sentiment_score']
df_bow_beer_score = df_2.groupby('product_name')['user_rating',"cosine_similarity_score",'overall_sentiment_score','final_score'].mean().reset_index().sort_values(by = ['final_score'], ascending=False)
df_bow_beer_score[:3]

  df_bow_beer_score = df_2.groupby('product_name')['user_rating',"cosine_similarity_score",'overall_sentiment_score','final_score'].mean().reset_index().sort_values(by = ['final_score'], ascending=False)


Unnamed: 0,product_name,user_rating,cosine_similarity_score,overall_sentiment_score,final_score
314,Trois Pistoles,4.382353,0.143197,0.865829,0.127207
98,Darkness,4.535,0.144346,0.6974,0.126762
18,Anchor Porter,4.166667,0.145855,0.7668,0.116442


In [31]:
# list of beers that we are recommending per bow cosine similarity
beer_reco_bow = df_bow_beer_score[:3]['product_name'].to_list()
beer_reco_bow

['Trois Pistoles', 'Darkness', 'Anchor Porter']

# Recommender System using word embeddings

In [32]:
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Downloading the small model containing tensors.
!python -m spacy download en_core_web_sm

# Downloading over 1 million word vectors.
!python -m spacy download en_core_web_lg

In [34]:
def similarity_word_embeddings(review):
    review_nlp = nlp(review)
    score = review_nlp.similarity(attributes_nlp)
    return score

nlp = spacy.load('en_core_web_lg')
attributes_nlp = nlp(required_attributes)
df_2['word_embedding_similarity'] = df_2['cleaned_review_wo_stopwords'].apply(similarity_word_embeddings)
df_2.head()

  score = review_nlp.similarity(attributes_nlp)


Unnamed: 0,product_name,user_rating,product_review,cleaned_review,cleaned_review_wo_stopwords,cosine_similarity_score,polarity_scores,overall_sentiment_score,final_score,word_embedding_similarity
18,Lagerbier Hell,4.33,usually the first sip is decisive and this bee...,usually the first sip is decisive and this bee...,usually first sip decisive beer made shout who...,0.0,"{'neg': 0.021, 'neu': 0.846, 'pos': 0.133, 'co...",0.9116,0.0,0.713669
19,Lagerbier Hell,5.0,favourite less bitter than pilsner typical mun...,favourite less bitter than pilsner typical mun...,favourite less bitter pilsner typical munich b...,0.166667,"{'neg': 0.087, 'neu': 0.523, 'pos': 0.39, 'com...",0.8655,0.14425,0.714128
20,Lagerbier Hell,3.78,drank straight from the bottle at the font bar...,drank straight from the bottle at the font bar...,drank straight bottle font bar manchester city...,0.0,"{'neg': 0.06, 'neu': 0.831, 'pos': 0.108, 'com...",0.2263,0.0,0.634419
21,Lagerbier Hell,3.65,untappd backlog,untappd backlog,untappd backlog,0.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.096756
22,Lagerbier Hell,4.43,pours a gorgeous crystal clear yellow with thi...,pours a gorgeous crystal clear yellow with thi...,pours gorgeous crystal clear yellow thick fluf...,0.0,"{'neg': 0.036, 'neu': 0.704, 'pos': 0.26, 'com...",0.9421,0.0,0.749954


In [35]:
df_we_similarity = df_2[['product_name','user_rating',"cosine_similarity_score",'overall_sentiment_score','word_embedding_similarity']].copy()
beer_reco_word_embedding = df_we_similarity.groupby(by='product_name').mean().sort_values(by='word_embedding_similarity', ascending=False)[:3].index.to_list()
df_we_similarity.groupby(by='product_name').mean().sort_values(by='word_embedding_similarity', ascending=False)[:3]

Unnamed: 0_level_0,user_rating,cosine_similarity_score,overall_sentiment_score,word_embedding_similarity
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Black Chocolate Stout,4.316,0.147544,0.642375,0.796689
Dark,4.195556,0.058727,0.886867,0.788704
Kalamazoo Stout,3.9525,0.136247,0.647694,0.78632


As we can see that the top 3 beers have changed and this makes more sense as all the beers are dark and bitter beers (Stouts)

In [36]:
def check_presence_of_word(text):
    a = text.find(attribute)
    if a==-1:
        b = 0
    else:
        b=1
    return b

# Comparing both the methods

In [37]:
# let's look into % of reviews that contain the attributes for bow recommended beers
df_bow = df_2.loc[df_2['product_name'].isin(beer_reco_bow),['product_name','cleaned_review']].copy()
col_list = ['product_name']
col_list.extend(attribute_list)
df_bow_per_attribute = pd.DataFrame(columns=col_list)
df_bow_per_attribute['product_name'] = beer_reco_bow

for attribute in attribute_list:
    df_bow[attribute] = df_bow['cleaned_review'].apply(check_presence_of_word)

df_bow_2 = df_bow[col_list].groupby(by='product_name').aggregate(["sum","count"]).reset_index()

for attribute in attribute_list:
    df_bow_per_attribute [attribute] = df_bow_2[attribute]['sum']/df_bow_2['dark']['count']

df_bow_per_attribute

Unnamed: 0,product_name,dark,strong,bitter
0,Trois Pistoles,0.761905,0.047619,0.380952
1,Darkness,0.65,0.2,0.45
2,Anchor Porter,0.764706,0.294118,0.235294


In [38]:
# let's look into % of reviews that contain the attributes for word embedding recommended beers
df_we = df_2.loc[df_2['product_name'].isin(beer_reco_word_embedding),['product_name','cleaned_review']].copy()
col_list = ['product_name']
col_list.extend(attribute_list)
df_we_per_attribute = pd.DataFrame(columns=col_list)
df_we_per_attribute['product_name'] = beer_reco_word_embedding

for attribute in attribute_list:
    df_we[attribute] = df_we['cleaned_review'].apply(check_presence_of_word)

df_we_2 = df_we[col_list].groupby(by='product_name').aggregate(["sum","count"]).reset_index()

for attribute in attribute_list:
    df_we_per_attribute [attribute] = df_we_2[attribute]['sum']/df_we_2['dark']['count']

df_we_per_attribute

Unnamed: 0,product_name,dark,strong,bitter
0,Black Chocolate Stout,0.6,0.1,0.7
1,Dark,0.5,0.0,0.444444
2,Kalamazoo Stout,0.625,0.1875,0.5


From the above two tables, we can see that the BoW model has more % of reviews in each attribute for all the beer that it recommends than word embeddings. This will happen in most of the cases because the bow model does the exact match so the reviews of top product it recommends should contain those exact words. So it might not help us conclusively prove if one method is performing better than other for the attributes that we have selected.

For the attributes that we considered, both word embedding and bow are giving good results. However, one thing we noticed is that in word embedding similarity scores, there isn't much difference amongst beers. So if the attributes we have are more obscure, we should go with bow models.

We qualitatively looked into the beers that met this criteria and found that recommendations from both word embeddings and bow cosine similarity made sense for the attributes that we selected.

# Evaluating advantage of recommender system over user ratings

In [39]:
top_3_rated_beers = df.groupby('product_name')['user_rating'].mean().sort_values(ascending=False).reset_index()['product_name'][:3]
df.groupby('product_name')['user_rating'].mean().sort_values(ascending=False)[:3]

product_name
Trappist Westvleteren 12 (XII)    4.683913
Black Tuesday                     4.681875
Kentucky Brunch Brand Stout       4.661875
Name: user_rating, dtype: float64

The top 3 recommendations as per Average User Rating (filtered for beers with atleast 15 reviews) are - 

1. Trappist Westvleteren 12 (XII)
2. Black Tuesday
3. Kentucky Brunch Brand Stout 

In [40]:
df_2[df_2['product_name'].isin(top_3_rated_beers)].groupby('product_name')['cosine_similarity_score','overall_sentiment_score','final_score','word_embedding_similarity'].mean()

  df_2[df_2['product_name'].isin(top_3_rated_beers)].groupby('product_name')['cosine_similarity_score','overall_sentiment_score','final_score','word_embedding_similarity'].mean()


Unnamed: 0_level_0,cosine_similarity_score,overall_sentiment_score,final_score,word_embedding_similarity
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Black Tuesday,0.073625,0.745481,0.067101,0.705261
Kentucky Brunch Brand Stout,0.008197,0.662919,0.00776,0.684135
Trappist Westvleteren 12 (XII),0.040979,0.859713,0.037005,0.702696


As we can see from the above table, the Cosine Similarity Scores are very low (less than <10%) for the 3 top beers basis avg. user rating. 

The sentiment rating is high, however on its own it is not a reliable score to use basis input attributes as it doesn't take into consideration the attributes but uses the overall review. Additionally, since these are top 3 rated beers they are bound to have a high sentiment score.

The Evaluation Score (final_score) is also very low as it is a multiplication of cosine similarity and sentiment score, which again shows that these 3 beers are not recommendations for the 3 selected attributes.

Word embedding similarity scores are relatively better but are still low, i.e. all 3 beers have only approximately 60% similarity.

Hence, we can conclude that recommending just the top 3 beers from the entire database without considering the attribute relevance of the product will lead to incorrect/ sub-optimal recommendations

# Association (Lift analysis)

The top 4 attributes identified previously in Task B are - 'light','malt','sweet','dark'

In [41]:
top_attr = ['light','malt','sweet','dark']

In [42]:
# Top 10 beers with highest comments

top_10_beer = df.groupby('product_name')['product_name'].count().sort_values(ascending=False)[:10]
top_10_beer = top_10_beer.index.to_list()
top_10_beer

['Oktoberfest',
 'IPA',
 'Festbier',
 'Tripel',
 'Imperial Stout',
 'Porter',
 'Pale Ale',
 'Pliny The Elder',
 'AleSchmidt Oktoberfest',
 'Storm King']

In [None]:
df_5 = df_2.copy()

def lw(y):
    comment_tokens = word_tokenize(y)
    new_comment = []
    
    for i in comment_tokens:
        new_comment.append(i)
    
    return new_comment

df_5['new_comment_tokens'] = df_5['cleaned_review_wo_stopwords'].apply(lw)

def lift(n, a, b, ab):
    l = ((n*ab)/(a*b))
    return (l)

lift_db = df_5.copy()
lift_db = lift_db[['product_name','product_review','new_comment_tokens']].apply(pd.Series.explode).set_index(['product_name','product_review']).reset_index().drop_duplicates().copy()

lift_values = pd.DataFrame(columns=['word_1','word_2','lift_val'])

for x1 in top_10_beer:
    for x2 in top_attr:
        lift_db['x1'] = 0
        lift_db['x2'] = 0
        
        lift_db['x1'][lift_db['product_name'] == x1] = 1
        lift_db['x2'][lift_db['new_comment_tokens'] == x2] = 1
        
        c = lift_db.groupby(['product_name','product_review'])[['x1','x2']].sum().reset_index()   
        
        a = lift_db[lift_db['product_name']==x1]['product_review'].drop_duplicates().count()
        b = lift_db['x2'].sum()
        ab = c[(c['x1']>0) & (c['x2']==1)]['x2'].count()
        n = df_2['product_review'].count()
        
        lift_val = lift(n, a, b, ab)
        
        app_dict = {}
        app_dict['word_1'] = x1
        app_dict['word_2'] = x2
        app_dict['lift_val'] = lift_val
        
        lift_values = lift_values.append(app_dict, ignore_index=True)
        

In [44]:
similarity = pd.crosstab(lift_values['word_1'], lift_values['word_2'], lift_values['lift_val'], aggfunc=np.mean)
similarity

# Below is the Similarity Matrix

word_2,dark,light,malt,sweet
word_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AleSchmidt Oktoberfest,0.781973,0.610486,2.059483,1.818854
Festbier,0.832971,1.560721,1.504318,1.107129
IPA,0.228075,0.712234,1.16132,0.571307
Imperial Stout,3.508853,0.788936,0.887162,1.205394
Oktoberfest,0.85407,1.260805,1.472312,1.500331
Pale Ale,0.331746,0.621586,0.698976,0.474852
Pliny The Elder,0.18246,0.569787,1.025165,1.305844
Porter,2.958817,1.478366,1.350724,1.058792
Storm King,2.736906,0.488389,1.510288,0.699559
Tripel,0.0,0.874557,0.894039,1.548792


Based on the above Similarity matrix, we can conclude that Festbier is most similar to Oktoberfest. For these 2 beers, the lifts for dark, light and malt attributes the lifts are almost comparable; only for the attribute "sweet" there is some difference. Otherwise, they are mostly similar.

