<a href="https://colab.research.google.com/github/sophielouie/beer-recommendation-system/blob/main/Node2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install surprise



In [72]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import train_test_split
import numpy as np
from scipy import spatial
import statistics as stats
import math
from networkx.algorithms.dag import topological_sort
import networkx as nx
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from node2vec import Node2Vec
import pickle

In [73]:
from node2vec import Node2Vec as n2v

In [74]:
import node2vec

In [6]:
!pip install node2vec



In [None]:
pd.set_option('display.max_columns', None)

# Data Collection

In [4]:
def clean_beer_reviews():
    # storing beer review dataset
    beer_reviews = pd.read_csv("beer_reviews.csv", encoding="utf-8")
    # creating a unique identifier for each beer using brewery name and beer name
    beer_reviews['Unique Beer Name'] = beer_reviews['brewery_name'] + ' ' + beer_reviews['beer_name']
    # storing beer profile dataset
    beer_profile = pd.read_csv("beer_profile_ratings.csv", encoding="utf-8")
    # columns to drop from beer reviews
    drop_cols = ['brewery_id', 'brewery_name',  'beer_name', 'beer_abv', 'beer_beerid']
    # dropping columns from beer reviews
    beer_reviews.drop(columns = drop_cols, inplace = True)
    # columns to drop from  beer profile
    drop_cols = ['Name', 'Style', 'Brewery', 'Description',
                 'Min IBU', 'Max IBU', 'Alcohol', 'review_aroma', 'review_appearance', 'review_palate',
                 'review_taste', 'review_overall', 'number_of_reviews']
    # dropping columns from beer profile
    beer_profile.drop(columns = drop_cols, inplace = True)
    # combining beer review and beer profile datasets to have profile of each beer attached to every review
    df_beer = pd.merge(beer_reviews, beer_profile, left_on = 'Unique Beer Name', right_on = 'Beer Name (Full)', how = 'inner')
    # isolating the numerical columns that need to be scaled
    need_scaling = df_beer.drop(columns = ['review_time', 'review_profilename', 'beer_style', 'Unique Beer Name', 'Beer Name (Full)'])
    # storing the informational portion of the dataset that does not need scaling
    informational = df_beer[['review_time', 'review_profilename', 'beer_style', 'Unique Beer Name', 'Beer Name (Full)']]
    # renaming beer name column
    informational.rename(columns = {'Beer Name (Full)': 'Beer Name'}, inplace = True)

    # scaling the data
    scaler = MinMaxScaler()
    scaler.fit(need_scaling)
    need_scaling = pd.DataFrame(scaler.transform(need_scaling), columns = need_scaling.columns)

    # recombining the informational data and scaled data
    df = pd.concat([informational, need_scaling], axis = 1)
    return df

In [9]:
df = clean_beer_reviews()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x9c in position 275: invalid start byte

#Defining Parameters before Testing

We need to select users to isolate for our test set. The criteria for these users is that they should have reviewed enough beers that there is some likelihood that they would have tried something that has been recommended to them. Starting with binary cumulative gain, we will assess the performance of our recommendation system by determining if the recommended beers have been tried by the user or exceeded a threshold.

- recommend x beers to 10 users
- traverse the recommended beer list, see how many have been rated highly
- compare the accuracy ratings of the three test users
- determine threshold for positive review by finding percentiles of ratings (top 1/4 review would be considered a successful recommendation)

## Choosing users for our test set

In [None]:
freq_reviews = df.groupby('review_profilename').count().sort_values(by=['review_time'], ascending = False)
freq_reviews.rename(columns={'review_time': 'Number of Reviews'}, inplace = True)
freq_reviews['Number of Reviews']

The ten users with the most beer reviews

In [10]:
test_users = freq_reviews.index[:10]

NameError: name 'freq_reviews' is not defined

##Find threshold for what makes a good rating

In [None]:
len(df.loc[df.review_overall >= .875])

219603

In [None]:
THRESHOLD = 0.875

Only 219,603 / 744,251 reviews (29.5%) were given a score of .875 or greater, so this will be our threshold for a successful recommendation

## Remove all ratings by the test users from the training set except for the beer they rated the highest

In [5]:
def create_train_test_split(frac_rem=1):
    #frac_rem: fraction of each user in test set to remain in train set
    # train_set is copy of df --> df has review_profilename and Unique Beer Name as indicies
    # we are preserving user information by reseting indicies, so when we .loc[] we still have access to user and beer info
    train_set = df.copy().reset_index()
    test_parameters = []
    test_set = pd.DataFrame(columns = train_set.columns)


    for user in test_users: 
        # all reviews for a the test user
        user_reviews = train_set.loc[train_set.review_profilename == user]

        # sorted reviews
        user_reviews = user_reviews.sort_values(by = 'review_overall', ascending = False)

        # store highest reviewed beer
        highest_reviewed_beer = user_reviews.iloc[0]
        test_parameters.append((user, highest_reviewed_beer["Beer Name"]))

        # calculating the last index to remove from the train set
        last_idx = int((len(user_reviews) - 1) * frac_rem)

        # concatenate the removed user-beer pairs to test set
        test_set = pd.concat([test_set, user_reviews.iloc[1:last_idx]])

        # remove all beers from training set, add back in highest_reviewed_beer
        train_set.drop(user_reviews.iloc[1:last_idx].index, axis = 0, inplace = True)

    return train_set, test_set, test_parameters

In [12]:
train_set, test_set, test_parameters = create_train_test_split(.5)

NameError: name 'df' is not defined

In [6]:
def calc_score(metric, rec_dict, n_recs, test):
    # test is a dataframe consisting of the user reviews to be used to determine relevancy
    score = 0
    index = 1
    num_rel = 0
    for user, recs in rec_dict.items():
        
        for rec in recs:

            # we need to include Raise And Catch Exception when beer is not in test set
            row = test.loc[(test.review_profilename == user) & (test["Unique Beer Name"] == rec)]

            # if rec in test_set and above threshold
            if len(row) > 0:
                if row["review_overall"].iloc[0] >= THRESHOLD:
                    num_rel += 1

                    if metric == "CG":
                        score += 1

                    elif metric in ["DCG", 'NDCG']:
                        score += (1 / math.log2(index + 1))

                    elif metric == "MAP":
                        score += num_rel / index

                    else:
                        raise Exception("Metric type provided is not valid.")

            if metric == 'NDCG':
                ideal = 0
                for num in range(num_rel):
                    # num starts at 0, so we add two to mimic the starting index of 1
                    ideal += (1 / math.log2(num + 2))
                score = score / ideal

            if metric == "MAP":
                score = score / n_recs

            index += 1
            print(f"{user} : {score}")

# Node2Vec

In [None]:
drop_cols = ['Name', 'Brewery', 'Description', 'Min IBU', 'Max IBU', 'Alcohol', 'review_aroma', 
             'review_appearance', 'review_palate', 'review_taste', 'review_overall', 
             'number_of_reviews', 'ABV']
beer_profile = pd.read_csv("beer_profile_ratings.csv", encoding="utf-8")
bp = beer_profile.drop(columns = drop_cols)
bp.head()

Unnamed: 0,Style,Beer Name (Full),Astringency,Body,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty
0,Altbier,Alaskan Brewing Co. Alaskan Amber,13,32,47,74,33,0,33,57,8,111
1,Altbier,Long Trail Brewing Co. Double Bag,12,57,33,55,16,0,24,35,12,84
2,Altbier,Long Trail Brewing Co. Long Trail Ale,14,37,42,43,11,0,10,54,4,62
3,Altbier,Uerige Obergärige Hausbrauerei GmbH / Zum Ueri...,13,55,47,101,18,1,49,40,16,119
4,Altbier,Ninkasi Brewing Company Sleigh'r Dark Doüble A...,25,51,44,45,9,1,11,51,20,95


Come back and add titles

In [None]:
histo_bp = bp.drop(columns = ['Style', 'Beer Name (Full)'])
fig = make_subplots(rows=round(len(histo_bp.columns) / 2), cols=2)
# count used for positioning plots
pos = 1
for col in histo_bp.columns:
  fig.append_trace(go.Histogram(x=list(histo_bp[col]), name = col), row=(math.ceil(pos / 2)), col=((pos % 2) + 1))
  pos+=1

fig.update_layout(height=1000, width=1000, title_text="Beer Attribute Frequency Distributions")
fig.show()

1
2
3
4
5
6
7
8
9
10


From these plots, we have decided that there is too little variability in the 'Salty' attribute for the buckets to have meaning. The reason for this is that a beer could be labeled as having significantly more spices than another while being less than 10 points different in its spices rating.

This function that will bin by each column and identify links between beers. 

We have decided to exclude salty as a bin feature because the distribution is skewed such that the vast majority of beers are not salty at all or are minimally salty, so it is not a valuable metric in distinguishing beers

In [None]:
bp = bp.drop(columns = 'Salty')

In [7]:
#iterate through each column; in each column we want to qcut on that column by x bins
#Make a new column that describes which buckets the beer falls into for each descriptor
#We should have a DF where each beer has a column describing the profile buckets it is in 
#These will be used make links between beers
def bucket_me(x, df):
  for col in df.columns:
    if col != 'Style':
      #Not passing labels in this step to ensure # of labels < bin edges
      categories, bins = pd.qcut(df[col], q=x, duplicates = 'drop', retbins = True)
      labels = []
      for i in range(len(bins) - 1):
        labels.append(col + str(i))
      df[col] = pd.qcut(df[col], q=x,labels=labels,duplicates='drop')
  return df

In [None]:
new_bp = bucket_me(4, bp.set_index('Beer Name (Full)'))

In [None]:
new_bp["buckets"] = new_bp.apply(lambda x: ','.join(x.astype(str)), axis=1)

In [8]:
def str_to_list(input):
  return input.split(',')

In [None]:
new_bp["buckets"] = new_bp["buckets"].apply(str_to_list)

In [None]:
new_bp

Unnamed: 0_level_0,Style,Astringency,Body,Bitter,Sweet,Sour,Fruits,Hoppy,Spices,Malty,buckets
Beer Name (Full),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alaskan Brewing Co. Alaskan Amber,Altbier,Astringency1,Body1,Bitter2,Sweet2,Sour2,Fruits2,Hoppy3,Spices1,Malty3,"[Altbier, Astringency1, Body1, Bitter2, Sweet2..."
Long Trail Brewing Co. Double Bag,Altbier,Astringency1,Body2,Bitter2,Sweet2,Sour1,Fruits1,Hoppy2,Spices2,Malty2,"[Altbier, Astringency1, Body2, Bitter2, Sweet2..."
Long Trail Brewing Co. Long Trail Ale,Altbier,Astringency1,Body1,Bitter2,Sweet1,Sour0,Fruits0,Hoppy2,Spices0,Malty1,"[Altbier, Astringency1, Body1, Bitter2, Sweet1..."
Uerige Obergärige Hausbrauerei GmbH / Zum Uerige Uerige Doppelsticke,Altbier,Astringency1,Body2,Bitter2,Sweet3,Sour1,Fruits2,Hoppy2,Spices2,Malty3,"[Altbier, Astringency1, Body2, Bitter2, Sweet3..."
Ninkasi Brewing Company Sleigh'r Dark Doüble Alt Ale,Altbier,Astringency3,Body2,Bitter2,Sweet1,Sour0,Fruits0,Hoppy2,Spices2,Malty2,"[Altbier, Astringency3, Body2, Bitter2, Sweet1..."
...,...,...,...,...,...,...,...,...,...,...,...
Cisco Brewers Inc. Winter Shredder,Winter Warmer,Astringency2,Body1,Bitter2,Sweet1,Sour1,Fruits1,Hoppy2,Spices3,Malty2,"[Winter Warmer, Astringency2, Body1, Bitter2, ..."
RJ Rockers Brewing Company First Snow Ale,Winter Warmer,Astringency2,Body1,Bitter0,Sweet1,Sour3,Fruits2,Hoppy0,Spices3,Malty1,"[Winter Warmer, Astringency2, Body1, Bitter0, ..."
Natty Greene's Pub & Brewing Co. Red Nose Winter Ale,Winter Warmer,Astringency0,Body2,Bitter1,Sweet1,Sour1,Fruits1,Hoppy1,Spices3,Malty2,"[Winter Warmer, Astringency0, Body2, Bitter1, ..."
Fish Brewing Company / Fishbowl Brewpub Fish Tale Winterfish,Winter Warmer,Astringency1,Body1,Bitter3,Sweet2,Sour3,Fruits3,Hoppy3,Spices2,Malty1,"[Winter Warmer, Astringency1, Body1, Bitter3, ..."


###Node Graph

This function will generate a article to article network given an input DataFrame. It will do so by creating an edge_dictionary where each key is going to be a node referenced by unique values in node_col and the values will be a list of other nodes connected to the key through the edge_col.

In [8]:
def generate_network(df, edge_col = "buckets", shared_att = 3):
    edge_dct = {}

    # iterating for each unique beer in the df
    for beer in list(df.index):
        # get  "all topic" of the beer
        beer_topics = df.loc[beer][edge_col]

        # creating a list of all the beers that are not the current one and share X attributes
        edge_df = df[(df.index != beer) & (df[edge_col].apply(beer_comparer, args = (beer_topics, shared_att, )))]
        edge_dct[beer] = edge_df.index
    
    # create nx network
    g = nx.Graph(edge_dct, create_using = nx.MultiGraph)
    return g

In [10]:
def beer_comparer(comp_list, beer_list, shared_att):
  # compares two lists and returns True if number of shared values is greater than or equal to shared_att
  return len([i for i, j in zip(comp_list, beer_list) if i == j]) >= shared_att

In [60]:
g = generate_network(new_bp, shared_att = 7)
g

NameError: name 'new_bp' is not defined

In [None]:
# Printing out the number of nodes and edges in our graph
print(nx.info(g))

Graph with 3197 nodes and 71482 edges


In [None]:
#nx.draw(g)

In [1]:
!pip install node2vec



In [None]:
embedding = n2v(g, dimensions = 16)
# *****Look into walk length, q, other relevant parameters

Computing transition probabilities:   0%|          | 0/3197 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [03:02<00:00, 18.20s/it]


In [None]:
model = embedding.fit(window = 1, min_count = 1, batch_words = 4)

In [59]:
emb_df = (pd.DataFrame([model.wv.get_vector(str(n)) for n in g.nodes()],
                       index = g.nodes))
emb_df

NameError: name 'g' is not defined

In [68]:
def predict_links(g, df, beer_name, num_rec):
    #dataframe with just row of given beer name
    this_beer = df[df.index == beer_name]

    #getting beers which are not already linked to the given beer
    
    all_nodes = g.nodes()
    #list of all beer names that are not adjacent to the beer
    all_other_nodes = [n for n in all_nodes if n not in list(g.adj[beer_name]) + [beer_name]]
    #DataFrame that contains non-adjacent nodes
    other_nodes = df[df.index.isin(all_other_nodes)]
    #find the cosine similarity between the given beer and all beers that are not already neighbors
    similar = dict()
    for beer in other_nodes.iterrows():
        similar[beer[0]] = (1 - spatial.distance.cosine(beer[1], np.array(this_beer)))
    #sort the dictionary by highest cosine similarity
    similar = pd.DataFrame(similar.items(), columns = ['beer', 'cos sim'])
    sorted_sim = similar.sort_values(by = 'cos sim', ascending = False)
    return sorted_sim['beer'].iloc[0:num_rec]

In [56]:
#getting beers which are not already linked to the given beer
all_nodes = g.nodes()
#list of all beer names that are not adjacent to the beer
all_other_nodes = [n for n in all_nodes if n not in list(g.adj['Alaskan Brewing Co. Alaskan Amber']) + ['Alaskan Brewing Co. Alaskan Amber']]
#DataFrame that contains non-adjacent nodes
other_nodes = emb_df[emb_df.index.isin(all_other_nodes)]

In [None]:
# predicting where there should be additonal edges for Alaskan Brewing Co. Alaskan Amber
predict_links(g, emb_df, 'Alaskan Brewing Co. Alaskan Amber', 5)

99                 Fuller Smith & Turner PLC Fuller's ESB
727     Williams Brothers Brewing Company Alba Scots P...
1319    Kulmbacher Brauerei AG Mönchshof Kellerbier (K...
922                     Dogfish Head Brewery Burton Baton
215      Brouwerij Duvel Moortgat NV Maredsous 6 - Blonde
Name: beer, dtype: object

In [14]:
#defining buckets and shared attributes values for Node2Vec algorithm
BUCKETS, SHARED_ATTRIBUTES = 4, 7

In [11]:
# storing graph 'g' and 'emb_df' in pkl and csv files
# Unnecessary columns in beer profile dataset
drop_cols = ['Name', 'Brewery', 'Description', 'Min IBU', 'Max IBU', 'Alcohol', 'review_aroma', 
         'review_appearance', 'review_palate', 'review_taste', 'review_overall', 
         'number_of_reviews', 'ABV', 'Salty']
# Loading in the beer profile dataset
beer_profile = pd.read_csv("beer_profile_and_ratings.csv", encoding="utf-8")
bp = beer_profile.drop(columns = drop_cols)
# Bucketing the traits of each beer to measure similarity
new_bp = bucket_me(BUCKETS, bp.set_index('Beer Name (Full)'))
# Make buckets column of strings
new_bp["buckets"] = new_bp.apply(lambda x: ','.join(x.astype(str)), axis=1)
# Turn buckets into list
new_bp["buckets"] = new_bp["buckets"].apply(str_to_list)
# generating network based on threshold for shared attributes
g = generate_network(new_bp, shared_att = SHARED_ATTRIBUTES)

filehandler = open('beer_network.pkl', 'wb')
pickle.dump(g, filehandler)

NameError: name 'BUCKETS' is not defined

In [25]:
# creating matrix used for predicting new links
embedding = n2v(g, dimensions = 16)
model = embedding.fit(window = 1, min_count = 1, batch_words = 4)
emb_df = (pd.DataFrame([model.wv.get_vector(str(n)) for n in g.nodes()],
                   index = g.nodes))

# embedding dataframe
emb_df.to_csv('embedding_df.csv')

Computing transition probabilities:   0%|          | 0/3197 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 10/10 [00:33<00:00,  3.39s/it]


In [75]:
def node2vec(beer, n_recs):
    # Loading in pkl file storing graph
    filehandler = open('beer_network.pkl', 'rb') 
    g = pickle.load(filehandler)
    # Reading in the embedding dataframe
    emb_df = pd.read_csv('embedding_df.csv')
    # Set embedding index to beer names
    emb_df.set_index(['Unnamed: 0'], inplace=True)
    
    return predict_links(g, emb_df, beer, n_recs)

In [77]:
node2vec('Thomas Creek Brewery River Falls Red Ale', 10)

1725               Nelson Brewing Company After Dark Mild
2843             Wychwood Brewery Company Ltd King Goblin
3041    Erdinger Weissbräu Erdinger Weissbier Kristall...
2064       Williams Brothers Brewing Company Midnight Sun
316     Gordon Biersch Brewery Restaurant Gordon Biers...
195                       Ska Brewing Co. True Blonde Ale
1878    Genesee Brewing Co. / Dundee Brewing Co. Dunde...
1760          T & R Theakston Ltd. Theakston Old Peculier
1735             Coopers Brewery Limited Coopers Mild Ale
0                       Alaskan Brewing Co. Alaskan Amber
Name: beer, dtype: object

calcscore, df, test_parameters

## Testing Node2Vec
- Iterate through test parameters (most frequent reviewers and their top reviewed beers), recommend them each ten beers, and see how they rated those beers

In [20]:
# Using the beer_profile dataframe, we want to compare 
def compile_rec_list_n2v(n_recs):
  rec_dict = {}
  # iterate through test users, get X recommendations for their only beer review, compare recommendations to actual ratings stored in test_set using DCG
  for user, beer in test_parameters[0:5]:
    recommendations = predict_links(g, emb_df, beer, n_recs)
    rec_dict[user] = list(recommendations)

  return rec_dict

In [None]:
recs = compile_rec_list_n2v(10)

In [None]:
calc_score('CG', recs, 10, df.copy().reset_index())

BuckeyeNation : 1
mikesgroove : 2
northyorksammy : 5
brentk56 : 5
BEERchitect : 6
