<a href="https://colab.research.google.com/github/sophielouie/beer-recommendation-system/blob/main/HybridModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive 
drive.mount('/content/gdrive', force_remount = True)

Mounted at /content/gdrive


In [None]:
def print_bruh():
  print('bruh')

In [2]:
! pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 7.6 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633737 sha256=35641e1a441cddc8d490b295f60ba0ef68c5e4bd1f0b7b7f130c0c043998ffe6
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [3]:
! pip install node2vec

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting node2vec
  Downloading node2vec-0.4.3.tar.gz (4.6 kB)
Building wheels for collected packages: node2vec
  Building wheel for node2vec (setup.py) ... [?25l[?25hdone
  Created wheel for node2vec: filename=node2vec-0.4.3-py3-none-any.whl size=5980 sha256=249cbd829098687aac94e42356722d2d1237b913635e0a32e6c9b035e40fb436
  Stored in directory: /root/.cache/pip/wheels/07/62/78/5202cb8c03cbf1593b48a8a442fca8ceec2a8c80e22318bae9
Successfully built node2vec
Installing collected packages: node2vec
Successfully installed node2vec-0.4.3


In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import train_test_split
import numpy as np
from scipy import spatial
import statistics as stats
import math
from node2vec import Node2Vec as n2v
from networkx.algorithms.dag import topological_sort
import networkx as nx
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [5]:
pd.set_option('display.max_columns', None)

In [None]:
def print_test():
  print("test")

# Data Collection

In [6]:
def clean_beer_reviews():
  # storing beer review dataset
  beer_reviews = pd.read_csv("gdrive/MyDrive/Recommender System/beer_reviews.csv", encoding="utf-8")
  # creating a unique identifier for each beer using brewery name and beer name
  beer_reviews['Unique Beer Name'] = beer_reviews['brewery_name'] + ' ' + beer_reviews['beer_name']
  # storing beer profile dataset
  beer_profile = pd.read_csv("gdrive/MyDrive/Recommender System/beer_profile_and_ratings.csv", encoding="utf-8")
  # columns to drop from beer reviews
  drop_cols = ['brewery_id', 'brewery_name',  'beer_name', 'beer_abv', 'beer_beerid']
  # dropping columns from beer reviews
  beer_reviews.drop(columns = drop_cols, inplace = True)
  # columns to drop from  beer profile
  drop_cols = ['Name', 'Style', 'Brewery', 'Description',
       'Min IBU', 'Max IBU', 'Alcohol', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste',
       'review_overall', 'number_of_reviews']
  # dropping columns from beer profile
  beer_profile.drop(columns = drop_cols, inplace = True)
  # combining beer review and beer profile datasets to have profile of each beer attached to every review
  df_beer = pd.merge(beer_reviews, beer_profile, left_on = 'Unique Beer Name', right_on = 'Beer Name (Full)', how = 'inner')
  # isolating the numerical columns that need to be scaled
  need_scaling = df_beer.drop(columns = ['review_time', 'review_profilename', 'beer_style', 'Unique Beer Name', 'Beer Name (Full)'])
  # storing the informational portion of the dataset that does not need scaling
  informational = df_beer[['review_time', 'review_profilename', 'beer_style', 'Unique Beer Name', 'Beer Name (Full)']]
  # renaming beer name column
  informational.rename(columns = {'Beer Name (Full)': 'Beer Name'}, inplace = True)

  # scaling the data
  scaler = MinMaxScaler()
  scaler.fit(need_scaling)
  need_scaling = pd.DataFrame(scaler.transform(need_scaling), columns = need_scaling.columns)

  # recombining the informational data and scaled data
  df = pd.concat([informational, need_scaling], axis = 1)
  return df

In [7]:
df = clean_beer_reviews()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


# Hybrid Model

Conceptualize the hybrid model

In [8]:
def hybrid(user, beer, n_recs, df, svd_model):

  values = ['review_aroma',	'review_appearance',	'review_palate',	'review_taste', 'ABV', 'Astringency', 'Body', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty']

  target_beer_vector = df.loc[(user, beer)][values]

  # list of top 50 similar beers
  similar = []
  for beer_name in df["Beer Name"].unique():
    beer_vectors = df.loc[df["Beer Name"] == beer_name][values]
    cos_sim = list()
    for beer_vector in np.array(beer_vectors):
      cos_sim.append(1 - spatial.distance.cosine(beer_vector, target_beer_vector))
    similar.append((beer_name, stats.mean(cos_sim)))

  # sort in decreasing order
  similar = sorted(similar, key = lambda x: x[1], reverse = True)
  sim = similar[1:50]

  # get metadata for each of 50 similar beers
  beer_idx = [i[0] for i in sim]

  # we are trying to make a list of the top 50 similar beers and its metadata
  # there are multiple reviews for a single beer by different users, so we will average review_overall for each similar beer --> groupby and average
  beers = pd.DataFrame()
  for idx in beer_idx:
    average_rating = df[df['Beer Name'] == idx].groupby('Beer Name').mean()[['review_overall']]
    beers = pd.concat([beers, average_rating])
  
  beers = beers.reset_index()

  # create an "est" column and apply SVD.predict() to each book
  # predict using the svd_model
  beers['est'] = beers.apply(lambda x: svd_model.predict(user, x['Beer Name'], x['review_overall']).est, axis = 1)

  # sort predictions in decreasing order
  beers = beers.sort_values(by = 'est', ascending = False)

  # return top n recommendations
  return beers[:n_recs]

In [9]:
df.set_index(['review_profilename', 'Unique Beer Name'], inplace = True)

#Hybrid Testing

We need to select users to isolate for our test set. The criteria for these users is that they should have reviewed enough beers that there is some likelihood that they would have tried something that has been recommended to them. Starting with binary cumulative gain, we will assess the performance of our recommendation system by determining if the recommended beers have been tried by the user or exceeded a threshold.

- recommend x beers to 10 users
- traverse the recommended beer list, see how many have been rated highly
- compare the accuracy ratings of the three test users
- determine threshold for positive review by finding percentiles of ratings (top 1/4 review would be considered a successful recommendation)

## Choosing users for our test set

In [10]:
freq_reviews = df.groupby('review_profilename').count().sort_values(by=['review_time'], ascending = False)
freq_reviews.rename(columns={'review_time': 'Number of Reviews'}, inplace = True)
freq_reviews['Number of Reviews']

review_profilename
BuckeyeNation     1505
mikesgroove       1422
northyorksammy    1348
brentk56          1225
BEERchitect       1201
                  ... 
cheapdark            1
cheath               1
chefguy              1
chefmonty            1
zzajjber             1
Name: Number of Reviews, Length: 26631, dtype: int64

The ten users with the most beer reviews

In [11]:
test_users = freq_reviews.index[:10]

##Find threshold for what makes a good rating

In [12]:
len(df.loc[df.review_overall >= .875])

219603

In [13]:
THRESHOLD = 0.875

Only 219,603 / 744,251 reviews (29.5%) were given a score of .875 or greater, so this will be our threshold for a successful recommendation

## Remove all ratings by the test users from the training set except for the beer they rated the highest

In [66]:
def create_train_test_split(df, test_users, frac_rem=1):
  #frac_rem: fraction of each user in test set to remain in train set
  # train_set is copy of df --> df has review_profilename and Unique Beer Name as indicies
  # we are preserving user information by reseting indicies, so when we .loc[] we still have access to user and beer info
  train_set = df.copy().reset_index()
  test_parameters = []
  test_set = pd.DataFrame(columns = train_set.columns)


  for user in test_users: 
    # all reviews for a the test user
    user_reviews = train_set.loc[train_set.review_profilename == user]

    # sorted reviews
    sorted_user_reviews = user_reviews.sort_values(by = 'review_overall', ascending = False)

    # store highest reviewed beer
    highest_reviewed_beer = sorted_user_reviews.iloc[0]
    test_parameters.append((user, highest_reviewed_beer["Beer Name"]))

    # removing highest reviewed beer from user_reviews so that it remains in the train set
    user_reviews.drop(highest_reviewed_beer.name, axis = 0, inplace = True)

    # calculating the last index to remove from the train set
    last_idx = int((len(user_reviews) - 1) * frac_rem)

    # concatenate the removed user-beer pairs to test set
    test_set = pd.concat([test_set, user_reviews.iloc[0:last_idx]])

    # remove all beers from training set
    train_set.drop(user_reviews.iloc[0:last_idx].index, axis = 0, inplace = True)

  return train_set, test_set, test_parameters

In [67]:
train_set, test_set, test_parameters = create_train_test_split(df, test_users, .5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [70]:
def train_svd(train_set, test_set):
  # creating and training SVD on train_set
  reader = Reader()

  # train_set = train_set.reset_index()
  # test_set = test_set.reset_index()

  test_data = Dataset.load_from_df(test_set[['review_profilename', 'Beer Name', 'review_overall']], reader)
  train_data = Dataset.load_from_df(train_set[['review_profilename', 'Beer Name', 'review_overall']], reader)

  # NEED TO TURN OUR TRAIN_SET INTO surprise TRAINSET TYPE
  surprise_train_set = train_data.build_full_trainset()

  # train
  svd = SVD()
  svd.fit(surprise_train_set)
  return svd
# NEED TO FIGURE OUT IF SVD USES USER INFORMATION FOR CALCULATIONS --> can this model perform well without significant historical review data

In [71]:
svd = train_svd(train_set, test_set)

In [72]:
def hybrid_model(user, beer, n_recs, test_frac = 1):
  # Collecting and cleaning the dataset
  df = clean_beer_reviews()
  # Setting the index for easier querying
  df.set_index(['review_profilename', 'Unique Beer Name'], inplace = True)
  # Identifying the top 10 most frequent reviewers
  freq_reviews = df.groupby('review_profilename').count().sort_values(by=['review_time'], ascending = False)
  freq_reviews.rename(columns={'review_time': 'Number of Reviews'}, inplace = True)
  test_users = freq_reviews.index[:10]
  # Divying up the training and test sets
  train_set, test_set, test_parameters = create_train_test_split(test_frac, df, test_users)
  # Train the SVD model
  svd = train_svd(train_set, test_set)
  # Resetting the index to cooperate with hybrid function
  train_set.set_index(["review_profilename", "Unique Beer Name"], inplace = True)
  return hybrid(user, beer, n_recs, train_set, svd)

Separate Testing Function

In [73]:
# setting train_set index to be compatible with hybrid()
train_set.set_index(["review_profilename", "Unique Beer Name"], inplace = True)

In [76]:
def compile_rec_list(n_recs, test_parameters):
  rec_dict = {}
  # iterate through test users, get X recommendations for their only beer review, compare recommendations to actual ratings stored in test_set using DCG
  for user, beer in test_parameters[0:5]:
    recommendations = hybrid(user, beer, n_recs, train_set, svd)
    rec_dict[user] = list(recommendations["Beer Name"])

  return rec_dict

In [77]:
# pass test_parameters in as parameter !!
recs = compile_rec_list(10, test_parameters)

  """


In [78]:
def calc_score(metric, rec_dict, n_recs, test):
  # test is a dataframe consisting of the user reviews to be used to determine relevancy
  score = 0
  index = 1
  num_rel = 0
  for user, recs in rec_dict.items():

    for rec in recs:

      # we need to include Raise And Catch Exception when beer is not in test set
      row = test.loc[(test.review_profilename == user) & (test["Unique Beer Name"] == rec)]

      # if rec in test_set and above threshold
      if len(row) > 0:
        if row["review_overall"].iloc[0] >= THRESHOLD:
          num_rel += 1

          if metric == "CG":
            score += 1

          elif metric in ["DCG", 'NDCG']:
            score += (1 / math.log2(index + 1))

          elif metric == "MAP":
            score += num_rel / index

          else:
            raise Exception("Metric type provided is not valid.")

    if metric == 'NDCG':
      ideal = 0
      for num in range(num_rel):
        # num starts at 0, so we add two to mimic the starting index of 1
        ideal += (1 / math.log2(num + 2))
      score = score / ideal

    if metric == "MAP":
      score = score / n_recs

    index += 1
    print(f"{user} : {score}")

In [85]:
calc_score("CG", recs, 10, test_set)

BuckeyeNation : 1
mikesgroove : 1
northyorksammy : 1
brentk56 : 1
BEERchitect : 1
