In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [23]:
# show all columns 
pd.set_option('display.max_columns', None)

In [24]:
# open beer_df.csv
beer_df = pd.read_csv('data/beer_df.csv', low_memory=False)
beer_df.head()

Unnamed: 0,address,categories,city,country,key,lat,long,brewery_name,phones,postalCode,province,websites,index,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,review_year,review_month
0,2010 Williams St,Brewery,San Leandro,US,us/ca/sanleandro/2010williamsst,37.711807,-122.177658,21st Amendment Brewery,5105952111,94577,CA,http://21st-amendment.com,1495017,735,2011-03-01 00:49:43,3.5,3.5,4.0,illidurit,American Double / Imperial IPA,3.5,3.5,21 Rock,9.7,66190,2011,3
1,2010 Williams St,Brewery,San Leandro,US,us/ca/sanleandro/2010williamsst,37.711807,-122.177658,21st Amendment Brewery,5105952111,94577,CA,http://21st-amendment.com,1495350,735,2008-12-04 19:03:15,4.0,4.0,4.0,magictrokini,American IPA,3.0,4.0,Harvest Moon,6.4,45648,2008,12
2,2010 Williams St,Brewery,San Leandro,US,us/ca/sanleandro/2010williamsst,37.711807,-122.177658,21st Amendment Brewery,5105952111,94577,CA,http://21st-amendment.com,1495733,735,2010-01-23 20:55:46,4.0,4.0,3.5,HapWifeHapLife,American IPA,4.0,4.0,21st Amendment IPA,7.0,20781,2010,1
3,2010 Williams St,Brewery,San Leandro,US,us/ca/sanleandro/2010williamsst,37.711807,-122.177658,21st Amendment Brewery,5105952111,94577,CA,http://21st-amendment.com,1501253,735,2010-04-08 18:58:54,4.0,3.5,4.5,pwoody11,Belgian Strong Dark Ale,4.0,4.0,Monk's Blood,8.3,52510,2010,4
4,2010 Williams St,Brewery,San Leandro,US,us/ca/sanleandro/2010williamsst,37.711807,-122.177658,21st Amendment Brewery,5105952111,94577,CA,http://21st-amendment.com,1501262,735,2010-03-14 16:30:10,4.0,3.5,4.0,metter98,Belgian Strong Dark Ale,4.0,4.5,Monk's Blood,8.3,52510,2010,3


## Baseline Model

In [25]:
# create baseline model of beer_df with normal_predictor and train test split from surprise
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(beer_df[['review_profilename', 'beer_name', 'review_overall']], reader)

trainset, testset = train_test_split(data, test_size=.25)

algo = NormalPredictor()
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.mae(predictions)
accuracy.mse(predictions)

MAE:  0.6883
MSE: 0.7644


0.7644282543439512

## Running model with Surprise Package

After running a baseline model, we will now run an SVD algoritm to improve our rmse and mae. 

In [28]:
# Running a model with SVD
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(beer_df[['review_profilename', 'beer_name', 'review_overall']], reader)

trainset, testset = train_test_split(data, test_size=.25)

algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.mse(predictions)
accuracy.mae(predictions)

MSE: 0.3174
MAE:  0.4198


0.41978312609914936

In [27]:
#Running a model with KNNBasic
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(beer_df[['review_profilename', 'beer_name', 'review_overall']], reader)

trainset, testset = train_test_split(data, test_size=.25)

sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.mse(predictions)
accuracy.mae(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
MSE: 0.3405
MAE:  0.4317


0.43173069978490525

### Tuned Model running GridSearchCV and RandomizedSearchCV

In [15]:
# run model svd using gridsearchcv
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(beer_df[['review_profilename', 'beer_name', 'review_overall']], reader)

param_grid = {'n_factors': [50, 100, 150], 'n_epochs': [20, 40, 60], 'lr_all': [0.002, 0.005, 0.008], 'reg_all': [0.4, 0.6, 0.8]}
gs = GridSearchCV(SVD, param_grid, measures=['mse', 'mae'], cv=3)

gs.fit(data)

print(gs.best_score['mse'])
print(gs.best_params['mse'])

print(gs.best_score['mae'])
print(gs.best_score['mae'])


0.5616435071805842
{'n_factors': 50, 'n_epochs': 60, 'lr_all': 0.002, 'reg_all': 0.4}


In [31]:
# Running a tuned SVD model with tuned parameters
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(beer_df[['review_profilename', 'beer_name', 'review_overall']], reader)

trainset, testset = train_test_split(data, test_size=.25)

algo = SVD(n_factors= 50, n_epochs= 60, lr_all= 0.002, reg_all= 0.4)
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.mse(predictions)
accuracy.mae(predictions)

MSE: 0.3151
MAE:  0.4139


0.41391307209987543

In [16]:
# run model svd using randomizedsearchcv
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import RandomizedSearchCV

reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(beer_df[['review_profilename', 'beer_name', 'review_overall']], reader)

param_grid = {'n_factors': [50, 100, 150], 'n_epochs': [20, 40, 60], 'lr_all': [0.002, 0.005, 0.008], 'reg_all': [0.4, 0.6, 0.8]}
gs = RandomizedSearchCV(SVD, param_grid, measures=['mse', 'mae'], cv=3)

gs.fit(data)

print(gs.best_score['mse'])
print(gs.best_params['mse'])

print(gs.best_score['mae'])
print(gs.best_score['mae'])


0.5618820518744397
{'n_factors': 150, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.4}


In [32]:
# Running a tuned SVD model with tuned parameters
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(beer_df[['review_profilename', 'beer_name', 'review_overall']], reader)

trainset, testset = train_test_split(data, test_size=.25)

algo = SVD(n_factors= 150, n_epochs= 40, lr_all= 0.005, reg_all= 0.4)
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.mse(predictions)
accuracy.mae(predictions)

MSE: 0.3147
MAE:  0.4159


0.41586875392250766

## Running model with SciKit-Learn

In [35]:
# run model using sci-kit learn and truncatedSVD


### Using the best model to create recommendation system

In [33]:
# use model to create recommendation system for user 
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(beer_df[['review_profilename', 'beer_name', 'review_overall']], reader)

trainset, testset = train_test_split(data, test_size=.25)

algo = SVD(n_factors= 150, n_epochs= 40, lr_all= 0.005, reg_all= 0.4)
algo.fit(trainset)
predictions = algo.test(testset)

In [40]:
# create function to recommend beers to a user based on a specific beer 
def recommend_beers_from_beer(user_id, beer_name, n_recommendations, algo):
    # get the user's ratings
    user_ratings = beer_df[beer_df['review_profilename'] == user_id]
    # get the beers the user has already rated
    user_beers = user_ratings['beer_name'].unique()
    # get the beers that the user has not rated
    beers_to_recommend = beer_df[~beer_df['beer_name'].isin(user_beers)]
    # get the top n recommendations
    recommendations = beers_to_recommend.groupby('beer_name').agg({'review_overall': 'mean'}).sort_values('review_overall', ascending=False).head(n_recommendations)
    return recommendations

# get recommendations for user 'northyorksammy' based on the beer 'Sierra Nevada Pale Ale'
recommend_beers_from_beer('northyorksammy', 'Sierra Nevada Pale Ale', 10, algo)

Unnamed: 0_level_0,review_overall,predicted_rating,user_rating
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Au Ciel,5.0,4.050786,
Best Bitter Ale With Cascade And Chinook Dry Hops,5.0,3.893398,
St. Patrick O'Sullivan's Irish Red,5.0,4.076184,
Stone Old Guardian Barley Wine Style Ale 1999,5.0,3.893398,
Kaldi Kreme,5.0,4.049588,
Sparnfarkel Smoked Porter,5.0,3.893398,
Bourbon Barley Wine,5.0,4.097954,
Cauldron Brew,5.0,4.041862,
Kona Belgian Triple,5.0,3.893398,
Anniversary Ale 2003,5.0,3.995765,


In [70]:
# create function to predict the rating of a beer for a user
def predict_rating(user_id, beer_name, algo):
    beer_id = beer_df[beer_df['beer_name'] == beer_name]['beer_beerid'].unique()[0]
    user_id = beer_df[beer_df['review_profilename'] == user_id]['review_profilename'].unique()[0]
    return algo.predict(user_id, beer_id).est

# print both the predicted rating and the actual rating for user 'northyorksammy' for beer 'Sierra Nevada Pale Ale'
print('Predicted Rating: ')
print(predict_rating('northyorksammy', 'Sierra Nevada Pale Ale', algo))
print(' ')
print('Actual Rating: ')
print(beer_df[(beer_df['review_profilename'] == 'northyorksammy') & (beer_df['beer_name'] == 'Sierra Nevada Pale Ale')]['review_overall'].unique()[0])


Predicted Rating: 
3.893397511749264
 
Actual Rating: 
4.0


In [36]:
# create a function to recommend beers to a user only from a specific brewery
def recommend_beers_from_brewery(user_id, brewery_name, n_recommendations, algo):
    # get the user's ratings
    user_ratings = beer_df[beer_df['review_profilename'] == user_id]
    # get the beers the user has already rated
    user_beers = user_ratings['beer_name'].unique()
    # get the beers that the user has not rated
    beers_to_recommend = beer_df[~beer_df['beer_name'].isin(user_beers)]
    # get the beers from the brewery
    beers_from_brewery = beers_to_recommend[beers_to_recommend['brewery_name'] == brewery_name]
    # get the top n recommendations
    recommendations = beers_from_brewery.groupby('beer_name').agg({'review_overall': 'mean'}).sort_values('review_overall', ascending=False).head(n_recommendations)
    return recommendations

# get recommendations for user 'northyorksammy' based on the brewery 'Sierra Nevada Brewing Co.'
recommend_beers_from_brewery('northyorksammy', 'Sierra Nevada Brewing Co.', 10, algo)

Unnamed: 0_level_0,review_overall
beer_name,Unnamed: 1_level_1
Best Bitter Ale With Cascade And Chinook Dry Hops,5.0
Rhymes Wit - Beer Camp #20,4.666667
22 Bines Blonde IPA - Beer Camp #9,4.5
Red Perle Red Ale - Beer Camp #11,4.5
Liquid Sourdough (LSD) - Beer Camp #41,4.5
Knightro – Celtic Festival Beer,4.5
Knightro ESB - Beer Camp #23,4.5
"Que Syrah, Syrah!",4.5
Sierra Nevada Kölsch Style Ale,4.444444
Brewer's Blackbird Black IPA - Beer Camp #27,4.423077
