In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise.model_selection import train_test_split
from surprise import dump
import csv
from surprise import accuracy
from pprint import pprint

In [2]:
csv_path = os.path.join("../data/csv/reviews_cleaned_reduced_500.csv")
TextFileReader = pd.read_csv(csv_path, chunksize=1000)  # the number of rows per chunk

dfList = []
for df in TextFileReader:
    dfList.append(df)

df = pd.concat(dfList,sort=False)


In [29]:
# load beers
csv_path = os.path.join("../data/csv/beers.csv")
beers_df = pd.read_csv(csv_path)

In [30]:
beers_df = beers_df.rename(columns={'id': 'beer_id'})

In [31]:
# Lets combine the dataframe
merge_df = pd.merge(df,
                 beers_df[['beer_id', 'name', 'style', 'brewery_id']],
                 on='beer_id')

In [None]:
merge_df.to_csv("../data/csv/beer_reviews_clean_500.csv",index=False)

In [32]:
# sample random trainset and testset method using Pearson similarity
# test set is made of 25% of the ratings. we are looking at similarities between items (user_based=false)
reader=Reader(rating_scale=(0,5))
data = Dataset.load_from_df(merge_df[['username', 'beer_id', 'score']], reader)

trainset, testset = train_test_split(data, test_size=.25)

sim_options = {'name': 'pearson',
               'user_based': False
               }

# We'll use KNN.
algo = KNNBasic(min_k = 10, sim_options=sim_options)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.fit(trainset).test(testset)

# Then compute RMSE
accuracy.rmse(predictions)
accuracy.mae(predictions)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.4935
MAE:  0.3584


0.3584073010077838

In [33]:
# Lets serialize and save this prediction algorithm
# Dump algorithm and reload it
file_name_algo = os.path.join('../data/dump/algo_knn_pearson_500dump_file')
dump.dump(file_name_algo, algo=algo)
file_name_pred = os.path.join('../data/dump/pred_knn_pearson_500dump_file')
dump.dump(file_name_pred, predictions=predictions)

In [34]:
# Code below identifes the top 10 best and worst predictions based upon code from this 
# notebook:https://nbviewer.jupyter.org/github/NicolasHug/Surprise/blob/master/examples/notebooks/KNNBasic_analysis.ipynb
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
def get_inner_ids(riid):
    inner_ids = []
    for riid in riids:
        inner_ids.append(trainset.to_inner_iid(riid))
    return inner_ids
        
    
df_predict = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_predict['Iu'] = df_predict.uid.apply(get_Iu)
df_predict['Ui'] = df_predict.iid.apply(get_Ui)
df_predict['err'] = abs(df_predict.est - df_predict.rui)
best_predictions = df_predict.sort_values(by='err')[:10]
worst_predictions = df_predict.sort_values(by='err')[-10:]

In [35]:
# Best Predictions:
print(best_predictions)

                 uid     iid  rui  est  \
56563       thuglife   27992  5.0  5.0   
253010      thuglife  172420  5.0  5.0   
248913      thuglife   76421  5.0  5.0   
140043  DrunkyBuddha   76008  5.0  5.0   
27973   DrunkyBuddha   38180  5.0  5.0   
312511  DrunkyBuddha  104699  5.0  5.0   
82806       thuglife  211516  5.0  5.0   
7503        bmcduff2    1503  1.0  1.0   
174413  DrunkyBuddha   33127  5.0  5.0   
164230      thuglife   51116  5.0  5.0   

                                          details  Iu    Ui  err  
56563   {'actual_k': 21, 'was_impossible': False}  25   876  0.0  
253010  {'actual_k': 25, 'was_impossible': False}  25   431  0.0  
248913  {'actual_k': 25, 'was_impossible': False}  25   384  0.0  
140043  {'actual_k': 40, 'was_impossible': False}  91   384  0.0  
27973   {'actual_k': 40, 'was_impossible': False}  91  1900  0.0  
312511  {'actual_k': 40, 'was_impossible': False}  91   557  0.0  
82806   {'actual_k': 25, 'was_impossible': False}  25   432  0.0  
7

In [36]:
# Worst Predictions:
print(worst_predictions)

                      uid     iid  rui       est  \
169961       4thelvofbeer  100421  1.0  4.348330   
187915     Craftonly23121   20014  1.0  4.370297   
171658        cclark67789     645  1.0  4.379122   
42274        vinylrooster   86626  1.0  4.385865   
218210           flipdog0    1545  1.0  4.421936   
31494          diamondlie  100421  1.0  4.422277   
212800           Warfarin   33183  1.0  4.427231   
95319      nickapalooza86    1558  1.0  4.439960   
216304             DevinK   16814  1.0  4.456608   
299573  MarshallBirdhouse   86621  1.0  4.464881   

                                          details   Iu    Ui       err  
169961  {'actual_k': 32, 'was_impossible': False}   34  1623  3.348330  
187915  {'actual_k': 25, 'was_impossible': False}   28   543  3.370297  
171658  {'actual_k': 40, 'was_impossible': False}   73  2323  3.379122  
42274   {'actual_k': 18, 'was_impossible': False}   22   454  3.385865  
218210  {'actual_k': 40, 'was_impossible': False}   76  1626  

In [37]:
def get_beer_name (beer_raw_id):
    beer_name = beers_df.loc[beers_df.beer_id==beer_raw_id,'name'].values[0]
    return beer_name

def get_beer_style (beer_raw_id):
    beer_style = beers_df.loc[beers_df.beer_id==beer_raw_id,'style'].values[0]
    return beer_style

def get_beer_score_mean (beer_raw_id):
    score_mean = mean_score.loc[mean_score.beer_id==beer_raw_id,'score'].values[0]
    return score_mean

def get_beer_neighbors (beer_raw_id):
    beer_inner_id = algo.trainset.to_inner_iid(beer_raw_id)
    beer_neighbors = algo.get_neighbors(beer_inner_id, k=5)
    beer_neighbors = (algo.trainset.to_raw_iid(inner_id)
                  for inner_id in beer_neighbors)
    return(beer_neighbors)

def get_beer_recc_df (beer_raw_id):
    beer_inner_id = algo.trainset.to_inner_iid(beer_raw_id)
    beer_neighbors = algo.get_neighbors(beer_inner_id, k=10)
    beer_neighbors = (algo.trainset.to_raw_iid(inner_id)
                      for inner_id in beer_neighbors)
    beers_id_recc = []
    beer_name_recc =[]
    beer_style_recc = []
    beer_score_mean = []
    for beer in beer_neighbors:
        beers_id_recc.append(beer)
        beer_name_recc.append(get_beer_name(beer))
        beer_style_recc.append(get_beer_style(beer))
        beer_score_mean.append(get_beer_score_mean(beer))
    beer_reccomendations_df = pd.DataFrame(list(zip(beers_id_recc,beer_name_recc,beer_style_recc,beer_score_mean)),
                                       columns=['beer_id', 'name', 'style', 'score_mean'])
    return beer_reccomendations_df
    
def get_inner_ids(riids):
    inner_ids = []
    for riid in riids:
        inner_ids.append(trainset.to_inner_iid(riid))
    return inner_ids
        

In [39]:
#Create mean score dataframe and get riids (riids = raw beers ids)
mean_score = merge_df.groupby('beer_id', as_index=False)[['score']].mean()
riids = mean_score['beer_id']
riids = riids.to_list()
#Get the beer inner ids
inner_ids = get_inner_ids(riids)

In [40]:
# Save the df_predict and df_ids for later use
df_predict.to_csv("../data/csv/df_predict_pearson_500.csv",index=False)
df_ids = pd.DataFrame(list(zip(riids,inner_ids)), columns=['beer_id', 'inner_ids'])
df_ids.to_csv("../data/csv/df_ids_pearson_500.csv",index=False)

In [41]:
# Lets test some beers.  Enter a beer and use the predictions model to return 5 nearest neighbors
# K=10

beer_raw_id =  232
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Corona Extra, American Adjunct Lager, score = 2.2471089363098145 are:


Unnamed: 0,beer_id,name,style,score_mean
0,233,Corona Light,American Light Lager,1.884826
1,75086,11.11.11 Vertical Epic Ale,Chile Beer,3.780256
2,2755,Cerveza Pacifico Clara,American Adjunct Lager,2.647262
3,2803,Sol,American Adjunct Lager,2.391426
4,122703,Stone / Beachwood / Heretic - Unapologetic IPA,American Imperial IPA,3.935283
5,1341,Keystone Light,American Light Lager,1.716901
6,246,Heineken Lager Beer,European Pale Lager,2.641871
7,1321,Modelo Especial,American Adjunct Lager,2.763934
8,3734,Michelob Ultra,American Light Lager,1.795151
9,26049,Wild Blue,Fruit and Field Beer,2.156789


In [42]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id =  412
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Old Rasputin, Russian Imperial Stout, score = 4.270477771759033 are:


Unnamed: 0,beer_id,name,style,score_mean
0,140,Corsendonk Christmas Ale,Belgian Strong Dark Ale,4.134585
1,127730,Tater Ridge: Scottish Ale (Beer Camp Across Am...,Scottish Ale,3.602797
2,48224,Flying Mouflan,American Barleywine,4.122644
3,37113,Old Stock Cellar Reserve (Aged In Bourbon Barr...,English Old Ale,4.384061
4,104466,Global Warmer,American Amber / Red Ale,3.886712
5,1160,Imperial Russian Stout,Russian Imperial Stout,4.314868
6,62722,White Rajah,American IPA,4.265573
7,410,Ruedrich's Red Seal Ale,American Amber / Red Ale,3.875684
8,132,Ayinger Bräu Weisse,German Hefeweizen,4.139741
9,1493,Sweetwater IPA,American IPA,3.922648


In [44]:
   # Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors

beer_raw_id =  26049
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Wild Blue, Fruit and Field Beer, score = 2.1567888259887695 are:


Unnamed: 0,beer_id,name,style,score_mean
0,79488,Frangelic Mountain Brown,American Brown Ale,4.003379
1,88880,Narwhal Imperial Stout - Barrel Aged,Russian Imperial Stout,4.321918
2,172420,No Rules,American Imperial Porter,4.471152
3,76899,Bud Light Platinum,American Light Lager,2.112869
4,46987,Beer Geek Brunch Weasel,American Imperial Stout,4.326587
5,52752,Fyodor,Russian Imperial Stout,4.454628
6,63224,Blue Moon Winter Abbey Ale,Belgian Dubbel,3.003898
7,95230,The Illinois,American Imperial IPA,3.884577
8,81072,Summit Sága IPA,American IPA,3.916811
9,232,Corona Extra,American Adjunct Lager,2.247109


In [45]:
# Lets test some beers.  Enter a beer and use the predictions model to return 5 nearest neighbors

beer_raw_id =  6108
print(f'The 5 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 5 nearest neighbors of 60 Minute IPA, American IPA, score = 4.019886493682861 are:


Unnamed: 0,beer_id,name,style,score_mean
0,2093,90 Minute IPA,American Imperial IPA,4.220053
1,12068,Exponential Hoppiness,American Imperial IPA,4.395072
2,402,Skull Splitter,Scotch Ale / Wee Heavy,4.065929
3,46385,Yakima Glory,American Black Ale,3.91918
4,18721,CascaZilla,American Amber / Red Ale,3.801036
5,5948,Terrapin Rye Pale Ale,American Pale Ale (APA),3.772735
6,74123,Snow Day Winter Ale,American Black Ale,3.668375
7,28318,Palate Shifter Imperial IPA,American Imperial IPA,4.13325
8,186,Smuttynose Shoals Pale Ale,English Pale Ale,3.800835
9,88298,Sap,New England IPA,4.307752


In [46]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id =  332
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Miller Lite, American Light Lager, score = 2.06624174118042 are:


Unnamed: 0,beer_id,name,style,score_mean
0,2280,Miller Genuine Draft,American Adjunct Lager,2.183389
1,1341,Keystone Light,American Light Lager,1.716901
2,580,Miller High Life,American Adjunct Lager,2.571435
3,837,Coors Light,American Light Lager,1.92867
4,1276,Coors Banquet,American Adjunct Lager,2.518194
5,41821,Bud Light Lime,American Light Lager,2.078862
6,2435,Beck's,German Pilsner,2.713063
7,1503,Sam Adams Light,American Light Lager,3.210688
8,653,Natural Ice,American Adjunct Lager,1.712514
9,63224,Blue Moon Winter Abbey Ale,Belgian Dubbel,3.003898


In [47]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id =  2512
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Chimay Grande Réserve (Blue), Belgian Strong Dark Ale, score = 4.253742694854736 are:


Unnamed: 0,beer_id,name,style,score_mean
0,672,Chimay Première (Red),Belgian Dubbel,4.123422
1,575,Organic Pale Ale,English Pale Ale,3.858218
2,2269,N'Ice Chouffe,Belgian Strong Dark Ale,4.063882
3,156,Piraat,Belgian Strong Pale Ale,4.027381
4,87109,Terrapin Liquid Bliss,American Porter,3.788704
5,28165,Bender,American Brown Ale,4.043412
6,127730,Tater Ridge: Scottish Ale (Beer Camp Across Am...,Scottish Ale,3.602797
7,1346,Chimay Tripel (White),Belgian Tripel,4.109288
8,674,Westmalle Trappist Dubbel,Belgian Dubbel,4.165857
9,189272,Ten FIDY - Bourbon Barrel Aged,Russian Imperial Stout,4.465606


In [48]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id =  1352
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Shiner Bock, German Bock, score = 3.087606430053711 are:


Unnamed: 0,beer_id,name,style,score_mean
0,5,Amber,Vienna Lager,3.403365
1,104466,Global Warmer,American Amber / Red Ale,3.886712
2,1798,Boddingtons Pub Ale,English Pale Ale,3.101113
3,126806,Mars (The Bringer Of War),American Imperial IPA,4.082902
4,32432,Total Domination IPA,American IPA,3.640893
5,79488,Frangelic Mountain Brown,American Brown Ale,4.003379
6,5096,Conway's Irish Ale,Irish Red Ale,3.747557
7,1312,Molson Canadian Lager,American Adjunct Lager,2.755008
8,123062,Swish,New England IPA,4.555908
9,570,Brooklyn Oktoberfest,German Märzen / Oktoberfest,3.641016


In [49]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id =  607
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Fat Tire Belgian Style Ale, American Amber / Red Ale, score = 3.5244343280792236 are:


Unnamed: 0,beer_id,name,style,score_mean
0,37966,Tricerahops Double IPA,American Imperial IPA,3.98686
1,58048,Stone / Dogfish Head / Victory - Saison Du BUFF,Belgian Saison,3.795838
2,33245,Lager of The Lakes,Bohemian Pilsener,3.639069
3,76086,Gingerbread Stout,English Sweet / Milk Stout,4.242263
4,74123,Snow Day Winter Ale,American Black Ale,3.668375
5,152542,Beer Camp: Hoppy Lager (2015),American Lager,3.900902
6,145155,Proprietor's Bourbon County Brand Stout (2014),American Imperial Stout,4.687116
7,32631,Brooklyn Summer Ale,English Pale Ale,3.538762
8,1426,Grolsch Premium Lager,European Pale Lager,3.052222
9,48933,UFO White,Belgian Witbier,3.526161


In [50]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id =  246
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Heineken Lager Beer, European Pale Lager, score = 2.6418709754943848 are:


Unnamed: 0,beer_id,name,style,score_mean
0,436,Amstel Light,American Light Lager,2.39925
1,1312,Molson Canadian Lager,American Adjunct Lager,2.755008
2,37112,Le Merle,Belgian Saison,3.878926
3,232,Corona Extra,American Adjunct Lager,2.247109
4,1053,Moosehead Lager,American Lager,2.918225
5,96436,Fortunate Islands,American Pale Wheat Ale,4.077514
6,2619,Pale Ale,American Pale Ale (APA),3.800148
7,2958,Leinenkugel's Honey Weiss,German Kristalweizen,2.975604
8,2280,Miller Genuine Draft,American Adjunct Lager,2.183389
9,50764,Lucky U IPA,American IPA,3.434723


In [51]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id = 2755 
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Cerveza Pacifico Clara, American Adjunct Lager, score = 2.6472620964050293 are:


Unnamed: 0,beer_id,name,style,score_mean
0,2803,Sol,American Adjunct Lager,2.391426
1,87109,Terrapin Liquid Bliss,American Porter,3.788704
2,34804,Landshark Lager,American Adjunct Lager,2.507511
3,20732,Budweiser Select,American Light Lager,2.007452
4,232,Corona Extra,American Adjunct Lager,2.247109
5,3734,Michelob Ultra,American Light Lager,1.795151
6,1321,Modelo Especial,American Adjunct Lager,2.763934
7,74778,Rodenbach Caractère Rouge,Flanders Red Ale,4.451386
8,27143,Barrel-Aged Blackout Stout,Russian Imperial Stout,4.331574
9,23459,Thumbprint Enigma,Flanders Oud Bruin,4.14066


In [52]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id = 131
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Celebrator, German Doppelbock, score = 4.299962997436523 are:


Unnamed: 0,beer_id,name,style,score_mean
0,27339,Berserker Imperial Stout,American Imperial Stout,4.311373
1,66195,Café Racer 15,American Imperial IPA,4.215812
2,78660,Assassin,American Imperial Stout,4.659036
3,127563,Double Latte: Coffee Milk Stout (Beer Camp Acr...,English Sweet / Milk Stout,4.147673
4,875,Old Engine Oil Black Ale,English Porter,4.097507
5,6368,Masala Mama India Pale Ale,American IPA,4.290054
6,773,Goudenband,Flanders Oud Bruin,4.163876
7,6715,Budweiser Budvar B:Original,Bohemian Pilsener,3.574975
8,77648,Row 2/Hill 56,American Pale Ale (APA),4.255475
9,224,Schneider Weisse Tap 6 Unser Aventinus,German Weizenbock,4.293004
