In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise.model_selection import train_test_split
from surprise import dump
import csv
from surprise import accuracy
from pprint import pprint

In [2]:
csv_path = os.path.join("../data/csv/reviews_cleaned_reduced_500.csv")
TextFileReader = pd.read_csv(csv_path, chunksize=1000)  # the number of rows per chunk

dfList = []
for df in TextFileReader:
    dfList.append(df)

df = pd.concat(dfList,sort=False)


In [32]:
# load beers
csv_path = os.path.join("../data/csv/beers.csv")
beers_df = pd.read_csv(csv_path)

In [33]:
beers_df = beers_df.rename(columns={'id': 'beer_id'})

In [34]:
# Lets combine the dataframe
merge_df = pd.merge(df,
                 beers_df[['beer_id', 'name', 'style', 'brewery_id']],
                 on='beer_id')

In [6]:
# sample random trainset and testset method Using Cosine similarity
# test set is made of 25% of the ratings. we are looking at similarities between items (user_based=false)
reader=Reader(rating_scale=(0,5))
data = Dataset.load_from_df(merge_df[['username', 'beer_id', 'score']], reader)

trainset, testset = train_test_split(data, test_size=.25)

sim_options = {'name': 'cosine',
               'user_based': False
               }

# We'll use KNN.
algo = KNNBasic(min_k = 5, sim_options=sim_options)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.fit(trainset).test(testset)

# Then compute RMSE
accuracy.rmse(predictions)
accuracy.mae(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.5617
MAE:  0.3948


0.3947948934387641

In [7]:
# Lets serialize and save this prediction algorithm
# Dump algorithm and reload it
file_name_algo = os.path.join('../data/dump/algo_knn_cosine_500dump_file')
dump.dump(file_name_algo, algo=algo)
file_name_pred = os.path.join('../data/dump/pred_knn_cosine_500dump_file')
dump.dump(file_name_pred, predictions=predictions)

In [19]:
# Code below identifes the top 10 best and worst predictions based upon code from this 
# notebook:https://nbviewer.jupyter.org/github/NicolasHug/Surprise/blob/master/examples/notebooks/KNNBasic_analysis.ipynb
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0

    
df_predict = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_predict['Iu'] = df_predict.uid.apply(get_Iu)
df_predict['Ui'] = df_predict.iid.apply(get_Ui)
df_predict['err'] = abs(df_predict.est - df_predict.rui)
best_predictions = df_predict.sort_values(by='err')[:10]
worst_predictions = df_predict.sort_values(by='err')[-10:]   

In [9]:
# Best Predictions:
print(best_predictions)

                 uid     iid  rui  est  \
221419      bmcduff2      59  1.0  1.0   
5530    DrunkyBuddha     402  5.0  5.0   
78697       thuglife  211516  5.0  5.0   
192036     100200300     680  3.0  3.0   
239355  DrunkyBuddha  145496  5.0  5.0   
96415      100200300   56738  3.0  3.0   
213965      thuglife  115317  5.0  5.0   
121924   IAMTEAMPONY   50697  3.0  3.0   
171023      thuglife   88969  5.0  5.0   
102531      bmcduff2     436  1.0  1.0   

                                          details  Iu    Ui  err  
221419  {'actual_k': 37, 'was_impossible': False}  37  1427  0.0  
5530    {'actual_k': 40, 'was_impossible': False}  84   662  0.0  
78697   {'actual_k': 24, 'was_impossible': False}  24   444  0.0  
192036  {'actual_k': 40, 'was_impossible': False}  58  1927  0.0  
239355  {'actual_k': 40, 'was_impossible': False}  84   529  0.0  
96415   {'actual_k': 40, 'was_impossible': False}  58   764  0.0  
213965  {'actual_k': 24, 'was_impossible': False}  24   549  0.0  
1

In [10]:
# Worst Predictions:
print(worst_predictions)

                      uid    iid  rui       est  \
240579               Sak3   1320  1.0  4.472478   
248485           SkiBum22   1390  1.0  4.490060   
128147  MarshallBirdhouse  86621  1.0  4.499298   
267529           dogfish7    639  1.0  4.501057   
243955          Pete27lax  95386  1.0  4.537452   
284631              J_Dub   1361  1.0  4.561757   
254159      Rochefort10nh    436  1.0  4.600629   
24945                ucis  53728  1.0  4.668918   
246243            Schimms   1320  1.0  4.768464   
59322          MaddDogg84  55019  1.0  4.784031   

                                          details   Iu    Ui       err  
240579  {'actual_k': 13, 'was_impossible': False}   13  1373  3.472478  
248485  {'actual_k': 40, 'was_impossible': False}  186   437  3.490060  
128147  {'actual_k': 40, 'was_impossible': False}  120   802  3.499298  
267529  {'actual_k': 22, 'was_impossible': False}   22  1240  3.501057  
243955  {'actual_k': 40, 'was_impossible': False}   95  1140  3.537452  


In [31]:
def get_beer_name (beer_raw_id):
    beer_name = beers_df.loc[beers_df.beer_id==beer_raw_id,'name'].values[0]
    return beer_name

def get_beer_style (beer_raw_id):
    beer_style = beers_df.loc[beers_df.beer_id==beer_raw_id,'style'].values[0]
    return beer_style

def get_beer_score_mean (beer_raw_id):
    score_mean = mean_score.loc[mean_score.beer_id==beer_raw_id,'score'].values[0]
    return score_mean

def get_beer_neighbors (beer_raw_id):
    beer_inner_id = algo.trainset.to_inner_iid(beer_raw_id)
    beer_neighbors = algo.get_neighbors(beer_inner_id, k=5)
    beer_neighbors = (algo.trainset.to_raw_iid(inner_id)
                  for inner_id in beer_neighbors)
    return(beer_neighbors)

def get_beer_recc_df (beer_raw_id):
    beer_inner_id = algo.trainset.to_inner_iid(beer_raw_id)
    beer_neighbors = algo.get_neighbors(beer_inner_id, k=10)
    beer_neighbors = (algo.trainset.to_raw_iid(inner_id)
                      for inner_id in beer_neighbors)
    beers_id_recc = []
    beer_name_recc =[]
    beer_style_recc = []
    beer_score_mean = []
    for beer in beer_neighbors:
        beers_id_recc.append(beer)
        beer_name_recc.append(get_beer_name(beer))
        beer_style_recc.append(get_beer_style(beer))
        beer_score_mean.append(get_beer_score_mean(beer))
    beer_reccomendations_df = pd.DataFrame(list(zip(beers_id_recc,beer_name_recc,beer_style_recc,beer_score_mean)),
                                       columns=['beer_id', 'name', 'style', 'score_mean'])
    return beer_reccomendations_df
    
def get_inner_ids(riids):
    inner_ids = []
    for riid in riids:
        inner_ids.append(trainset.to_inner_iid(riid))
    return inner_ids
        

In [26]:
#Create mean score dataframe and get riids (riids = raw beers ids)
mean_score = merge_df.groupby('beer_id', as_index=False)[['score']].mean()
riids = mean_score['beer_id']
riids = riids.to_list()
#Get the beer inner ids
inner_ids = get_inner_ids(riids)

In [27]:
# Save the df_predict and df_ids for later use
df_predict.to_csv("../data/csv/df_predict_cosine_500.csv",index=False)
df_ids = pd.DataFrame(list(zip(riids,inner_ids)), columns=['beer_id', 'inner_ids'])
df_ids.to_csv("../data/csv/df_ids_cosine_500.csv",index=False)

In [30]:
# Lets test some beers.  Enter a beer and use the predictions model to return 5 nearest neighbors
# K=10

beer_raw_id =  232
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Corona Extra, American Adjunct Lager, score = 2.2471090047393374 are:


Unnamed: 0,beer_id,name,style,score_mean
0,20604,Hell Hath No Fury Ale,Belgian Dubbel,3.789254
1,75898,"Hop, Drop 'n Roll",American IPA,4.239664
2,79387,Drie Fonteinen Oude Geuze Golden Blend,Belgian Gueuze,4.478362
3,69522,Double Citra,New England IPA,4.594883
4,84078,White Chocolate,American Wheatwine Ale,4.199346
5,942,Full Sail IPA,American IPA,3.558979
6,13826,Cantillon Lou Pepe - Gueuze,Belgian Gueuze,4.511007
7,113674,King Sue,New England IPA,4.584517
8,84772,West Ashley,American Wild Ale,4.546653
9,2479,Petrus Oud Bruin,Flanders Oud Bruin,3.681499


In [35]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id =  412
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Old Rasputin, Russian Imperial Stout, score = 4.270477977977977 are:


Unnamed: 0,beer_id,name,style,score_mean
0,99011,Mastermind,American Imperial IPA,4.39691
1,189272,Ten FIDY - Bourbon Barrel Aged,Russian Imperial Stout,4.465606
2,90101,Surette Provision Saison,Belgian Saison,4.1854
3,122334,Bengali,American IPA,3.98447
4,83008,Society & Solitude #4,New England IPA,4.586907
5,62397,Rare Bourbon County Brand Stout (2010),American Imperial Stout,4.806029
6,64550,Sumatra Mountain Brown,American Brown Ale,4.275013
7,182256,Beer Camp: Tropical IPA (2016),New England IPA,3.920873
8,37113,Old Stock Cellar Reserve (Aged In Bourbon Barr...,English Old Ale,4.384061
9,123062,Swish,New England IPA,4.555908


In [37]:
# Lets test some beers.  Enter a beer and use the predictions model to return 5 nearest neighbors

beer_raw_id =  26049
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Wild Blue, Fruit and Field Beer, score = 2.1567888307155303 are:


Unnamed: 0,beer_id,name,style,score_mean
0,76421,Mornin' Delight,American Imperial Stout,4.697608
1,211516,Doppelganger,New England IPA,4.56824
2,111176,Rogue Farms 7 Hop IPA,American IPA,3.922445
3,40628,Big Swell IPA,American IPA,3.759861
4,88298,Sap,New England IPA,4.307752
5,49374,Colette Farmhouse Ale,Belgian Saison,3.828932
6,84194,16th Anniversary IPA,American Imperial IPA,3.806564
7,88169,Prairie Standard,Belgian Saison,3.995584
8,45541,Imperial Stout,American Imperial Stout,4.159929
9,42349,Marshmallow Handjee,Russian Imperial Stout,4.745615


In [38]:
# Lets test some beers.  Enter a beer and use the predictions model to return 5 nearest neighbors

beer_raw_id =  6108
print(f'The 5 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 5 nearest neighbors of 60 Minute IPA, American IPA, score = 4.019886399495109 are:


Unnamed: 0,beer_id,name,style,score_mean
0,154542,Mosaic Dry Hopped Fort Point,American Pale Ale (APA),4.498179
1,76008,Box Set Track #10 - Bat Out Of Hell,American Imperial Stout,4.395156
2,178857,ReDANKulous,American Imperial IPA,4.121856
3,89586,Pompeii,American IPA,4.299824
4,56199,Tweak,American Imperial Stout,4.432399
5,133621,Born Yesterday Pale Ale,American Pale Ale (APA),4.390237
6,74778,Rodenbach Caractère Rouge,Flanders Red Ale,4.451386
7,90473,Imperial Doughnut Break,American Imperial Porter,4.186481
8,114804,Crusher,New England IPA,4.473434
9,66245,Elevated IPA,American IPA,4.316832


In [39]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id =  332
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Miller Lite, American Light Lager, score = 2.06624165554072 are:


Unnamed: 0,beer_id,name,style,score_mean
0,37774,Schneider Weisse Tap 5 Schneider & Brooklyner ...,German Weizenbock,4.074547
1,56199,Tweak,American Imperial Stout,4.432399
2,772,Hacker-Pschorr Hefe Weisse Naturtrüb,German Hefeweizen,4.069036
3,99699,Accumulation,Belgian IPA,3.713689
4,127725,Electric Ray: India Pale Lager (Beer Camp Acro...,American Lager,3.812417
5,7284,Double IPA,American Imperial IPA,4.171067
6,27571,Hoppyum IPA,American IPA,3.753883
7,58048,Stone / Dogfish Head / Victory - Saison Du BUFF,Belgian Saison,3.795838
8,52752,Fyodor,Russian Imperial Stout,4.454628
9,170124,Boomsauce,New England IPA,3.930437


In [40]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id =  2512
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Chimay Grande Réserve (Blue), Belgian Strong Dark Ale, score = 4.2537427801309216 are:


Unnamed: 0,beer_id,name,style,score_mean
0,168971,Ruination Double IPA 2.0,American Imperial IPA,4.199918
1,176650,Alter Ego,New England IPA,4.516871
2,68269,The Waldos' Special Ale,American Imperial IPA,4.292115
3,182256,Beer Camp: Tropical IPA (2016),New England IPA,3.920873
4,211516,Doppelganger,New England IPA,4.56824
5,67722,Bourbon Barrel Aged Siberian Night Imperial Stout,Russian Imperial Stout,4.230686
6,23459,Thumbprint Enigma,Flanders Oud Bruin,4.14066
7,84140,Lil' B,American Imperial Porter,4.164918
8,18093,Arctic Devil Barley Wine,British Barleywine,4.354669
9,77708,Ninja Vs. Unicorn,American Imperial IPA,4.264877


In [41]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id =  1352
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Shiner Bock, German Bock, score = 3.0876065065840432 are:


Unnamed: 0,beer_id,name,style,score_mean
0,55401,B-Bomb (Bourbon Abominable Winter Ale),American Strong Ale,4.429721
1,63724,Oude Tart - Cherries,Flanders Red Ale,4.412661
2,78660,Assassin,American Imperial Stout,4.659036
3,83008,Society & Solitude #4,New England IPA,4.586907
4,8954,Cantillon Saint Lamvinus,Belgian Fruit Lambic,4.530538
5,107838,The Substance,American IPA,4.3075
6,172420,No Rules,American Imperial Porter,4.471153
7,67722,Bourbon Barrel Aged Siberian Night Imperial Stout,Russian Imperial Stout,4.230686
8,31548,Big Black Voodoo Daddy,Russian Imperial Stout,4.171649
9,154542,Mosaic Dry Hopped Fort Point,American Pale Ale (APA),4.498179


In [42]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id =  607
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Fat Tire Belgian Style Ale, American Amber / Red Ale, score = 3.524434306569337 are:


Unnamed: 0,beer_id,name,style,score_mean
0,123062,Swish,New England IPA,4.555908
1,182256,Beer Camp: Tropical IPA (2016),New England IPA,3.920873
2,74467,Boat Beer,American Pale Ale (APA),4.070037
3,18093,Arctic Devil Barley Wine,British Barleywine,4.354669
4,56875,BPA,Belgian Pale Ale,3.981579
5,34129,Sierra Nevada Bigfoot Barleywine Style Ale - B...,American Barleywine,4.179134
6,79898,Framinghammer - Bourbon Barrel-Aged,Baltic Porter,4.297917
7,139521,Barrel Aged Bomb!,American Imperial Stout,4.478484
8,74065,077XX,American Imperial IPA,4.208741
9,89586,Pompeii,American IPA,4.299824


In [43]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id =  246
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Heineken Lager Beer, European Pale Lager, score = 2.6418710089399737 are:


Unnamed: 0,beer_id,name,style,score_mean
0,69702,Seizoen Bretta,Belgian Saison,4.260015
1,35284,Sexual Chocolate,Russian Imperial Stout,4.103513
2,127725,Electric Ray: India Pale Lager (Beer Camp Acro...,American Lager,3.812417
3,74123,Snow Day Winter Ale,American Black Ale,3.668375
4,1611,Warsteiner Premium Dunkel,Munich Dunkel Lager,3.468836
5,127727,Alt Route: Altbier (Beer Camp Across America),German Altbier,3.554654
6,129674,18th Anniversary IPA,American Imperial IPA,3.877831
7,37774,Schneider Weisse Tap 5 Schneider & Brooklyner ...,German Weizenbock,4.074547
8,2566,Maredsous 10 - Triple,Belgian Tripel,3.915321
9,65113,80-Acre Hoppy Wheat,American Pale Wheat Ale,3.886248


In [44]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id = 2755 
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Cerveza Pacifico Clara, American Adjunct Lager, score = 2.647262210796916 are:


Unnamed: 0,beer_id,name,style,score_mean
0,3659,Cantillon Lou Pepe - Kriek,Belgian Fruit Lambic,4.619039
1,87246,Barrel-Aged Abraxas,American Imperial Stout,4.736365
2,172420,No Rules,American Imperial Porter,4.471153
3,75898,"Hop, Drop 'n Roll",American IPA,4.239664
4,81072,Summit Sága IPA,American IPA,3.916811
5,42349,Marshmallow Handjee,Russian Imperial Stout,4.745615
6,42203,Apricot Ale,American Wild Ale,4.332906
7,79405,Citra Ass Down!,American Imperial IPA,3.897207
8,62328,Estate Homegrown Wet Hop Ale,American IPA,4.088647
9,27571,Hoppyum IPA,American IPA,3.753883


In [45]:
# Lets test some beers.  Enter a beer and use the predictions model to return 10 nearest neighbors
# K=10, pearson

beer_raw_id = 131
print(f'The 10 nearest neighbors of {get_beer_name(beer_raw_id)}, {get_beer_style(beer_raw_id)},\
 score = {get_beer_score_mean (beer_raw_id)} are:')
df = get_beer_recc_df (beer_raw_id)
df.head(10)

The 10 nearest neighbors of Celebrator, German Doppelbock, score = 4.2999630390143775 are:


Unnamed: 0,beer_id,name,style,score_mean
0,176650,Alter Ego,New England IPA,4.516871
1,179482,IPA,American IPA,3.941913
2,95068,Another One,American IPA,4.292244
3,21363,Odyssey,Belgian Strong Dark Ale,4.242478
4,182256,Beer Camp: Tropical IPA (2016),New England IPA,3.920873
5,175577,CitruSinensis (OneHitter Series),American Pale Ale (APA),4.058003
6,90473,Imperial Doughnut Break,American Imperial Porter,4.186481
7,91358,Space Station Middle Finger,American Pale Ale (APA),4.160258
8,69290,The Corruption,American IPA,3.84671
9,127866,Hop Knife Harvest Ale,American IPA,4.037496
