## Imports

In [1]:
#!pip install texthero
#!pip install --upgrade numpy
#!pip install surprise

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import texthero as hero
from texthero import preprocessing

# Import from Surprise 
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline, SVD
import warnings

---
## EDA

In [4]:
def eda(df):
    """Function to perform some basic EDA on my datasets"""
    
    #Inspect the first 5 rows
    display(df.head())
    print("\n")
    
    # Count of non-null values, datatypes, and total entries
    display(df.info())
    print("\n")
    
    # Check descriptive statistics
    display(df.describe())
    print("\n")
    
    # Check value counts
    for c in df.columns:
        print ("---- %s ----" % c)
        print (df[c].value_counts())
        print("\n")
    
    # Print null values
    display(df.isna().sum())
    print('Total Null Count:', df.isna().sum().sum())

> ### First Data Set
>
> [Beer Reviews](https://www.kaggle.com/rdoume/beerreviews)
>
>Not combining this data for now, it's proving to be too complicated 

In [5]:
#beer_reviews = pd.read_csv('data/beer_reviews.csv')
#eda(beer_reviews)

In [6]:
# dropping irrelevant columns
#beer_reviews = beer_reviews.drop(columns=['review_time','review_profilename'])

# renaming columns for consistency among all dataframes
#beer_reviews = beer_reviews.rename(columns={'review_overall': 'overall', 'review_aroma': 'smell', 'review_appearance':'look',
#                                           'beer_style':'style','review_palate':'feel','review_taste':'taste','beer_abv':'abv',
#                                           'beer_beerid':'beer_id'})

In [7]:
#beer_reviews.head()

---
> ### Second Data Set
>
> [Beers, Breweries, and Beer Reviews](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews?select=beers.csv)

In [8]:
beer = pd.read_csv('/content/drive/MyDrive/capstone/beers.csv')

In [9]:
eda(beer)

Unnamed: 0,id,name,brewery_id,state,country,style,availability,abv,notes,retired
0,202522,Olde Cogitator,2199,CA,US,English Oatmeal Stout,Rotating,7.3,No notes at this time.,f
1,82352,Konrads Stout Russian Imperial Stout,18604,,NO,Russian Imperial Stout,Rotating,10.4,No notes at this time.,f
2,214879,Scottish Right,44306,IN,US,Scottish Ale,Year-round,4.0,No notes at this time.,t
3,320009,MegaMeow Imperial Stout,4378,WA,US,American Imperial Stout,Winter,8.7,Every time this year,f
4,246438,Peaches-N-Cream,44617,PA,US,American Cream Ale,Rotating,5.1,No notes at this time.,f




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358873 entries, 0 to 358872
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            358873 non-null  int64  
 1   name          358873 non-null  object 
 2   brewery_id    358873 non-null  int64  
 3   state         298147 non-null  object 
 4   country       358719 non-null  object 
 5   style         358872 non-null  object 
 6   availability  358873 non-null  object 
 7   abv           320076 non-null  float64
 8   notes         358827 non-null  object 
 9   retired       358873 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 27.4+ MB


None





Unnamed: 0,id,brewery_id,abv
count,358873.0,358873.0,320076.0
mean,189241.027199,24597.365051,6.528504
std,107675.746916,16442.976904,2.085403
min,3.0,1.0,0.01
25%,96422.0,7318.0,5.0
50%,189909.0,28383.0,6.0
75%,282546.0,37452.0,7.5
max,374406.0,54144.0,100.0




---- id ----
2047      1
252462    1
236070    1
234023    1
256552    1
         ..
95620     1
97669     1
91526     1
93575     1
2049      1
Name: id, Length: 358873, dtype: int64


---- name ----
Oktoberfest                 755
IPA                         633
Pale Ale                    620
Hefeweizen                  477
Oatmeal Stout               443
                           ... 
Prion Porter                  1
Fifty Fifty Sorachi Pale      1
Spaltfest                     1
Kalifornia Uncommon           1
San Juan Pale Ale             1
Name: name, Length: 298567, dtype: int64


---- brewery_id ----
1550     1229
28383    1225
17981    1043
1146      922
13307     898
         ... 
42996       1
24813       1
18853       1
25889       1
37054       1
Name: brewery_id, Length: 16569, dtype: int64


---- state ----
CA    33649
PA    17083
NY    14572
CO    14182
MI    12690
      ...  
MB      171
PE       98
YT       86
NL       77
NT        4
Name: state, Length: 67, dtype:

id                  0
name                0
brewery_id          0
state           60726
country           154
style               1
availability        0
abv             38797
notes              46
retired             0
dtype: int64

Total Null Count: 99724


In [10]:
# dropping irrelevant columns
beer = beer.drop(columns=['country','notes'])

# renaming columns for consistency among all dataframes
beer = beer.rename(columns={'id': 'beer_id', 'name': 'beer_name'})

#removing rows without state data
beer = beer.dropna(subset=['state'])

# cleaning the availability column 
beer['availability'] = beer['availability'].str.strip()

In [11]:
beer.head()

Unnamed: 0,beer_id,beer_name,brewery_id,state,style,availability,abv,retired
0,202522,Olde Cogitator,2199,CA,English Oatmeal Stout,Rotating,7.3,f
2,214879,Scottish Right,44306,IN,Scottish Ale,Year-round,4.0,t
3,320009,MegaMeow Imperial Stout,4378,WA,American Imperial Stout,Winter,8.7,f
4,246438,Peaches-N-Cream,44617,PA,American Cream Ale,Rotating,5.1,f
6,108605,Icon Sender,22598,CA,American Lager,Year-round,5.6,f


In [12]:
eda(beer)

Unnamed: 0,beer_id,beer_name,brewery_id,state,style,availability,abv,retired
0,202522,Olde Cogitator,2199,CA,English Oatmeal Stout,Rotating,7.3,f
2,214879,Scottish Right,44306,IN,Scottish Ale,Year-round,4.0,t
3,320009,MegaMeow Imperial Stout,4378,WA,American Imperial Stout,Winter,8.7,f
4,246438,Peaches-N-Cream,44617,PA,American Cream Ale,Rotating,5.1,f
6,108605,Icon Sender,22598,CA,American Lager,Year-round,5.6,f




<class 'pandas.core.frame.DataFrame'>
Int64Index: 298147 entries, 0 to 358872
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   beer_id       298147 non-null  int64  
 1   beer_name     298147 non-null  object 
 2   brewery_id    298147 non-null  int64  
 3   state         298147 non-null  object 
 4   style         298146 non-null  object 
 5   availability  298147 non-null  object 
 6   abv           263558 non-null  float64
 7   retired       298147 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 20.5+ MB


None





Unnamed: 0,beer_id,brewery_id,abv
count,298147.0,298147.0,263558.0
mean,194002.156272,25343.706101,6.608174
std,106328.032039,16261.618864,2.064542
min,3.0,2.0,0.01
25%,103642.5,9819.0,5.2
50%,196192.0,29238.0,6.1
75%,286387.5,37625.0,7.6
max,374406.0,54144.0,100.0




---- beer_id ----
2047      1
250671    1
33605     1
39750     1
60232     1
         ..
136424    1
138473    1
134379    1
144620    1
2049      1
Name: beer_id, Length: 298147, dtype: int64


---- beer_name ----
Oktoberfest                              724
IPA                                      516
Pale Ale                                 477
Hefeweizen                               443
Oatmeal Stout                            416
                                        ... 
The Angel and The Sword                    1
Cambridge House Stonehenge Ale             1
Rudy's Dark                                1
Side Trail Series : Beard Of Paradise      1
San Juan Pale Ale                          1
Name: beer_name, Length: 247134, dtype: int64


---- brewery_id ----
28383    1225
17981    1043
1146      922
147       736
16866     723
         ... 
50863       1
30393       1
49473       1
41242       1
2344        1
Name: brewery_id, Length: 10011, dtype: int64


---- state ----


beer_id             0
beer_name           0
brewery_id          0
state               0
style               1
availability        0
abv             34589
retired             0
dtype: int64

Total Null Count: 34590


---
> ### Third Data Set
>
> [Beers, Breweries, and Beer Reviews](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews?select=beers.csv)

In [13]:
# reading in the dataframe containing the breweries
breweries = pd.read_csv('/content/drive/MyDrive/capstone/breweries.csv')

In [14]:
# eda on that data
eda(breweries)

Unnamed: 0,id,name,city,state,country,notes,types
0,19730,Brouwerij Danny,Erpe-Mere,,BE,No notes at this time.,Brewery
1,32541,Coachella Valley Brewing Co,Thousand Palms,CA,US,No notes at this time.,"Brewery, Bar, Beer-to-go"
2,44736,Beef 'O' Brady's,Plant City,FL,US,No notes at this time.,"Bar, Eatery"
3,23372,Broadway Wine Merchant,Oklahoma City,OK,US,No notes at this time.,Store
4,35328,Brighton Beer Dispensary (DUPLICATE),Brighton,GB2,GB,Duplicate of https://www.beeradvocate.com/beer...,"Bar, Eatery"




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50347 entries, 0 to 50346
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       50347 non-null  int64 
 1   name     50347 non-null  object
 2   city     50289 non-null  object
 3   state    39076 non-null  object
 4   country  50341 non-null  object
 5   notes    50262 non-null  object
 6   types    50347 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.7+ MB


None





Unnamed: 0,id
count,50347.0
mean,27870.513874
std,15270.53593
min,1.0
25%,15487.5
50%,28313.0
75%,40922.5
max,54156.0




---- id ----
2047     1
48413    1
36123    1
34074    1
40217    1
        ..
37479    1
39526    1
33381    1
35428    1
2049     1
Name: id, Length: 50347, dtype: int64


---- name ----
Whole Foods Market                      162
Total Wine & More                       147
Cost Plus World Market                  118
Mellow Mushroom                         114
Trader Joe's                             88
                                       ... 
Sparrow Wine & Liquor Co. - Downtown      1
Coconuts Bar & Grill                      1
Craftbeers                                1
Mosaic Cafe & Lounge                      1
The Portcullis                            1
Name: name, Length: 45245, dtype: int64


---- city ----
Chicago                     512
Philadelphia                505
New York                    432
Portland                    370
London                      358
                           ... 
Piazzola sul Brenta (PD)      1
Villarodin-Bourget            1
Ate         

id             0
name           0
city          58
state      11271
country        6
notes         85
types          0
dtype: int64

Total Null Count: 11420


In [15]:
# dropping irrelevant columns
breweries = breweries.drop(columns=['notes'])

# renaming columns for consistency among all dataframes
breweries = breweries.rename(columns={'id': 'brewery_id', 'name': 'brewery_name'})

#removing rows without state data
beer = beer.dropna(subset=['state'])

In [16]:
# making sure everything looks right
breweries.head()

Unnamed: 0,brewery_id,brewery_name,city,state,country,types
0,19730,Brouwerij Danny,Erpe-Mere,,BE,Brewery
1,32541,Coachella Valley Brewing Co,Thousand Palms,CA,US,"Brewery, Bar, Beer-to-go"
2,44736,Beef 'O' Brady's,Plant City,FL,US,"Bar, Eatery"
3,23372,Broadway Wine Merchant,Oklahoma City,OK,US,Store
4,35328,Brighton Beer Dispensary (DUPLICATE),Brighton,GB2,GB,"Bar, Eatery"


---
> ### Fourth Data Set
>
> [Beers, Breweries, and Beer Reviews](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews?select=beers.csv)

In [17]:
reviews = pd.read_csv('/content/drive/MyDrive/capstone/reviews.csv')

In [18]:
eda(reviews)

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score
0,271781,bluejacket74,2017-03-17,"750 ml bottle, 2016 vintage, bottle #304 of...",4.0,4.0,4.0,4.25,4.0,4.03
1,125646,_dirty_,2017-12-21,,4.5,4.5,4.5,4.5,4.5,4.5
2,125646,CJDUBYA,2017-12-21,,4.75,4.75,4.75,4.75,4.75,4.75
3,125646,GratefulBeerGuy,2017-12-20,0% 16 oz can. Funny story: As I finally wal...,4.75,4.75,4.5,4.5,4.5,4.58
4,125646,LukeGude,2017-12-20,Classic TH NEIPA. Overflowing head and bouq...,4.25,4.5,4.25,4.25,4.25,4.31




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9073128 entries, 0 to 9073127
Data columns (total 10 columns):
 #   Column    Dtype  
---  ------    -----  
 0   beer_id   int64  
 1   username  object 
 2   date      object 
 3   text      object 
 4   look      float64
 5   smell     float64
 6   taste     float64
 7   feel      float64
 8   overall   float64
 9   score     float64
dtypes: float64(6), int64(1), object(3)
memory usage: 692.2+ MB


None





Unnamed: 0,beer_id,look,smell,taste,feel,overall,score
count,9073128.0,5283110.0,5283110.0,5283110.0,5283110.0,5283110.0,9073128.0
mean,77306.55,3.952155,3.887863,3.920704,3.88435,3.91628,3.889815
std,79293.45,0.5517191,0.611591,0.6333365,0.5999279,0.6054554,0.6127417
min,3.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,11554.0,3.75,3.5,3.5,3.5,3.5,3.57
50%,56545.0,4.0,4.0,4.0,4.0,4.0,4.0
75%,107667.0,4.25,4.25,4.25,4.25,4.25,4.25
max,373128.0,5.0,5.0,5.0,5.0,5.0,5.0




---- beer_id ----
11757     17160
2093      15947
7971      14927
1093      14915
29619     14292
          ...  
203777        1
126326        1
109950        1
282050        1
8188          1
Name: beer_id, Length: 309542, dtype: int64


---- username ----
Sammy            13798
kylehay2004      12221
acurtis          12016
StonedTrippin    11859
jaydoc           11800
                 ...  
mgerwien             1
rss                  1
robpod               1
leviticus7           1
BeermanLIc           1
Name: username, Length: 164934, dtype: int64


---- date ----
2011-11-15    17648
2011-11-14    17010
2011-11-16     9571
2014-07-20     9066
2011-11-18     8739
              ...  
2000-12-21        1
2000-11-11        1
1998-06-10        1
2000-09-19        1
1998-02-18        1
Name: date, Length: 6560, dtype: int64


---- text ----
                                                                                                                                                    

beer_id           0
username       3815
date              0
text              0
look        3790018
smell       3790018
taste       3790018
feel        3790018
overall     3790018
score             0
dtype: int64

Total Null Count: 18953905


In [19]:
# dropping irrelevant columns
reviews = reviews.drop(columns=['date','look','smell','taste','feel','overall'])

# dropping reviews without review data
reviews = reviews.dropna(subset=['username'])

In [20]:
# making everything looks right
reviews.head()

Unnamed: 0,beer_id,username,text,score
0,271781,bluejacket74,"750 ml bottle, 2016 vintage, bottle #304 of...",4.03
1,125646,_dirty_,,4.5
2,125646,CJDUBYA,,4.75
3,125646,GratefulBeerGuy,0% 16 oz can. Funny story: As I finally wal...,4.58
4,125646,LukeGude,Classic TH NEIPA. Overflowing head and bouq...,4.31


---

## Combining Data

In [21]:
beer_reviews = beer.merge(reviews, on=['beer_id'])

In [22]:
eda(beer_reviews)

Unnamed: 0,beer_id,beer_name,brewery_id,state,style,availability,abv,retired,username,text,score
0,214879,Scottish Right,44306,IN,Scottish Ale,Year-round,4.0,t,warpig372,,2.5
1,320009,MegaMeow Imperial Stout,4378,WA,American Imperial Stout,Winter,8.7,f,NickThePyro,,4.0
2,246438,Peaches-N-Cream,44617,PA,American Cream Ale,Rotating,5.1,f,rolltide8425,Exactly what it says it is,3.67
3,108605,Icon Sender,22598,CA,American Lager,Year-round,5.6,f,Smarty_Pints,,3.75
4,108605,Icon Sender,22598,CA,American Lager,Year-round,5.6,f,SocalKicks,,3.75




<class 'pandas.core.frame.DataFrame'>
Int64Index: 7936808 entries, 0 to 7936807
Data columns (total 11 columns):
 #   Column        Dtype  
---  ------        -----  
 0   beer_id       int64  
 1   beer_name     object 
 2   brewery_id    int64  
 3   state         object 
 4   style         object 
 5   availability  object 
 6   abv           float64
 7   retired       object 
 8   username      object 
 9   text          object 
 10  score         float64
dtypes: float64(2), int64(2), object(7)
memory usage: 726.6+ MB


None





Unnamed: 0,beer_id,brewery_id,abv,score
count,7936808.0,7936808.0,7775221.0,7936808.0
mean,82700.0,10918.91,7.422089,3.909407
std,79891.68,13687.96,2.471093,0.5951969
min,3.0,2.0,0.01,1.0
25%,19960.0,193.0,5.5,3.62
50%,62984.0,1337.0,7.0,4.0
75%,113858.0,22928.0,8.9,4.25
max,373128.0,54080.0,100.0,5.0




---- beer_id ----
11757     17137
2093      15926
7971      14909
1093      14896
29619     14285
          ...  
227086        1
231184        1
239380        1
141092        1
182179        1
Name: beer_id, Length: 259803, dtype: int64


---- beer_name ----
IPA                                   30640
Porter                                19024
Breakfast Stout                       17552
Imperial Stout                        17129
Pale Ale                              16293
                                      ...  
Hatchery: Strawberry Milkshake IPA        1
Radlier                                   1
Cranberry Peak Ale                        1
Gronk                                     1
Dewey's Porter                            1
Name: beer_name, Length: 218678, dtype: int64


---- brewery_id ----
140      175085
147      173216
1199     156665
35       152576
64       152227
          ...  
31295         1
50335         1
44502         1
51764         1
39365         1
Name: bre

beer_id              0
beer_name            0
brewery_id           0
state                0
style                2
availability         0
abv             161587
retired              0
username             0
text                 0
score                0
dtype: int64

Total Null Count: 161589


In [23]:
plus_brew = beer_reviews.merge(breweries, on=['brewery_id'])

In [24]:
eda(plus_brew)

Unnamed: 0,beer_id,beer_name,brewery_id,state_x,style,availability,abv,retired,username,text,score,brewery_name,city,state_y,country,types
0,214879,Scottish Right,44306,IN,Scottish Ale,Year-round,4.0,t,warpig372,,2.5,Byway Brewing,Hammond,IN,US,"Brewery, Bar, Eatery, Beer-to-go"
1,356585,Bingo & Snorky,44306,IN,American IPA,Rotating,6.9,f,uncleotis,,4.4,Byway Brewing,Hammond,IN,US,"Brewery, Bar, Eatery, Beer-to-go"
2,284962,Raspberry Beret,44306,IN,Fruit and Field Beer,Limited (brewed once),4.5,t,Prager62,On tap at the brewery on 06/04/17 served in...,3.61,Byway Brewing,Hammond,IN,US,"Brewery, Bar, Eatery, Beer-to-go"
3,356572,Salted Caramel imperial brown ale,44306,IN,American Brown Ale,Rotating,9.0,f,uncleotis,,4.18,Byway Brewing,Hammond,IN,US,"Brewery, Bar, Eatery, Beer-to-go"
4,278791,Bean White Stout,44306,IN,American Stout,Rotating,7.5,f,Hackattack,A very tasty stout. The color is very unusu...,4.0,Byway Brewing,Hammond,IN,US,"Brewery, Bar, Eatery, Beer-to-go"




<class 'pandas.core.frame.DataFrame'>
Int64Index: 7936808 entries, 0 to 7936807
Data columns (total 16 columns):
 #   Column        Dtype  
---  ------        -----  
 0   beer_id       int64  
 1   beer_name     object 
 2   brewery_id    int64  
 3   state_x       object 
 4   style         object 
 5   availability  object 
 6   abv           float64
 7   retired       object 
 8   username      object 
 9   text          object 
 10  score         float64
 11  brewery_name  object 
 12  city          object 
 13  state_y       object 
 14  country       object 
 15  types         object 
dtypes: float64(2), int64(2), object(12)
memory usage: 1.0+ GB


None





Unnamed: 0,beer_id,brewery_id,abv,score
count,7936808.0,7936808.0,7775221.0,7936808.0
mean,82700.0,10918.91,7.422089,3.909407
std,79891.68,13687.96,2.471093,0.5951969
min,3.0,2.0,0.01,1.0
25%,19960.0,193.0,5.5,3.62
50%,62984.0,1337.0,7.0,4.0
75%,113858.0,22928.0,8.9,4.25
max,373128.0,54080.0,100.0,5.0




---- beer_id ----
11757     17137
2093      15926
7971      14909
1093      14896
29619     14285
          ...  
227086        1
231184        1
239380        1
141092        1
182179        1
Name: beer_id, Length: 259803, dtype: int64


---- beer_name ----
IPA                         30640
Porter                      19024
Breakfast Stout             17552
Imperial Stout              17129
Pale Ale                    16293
                            ...  
Maggie Claus                    1
Fuzzy Nectar                    1
Broken Tail - Dry-Hopped        1
Bravo, Warrior!                 1
Dewey's Porter                  1
Name: beer_name, Length: 218678, dtype: int64


---- brewery_id ----
140      175085
147      173216
1199     156665
35       152576
64       152227
          ...  
31295         1
50335         1
44502         1
51764         1
39365         1
Name: brewery_id, Length: 9503, dtype: int64


---- state_x ----
CA     1468142
CO      512926
MI      509092
NY      4

beer_id              0
beer_name            0
brewery_id           0
state_x              0
style                2
availability         0
abv             161587
retired              0
username             0
text                 0
score                0
brewery_name         0
city                 0
state_y              0
country              0
types                0
dtype: int64

Total Null Count: 161589


In [25]:
# dropping duplicate column
plus_brew = plus_brew.drop(columns=['state_x'])

# renaming column to reflect
plus_brew = plus_brew.rename(columns={'state_y': 'state'})

In [26]:
# selecting only US beers
df = plus_brew.loc[(plus_brew['state'] == 'WA') & (plus_brew['retired'] == 'f')]


In [27]:
df = df.drop_duplicates()

In [28]:
# selecting only currently available beers
# df = plus_brew.loc[plus_brew['retired'] == 'f']

In [29]:
eda(df)

Unnamed: 0,beer_id,beer_name,brewery_id,style,availability,abv,retired,username,text,score,brewery_name,city,state,country,types
44,320009,MegaMeow Imperial Stout,4378,American Imperial Stout,Winter,8.7,f,NickThePyro,,4.0,Georgetown Brewing Company,Seattle,WA,US,Brewery
45,262748,Sparklepuss,4378,American IPA,Rotating,6.0,f,gandres,Georgetown Brewing [email protected] Seattl...,4.0,Georgetown Brewing Company,Seattle,WA,US,Brewery
46,262748,Sparklepuss,4378,American IPA,Rotating,6.0,f,DamienI,,3.75,Georgetown Brewing Company,Seattle,WA,US,Brewery
47,298624,"Boots, Berry & Murray",4378,Leipzig Gose,Summer,3.1,f,gandres,Glass from tap @ Georgetown Brewing [email ...,3.72,Georgetown Brewing Company,Seattle,WA,US,Brewery
48,298624,"Boots, Berry & Murray",4378,Leipzig Gose,Summer,3.1,f,dshansen1184,,3.85,Georgetown Brewing Company,Seattle,WA,US,Brewery




<class 'pandas.core.frame.DataFrame'>
Int64Index: 81006 entries, 44 to 7936805
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   beer_id       81006 non-null  int64  
 1   beer_name     81006 non-null  object 
 2   brewery_id    81006 non-null  int64  
 3   style         81006 non-null  object 
 4   availability  81006 non-null  object 
 5   abv           78408 non-null  float64
 6   retired       81006 non-null  object 
 7   username      81006 non-null  object 
 8   text          81006 non-null  object 
 9   score         81006 non-null  float64
 10  brewery_name  81006 non-null  object 
 11  city          81006 non-null  object 
 12  state         81006 non-null  object 
 13  country       81006 non-null  object 
 14  types         81006 non-null  object 
dtypes: float64(2), int64(2), object(11)
memory usage: 9.9+ MB


None





Unnamed: 0,beer_id,brewery_id,abv,score
count,81006.0,81006.0,78408.0,81006.0
mean,88666.820433,12442.17732,7.178454,3.813942
std,85705.067718,13724.965852,2.182743,0.529421
min,946.0,351.0,0.05,1.0
25%,18443.0,700.0,5.7,3.5
50%,65499.0,1586.0,6.7,3.85
75%,124768.0,20680.0,8.1,4.15
max,372954.0,53727.0,16.8,5.0




---- beer_id ----
84045     2641
7077      1354
2018      1319
55401     1227
25649     1109
          ... 
168393       1
208275       1
340413       1
329110       1
293271       1
Name: beer_id, Length: 6256, dtype: int64


---- beer_name ----
Space Dust IPA                            2641
Night Owl Pumpkin Ale                     1354
The Immortal IPA                          1319
B-Bomb (Bourbon Abominable Winter Ale)    1227
Pyramid Apricot Ale                       1109
                                          ... 
The Landscape                                1
Brett Lab No. 12                             1
Tom Flanders Flanders Red                    1
Nillasicle                                   1
Chai Jones (Nitro)                           1
Name: beer_name, Length: 5780, dtype: int64


---- brewery_id ----
700      15199
20680     8930
403       5259
365       3745
684       2823
         ...  
49533        1
1061         1
50892        1
53565        1
45302        1
Na

beer_id            0
beer_name          0
brewery_id         0
style              0
availability       0
abv             2598
retired            0
username           0
text               0
score              0
brewery_name       0
city               0
state              0
country            0
types              0
dtype: int64

Total Null Count: 2598


In [30]:
# checking to see if some of these beers are actually named IPA and looking at each brewery's website I can confirm they are not creative people
df.loc[df['beer_name'] == 'IPA']

Unnamed: 0,beer_id,beer_name,brewery_id,style,availability,abv,retired,username,text,score,brewery_name,city,state,country,types
3534559,118068,IPA,32830,American IPA,Year-round,7.0,f,Donkster46,,3.61,Menace Brewing,Bellingham,WA,US,"Brewery, Bar"
3534560,118068,IPA,32830,American IPA,Year-round,7.0,f,beertunes,Served in Nonic. Pored a very nice light co...,3.67,Menace Brewing,Bellingham,WA,US,"Brewery, Bar"
3973407,96659,IPA,32084,American IPA,Rotating,8.0,f,Mattymystique,,3.75,Narrows Brewing Co.,Tacoma,WA,US,"Brewery, Bar, Eatery, Beer-to-go"
3973408,96659,IPA,32084,American IPA,Rotating,8.0,f,Sound_Explorer,0%,3.71,Narrows Brewing Co.,Tacoma,WA,US,"Brewery, Bar, Eatery, Beer-to-go"
3973409,96659,IPA,32084,American IPA,Rotating,8.0,f,bondjedi,,3.87,Narrows Brewing Co.,Tacoma,WA,US,"Brewery, Bar, Eatery, Beer-to-go"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7867428,146784,IPA,37641,American IPA,Year-round,7.0,f,VashonGuy,,3.83,Burwood Brewing Company,Walla Walla,WA,US,"Brewery, Bar, Beer-to-go"
7867429,146784,IPA,37641,American IPA,Year-round,7.0,f,Peter_ogrady,Very drinkable ipa - not as hoppy as I woul...,4.06,Burwood Brewing Company,Walla Walla,WA,US,"Brewery, Bar, Beer-to-go"
7867430,146784,IPA,37641,American IPA,Year-round,7.0,f,Thatoneguyssidekick,Pours a darker color than what would be exp...,4.17,Burwood Brewing Company,Walla Walla,WA,US,"Brewery, Bar, Beer-to-go"
7924089,60421,IPA,19721,American IPA,Year-round,,f,John_M,On tap at the brewpub yesterday. The MC IPA...,3.03,Mill Creek Brewpub,Walla Walla,WA,US,"Brewery, Bar, Eatery, Beer-to-go"


In [31]:
# dropping rows with missing ibu data 
df = df.dropna(subset=['abv'])

In [32]:
df.head()

Unnamed: 0,beer_id,beer_name,brewery_id,style,availability,abv,retired,username,text,score,brewery_name,city,state,country,types
44,320009,MegaMeow Imperial Stout,4378,American Imperial Stout,Winter,8.7,f,NickThePyro,,4.0,Georgetown Brewing Company,Seattle,WA,US,Brewery
45,262748,Sparklepuss,4378,American IPA,Rotating,6.0,f,gandres,Georgetown Brewing [email protected] Seattl...,4.0,Georgetown Brewing Company,Seattle,WA,US,Brewery
46,262748,Sparklepuss,4378,American IPA,Rotating,6.0,f,DamienI,,3.75,Georgetown Brewing Company,Seattle,WA,US,Brewery
47,298624,"Boots, Berry & Murray",4378,Leipzig Gose,Summer,3.1,f,gandres,Glass from tap @ Georgetown Brewing [email ...,3.72,Georgetown Brewing Company,Seattle,WA,US,Brewery
48,298624,"Boots, Berry & Murray",4378,Leipzig Gose,Summer,3.1,f,dshansen1184,,3.85,Georgetown Brewing Company,Seattle,WA,US,Brewery


In [33]:
df['beer_id'].nunique()

5494

In [34]:
grouped = df.groupby(['beer_id'], as_index = False).agg({'text': ' '.join})

In [35]:
eda(grouped)

Unnamed: 0,beer_id,text
0,948,Darkly rich and grainy black brew....
1,949,Rich clear amber pour wit...
2,950,0% One of the worst beers...
3,951,"On tap at Bitterroot, Seattle. Hazy bronze ..."
4,952,Pint at Cutters in downtown Seattle. A s...




<class 'pandas.core.frame.DataFrame'>
Int64Index: 5494 entries, 0 to 5493
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   beer_id  5494 non-null   int64 
 1   text     5494 non-null   object
dtypes: int64(1), object(1)
memory usage: 128.8+ KB


None





Unnamed: 0,beer_id
count,5494.0
mean,209586.767019
std,101739.793056
min,948.0
25%,126191.25
50%,216201.5
75%,298675.5
max,372954.0




---- beer_id ----
233470    1
267185    1
156325    1
283303    1
191144    1
         ..
252653    1
300376    1
9562      1
295122    1
122880    1
Name: beer_id, Length: 5494, dtype: int64


---- text ----
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

beer_id    0
text       0
dtype: int64

Total Null Count: 0


In [36]:
#grouped.to_csv("beerdata.csv", index=False)

In [37]:
df = df.dropna(subset=['text'])

In [38]:
#df.to_csv('fulldf.csv', index=False)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78408 entries, 44 to 7936805
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   beer_id       78408 non-null  int64  
 1   beer_name     78408 non-null  object 
 2   brewery_id    78408 non-null  int64  
 3   style         78408 non-null  object 
 4   availability  78408 non-null  object 
 5   abv           78408 non-null  float64
 6   retired       78408 non-null  object 
 7   username      78408 non-null  object 
 8   text          78408 non-null  object 
 9   score         78408 non-null  float64
 10  brewery_name  78408 non-null  object 
 11  city          78408 non-null  object 
 12  state         78408 non-null  object 
 13  country       78408 non-null  object 
 14  types         78408 non-null  object 
dtypes: float64(2), int64(2), object(11)
memory usage: 9.6+ MB


In [40]:
df.isna().sum().sum()

0

In [41]:
cleandf = df.copy()

In [42]:
cleandf['clean_text'] = hero.clean(cleandf['text'])

In [43]:
cleandf.head()

Unnamed: 0,beer_id,beer_name,brewery_id,style,availability,abv,retired,username,text,score,brewery_name,city,state,country,types,clean_text
44,320009,MegaMeow Imperial Stout,4378,American Imperial Stout,Winter,8.7,f,NickThePyro,,4.0,Georgetown Brewing Company,Seattle,WA,US,Brewery,
45,262748,Sparklepuss,4378,American IPA,Rotating,6.0,f,gandres,Georgetown Brewing [email protected] Seattl...,4.0,Georgetown Brewing Company,Seattle,WA,US,Brewery,georgetown brewing email protected seattle wa ...
46,262748,Sparklepuss,4378,American IPA,Rotating,6.0,f,DamienI,,3.75,Georgetown Brewing Company,Seattle,WA,US,Brewery,
47,298624,"Boots, Berry & Murray",4378,Leipzig Gose,Summer,3.1,f,gandres,Glass from tap @ Georgetown Brewing [email ...,3.72,Georgetown Brewing Company,Seattle,WA,US,Brewery,glass tap georgetown brewing email protected s...
48,298624,"Boots, Berry & Murray",4378,Leipzig Gose,Summer,3.1,f,dshansen1184,,3.85,Georgetown Brewing Company,Seattle,WA,US,Brewery,


In [44]:
cleandf['clean_text'] = hero.stem(cleandf['clean_text'])

In [45]:
cleandf.head()

Unnamed: 0,beer_id,beer_name,brewery_id,style,availability,abv,retired,username,text,score,brewery_name,city,state,country,types,clean_text
44,320009,MegaMeow Imperial Stout,4378,American Imperial Stout,Winter,8.7,f,NickThePyro,,4.0,Georgetown Brewing Company,Seattle,WA,US,Brewery,
45,262748,Sparklepuss,4378,American IPA,Rotating,6.0,f,gandres,Georgetown Brewing [email protected] Seattl...,4.0,Georgetown Brewing Company,Seattle,WA,US,Brewery,georgetown brew email protect seattl wa sight ...
46,262748,Sparklepuss,4378,American IPA,Rotating,6.0,f,DamienI,,3.75,Georgetown Brewing Company,Seattle,WA,US,Brewery,
47,298624,"Boots, Berry & Murray",4378,Leipzig Gose,Summer,3.1,f,gandres,Glass from tap @ Georgetown Brewing [email ...,3.72,Georgetown Brewing Company,Seattle,WA,US,Brewery,glass tap georgetown brew email protect seattl...
48,298624,"Boots, Berry & Murray",4378,Leipzig Gose,Summer,3.1,f,dshansen1184,,3.85,Georgetown Brewing Company,Seattle,WA,US,Brewery,


In [46]:
cleandf['tfidf'] = hero.tfidf(cleandf['clean_text'], max_features = 200)

In [47]:
cleandf.head()

Unnamed: 0,beer_id,beer_name,brewery_id,style,availability,abv,retired,username,text,score,brewery_name,city,state,country,types,clean_text,tfidf
44,320009,MegaMeow Imperial Stout,4378,American Imperial Stout,Winter,8.7,f,NickThePyro,,4.0,Georgetown Brewing Company,Seattle,WA,US,Brewery,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
45,262748,Sparklepuss,4378,American IPA,Rotating,6.0,f,gandres,Georgetown Brewing [email protected] Seattl...,4.0,Georgetown Brewing Company,Seattle,WA,US,Brewery,georgetown brew email protect seattl wa sight ...,"[0.0, 0.23632224698134519, 0.0, 0.0, 0.0, 0.0,..."
46,262748,Sparklepuss,4378,American IPA,Rotating,6.0,f,DamienI,,3.75,Georgetown Brewing Company,Seattle,WA,US,Brewery,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
47,298624,"Boots, Berry & Murray",4378,Leipzig Gose,Summer,3.1,f,gandres,Glass from tap @ Georgetown Brewing [email ...,3.72,Georgetown Brewing Company,Seattle,WA,US,Brewery,glass tap georgetown brew email protect seattl...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1978775505554..."
48,298624,"Boots, Berry & Murray",4378,Leipzig Gose,Summer,3.1,f,dshansen1184,,3.85,Georgetown Brewing Company,Seattle,WA,US,Brewery,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [48]:
features = hero.tfidf(cleandf['clean_text'], max_features = 200, return_feature_names = True)


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



In [49]:
features

(44         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
 45         [0.0, 0.23632224698134519, 0.0, 0.0, 0.0, 0.0,...
 46         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
 47         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1978775505554...
 48         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                                  ...                        
 7936651    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
 7936721    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
 7936778    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
 7936804    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
 7936805    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
 Length: 78408, dtype: object,
 ['abv',
  'aftertast',
  'alcohol',
  'ale',
  'almost',
  'also',
  'amber',
  'amount',
  'appear',
  'apricot',
  'aroma',
  'around',
  'averag',
  'back',
  'bad',
  'balanc',
  'beer',
  'best',
  'better',
  'big',
  'bit',
  'bitter',
  'black',
  'bodi',
  'bomber',
  'bottl',

##Modeling

### Collaborative Filtering

In [50]:
reader = Reader(rating_scale =(1, 5) ) 
colab_df = Dataset.load_from_df(cleandf[['username', 'beer_id', 'score']], reader)

In [103]:
codf = cleandf[['username', 'beer_id', 'score']]

In [52]:
#Train test split with test size of 20% 
trainset, testset = train_test_split(colab_df, test_size=0.2)

In [55]:
print(len(testset))
print(testset[0])

15682
('vfgccp', 85085, 3.73)


In [56]:
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  11593 

Number of items:  4999 



In [57]:
sim_cos = {'name':'cosine', 'user_based':False}

In [58]:
basic = knns.KNNBasic(sim_options=sim_cos)
basic.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f253ecc2890>

In [59]:
predictions = basic.test(testset)

In [60]:
print(accuracy.rmse(predictions))

RMSE: 0.5036
0.5036011158320134


In [61]:
sim_pearson = {'name':'pearson', 'user_based':False}
basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
basic_pearson.fit(trainset)
predictions = basic_pearson.test(testset)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.5061
0.5061399770180597


In [62]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
knn_baseline.fit(trainset)
predictions = knn_baseline.test(testset)
print(accuracy.rmse(predictions))

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.4332
0.43321921404330566


In [74]:
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(trainset)
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.4287
0.42866398888771334


In [75]:
user_34_prediction = svd.predict('34', '25')
user_34_prediction

Prediction(uid='34', iid='25', r_ui=None, est=3.820096451232344, details={'was_impossible': False})

In [76]:
user_34_prediction[3]

3.820096451232344

In [77]:
params = {'n_factors': [20, 50, 100],
         'reg_all': [0.02, 0.05, 0.1]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(colab_df)

In [78]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 0.41191129339583055, 'mae': 0.29094365241660436}
{'rmse': {'n_factors': 20, 'reg_all': 0.05}, 'mae': {'n_factors': 20, 'reg_all': 0.05}}


In [82]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options={'name':'pearson', 'user_based':True})
cv_knn_basic = cross_validate(knn_basic, colab_df) 

Computing the pearson similarity matrix...



invalid value encountered in sqrt



Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [83]:
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

('test_rmse', array([0.46835315, 0.47364874, 0.46207167, 0.46774276, 0.47035057]))
('test_mae', array([0.33381937, 0.33865308, 0.33371864, 0.33519796, 0.33716137]))
('fit_time', (23.136934995651245, 21.237473964691162, 21.044249534606934, 21.64932131767273, 21.070987701416016))
('test_time', (4.07350754737854, 4.052576303482056, 4.113367319107056, 3.965907335281372, 4.088876247406006))
-----------------------
0.4684333763475137


In [84]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options={'name':'pearson', 'user_based':True})
cv_knn_baseline = cross_validate(knn_baseline,colab_df)

Estimating biases using als...
Computing the pearson similarity matrix...



invalid value encountered in sqrt



Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [85]:
for i in cv_knn_baseline.items():
    print(i)

np.mean(cv_knn_baseline['test_rmse'])

('test_rmse', array([0.42239053, 0.41245731, 0.43130728, 0.42252806, 0.42077982]))
('test_mae', array([0.29766869, 0.29426188, 0.30373699, 0.29835208, 0.29745451]))
('fit_time', (22.353923559188843, 21.239930152893066, 22.045680284500122, 21.212291717529297, 21.29622197151184))
('test_time', (4.3296520709991455, 4.277353048324585, 5.209131240844727, 4.1706459522247314, 4.243609189987183))


0.42189259985454264

In [107]:
def beer_rater(df,num, style=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if style:
            beer = df[df['style'].str.contains(style)].sample(1)
        else:
            beer = df.sample(1)
        print(beer)
        rating = input('How do you rate this beer on a scale of 1-5, press n if you have not had :\n')
        if rating == 'n':
            continue
        else:
            rating_one_beer = {'username':userID,'beer_id':beer['beer_id'].values[0],'score':rating}
            rating_list.append(rating_one_beer) 
            num -= 1
    return rating_list   

In [108]:
simpledf = df[['beer_name','style','username','score','beer_id','brewery_name']]

In [109]:
user_rating = beer_rater(simpledf, 4, 'Stout')

                   beer_name           style  ... beer_id           brewery_name
3794049  Jive Espresso Stout  American Stout  ...   48071  Two Beers Brewing Co.

[1 rows x 6 columns]
How do you rate this beer on a scale of 1-5, press n if you have not had :
5
                                             beer_name  ...             brewery_name
1335469  Bourbon Barrel Aged Dark Star: Coffee Edition  ...  Fremont Brewing Company

[1 rows x 6 columns]
How do you rate this beer on a scale of 1-5, press n if you have not had :
3
                          beer_name  ...             brewery_name
1014081  Dragonstooth Oatmeal Stout  ...  Elysian Brewing Company

[1 rows x 6 columns]
How do you rate this beer on a scale of 1-5, press n if you have not had :
5
                              beer_name  ...                                brewery_name
7396775  Black Frog Nitro Oatmeal Stout  ...  Snoqualmie Falls Brewing Company & Taproom

[1 rows x 6 columns]
How do you rate this beer on a scale of

In [110]:
## add the new ratings to the original ratings DataFrame
new_ratings_df = codf.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

In [111]:
new_ratings_df

Unnamed: 0,username,beer_id,score
0,NickThePyro,320009,4
1,gandres,262748,4
2,DamienI,262748,3.75
3,gandres,298624,3.72
4,dshansen1184,298624,3.85
...,...,...,...
78407,jmccraney,168379,3.25
78408,1000,48071,5
78409,1000,143753,3
78410,1000,2023,5


In [112]:
svd_ = SVD(n_factors= 50, reg_all=0.05)
svd_.fit(new_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2538f8f050>

In [114]:
list_of_beer = []
for beer_id in codf['beer_id'].unique():
    list_of_beer.append( (beer_id,svd_.predict(1000,beer_id)[3]))

In [115]:
ranked_beers = sorted(list_of_beer, key=lambda x:x[1], reverse=True)

In [123]:
# trying to fix this
def recommended_beer(user_ratings,df,n):
        for idx, rec in enumerate(user_ratings):
            name = df.loc[df['beer_name'] == int(rec[0])]['beer_name']
            print('Recommendation # ', idx+1, ': ', name, '\n')
            n-= 1
            if n == 0:
                break
            
recommended_beer(ranked_beers,simpledf,5)

Recommendation #  1 :  Series([], Name: beer_name, dtype: object) 

Recommendation #  2 :  Series([], Name: beer_name, dtype: object) 

Recommendation #  3 :  Series([], Name: beer_name, dtype: object) 

Recommendation #  4 :  Series([], Name: beer_name, dtype: object) 

Recommendation #  5 :  Series([], Name: beer_name, dtype: object) 



## Content-Based Recommendation

In [120]:
#coming soon 