## Imports

In [55]:
# !pip install texthero
# !pip install --upgrade numpy
# !pip install surprise

In [56]:
# from google.colab import drive
# drive.mount('/content/drive')

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import texthero as hero
from texthero import preprocessing

# Import from Surprise 
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline, SVD

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.neighbors import NearestNeighbors

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import PCA

import warnings

---
## EDA

In [58]:
def eda(df):
    """Function to perform some basic EDA on my datasets"""
    
    #Inspect the first 5 rows
    display(df.head())
    print("\n")
    
    # Count of non-null values, datatypes, and total entries
    display(df.info())
    print("\n")
    
    # Check descriptive statistics
    display(df.describe())
    print("\n")
    
    # Check value counts
    for c in df.columns:
        print ("---- %s ----" % c)
        print (df[c].value_counts())
        print("\n")
    
    # Print null values
    display(df.isna().sum())
    print('Total Null Count:', df.isna().sum().sum())

---
> ### Second Data Set
>
> [Beers, Breweries, and Beer Reviews](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews?select=beers.csv)

In [59]:
beer = pd.read_csv('data/beers.csv')

In [60]:
eda(beer)

Unnamed: 0,id,name,brewery_id,state,country,style,availability,abv,notes,retired
0,202522,Olde Cogitator,2199,CA,US,English Oatmeal Stout,Rotating,7.3,No notes at this time.,f
1,82352,Konrads Stout Russian Imperial Stout,18604,,NO,Russian Imperial Stout,Rotating,10.4,No notes at this time.,f
2,214879,Scottish Right,44306,IN,US,Scottish Ale,Year-round,4.0,No notes at this time.,t
3,320009,MegaMeow Imperial Stout,4378,WA,US,American Imperial Stout,Winter,8.7,Every time this year,f
4,246438,Peaches-N-Cream,44617,PA,US,American Cream Ale,Rotating,5.1,No notes at this time.,f




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358873 entries, 0 to 358872
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            358873 non-null  int64  
 1   name          358873 non-null  object 
 2   brewery_id    358873 non-null  int64  
 3   state         298147 non-null  object 
 4   country       358719 non-null  object 
 5   style         358872 non-null  object 
 6   availability  358873 non-null  object 
 7   abv           320076 non-null  float64
 8   notes         358827 non-null  object 
 9   retired       358873 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 27.4+ MB


None





Unnamed: 0,id,brewery_id,abv
count,358873.0,358873.0,320076.0
mean,189241.027199,24597.365051,6.528504
std,107675.746916,16442.976904,2.085403
min,3.0,1.0,0.01
25%,96422.0,7318.0,5.0
50%,189909.0,28383.0,6.0
75%,282546.0,37452.0,7.5
max,374406.0,54144.0,100.0




---- id ----
2047      1
252462    1
236070    1
234023    1
256552    1
         ..
95620     1
97669     1
91526     1
93575     1
2049      1
Name: id, Length: 358873, dtype: int64


---- name ----
Oktoberfest                                         755
IPA                                                 633
Pale Ale                                            620
Hefeweizen                                          477
Oatmeal Stout                                       443
                                                   ... 
Brekeriet Sour White                                  1
Volcano IPA                                           1
Brewer's ExperimentALE Series #12: Oakmeal Stout      1
Entry Level Snob                                      1
Rumbler Red Scottish Ale                              1
Name: name, Length: 298567, dtype: int64


---- brewery_id ----
1550     1229
28383    1225
17981    1043
1146      922
13307     898
         ... 
42996       1
24813       1
18853

id                  0
name                0
brewery_id          0
state           60726
country           154
style               1
availability        0
abv             38797
notes              46
retired             0
dtype: int64

Total Null Count: 99724


In [61]:
# dropping irrelevant columns
beer = beer.drop(columns=['country','notes'])

# renaming columns for consistency among all dataframes
beer = beer.rename(columns={'id': 'beer_id', 'name': 'beer_name'})

#removing rows without state data
beer = beer.dropna(subset=['state'])

# cleaning the availability column 
beer['availability'] = beer['availability'].str.strip()

In [62]:
styledict= {'German Doppelbock':'Bock',
'German Maibock': 'Bock',
'German Bock': 'Bock',
'German Weizenbock': 'Bock',
'German Eisbock': 'Bock', 
'German Altbier':'Brown Ale',
'American Brown Ale':'Brown Ale',
'English Brown Ale':'Brown Ale',
'English Dark Mild Ale':'Brown Ale', 
'Belgian Dubbel':'Dark Ale',
'German Roggenbier':'Dark Ale',
'Scottish Ale':'Dark Ale',
'Winter Warmer':'Dark Ale', 
'American Amber / Red':'Dark Lager',
'American Amber / Red Ale':'Dark Lager',
'European Dark Lager':'Dark Lager',
'German Märzen / Oktoberfest':'Dark Lager',
'Munich Dunkel Lager':'Dark Lager',
'German Rauchbier':'Dark Lager',
'German Schwarzbier':'Dark Lager',
'Vienna Lager':'Dark Lager', 
'American IPA':'India Pale Ale',
'Belgian IPA':'India Pale Ale',
'American Brut IPA':'India Pale Ale',
'English India Pale Ale (IPA)':'India Pale Ale',
'American Imperial IPA':'India Pale Ale',
'New England IPA':'India Pale Ale', 
'English Bitter':'Pale Ale',
'English Extra Special / Strong Bitter (ESB)':'Pale Ale',
'Belgian Blonde Ale':'Pale Ale',
'American Blonde Ale':'Pale Ale',
'French Bière de Garde':'Pale Ale',
'Belgian Saison':'Pale Ale',
'German Kölsch':'Pale Ale',
'English Pale Ale':'Pale Ale',
'American Pale Ale (APA)':'Pale Ale',
'Belgian Pale Ale':'Pale Ale',
'American Amber / Red Lager':'Pale Ale',
'Irish Red Ale':'Pale Ale',
'American Adjunct Lager':'Pale Lager',
'American Light Lager ':'Pale Lager',
'European Export / Dortmunder':'Pale Lager',
'European Pale Lager':'Pale Lager',
'European Strong Lager':'Pale Lager',
'German Helles':'Pale Lager',
'German Kellerbier / Zwickelbier':'Pale Lager',
'American Light Lager':'Pale Lager',
'American Malt Liquor':'Pale Lager',
'Bohemian Pilsener':'Pale Lager',
'German Pilsner':'Pale Lager',
'American Imperial Pilsner':'Pale Lager',
'American Lager':'Pale Lager',
'American Porter':'Porter',
'English Porter':'Porter',
'Baltic Porter':'Porter',
'American Imperial Porter':'Porter',
'Smoke Porter':'Porter',
'Robust Porter':'Porter',
'American Imperial Stout':'Stout',
'American Stout':'Stout',
'English Sweet / Milk Stout':'Stout',
'Russian Imperial Stout':'Stout',
'English Oatmeal Stout':'Stout',
'Irish Dry Stout':'Stout',
'English Stout':'Stout',
'Foreign / Export Stout':'Stout',
'American Barleywine':'Strong Ale',
'British Barleywine':'Strong Ale',
'English Old Ale':'Strong Ale',
'Belgian Quadrupel (Quad)':'Strong Ale',
'American Imperial Red Ale':'Strong Ale',
'Scotch Ale / Wee Heavy':'Strong Ale',
'American Strong Ale':'Strong Ale',
'Belgian Dark Ale':'Strong Ale',
'Belgian Strong Dark Ale':'Strong Ale',
'Belgian Strong Pale Ale':'Strong Ale',
'English Strong Ale':'Strong Ale',
'Belgian Tripel':'Strong Ale',
'American Wheatwine Ale':'Strong Ale',
'American Dark Wheat Ale':'Wheat Beer',
'American Pale Wheat Ale':'Wheat Beer',
'German Dunkelweizen':'Wheat Beer',
'German Kristalweizen':'Wheat Beer',
'German Hefeweizen':'Wheat Beer',
'Belgian Witbier':'Wheat Beer',
'American Brett':'Wild/Sour Beer',
'Belgian Faro':'Wild/Sour Beer',
'Belgian Fruit Lambic':'Wild/Sour Beer',
'Belgian Gueuze':'Wild/Sour Beer',
'Belgian Lambic':'Wild/Sour Beer',
'Berliner Weisse':'Wild/Sour Beer',
'Flanders Oud Bruin':'Wild/Sour Beer',
'Flanders Red Ale':'Wild/Sour Beer',
'Leipzig Gose':'Wild/Sour Beer',
'American Wild Ale':'Wild/Sour Beer',
'Wild/Sour Beers':'Wild/Sour Beer',
'American Black Ale':'Black Ale',
'American Cream Ale':'Cream Ale'}

In [63]:
beer['broad_style'] = beer['style'].replace(styledict)

In [64]:
eda(beer)

Unnamed: 0,beer_id,beer_name,brewery_id,state,style,availability,abv,retired,broad_style
0,202522,Olde Cogitator,2199,CA,English Oatmeal Stout,Rotating,7.3,f,Stout
2,214879,Scottish Right,44306,IN,Scottish Ale,Year-round,4.0,t,Dark Ale
3,320009,MegaMeow Imperial Stout,4378,WA,American Imperial Stout,Winter,8.7,f,Stout
4,246438,Peaches-N-Cream,44617,PA,American Cream Ale,Rotating,5.1,f,Cream Ale
6,108605,Icon Sender,22598,CA,American Lager,Year-round,5.6,f,Pale Lager




<class 'pandas.core.frame.DataFrame'>
Int64Index: 298147 entries, 0 to 358872
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   beer_id       298147 non-null  int64  
 1   beer_name     298147 non-null  object 
 2   brewery_id    298147 non-null  int64  
 3   state         298147 non-null  object 
 4   style         298146 non-null  object 
 5   availability  298147 non-null  object 
 6   abv           263558 non-null  float64
 7   retired       298147 non-null  object 
 8   broad_style   298146 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 22.7+ MB


None





Unnamed: 0,beer_id,brewery_id,abv
count,298147.0,298147.0,263558.0
mean,194002.156272,25343.706101,6.608174
std,106328.032039,16261.618864,2.064542
min,3.0,2.0,0.01
25%,103642.5,9819.0,5.2
50%,196192.0,29238.0,6.1
75%,286387.5,37625.0,7.6
max,374406.0,54144.0,100.0




---- beer_id ----
2047      1
250671    1
33605     1
39750     1
60232     1
         ..
136424    1
138473    1
134379    1
144620    1
2049      1
Name: beer_id, Length: 298147, dtype: int64


---- beer_name ----
Oktoberfest                                 724
IPA                                         516
Pale Ale                                    477
Hefeweizen                                  443
Oatmeal Stout                               416
                                           ... 
Hit Me With Your Best Hops - Galaxy           1
All The Action (Quadruple Dry Hopped)         1
Barrelman’s Brown                             1
Red Racer Berry Colada Infused White Ale      1
Rumbler Red Scottish Ale                      1
Name: beer_name, Length: 247134, dtype: int64


---- brewery_id ----
28383    1225
17981    1043
1146      922
147       736
16866     723
         ... 
50863       1
30393       1
49473       1
41242       1
2344        1
Name: brewery_id, Length: 10011

beer_id             0
beer_name           0
brewery_id          0
state               0
style               1
availability        0
abv             34589
retired             0
broad_style         1
dtype: int64

Total Null Count: 34591


---
> ### Third Data Set
>
> [Beers, Breweries, and Beer Reviews](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews?select=beers.csv)

In [65]:
# reading in the dataframe containing the breweries
breweries = pd.read_csv('data/breweries.csv')

In [66]:
# eda on that data
eda(breweries)

Unnamed: 0,id,name,city,state,country,notes,types
0,19730,Brouwerij Danny,Erpe-Mere,,BE,No notes at this time.,Brewery
1,32541,Coachella Valley Brewing Co,Thousand Palms,CA,US,No notes at this time.,"Brewery, Bar, Beer-to-go"
2,44736,Beef 'O' Brady's,Plant City,FL,US,No notes at this time.,"Bar, Eatery"
3,23372,Broadway Wine Merchant,Oklahoma City,OK,US,No notes at this time.,Store
4,35328,Brighton Beer Dispensary (DUPLICATE),Brighton,GB2,GB,Duplicate of https://www.beeradvocate.com/beer...,"Bar, Eatery"




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50347 entries, 0 to 50346
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       50347 non-null  int64 
 1   name     50347 non-null  object
 2   city     50289 non-null  object
 3   state    39076 non-null  object
 4   country  50341 non-null  object
 5   notes    50262 non-null  object
 6   types    50347 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.7+ MB


None





Unnamed: 0,id
count,50347.0
mean,27870.513874
std,15270.53593
min,1.0
25%,15487.5
50%,28313.0
75%,40922.5
max,54156.0




---- id ----
2047     1
48413    1
36123    1
34074    1
40217    1
        ..
37479    1
39526    1
33381    1
35428    1
2049     1
Name: id, Length: 50347, dtype: int64


---- name ----
Whole Foods Market          162
Total Wine & More           147
Cost Plus World Market      118
Mellow Mushroom             114
Trader Joe's                 88
                           ... 
Altitude Brewing              1
Pivovar Klášter               1
Liquor Depot - Brentwood      1
Blake's Orchard Ales          1
Les 400 Coups                 1
Name: name, Length: 45245, dtype: int64


---- city ----
Chicago         512
Philadelphia    505
New York        432
Portland        370
London          358
               ... 
Otradnoe          1
Pokrov            1
Diekirch          1
Winklarn          1
Beernem           1
Name: city, Length: 11664, dtype: int64


---- state ----
CA     3638
PA     2454
NY     2284
GB2    2152
FL     1595
       ... 
MB       36
NL       23
PE        7
YT        3
NT

id             0
name           0
city          58
state      11271
country        6
notes         85
types          0
dtype: int64

Total Null Count: 11420


In [67]:
# dropping irrelevant columns
breweries = breweries.drop(columns=['notes'])

# renaming columns for consistency among all dataframes
breweries = breweries.rename(columns={'id': 'brewery_id', 'name': 'brewery_name'})

#removing rows without state data
beer = beer.dropna(subset=['state'])

In [68]:
# making sure everything looks right
breweries.head()

Unnamed: 0,brewery_id,brewery_name,city,state,country,types
0,19730,Brouwerij Danny,Erpe-Mere,,BE,Brewery
1,32541,Coachella Valley Brewing Co,Thousand Palms,CA,US,"Brewery, Bar, Beer-to-go"
2,44736,Beef 'O' Brady's,Plant City,FL,US,"Bar, Eatery"
3,23372,Broadway Wine Merchant,Oklahoma City,OK,US,Store
4,35328,Brighton Beer Dispensary (DUPLICATE),Brighton,GB2,GB,"Bar, Eatery"


---
> ### Fourth Data Set
>
> [Beers, Breweries, and Beer Reviews](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews?select=beers.csv)

In [69]:
reviews = pd.read_csv('data/reviews.csv')

In [70]:
eda(reviews)

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score
0,271781,bluejacket74,2017-03-17,"750 ml bottle, 2016 vintage, bottle #304 of...",4.0,4.0,4.0,4.25,4.0,4.03
1,125646,_dirty_,2017-12-21,,4.5,4.5,4.5,4.5,4.5,4.5
2,125646,CJDUBYA,2017-12-21,,4.75,4.75,4.75,4.75,4.75,4.75
3,125646,GratefulBeerGuy,2017-12-20,0% 16 oz can. Funny story: As I finally wal...,4.75,4.75,4.5,4.5,4.5,4.58
4,125646,LukeGude,2017-12-20,Classic TH NEIPA. Overflowing head and bouq...,4.25,4.5,4.25,4.25,4.25,4.31




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9073128 entries, 0 to 9073127
Data columns (total 10 columns):
 #   Column    Dtype  
---  ------    -----  
 0   beer_id   int64  
 1   username  object 
 2   date      object 
 3   text      object 
 4   look      float64
 5   smell     float64
 6   taste     float64
 7   feel      float64
 8   overall   float64
 9   score     float64
dtypes: float64(6), int64(1), object(3)
memory usage: 692.2+ MB


None





Unnamed: 0,beer_id,look,smell,taste,feel,overall,score
count,9073128.0,5283110.0,5283110.0,5283110.0,5283110.0,5283110.0,9073128.0
mean,77306.55,3.952155,3.887863,3.920704,3.88435,3.91628,3.889815
std,79293.45,0.5517191,0.6115909,0.6333365,0.5999279,0.6054554,0.6127417
min,3.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,11554.0,3.75,3.5,3.5,3.5,3.5,3.57
50%,56545.0,4.0,4.0,4.0,4.0,4.0,4.0
75%,107667.0,4.25,4.25,4.25,4.25,4.25,4.25
max,373128.0,5.0,5.0,5.0,5.0,5.0,5.0




---- beer_id ----
11757     17160
2093      15947
7971      14927
1093      14915
29619     14292
          ...  
203777        1
126326        1
109950        1
282050        1
8188          1
Name: beer_id, Length: 309542, dtype: int64


---- username ----
Sammy            13798
kylehay2004      12221
acurtis          12016
StonedTrippin    11859
jaydoc           11800
                 ...  
Scheco               1
Tseliso              1
holowac              1
JeremyC              1
SJBeto               1
Name: username, Length: 164934, dtype: int64


---- date ----
2011-11-15    17648
2011-11-14    17010
2011-11-16     9571
2014-07-20     9066
2011-11-18     8739
              ...  
1998-06-23        1
2001-03-05        1
1999-02-23        1
1999-03-22        1
1998-04-03        1
Name: date, Length: 6560, dtype: int64


---- text ----
                                                                                                                                                    

4.00    1404549
3.50     784523
4.50     731599
4.25     652189
3.75     586989
3.00     340131
5.00     194129
4.75     179160
3.25     153232
2.50     100771
2.00      63560
2.75      40045
1.50      18330
1.00      13560
2.25      12336
1.75       5457
1.25       2550
Name: smell, dtype: int64


---- taste ----
4.00    1331969
4.50     853299
3.50     709595
4.25     682144
3.75     554833
3.00     291585
5.00     220820
4.75     212331
3.25     150945
2.50     103068
2.00      63884
2.75      43873
1.50      22609
1.00      19311
2.25      13482
1.75       6245
1.25       3117
Name: taste, dtype: int64


---- feel ----
4.00    1540880
3.50     749577
4.50     697844
4.25     623751
3.75     583744
3.00     348173
5.00     181624
4.75     161107
3.25     150760
2.50      96137
2.00      59189
2.75      39254
1.50      16895
1.00      13851
2.25      12406
1.75       5179
1.25       2739
Name: feel, dtype: int64


---- overall ----
4.00    1467309
4.50     798500
3.50     694298
4.25

beer_id           0
username       3815
date              0
text              0
look        3790018
smell       3790018
taste       3790018
feel        3790018
overall     3790018
score             0
dtype: int64

Total Null Count: 18953905


In [71]:
# dropping irrelevant columns
reviews = reviews.drop(columns=['date','look','smell','taste','feel','overall'])

# setting blank reviews as nan so they can be dropped
reviews['text'] = reviews['text'].str.strip()
reviews['text'] = reviews['text'].replace({"":np.nan,"0%":np.nan})

# dropping reviews without review text data
reviews = reviews.dropna(subset=['username','text'])

In [72]:
# making everything looks right
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2906027 entries, 0 to 9073114
Data columns (total 4 columns):
 #   Column    Dtype  
---  ------    -----  
 0   beer_id   int64  
 1   username  object 
 2   text      object 
 3   score     float64
dtypes: float64(1), int64(1), object(2)
memory usage: 110.9+ MB


---

## Combining Data

In [73]:
beer_reviews = beer.merge(reviews, on=['beer_id'])

In [74]:
eda(beer_reviews)

Unnamed: 0,beer_id,beer_name,brewery_id,state,style,availability,abv,retired,broad_style,username,text,score
0,246438,Peaches-N-Cream,44617,PA,American Cream Ale,Rotating,5.1,f,Cream Ale,rolltide8425,Exactly what it says it is,3.67
1,29556,Warrior's Bock,8203,SK,German Bock,Rotating,7.5,t,Bock,markaberrant,Pours a clear reddish-brown (mahogany?) into m...,3.25
2,29556,Warrior's Bock,8203,SK,German Bock,Rotating,7.5,t,Bock,Noxious26,341ml bottle poured in a pint glass. Label has...,3.04
3,15271,Belgian Style Wit,1345,WA,Belgian Witbier,Year-round,4.5,t,Wheat Beer,zacharius,Beer poured a light pale yellow. Smell was goo...,2.5
4,69212,Réserve No. 3,18796,QC,American Strong Ale,Limited (brewed once),11.8,t,Strong Ale,papat444,***2010 Vintage*** Bought this directly at the...,2.4




<class 'pandas.core.frame.DataFrame'>
Int64Index: 2429919 entries, 0 to 2429918
Data columns (total 12 columns):
 #   Column        Dtype  
---  ------        -----  
 0   beer_id       int64  
 1   beer_name     object 
 2   brewery_id    int64  
 3   state         object 
 4   style         object 
 5   availability  object 
 6   abv           float64
 7   retired       object 
 8   broad_style   object 
 9   username      object 
 10  text          object 
 11  score         float64
dtypes: float64(2), int64(2), object(8)
memory usage: 241.0+ MB


None





Unnamed: 0,beer_id,brewery_id,abv,score
count,2429919.0,2429919.0,2366915.0,2429919.0
mean,67937.88,8443.941,7.293552,3.860247
std,78286.31,12514.34,2.407513,0.5921705
min,3.0,2.0,0.01,1.0
25%,7971.0,147.0,5.5,3.58
50%,44149.0,784.0,6.8,3.95
75%,88352.0,16315.0,8.7,4.25
max,373128.0,54080.0,100.0,5.0




---- beer_id ----
11757     4259
2093      4203
7971      4094
1093      4004
412       3955
          ... 
261618       1
117404       1
212458       1
220646       1
64487        1
Name: beer_id, Length: 171290, dtype: int64


---- beer_name ----
IPA                                         9356
Porter                                      6504
Imperial Stout                              6484
Pale Ale                                    6270
Oatmeal Stout                               5113
                                            ... 
Bullen Saison                                  1
House Ales / Le Trou Du Diable Hopscotch       1
Blackest Of Black IIPA                         1
Guilty Conscience                              1
Honey Don't                                    1
Name: beer_name, Length: 147720, dtype: int64


---- brewery_id ----
35       55909
140      55399
147      51241
64       47555
1199     40562
         ...  
52619        1
53549        1
17216        1
44419 

Name: score, Length: 401, dtype: int64




beer_id             0
beer_name           0
brewery_id          0
state               0
style               2
availability        0
abv             63004
retired             0
broad_style         2
username            0
text                0
score               0
dtype: int64

Total Null Count: 63008


In [75]:
plus_brew = beer_reviews.merge(breweries, on=['brewery_id'])

In [76]:
eda(plus_brew)

Unnamed: 0,beer_id,beer_name,brewery_id,state_x,style,availability,abv,retired,broad_style,username,text,score,brewery_name,city,state_y,country,types
0,246438,Peaches-N-Cream,44617,PA,American Cream Ale,Rotating,5.1,f,Cream Ale,rolltide8425,Exactly what it says it is,3.67,Mad Princes Brewing,Doylestown,PA,US,"Brewery, Bar, Beer-to-go"
1,246443,Clinging And Bitter,44617,PA,English Bitter,Rotating,3.8,f,Pale Ale,rolltide8425,A no-nonsense British style bitter; just how I...,3.74,Mad Princes Brewing,Doylestown,PA,US,"Brewery, Bar, Beer-to-go"
2,29556,Warrior's Bock,8203,SK,German Bock,Rotating,7.5,t,Bock,markaberrant,Pours a clear reddish-brown (mahogany?) into m...,3.25,Paddock Wood Brewing Co.,Saskatoon,SK,CA,"Brewery, Beer-to-go"
3,29556,Warrior's Bock,8203,SK,German Bock,Rotating,7.5,t,Bock,Noxious26,341ml bottle poured in a pint glass. Label has...,3.04,Paddock Wood Brewing Co.,Saskatoon,SK,CA,"Brewery, Beer-to-go"
4,35000,Bête Noire,8203,SK,English Oatmeal Stout,Year-round,5.4,f,Stout,CalgaryFMC,Got this lovely offering because the beer guy ...,3.57,Paddock Wood Brewing Co.,Saskatoon,SK,CA,"Brewery, Beer-to-go"




<class 'pandas.core.frame.DataFrame'>
Int64Index: 2429919 entries, 0 to 2429918
Data columns (total 17 columns):
 #   Column        Dtype  
---  ------        -----  
 0   beer_id       int64  
 1   beer_name     object 
 2   brewery_id    int64  
 3   state_x       object 
 4   style         object 
 5   availability  object 
 6   abv           float64
 7   retired       object 
 8   broad_style   object 
 9   username      object 
 10  text          object 
 11  score         float64
 12  brewery_name  object 
 13  city          object 
 14  state_y       object 
 15  country       object 
 16  types         object 
dtypes: float64(2), int64(2), object(13)
memory usage: 333.7+ MB


None





Unnamed: 0,beer_id,brewery_id,abv,score
count,2429919.0,2429919.0,2366915.0,2429919.0
mean,67937.88,8443.941,7.293552,3.860247
std,78286.31,12514.34,2.407513,0.5921705
min,3.0,2.0,0.01,1.0
25%,7971.0,147.0,5.5,3.58
50%,44149.0,784.0,6.8,3.95
75%,88352.0,16315.0,8.7,4.25
max,373128.0,54080.0,100.0,5.0




---- beer_id ----
11757     4259
2093      4203
7971      4094
1093      4004
412       3955
          ... 
261618       1
117404       1
212458       1
220646       1
64487        1
Name: beer_id, Length: 171290, dtype: int64


---- beer_name ----
IPA                            9356
Porter                         6504
Imperial Stout                 6484
Pale Ale                       6270
Oatmeal Stout                  5113
                               ... 
Bout-A-Hunerd                     1
Brandy Barrel Aged Dark Rye       1
Zeroth Anniversary                1
Orange Zest Lager                 1
South Shore                       1
Name: beer_name, Length: 147720, dtype: int64


---- brewery_id ----
35       55909
140      55399
147      51241
64       47555
1199     40562
         ...  
52619        1
53549        1
17216        1
44419        1
51642        1
Name: brewery_id, Length: 8531, dtype: int64


---- state_x ----
CA     427130
CO     160911
NY     160298
MI     14861

4.00    94392
4.25    35869
4.10    33045
4.20    32814
4.50    32621
        ...  
1.02       53
1.05       51
1.04       16
1.07       14
1.01        3
Name: score, Length: 401, dtype: int64


---- brewery_name ----
Boston Beer Company (Samuel Adams)             55909
Sierra Nevada Brewing Co.                      55399
Stone Brewing                                  51241
Dogfish Head Craft Brewery                     47555
Founders Brewing Company                       40562
                                               ...  
Les Gueux                                          1
Yïsst                                              1
Campervan Brewery                                  1
Bear's Lair                                        1
The Flowerpots Brewery / The Flowerpots Inn        1
Name: brewery_name, Length: 8388, dtype: int64


---- city ----
Escondido        60775
Jamaica Plain    55925
Chico            55399
Portland         55167
Chicago          51865
                 ...

beer_id             0
beer_name           0
brewery_id          0
state_x             0
style               2
availability        0
abv             63004
retired             0
broad_style         2
username            0
text                0
score               0
brewery_name        0
city                0
state_y             0
country             0
types               0
dtype: int64

Total Null Count: 63008


In [77]:
# dropping duplicate column
plus_brew = plus_brew.drop(columns=['state_x'])

# renaming column to reflect
plus_brew = plus_brew.rename(columns={'state_y': 'state'})

In [78]:
# selecting only available US beers
df = plus_brew.loc[(plus_brew['country'] == 'US') & (plus_brew['retired'] == 'f')]

df = df.drop_duplicates()

# dropping rows with missing ibu data 
df = df.dropna(subset=['abv'])

In [79]:
eda(df)

Unnamed: 0,beer_id,beer_name,brewery_id,style,availability,abv,retired,broad_style,username,text,score,brewery_name,city,state,country,types
0,246438,Peaches-N-Cream,44617,American Cream Ale,Rotating,5.1,f,Cream Ale,rolltide8425,Exactly what it says it is,3.67,Mad Princes Brewing,Doylestown,PA,US,"Brewery, Bar, Beer-to-go"
1,246443,Clinging And Bitter,44617,English Bitter,Rotating,3.8,f,Pale Ale,rolltide8425,A no-nonsense British style bitter; just how I...,3.74,Mad Princes Brewing,Doylestown,PA,US,"Brewery, Bar, Beer-to-go"
433,12661,Lava Rock Porter,1345,American Porter,Year-round,7.0,f,Porter,DrMullet,Pours a very dark brown with moderate head and...,3.5,Dick's Brewing Company,Centralia,WA,US,"Brewery, Bar, Eatery"
434,12661,Lava Rock Porter,1345,American Porter,Year-round,7.0,f,Porter,Karibourgeois,Dark brown pour with a tan head. Aroma of choc...,3.44,Dick's Brewing Company,Centralia,WA,US,"Brewery, Bar, Eatery"
435,12661,Lava Rock Porter,1345,American Porter,Year-round,7.0,f,Porter,Jeffsta1,"Beautiful dark brown/black, with a pretty, pal...",3.44,Dick's Brewing Company,Centralia,WA,US,"Brewery, Bar, Eatery"




<class 'pandas.core.frame.DataFrame'>
Int64Index: 1686227 entries, 0 to 2429918
Data columns (total 16 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   beer_id       1686227 non-null  int64  
 1   beer_name     1686227 non-null  object 
 2   brewery_id    1686227 non-null  int64  
 3   style         1686227 non-null  object 
 4   availability  1686227 non-null  object 
 5   abv           1686227 non-null  float64
 6   retired       1686227 non-null  object 
 7   broad_style   1686227 non-null  object 
 8   username      1686227 non-null  object 
 9   text          1686227 non-null  object 
 10  score         1686227 non-null  float64
 11  brewery_name  1686227 non-null  object 
 12  city          1686227 non-null  object 
 13  state         1686227 non-null  object 
 14  country       1686227 non-null  object 
 15  types         1686227 non-null  object 
dtypes: float64(2), int64(2), object(12)
memory usage: 218.7+ MB


None





Unnamed: 0,beer_id,brewery_id,abv,score
count,1686227.0,1686227.0,1686227.0,1686227.0
mean,65882.09,8922.693,7.275945,3.886465
std,79372.49,13009.61,2.349416,0.5910458
min,5.0,3.0,0.05,1.0
25%,6088.0,147.0,5.5,3.61
50%,40674.0,784.0,6.8,3.98
75%,86970.0,17960.0,8.5,4.26
max,373122.0,54080.0,100.0,5.0




---- beer_id ----
11757     4259
2093      4203
7971      4094
1093      4004
412       3955
          ... 
337430       1
320985       1
323032       1
300503       1
302737       1
Name: beer_id, Length: 82036, dtype: int64


---- beer_name ----
IPA                                             8448
Porter                                          6130
Breakfast Stout                                 4357
90 Minute IPA                                   4203
Pliny The Elder                                 4094
                                                ... 
Bourbon Barrel-Aged Save Some Room For Later       1
Pine On You Hazy Diamond                           1
This Name Change is Driving Me Nuts                1
Liquid Swords 11th Chamber                         1
Hyperborea                                         1
Name: beer_name, Length: 73149, dtype: int64


---- brewery_id ----
35       46082
64       44230
140      35462
1199     32661
287      31389
         ...  
46614    

Jamaica Plain     46098
Portland          44357
Milton            44230
San Diego         43884
Chicago           40139
                  ...  
Mystic                1
Hermitage             1
Cherokee              1
North Wales           1
Kelleys Island        1
Name: city, Length: 2253, dtype: int64


---- state ----
CA    313012
MI    123812
CO    123100
NY    121212
MA     99284
PA     91616
OR     84857
IL     56591
WI     55667
DE     47526
MO     47448
OH     44411
VT     39952
ME     35868
IN     30335
NC     29799
TX     29349
MD     29157
WA     27182
MN     26663
FL     25234
NH     24540
VA     21694
GA     17022
CT     13224
NJ     12143
UT     10064
OK      9957
LA      9952
AK      9702
SC      7688
IA      7507
HI      6840
MT      6533
ID      5611
KY      5050
AZ      4664
TN      4573
RI      4512
NM      4033
NE      3019
AL      2861
NV      2419
DC      2322
KS      1994
WY      1936
MS      1787
AR      1077
WV       562
SD       489
ND       377
Name: state, dty

beer_id         0
beer_name       0
brewery_id      0
style           0
availability    0
abv             0
retired         0
broad_style     0
username        0
text            0
score           0
brewery_name    0
city            0
state           0
country         0
types           0
dtype: int64

Total Null Count: 0


In [80]:
# checking to see if some of these beers are actually named IPA and looking at each brewery's website I can confirm they are not creative people
df.loc[df['beer_name'] == 'IPA']

Unnamed: 0,beer_id,beer_name,brewery_id,style,availability,abv,retired,broad_style,username,text,score,brewery_name,city,state,country,types
106363,91469,IPA,29573,American IPA,Year-round,7.2,f,India Pale Ale,mjbachma,"Classic bottle and labeling, titled by style, ...",3.94,pFriem Family Brewers,Hood River,OR,US,"Brewery, Bar, Eatery, Beer-to-go"
106364,91469,IPA,29573,American IPA,Year-round,7.2,f,India Pale Ale,CraftBeerRunner,Sweet Jammy fruity and bright. Drank fresh whi...,4.00,pFriem Family Brewers,Hood River,OR,US,"Brewery, Bar, Eatery, Beer-to-go"
106365,91469,IPA,29573,American IPA,Year-round,7.2,f,India Pale Ale,Hopheadjeffery,Tasted in a Spiegelau IPA glass from a bottle ...,4.04,pFriem Family Brewers,Hood River,OR,US,"Brewery, Bar, Eatery, Beer-to-go"
106366,91469,IPA,29573,American IPA,Year-round,7.2,f,India Pale Ale,BeerAndGasMasks,"From the bottle, it pours a nice lightly hazy ...",4.00,pFriem Family Brewers,Hood River,OR,US,"Brewery, Bar, Eatery, Beer-to-go"
106367,91469,IPA,29573,American IPA,Year-round,7.2,f,India Pale Ale,NickSMpls,Can these guys do anything that's not great? O...,4.18,pFriem Family Brewers,Hood River,OR,US,"Brewery, Bar, Eatery, Beer-to-go"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2429005,138894,IPA,36717,American IPA,Year-round,6.0,f,India Pale Ale,JonAdams,"Poured from tap at brewery. Deep copper, clear...",4.30,Lizard Tail Brewing,Albuquerque,NM,US,"Brewery, Bar, Beer-to-go"
2429495,322903,IPA,47202,American IPA,Year-round,5.9,f,India Pale Ale,stevoj,Bottle from BevMo. Gusher at opening. Dark gol...,3.55,Dagny Brewing Company,Modesto,CA,US,Brewery
2429694,275547,IPA,48593,American IPA,Year-round,6.6,f,India Pale Ale,JacobusFavier23,L- Straw colored IPA with a slight amber tint....,3.66,Haint Blue Brewery,Mobile,AL,US,Brewery
2429712,349972,IPA,49155,American IPA,Year-round,6.5,f,India Pale Ale,brett808ola,"Smells very fruity, great for tropical weather...",4.98,Hawaiian Ola Brewing Corporation,Kailua Kona,HI,US,"Brewery, Bar, Beer-to-go"


In [81]:
# creating a dataframe grouped by beer for content based recommendations
cont_df = df.groupby(['beer_id'], as_index = False).agg({'text': ' '.join, 'beer_id' : 'first','beer_name': 'first'	,
                                                         'brewery_id': 'first'	,'style': 'first'	,'broad_style': 'first'	,
                                                         'availability': 'first'	,'abv': 'first'	,'retired'	: 'first',
                                                         'text'	: 'first','score'	: 'first','brewery_name'	: 'first',
                                                         'city'	: 'first','state' : 'first'})

In [82]:
# setting the index to be unique beer id
cont_df = cont_df.set_index('beer_id')

In [83]:
eda(cont_df)

Unnamed: 0_level_0,text,beer_name,brewery_id,style,broad_style,availability,abv,retired,score,brewery_name,city,state
beer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5,Beautiful beer. Light and tasty.,Amber,3,Vienna Lager,Dark Lager,Year-round,4.5,f,3.93,Abita Brewing Co.,Abita Springs,LA
6,great brown ale...one of my favorites.,Turbodog,3,English Brown Ale,Brown Ale,Year-round,5.6,f,4.9,Abita Brewing Co.,Abita Springs,LA
7,The labeling with the purple haze guy with the...,Purple Haze,3,Fruit and Field Beer,Fruit and Field Beer,Year-round,4.2,f,3.9,Abita Brewing Co.,Abita Springs,LA
9,Poured chilled in large glass stein. A: cloudy...,Golden,3,American Lager,Pale Lager,Year-round,4.2,f,2.44,Abita Brewing Co.,Abita Springs,LA
10,Pours slightly hazy deep amber/brown. 1 finger...,Dubbel Ale,4,Belgian Dubbel,Dark Ale,Year-round,7.0,f,3.8,Allagash Brewing Company,Portland,ME




<class 'pandas.core.frame.DataFrame'>
Int64Index: 82036 entries, 5 to 373122
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   text          82036 non-null  object 
 1   beer_name     82036 non-null  object 
 2   brewery_id    82036 non-null  int64  
 3   style         82036 non-null  object 
 4   broad_style   82036 non-null  object 
 5   availability  82036 non-null  object 
 6   abv           82036 non-null  float64
 7   retired       82036 non-null  object 
 8   score         82036 non-null  float64
 9   brewery_name  82036 non-null  object 
 10  city          82036 non-null  object 
 11  state         82036 non-null  object 
dtypes: float64(2), int64(1), object(9)
memory usage: 8.1+ MB


None





Unnamed: 0,brewery_id,abv,score
count,82036.0,82036.0,82036.0
mean,26160.217002,6.722944,3.84014
std,15483.810761,2.0129,0.528591
min,3.0,0.05,1.0
25%,15504.0,5.3,3.59
50%,29854.0,6.3,3.9
75%,37475.0,7.8,4.16
max,54080.0,100.0,5.0




---- text ----
On tap at the brewery                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

Oktoberfest                                          252
IPA                                                  178
Pale Ale                                             130
Hefeweizen                                           115
Oatmeal Stout                                        104
                                                    ... 
Kidnapped By Vikings                                   1
Citra Kitties & Sunsets                                1
Rick - Rustic Farmhouse Saison                         1
Vinatta Russian Imperial Stout (Port Barrel Aged)      1
Old Ellicott Ale                                       1
Name: beer_name, Length: 73149, dtype: int64


---- brewery_id ----
28383    324
33510    293
41018    225
28178    217
9629     215
        ... 
41717      1
11492      1
44340      1
42357      1
49128      1
Name: brewery_id, Length: 5453, dtype: int64


---- style ----
American IPA               12785
American Imperial IPA       6682
American Pale Ale (APA)     5275


text            0
beer_name       0
brewery_id      0
style           0
broad_style     0
availability    0
abv             0
retired         0
score           0
brewery_name    0
city            0
state           0
dtype: int64

Total Null Count: 0


In [84]:
cont_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82036 entries, 5 to 373122
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   text          82036 non-null  object 
 1   beer_name     82036 non-null  object 
 2   brewery_id    82036 non-null  int64  
 3   style         82036 non-null  object 
 4   broad_style   82036 non-null  object 
 5   availability  82036 non-null  object 
 6   abv           82036 non-null  float64
 7   retired       82036 non-null  object 
 8   score         82036 non-null  float64
 9   brewery_name  82036 non-null  object 
 10  city          82036 non-null  object 
 11  state         82036 non-null  object 
dtypes: float64(2), int64(1), object(9)
memory usage: 8.1+ MB


In [85]:
cont_df['clean_text'] = hero.clean(cont_df['text'])

In [86]:
cont_df.head()

Unnamed: 0_level_0,text,beer_name,brewery_id,style,broad_style,availability,abv,retired,score,brewery_name,city,state,clean_text
beer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5,Beautiful beer. Light and tasty.,Amber,3,Vienna Lager,Dark Lager,Year-round,4.5,f,3.93,Abita Brewing Co.,Abita Springs,LA,beautiful beer light tasty
6,great brown ale...one of my favorites.,Turbodog,3,English Brown Ale,Brown Ale,Year-round,5.6,f,4.9,Abita Brewing Co.,Abita Springs,LA,great brown ale one favorites
7,The labeling with the purple haze guy with the...,Purple Haze,3,Fruit and Field Beer,Fruit and Field Beer,Year-round,4.2,f,3.9,Abita Brewing Co.,Abita Springs,LA,labeling purple haze guy face paint looks awes...
9,Poured chilled in large glass stein. A: cloudy...,Golden,3,American Lager,Pale Lager,Year-round,4.2,f,2.44,Abita Brewing Co.,Abita Springs,LA,poured chilled large glass stein cloudy golden...
10,Pours slightly hazy deep amber/brown. 1 finger...,Dubbel Ale,4,Belgian Dubbel,Dark Ale,Year-round,7.0,f,3.8,Allagash Brewing Company,Portland,ME,pours slightly hazy deep amber brown finger he...


In [87]:
cont_df['clean_text'] = hero.stem(cont_df['clean_text'])

In [88]:
cont_df.head()

Unnamed: 0_level_0,text,beer_name,brewery_id,style,broad_style,availability,abv,retired,score,brewery_name,city,state,clean_text
beer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5,Beautiful beer. Light and tasty.,Amber,3,Vienna Lager,Dark Lager,Year-round,4.5,f,3.93,Abita Brewing Co.,Abita Springs,LA,beauti beer light tasti
6,great brown ale...one of my favorites.,Turbodog,3,English Brown Ale,Brown Ale,Year-round,5.6,f,4.9,Abita Brewing Co.,Abita Springs,LA,great brown ale one favorit
7,The labeling with the purple haze guy with the...,Purple Haze,3,Fruit and Field Beer,Fruit and Field Beer,Year-round,4.2,f,3.9,Abita Brewing Co.,Abita Springs,LA,label purpl haze guy face paint look awesom un...
9,Poured chilled in large glass stein. A: cloudy...,Golden,3,American Lager,Pale Lager,Year-round,4.2,f,2.44,Abita Brewing Co.,Abita Springs,LA,pour chill larg glass stein cloudi golden yell...
10,Pours slightly hazy deep amber/brown. 1 finger...,Dubbel Ale,4,Belgian Dubbel,Dark Ale,Year-round,7.0,f,3.8,Allagash Brewing Company,Portland,ME,pour slight hazi deep amber brown finger head ...


In [89]:
cont_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82036 entries, 5 to 373122
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   text          82036 non-null  object 
 1   beer_name     82036 non-null  object 
 2   brewery_id    82036 non-null  int64  
 3   style         82036 non-null  object 
 4   broad_style   82036 non-null  object 
 5   availability  82036 non-null  object 
 6   abv           82036 non-null  float64
 7   retired       82036 non-null  object 
 8   score         82036 non-null  float64
 9   brewery_name  82036 non-null  object 
 10  city          82036 non-null  object 
 11  state         82036 non-null  object 
 12  clean_text    82036 non-null  object 
dtypes: float64(2), int64(1), object(10)
memory usage: 8.8+ MB


In [90]:
cont_df = cont_df.drop(columns=['text','brewery_id','availability','retired'])

In [91]:
# cont_df.to_csv('data/cont_df.csv', index_label = 'beer_id')

In [92]:
tf = TfidfVectorizer(max_features=500, ngram_range=(1,2))
dtm = tf.fit_transform(cont_df['clean_text'])
dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names(), index = cont_df.index)

In [93]:
dtm

Unnamed: 0_level_0,abv,acid,actual,ad,add,aftertast,age,alcohol,ale,almost,...,work,worth,would,year,yeast,yeasti,yellow,yellow color,yet,zest
beer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
6,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.414496,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
7,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
9,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.209970,0.0,0.201687,0.000000,0.0,0.0
10,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.207682,0.000000,0.0,...,0.0,0.249621,0.0,0.0,0.205169,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373101,0.10215,0.0,0.000000,0.0,0.0,0.106862,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.095899,0.130761,0.0,0.0
373105,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
373108,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
373112,0.00000,0.0,0.483203,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0


## Modeling

### Content-Based Recommendation

In [94]:
model_df = cont_df.merge(dtm, left_index=True, right_index=True)

In [95]:
model_df = model_df.rename(columns={'style_x':'style','abv_x':'abv'})
model_df.head()

Unnamed: 0_level_0,beer_name,style,broad_style,abv,score,brewery_name,city,state,clean_text,abv_y,...,work,worth,would,year,yeast,yeasti,yellow,yellow color,yet,zest
beer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,Amber,Vienna Lager,Dark Lager,4.5,3.93,Abita Brewing Co.,Abita Springs,LA,beauti beer light tasti,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Turbodog,English Brown Ale,Brown Ale,5.6,4.9,Abita Brewing Co.,Abita Springs,LA,great brown ale one favorit,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Purple Haze,Fruit and Field Beer,Fruit and Field Beer,4.2,3.9,Abita Brewing Co.,Abita Springs,LA,label purpl haze guy face paint look awesom un...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Golden,American Lager,Pale Lager,4.2,2.44,Abita Brewing Co.,Abita Springs,LA,pour chill larg glass stein cloudi golden yell...,0.0,...,0.0,0.0,0.0,0.0,0.20997,0.0,0.201687,0.0,0.0,0.0
10,Dubbel Ale,Belgian Dubbel,Dark Ale,7.0,3.8,Allagash Brewing Company,Portland,ME,pour slight hazi deep amber brown finger head ...,0.0,...,0.0,0.249621,0.0,0.0,0.205169,0.0,0.0,0.0,0.0,0.0


In [96]:
model_df = model_df.drop(columns=['style','beer_name','brewery_name','city','state','clean_text'])

In [97]:
model_df.head()

Unnamed: 0_level_0,broad_style,abv,score,abv_y,acid,actual,ad,add,aftertast,age,...,work,worth,would,year,yeast,yeasti,yellow,yellow color,yet,zest
beer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,Dark Lager,4.5,3.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Brown Ale,5.6,4.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Fruit and Field Beer,4.2,3.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Pale Lager,4.2,2.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.20997,0.0,0.201687,0.0,0.0,0.0
10,Dark Ale,7.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.249621,0.0,0.0,0.205169,0.0,0.0,0.0,0.0,0.0


In [98]:
result_df = cont_df[['beer_name','style','brewery_name','city','state']]
result_df = result_df.rename(columns={'beer_name':'Name','style':'Style',
                                      'brewery_name':'Brewery','city':'City',
                                      'state':'State'})

In [99]:
# result_df.to_csv('data/result_df.csv', index_label = 'beer_id')

In [100]:
model_df.head()

Unnamed: 0_level_0,broad_style,abv,score,abv_y,acid,actual,ad,add,aftertast,age,...,work,worth,would,year,yeast,yeasti,yellow,yellow color,yet,zest
beer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,Dark Lager,4.5,3.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Brown Ale,5.6,4.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Fruit and Field Beer,4.2,3.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Pale Lager,4.2,2.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.20997,0.0,0.201687,0.0,0.0,0.0
10,Dark Ale,7.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.249621,0.0,0.0,0.205169,0.0,0.0,0.0,0.0,0.0


Arrange DF by stlye value counts?????

In [101]:
style_OHE = pd.get_dummies(model_df['broad_style'], sparse=True)
style_OHE.head()

Unnamed: 0_level_0,Bière de Champagne / Bière Brut,Black Ale,Bock,Braggot,Brown Ale,California Common / Steam Beer,Chile Beer,Cream Ale,Dark Ale,Dark Lager,...,Porter,Pumpkin Beer,Russian Kvass,Rye Beer,Scottish Gruit / Ancient Herbed Ale,Smoke Beer,Stout,Strong Ale,Wheat Beer,Wild/Sour Beer
beer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
col_names = ['abv', 'score']

features = model_df[col_names]

features = MinMaxScaler().fit_transform(features.values)

model_df[col_names] = features

final_df = model_df.join(style_OHE)

final_df = final_df.drop(columns=['broad_style'])

In [103]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82036 entries, 5 to 373122
Columns: 531 entries, abv to Wild/Sour Beer
dtypes: Sparse[uint8, 0](29), float64(502)
memory usage: 317.7 MB


In [104]:
final_df.head()

Unnamed: 0_level_0,abv,score,abv_y,acid,actual,ad,add,aftertast,age,alcohol,...,Porter,Pumpkin Beer,Russian Kvass,Rye Beer,Scottish Gruit / Ancient Herbed Ale,Smoke Beer,Stout,Strong Ale,Wheat Beer,Wild/Sour Beer
beer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.044522,0.7325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,0.055528,0.975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,0.041521,0.725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,0.041521,0.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
10,0.069535,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207682,...,0,0,0,0,0,0,0,0,0,0


In [105]:
# superfuzz 90993
y = np.array(final_df.loc[90993])
y = y.reshape(1, -1)
cos_sim = cosine_similarity(final_df, y)
cos_sim = pd.DataFrame(data=cos_sim, index=final_df.index)
results = cos_sim.sort_values(by = 0, ascending=False)
results.head(6)

Unnamed: 0_level_0,0
beer_id,Unnamed: 1_level_1
90993,1.0
249115,0.786087
365685,0.777723
182322,0.773225
74273,0.771552
71293,0.768874


In [106]:
result_df.loc[90993]

Name       Superfuzz Blood Orange Pale Ale
Style              American Pale Ale (APA)
Brewery            Elysian Brewing Company
City                               Seattle
State                                   WA
Name: 90993, dtype: object

In [149]:
result_df.loc[249115]

Name         Oh My Darlin, Saisontine
Style                  Belgian Saison
Brewery    Mt. Carmel Brewing Company
City                       Cincinnati
State                              OH
Name: 249115, dtype: object

In [108]:
def cos_beer():
    
    style_input = input('Style: ')
    beer_input = input('Beer Name: ')
    n_recs = int(input('How many recommendations? '))
    
    style_df = cont_df[cont_df['broad_style'] == style_input]
    
    tf = TfidfVectorizer(max_features=500, ngram_range=(1,3))
    dtm = tf.fit_transform(style_df['clean_text'])
    dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names(), index = style_df.index)
    style_df = style_df.merge(dtm, left_index=True, right_index=True)
    style_df = style_df.rename(columns={'style_x':'style','abv_x':'abv'})
    style_df = style_df.drop(columns=['broad_style','style','beer_name','brewery_name',
                                      'city','state','clean_text'])

    col_names = ['abv', 'score']
    features = style_df[col_names]
    features = MinMaxScaler().fit_transform(features.values)
    style_df[col_names] = features
         
    beerix = cont_df.loc[cont_df['beer_name'] == beer_input].index.values
    y = np.array(style_df.loc[beerix[0]])
    y = y.reshape(1, -1)
    
    cos_sim = cosine_similarity(style_df, y)
    cos_sim = pd.DataFrame(data=cos_sim, index=style_df.index)
    results = cos_sim.sort_values(by = 0, ascending=False)
    nresultsid = results.head(n_recs+1).index.values[1:]
    nresults_df = result_df.loc[nresultsid]
    return nresults_df.style.hide_index()

In [109]:
cos_beer()

Style: Pale Ale
Beer Name: Superfuzz Blood Orange Pale Ale
How many recommendations? 5


Name,Style,Brewery,City,State
"Oh My Darlin, Saisontine",Belgian Saison,Mt. Carmel Brewing Company,Cincinnati,OH
Ron And The Beast Ryan,Belgian Saison,Evil Twin Brewing,Brooklyn,NY
Hopop Hetbot,Belgian Pale Ale,Cumberland Brews,Louisville,KY
Krohn Grown,Belgian Pale Ale,The Woodburn Brewery,Cincinnati,OH
Chair Lift,Belgian Saison,Mile Wide Beer Co.,Louisville,KY


In [110]:
def lin_beer():
    
    style_input = input('Style: ')
    beer_input = input('Beer Name: ')
    n_recs = int(input('How many recommendations? '))
    
    style_df = cont_df[cont_df['broad_style'] == style_input]
    
    tf = TfidfVectorizer(max_features=500, ngram_range=(1,3))
    dtm = tf.fit_transform(style_df['clean_text'])
    dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names(), index = style_df.index)
    style_df = style_df.merge(dtm, left_index=True, right_index=True)
    style_df = style_df.rename(columns={'style_x':'style','abv_x':'abv'})
    style_df = style_df.drop(columns=['broad_style','style','beer_name','brewery_name',
                                      'city','state','clean_text'])

    col_names = ['abv', 'score']
    features = style_df[col_names]
    features = MinMaxScaler().fit_transform(features.values)
    style_df[col_names] = features
         
    beerix = cont_df.loc[cont_df['beer_name'] == beer_input].index.values
    y = np.array(style_df.loc[beerix[0]])
    y = y.reshape(1, -1)
    
    lin_sim = linear_kernel(style_df, y)
    lin_sim = pd.DataFrame(data=lin_sim, index=style_df.index)
    results = lin_sim.sort_values(by = 0, ascending=False)
    nresultsid = results.head(n_recs+1).index.values[1:]
    nresults_df = result_df.loc[nresultsid]
    return nresults_df.style.hide_index()

In [111]:
lin_beer()

Style: Pale Ale
Beer Name: Superfuzz Blood Orange Pale Ale
How many recommendations? 5


Name,Style,Brewery,City,State
"Oh My Darlin, Saisontine",Belgian Saison,Mt. Carmel Brewing Company,Cincinnati,OH
Parabolic,American Pale Ale (APA),Woodstock Brewing,Phoenicia,NY
Oak Aged Saison,Belgian Saison,Breckenridge Brewery,Littleton,CO
Funked Up Series #32: Sint Whosit,Belgian Saison,Bent Brewstillery,Roseville,MN
Hot Mess,Belgian Saison,Southern Tier Brewing Company,Lakewood,NY


In [112]:
def knn_beer():
    
    style_input = input('Style: ')
    beer_input = input('Beer Name: ')
    n_recs = int(input('How many recommendations? '))
    
    style_df = cont_df[cont_df['broad_style'] == style_input]
    
    tf = TfidfVectorizer(max_features=500, ngram_range=(1,3))
    dtm = tf.fit_transform(style_df['clean_text'])
    dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names(), index = style_df.index)
    style_df = style_df.merge(dtm, left_index=True, right_index=True)
    style_df = style_df.rename(columns={'style_x':'style','abv_x':'abv'})
    style_df = style_df.drop(columns=['broad_style','style','beer_name','brewery_name',
                                      'city','state','clean_text'])

    col_names = ['abv', 'score']
    features = style_df[col_names]
    features = MinMaxScaler().fit_transform(features.values)
    style_df[col_names] = features
         
    beerix = cont_df.loc[cont_df['beer_name'] == beer_input].index.values
    x = np.array(style_df.loc[beerix[0]])
    x = x.reshape(1, -1)
    
    knn = NearestNeighbors(algorithm='brute', n_neighbors=5)
    knn.fit(style_df)
    
    results = knn.kneighbors(x, n_recs+1, return_distance=False).flatten()
    resultsids = style_df.iloc[results].index.values[1:]
    nresults_df = result_df.loc[resultsids]
    return nresults_df.style.hide_index()

In [113]:
knn_beer()

Style: Pale Ale
Beer Name: Superfuzz Blood Orange Pale Ale
How many recommendations? 5


Name,Style,Brewery,City,State
Extra Special Bitter,English Extra Special / Strong Bitter (ESB),Souris River Brewing,Minot,ND
Private Rye Bière De Garde,French Bière de Garde,Jack's Abby Brewing,Framingham,MA
Eagle Skull Saison,Belgian Saison,Monnik Beer Co.,Louisville,KY
Broken Rival,American Pale Ale (APA),BuckleDown Brewing,Lyons,IL
The Wise ESB,English Extra Special / Strong Bitter (ESB),Elysian Brewing Company,Seattle,WA


### Collaborative Filtering ???????????????

In [114]:
# reader = Reader(rating_scale =(1, 5) ) 
# collab_df = Dataset.load_from_df(df[['username', 'beer_id', 'score']], reader)

In [115]:
# cleandf = df[['username', 'beer_id', 'score']]

In [116]:
# #Train test split with test size of 20% 
# trainset, testset = train_test_split(collab_df, test_size=0.2)

In [117]:
# print(len(testset))
# print(testset[0])

In [118]:
# print('Number of users: ', trainset.n_users, '\n')
# print('Number of items: ', trainset.n_items, '\n')

In [119]:
# sim_cos = {'name':'cosine', 'user_based':False}

In [120]:
# basic = knns.KNNBasic(sim_options=sim_cos)
# basic.fit(trainset)

In [121]:
# predictions = basic.test(testset)

In [122]:
# print(accuracy.rmse(predictions))

In [123]:
# sim_pearson = {'name':'pearson', 'user_based':False}
# basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
# basic_pearson.fit(trainset)
# predictions = basic_pearson.test(testset)
# print(accuracy.rmse(predictions))

In [124]:
# sim_pearson = {'name':'pearson', 'user_based':False}
# knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
# knn_baseline.fit(trainset)
# predictions = knn_baseline.test(testset)
# print(accuracy.rmse(predictions))

In [125]:
# svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
# svd.fit(trainset)
# predictions = svd.test(testset)
# print(accuracy.rmse(predictions))

In [126]:
# user_34_prediction = svd.predict('34', '25')
# user_34_prediction

In [127]:
# user_34_prediction[3]

In [128]:
# params = {'n_factors': [20, 50, 100],
#          'reg_all': [0.02, 0.05, 0.1]}
# g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
# g_s_svd.fit(colab_df)

In [129]:
# print(g_s_svd.best_score)
# print(g_s_svd.best_params)

In [130]:
# # cross validating with KNNBasic
# knn_basic = KNNBasic(sim_options={'name':'pearson', 'user_based':True})
# cv_knn_basic = cross_validate(knn_basic, colab_df) 

In [131]:
# for i in cv_knn_basic.items():
#     print(i)
# print('-----------------------')
# print(np.mean(cv_knn_basic['test_rmse']))

In [132]:
# # cross validating with KNNBaseline
# knn_baseline = KNNBaseline(sim_options={'name':'pearson', 'user_based':True})
# cv_knn_baseline = cross_validate(knn_baseline,colab_df)

In [133]:
# for i in cv_knn_baseline.items():
#     print(i)

# np.mean(cv_knn_baseline['test_rmse'])

In [134]:
# def beer_rater(df,num, style=None):
#     userID = 1000
#     rating_list = []
#     while num > 0:
#         if style:
#             beer = df[df['style'].str.contains(style)].sample(1)
#         else:
#             beer = df.sample(1)
#         print(beer)
#         rating = input('How do you rate this beer on a scale of 1-5, press n if you have not had :\n')
#         if rating == 'n':
#             continue
#         else:
#             rating_one_beer = {'username':userID,'beer_id':beer['beer_id'].values[0],'score':rating}
#             rating_list.append(rating_one_beer) 
#             num -= 1
#     return rating_list   

In [135]:
# simpledf = df[['beer_name','style','username','score','beer_id','brewery_name']]

In [136]:
# user_rating = beer_rater(simpledf, 4, 'Stout')

In [137]:
# ## add the new ratings to the original ratings DataFrame
# new_ratings_df = codf.append(user_rating,ignore_index=True)
# new_data = Dataset.load_from_df(new_ratings_df,reader)

In [138]:
# new_ratings_df

In [139]:
# svd_ = SVD(n_factors= 50, reg_all=0.05)
# svd_.fit(new_data.build_full_trainset())

In [140]:
# list_of_beer = []
# for beer_id in codf['beer_id'].unique():
#     list_of_beer.append( (beer_id,svd_.predict(1000,beer_id)[3]))

In [141]:
# ranked_beers = sorted(list_of_beer, key=lambda x:x[1], reverse=True)

In [142]:
# # trying to fix this
# def recommended_beer(user_ratings,df,n):
#         for idx, rec in enumerate(user_ratings):
#             name = df.loc[df['beer_name'] == int(rec[0])]['beer_name']
#             print('Recommendation # ', idx+1, ': ', name, '\n')
#             n-= 1
#             if n == 0:
#                 break
            
# recommended_beer(ranked_beers,simpledf,5)

In [143]:
# model_df.head()

In [144]:
# plt.figure(figsize=(20,20))

# ax = sns.scatterplot(data=model_df, x='abv', y='score', 
#                      hue='style', palette='rainbow',
#                      size='score', sizes=(10,800), 
#                      alpha=0.7)

# ax.legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0., fontsize=12, ncol=2)

In [145]:
# sns.catplot(x='score', y='style', data=model_df, kind='bar',
#             palette='mako', height=20, aspect=1.5, ci=None)
# plt.xticks(fontsize=13)
# plt.yticks(fontsize=12.7)
# plt.xlabel('Score', fontsize=16)
# plt.ylabel('Style', fontsize=16, labelpad=15)
# plt.title('Mean Score by Style', fontsize=21,
#           pad=10, fontweight='bold');

In [146]:
# final_df.head()

In [147]:
# final_df.isna().sum().sum()

In [148]:
# pca = PCA(n_components=.9, random_state=42)

# beer_pca = pca.fit_transform(final_df)
