In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
styles = pd.read_json("../../ba_scrape/scrape_extract_data/styles_final.json")

In [6]:
styles.head()

Unnamed: 0,abvRange,baLink,description,glassware,ibuRange,name
0,6.0â€“9.0%,https://www.beeradvocate.com/beer/styles/57/,"The Belgian Dubbel is a rich, malty beer with ...",Goblet (or Chalice),15â€“30,Belgian Dubbel
1,5.5â€“7.5%,https://www.beeradvocate.com/beer/styles/116/,Today's American IPA is a different soul from ...,Tulip,50â€“70,American IPA
2,4.6-5.3%,https://www.beeradvocate.com/beer/styles/41/,A classic German-style Pilsner is straw to pal...,Flute,25-40,German Pilsner
3,4.5â€“6.5%,https://www.beeradvocate.com/beer/styles/97/,"Originally British in origin, this style is no...","Pint Glass (or Becker, Nonic, Tumbler)",25â€“50,American Pale Ale (APA)
4,7.0â€“12.0%,https://www.beeradvocate.com/beer/styles/140/,We have west coast American brewers to thank f...,Tulip,65â€“100,American Imperial IPA


In [7]:
def clean_text(corpus):
    replace_dict = {
        "â€œ":"\"", 
        "â€�":"\"", 
        "Ã¨":"è",
        "Ã¶":"ö",
        "Ã¼s":"ü",
        "â€“":"-",
        "â€”":"-",
        "Ã©":"é",
        "â€™":"'"
    }
    for term_to_replace, replacement_value in replace_dict.items():
        corpus = corpus.replace(term_to_replace, replacement_value)
    return corpus

In [8]:
styles.abvRange = styles.abvRange.str.replace("â€“","-")
styles.ibuRange = styles.ibuRange.str.replace("â€“","-").replace("0-0","Unknown")
styles.description = styles.description.map(lambda x: clean_text(x))

In [9]:
count_vect = CountVectorizer(stop_words='english', ngram_range=(1,5))
word_freq = count_vect.fit_transform(styles.description)

In [10]:
vocab = count_vect.vocabulary_
vocab

{'belgian': 2846,
 'dubbel': 7539,
 'rich': 17152,
 'malty': 13714,
 'beer': 2280,
 'spicy': 18556,
 'phenolic': 15597,
 'mild': 14170,
 'alcoholic': 725,
 'characteristics': 5254,
 'fruitiness': 9415,
 'strong': 18920,
 'dark': 6684,
 'ale': 753,
 'fruit': 9361,
 'aromas': 1736,
 'flavors': 9059,
 'present': 16033,
 'medium': 13915,
 'amber': 1160,
 'deep': 6903,
 'brown': 4423,
 'style': 19043,
 'recognizable': 16716,
 'hop': 10707,
 'bitterness': 3172,
 'lingering': 12796,
 'likely': 12777,
 'signs': 17912,
 'sweeter': 19749,
 'caramel': 4707,
 'flavor': 8790,
 'use': 20918,
 'crystal': 6629,
 'malt': 13291,
 'candi': 4691,
 'sugar': 19556,
 'look': 12941,
 'body': 3664,
 'expressive': 8389,
 'carbonation': 4791,
 'dryness': 7525,
 'finish': 8687,
 'traditionally': 20420,
 'trappist': 20492,
 'breweries': 4117,
 'make': 13241,
 'similar': 17925,
 'abbey': 128,
 'dubbels': 7554,
 'try': 20600,
 'emulate': 7728,
 'monastic': 14360,
 'originals': 15026,
 'westvleteren': 21652,
 'westma

In [131]:
word_freq.toarray()

array([[1]], dtype=int64)

  (0, 322)	1
  (0, 327)	1
  (0, 307)	1
  (0, 234)	1
  (0, 229)	1
  (0, 133)	1
  (0, 312)	1
  (0, 128)	1
  (0, 4)	1
  (0, 265)	1
  (0, 196)	1
  (0, 57)	1
  (0, 14)	1
  (0, 300)	1
  (0, 295)	1
  (0, 143)	1
  (0, 112)	1
  (0, 77)	1
  (0, 138)	1
  (0, 52)	1
  (0, 215)	1
  (0, 191)	1
  (0, 285)	1
  (0, 67)	1
  (0, 98)	1
  :	:
  (0, 168)	2
  (0, 246)	1
  (0, 276)	1
  (0, 58)	1
  (0, 103)	1
  (0, 19)	1
  (0, 207)	2
  (0, 241)	1
  (0, 149)	2
  (0, 24)	1
  (0, 158)	1
  (0, 10)	2
  (0, 90)	3
  (0, 271)	1
  (0, 163)	1
  (0, 78)	1
  (0, 5)	1
  (0, 216)	2
  (0, 235)	1
  (0, 266)	1
  (0, 29)	1
  (0, 202)	1
  (0, 251)	1
  (0, 113)	3
  (0, 34)	2
  (0, 183)	1
  (0, 219)	1
  (0, 139)	1
  (0, 104)	1
  (0, 196)	1
  (0, 21)	1
  (0, 36)	1
  (0, 214)	1
  (0, 41)	1
  (0, 59)	1
  (0, 77)	1
  (0, 54)	1
  (0, 224)	1
  (0, 169)	1
  (0, 229)	1
  (0, 259)	1
  (0, 308)	1
  (0, 323)	1
  (0, 209)	1
  (0, 49)	1
  (0, 87)	1
  (0, 264)	1
  (0, 200)	1
  (0, 129)	1
  (0, 99)	1
  :	:
  (0, 235)	1
  (0, 160)	1
  (0, 105)	1
 