In [2]:
import os
import random
import re
import string
from multiprocessing import Pool

import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

import sklearn.cluster as sk

from sklearn.mixture import GaussianMixture
from sklearn.cluster import MiniBatchKMeans, SpectralClustering
from sklearn.metrics import silhouette_samples, silhouette_score

import json
import api
import gensim.downloader

import langdetect

from keybert import KeyBERT

nltk.download("stopwords")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to C:\Users\Paul-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = ["" if re.findall('[0-9]+', t) else t for t in tokens]  # Remove words that contain numbers
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [4]:
custom_stopwords = list(set(stopwords.words("english") + ["news", "new", "top", "aroma", "draft", "brew", "beer", "taste", "brewery", "bottle", "straw", "flavour", "flavors", "pours", "overall"
"tap", "bottle", "can", "aroma", "draft", "brew", "beer", "taste", "brewery", "bottle", "straw", "flavour", "flavor", "complex", "similar", "drinkable", "tastes", "colour", "notes", "nice"
"drinking", "drink", "choice", "sample", "nose", "oz", "ml", "keg", "taster", "poured", "pour", "liked", "like", "love", "loved", "style", "palate", "enjoyable", "pleasant", "birthday", "apa", "uk", "us", "york",
"nothing", "special", "resturant", "bar", "house", "much", "better", "though", "central"]))
#custom_stopwords.extend(big_list_stop_words)

In [5]:
def tokenize_beer_reviews(beer_type: str, beer: str):
    """ 
    beer_type: type of beer directory
    beer: specific beer in beer type directory
    """
    f = open(f"beer_data\\{beer_type}\\{beer}\\reviews.json")
    data = json.load(f)

    keywords_list = []
    for index, review in enumerate(data["reviews"]):
        
        try:
            if langdetect.detect(review) != "en":
                continue
        except Exception as e:
            continue
        
        kw_model = KeyBERT()
        keywords = kw_model.extract_keywords(review)
        review_keywords = kw_model.extract_keywords(review, keyphrase_ngram_range=(1, 1), stop_words=custom_stopwords, top_n=10)
        review_words = [item[0] for item in review_keywords]
        keywords_list.append((" ".join(review_words)))


    tokenized_docs = []
    for review in keywords_list:
        tokenized_docs.append(clean_text(review, word_tokenize, custom_stopwords))

    token_dict = {
        "tokens": tokenized_docs
    }

    with open(f"beer_data\\{beer_type}\\{beer}\\token_words.json", "w") as outfile:
        json.dump(token_dict, outfile)
    print(f"Processed {beer_type}-{beer}")

In [6]:
list_tuples_beers = []
for beer_type in os.listdir("beer_data"):
    for beer in os.listdir(f"beer_data\\{beer_type}"):
        list_tuples_beers.append((beer_type, beer))

In [None]:
# We should mulitprocess this at some point, just ran this while cooking and eating for now
for item in list_tuples_beers:
    tokenize_beer_reviews(item[0], item[1])

In [None]:
dict_words = {}
for doc in tokenized_docs:
    for word in doc:
        if word not in dict_words:
            dict_words[word] = 1
        else:
            dict_words[word] += 1

In [None]:
{k: v for k, v in sorted(dict_words.items(), key=lambda item: item[1])}

{'bottles': 1,
 'beersniffers': 1,
 'saltburn': 1,
 'peppery': 1,
 'miller': 1,
 'bubbles': 1,
 'moderate': 1,
 'snifter': 1,
 'starinise': 1,
 'drunk': 1,
 'varnavas': 1,
 'zambon': 1,
 'fellow': 1,
 'inactive': 1,
 'fan': 1,
 'analyst': 1,
 'flavoursome': 1,
 'stone': 1,
 'sours': 1,
 'bière': 1,
 'curious': 1,
 'kind': 1,
 'school': 1,
 'freshest': 1,
 'orangeish': 1,
 'copperish': 1,
 'isolation': 1,
 'lemongrass': 1,
 'hard': 1,
 'proper': 1,
 'still': 1,
 'six': 1,
 'pack': 1,
 'drop': 1,
 'lovely': 1,
 'stomach': 1,
 'imperial': 1,
 'pleasand': 1,
 'fun': 1,
 'bite': 1,
 'lingers': 1,
 'portland': 1,
 'august': 1,
 'co': 1,
 'byfleet': 1,
 'undertone': 1,
 'makes': 1,
 'combination': 1,
 'soda': 1,
 'original': 1,
 'lots': 1,
 'march': 1,
 'overlly': 1,
 'straight': 1,
 'hoppier': 1,
 'supersmooth': 1,
 'damian': 1,
 'withe': 1,
 'tampere': 1,
 'finland': 1,
 'haze': 1,
 'malted': 1,
 'caramellized': 1,
 'oldschool': 1,
 'bb': 1,
 'lil': 1,
 'maybe': 1,
 'bbe': 1,
 'foggy': 1,
 