In [1]:
import pandas as pd
from scipy.stats import entropy
import queue
import numpy as np
import gensim
from scipy.spatial.distance import euclidean
from gensim.utils import simple_preprocess
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from nltk import tokenize
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import euclidean_distances
from sklearn import manifold
from sklearn.cluster import DBSCAN

from source.Topic.Utils import *
from source.Topic.Model import *
import re

from sentence_transformers import SentenceTransformer

Using TensorFlow backend.


In [2]:
meta = pd.read_csv("kaffee_reviews.csv")
meta.drop(columns=["index"], inplace=True)

In [3]:
method = "BERT_AE" # "LDA_BERT"
samp_size = len(meta.review)
ntopic = 10

rws = meta.review
#rws = pd.Series(sentences)
sentences, token_lists, idx_in = preprocess(rws, samp_size=samp_size)

Preprocessing raw texts ...
Preprocessing raw texts. Done!


In [4]:
model = SentenceTransformer("distiluse-base-multilingual-cased")

In [5]:
words, words_counts = np.unique(np.concatenate(token_lists), return_counts=True)
embeddings = model.encode(words)

In [26]:
def synonyme(unique_words_count, embeddings, eps=0.3, min_samples=2):
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(embeddings)
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)
    #[np.where(labels == i)[0] for i in range(n_clusters_)]
    count = 0
    syn = {}
    syn_i = {}
    for word_idx in np.where(labels == -1)[0]:
        syn[word_idx] = count
        syn_i[count] = word_idx
        count += 1
    for cluster in [np.where(labels == i)[0] for i in range(n_clusters_)]:
        # print([words[j] for j in cluster])
        for word_idx in cluster:
            syn[word_idx] = count
        syn_i[count] = cluster[unique_words_count[cluster].argmax()]
        count += 1
    return syn, syn_i, count

syn, syn_i, n_syn = synonyme(words_counts, embeddings)

Estimated number of clusters: 44
Estimated number of noise points: 762
['Abpackung', 'Verpackung']
['Anblick', 'Hinblick']
['Aroma', 'aroma', 'aromatisch']
['Art', 'art']
['Beigeschmack', 'geschmackssach', 'geschmacksscal']
['Benutzung', 'Verwendung']
['Bohnenform', 'bohnenform']
['Bohnenkaffee', 'bohnenkaffee']
['Cafe', 'Café', 'cafe']
['Cappuccino', 'cappuccino']
['Discounter', 'discounter']
['Einkauf', 'einkaufen']
['Espressomaschine', 'Nespressomaschine', 'espressomaschine']
['Geschmack', 'Geschmackssache']
['Herstellung', 'Produktion']
['Info', 'Information']
['Kaffe', 'Kaffee', 'Kaffeebohnen', 'Kaffeebörse', 'Kaffeeduft', 'Kaffeegenuss', 'Kaffeemaschine', 'Kaffeemühle', 'Kaffeeröstereien', 'Kaffeesort', 'Kaffeesorte', 'Kaffeetasse', 'Kaffeetrinker', 'cafés', 'getreidekaffee', 'kaffe', 'kaffebohn', 'kaffeearomen', 'kaffeebauern', 'kaffeebohn', 'kaffeegenießer', 'kaffeegenuss', 'kaffeekonsum', 'kaffeemaschine', 'kaffeereihen', 'kaffees', 'kaffeesorte', 'kaffeesorten', 'kaffeetrink'

In [36]:
mat = np.zeros((n_syn, n_syn))
for k, token_list in enumerate(token_lists):
    syn_tokens = np.unique([syn[np.where(words==i)[0][0]] for i in token_list])
    token_combinations = np.array(np.meshgrid(syn_tokens, syn_tokens)).T.reshape(-1, 2)
    for i, j in token_combinations:
        if i == j:
            mat[i][j] += 1 # len(syn_tokens) - 1
        else:
            mat[i][j] += 1

mat_p = np.zeros((n_syn, n_syn))
for i in range(n_syn):
    if mat[i][i] != 0:
        mat_p[i] = mat[i] / mat[i][i]
        # mat_p[i][i] = 0

In [38]:
# Kaffee = 239, syn[Kaffee] = 778
# Geschmack = 178, syn[Geschmack] = 775
# argmax 775
# q = syn[np.where(words == "Bewertung")[0][0]]
q = 778
[(words[syn_i[i]], np.round(mat_p[q][i] * 100, decimals=2)) for i in np.argsort(mat_p[q])[::-1]][:10]
# [np.round(sum(mat_p[i]), decimals=3) for i in range(mat_p.shape[0])]

[('Kaffee', 100.0),
 ('Geschmack', 28.37),
 ('Aroma', 14.54),
 ('preis', 14.54),
 ('Bio', 13.48),
 ('Leck', 11.35),
 ('Produkt', 9.93),
 ('Bohne', 9.57),
 ('Sorte', 9.57),
 ('Qualität', 8.16)]

In [35]:
np.argsort([sum(mat[i]) for i in range(n_syn)])[::-1][:10]

array([778, 775, 804,  62, 764, 344, 400, 265,  70, 413], dtype=int64)

In [37]:
p = 775
threshold = 10
[(words[syn_i[i]], np.round(mat_p[:, p][i] * 100, decimals=2), mat[:,p][i]) for i in np.argsort(mat_p[:, p])[::-1] if mat[:, p][i] > threshold][:10]
#mat_p[:, 775]

[('Geschmack', 100.0, 116.0),
 ('preis', 41.18, 21.0),
 ('Cafe', 40.74, 11.0),
 ('Bio', 31.48, 17.0),
 ('Aroma', 28.57, 16.0),
 ('Kaffee', 28.37, 80.0),
 ('Leck', 27.66, 13.0)]

In [40]:
bar = np.argsort([sum(mat[i]) for i in range(n_syn)])[::-1][:10]
for k in bar:
    foo = [(words[syn_i[i]], np.round(mat_p[k][i] * 100, decimals=2)) for i in np.argsort(mat_p[k])[::-1]][:10]
    print(foo)

[('Kaffee', 100.0), ('Geschmack', 28.37), ('Aroma', 14.54), ('preis', 14.54), ('Bio', 13.48), ('Leck', 11.35), ('Produkt', 9.93), ('Bohne', 9.57), ('Sorte', 9.57), ('Qualität', 8.16)]
[('Geschmack', 100.0), ('Kaffee', 68.97), ('preis', 18.1), ('Bio', 14.66), ('Aroma', 13.79), ('Leck', 11.21), ('Cafe', 9.48), ('fairtrade', 8.62), ('Produkt', 8.62), ('Sorte', 8.62)]
[('preis', 100.0), ('Kaffee', 80.39), ('Geschmack', 41.18), ('Sorte', 21.57), ('Aroma', 17.65), ('Qualität', 15.69), ('Cafe', 15.69), ('Bio', 15.69), ('Leck', 15.69), ('Bohne', 13.73)]
[('Bio', 100.0), ('Kaffee', 70.37), ('Geschmack', 31.48), ('Aroma', 18.52), ('Produkt', 16.67), ('Qualität', 14.81), ('preis', 14.81), ('fairtrade', 12.96), ('Sorte', 9.26), ('Magen', 9.26)]
[('Aroma', 100.0), ('Kaffee', 73.21), ('Geschmack', 28.57), ('Bio', 17.86), ('preis', 16.07), ('Bohne', 14.29), ('Cafe', 12.5), ('Produkt', 10.71), ('Duft', 10.71), ('kaffeevollautomat', 8.93)]
[('Produkt', 100.0), ('Kaffee', 75.68), ('Geschmack', 27.03), (