In [44]:
import os
import collections
import numpy as np
import pandas as pd
from time import time

import plotly.graph_objects as go
from plotly.subplots import make_subplots

data_dir = "/home/stavros/DATA/TripAdvisorReviews"

In [14]:
hotel_dir = os.path.join(data_dir, "kresten_royal/the_kresten_royal_villas_1747reviews_withaspects")
data = containers.DataAspects.load(data_dir=hotel_dir)

In [7]:
import gensim
google_vec_file = os.path.join("/home/stavros/DATA/GoogleNews-vectors-negative300.bin.gz")
word2vec = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True, limit=200000)

In [8]:
categories = ["location", "cleanliness", "service", "value"]

In [9]:
for cat in categories:
    print(cat in word2vec)

True
True
True
True


In [32]:
single_words = set(word for word in data.container.words if len(word.split(" ")) == 1)
more_words = set(word for word in data.container.words if len(word.split(" ")) > 1)
print(len(single_words), len(more_words))

1829 890


In [33]:
# Map from single words to phrase
single_words_map = {}
for phrase in data.container.words:
    for word in phrase.split(" "):
        single_words_map[word] = phrase
print(len(single_words_map))

2203


In [35]:
ordered_words = [word for word in single_words_map.keys() if word in word2vec]
print(len(ordered_words), len(single_words_map))
print(len(ordered_words) / len(single_words_map))

1911 2203
0.867453472537449


In [38]:
categorical_distances = np.array([word2vec.distances(word, categories) for word in ordered_words])
categorical_distances.shape

(1911, 4)

In [51]:
np.unique(categorical_distances.argmin(axis=-1), return_counts=True)

(array([0, 1, 2, 3]), array([491, 632, 392, 396]))

In [52]:
word2cat = {word: categories[dist.argmin()] for word, dist in zip(ordered_words, categorical_distances)}
print(len(word2cat))
phrase2cat = {single_words_map[word]: cat for word, cat in word2cat.items()}
print(len(phrase2cat))

1911
1777


In [68]:
single_words_map["location"]

'hotels location'

In [53]:
groups = {cat: collections.Counter({}) for cat in categories}
for phrase, cat in phrase2cat.items():
    groups[cat][phrase] = data.container.appearances[phrase]

In [65]:
for k, group in groups.items():
    print(k, "location" in group)

location False
cleanliness False
service False
value False


In [63]:
groups["service"].most_common()

[('people', 82),
 ('massage', 45),
 ('job', 44),
 ('music', 28),
 ('upgrade', 25),
 ('meals', 21),
 ('work', 16),
 ('working', 11),
 ('reception area', 10),
 ('pizza', 8),
 ('hours', 7),
 ('sandwiches', 7),
 ('country', 7),
 ('helpful', 6),
 ('bread', 6),
 ('life', 6),
 ('system', 6),
 ('mobility problems', 6),
 ('bar tender', 6),
 ('voucher', 5),
 ('activities', 5),
 ('car hire', 5),
 ('songs', 5),
 ('reach', 5),
 ('greeting', 5),
 ('gentleman', 5),
 ('flight', 4),
 ('journey', 4),
 ('trips', 4),
 ('treatments', 4),
 ('policy', 4),
 ('gesture', 4),
 ('flights', 4),
 ('group', 4),
 ('courses', 4),
 ('vouchers', 4),
 ('thanks', 4),
 ('ride', 4),
 ('company', 4),
 ('credit', 4),
 ('reports', 4),
 ('buffet meals', 3),
 ('cuts', 3),
 ('effort', 3),
 ('supplies', 3),
 ('advice', 3),
 ('drive', 3),
 ('session', 3),
 ('worked', 3),
 ('scheme', 3),
 ('faces', 3),
 ('singles', 3),
 ('checkout', 3),
 ('bell boy', 3),
 ('second', 3),
 ('beverages', 3),
 ('wait', 3),
 ('workers', 3),
 ('customers'