In [1]:
import os
import numpy as np
import pandas as pd
from time import time
from utils import custom_preprocessing, dependencies, aspect_tools

data_dir = "/home/stavros/DATA/AirbnbReviews"
#data_dir = "D:/"

In [4]:
area = "nyc"
area_dir = os.path.join(data_dir, area)

nyc_reviews = pd.read_pickle(os.path.join(area_dir, "reviews_with_aspects_200000samples.pkl"))
print(nyc_reviews.shape)
nyc_reviews = nyc_reviews[pd.notnull(nyc_reviews["aspects"])]
print(nyc_reviews.shape)
nyc_reviews = nyc_reviews[nyc_reviews["aspects"].map(lambda x: len(x) > 0)]
print(nyc_reviews.shape)

(200000, 8)
(200000, 8)
(177490, 8)


In [3]:
area = "munich"
area_dir = os.path.join(data_dir, area)

muc_reviews = pd.read_pickle(os.path.join(area_dir, "reviews_with_aspects_169940samples.pkl"))
print(muc_reviews.shape)
muc_reviews = muc_reviews[pd.notnull(muc_reviews["aspects"])]
print(muc_reviews.shape)
muc_reviews = muc_reviews[muc_reviews["aspects"].map(lambda x: len(x) > 0)]
print(muc_reviews.shape)

(169940, 8)
(112886, 8)
(99119, 8)


In [5]:
nyc_aspects = aspect_tools.collect_aspects(nyc_reviews.aspects)
print(len(nyc_aspects))
muc_aspects = aspect_tools.collect_aspects(muc_reviews.aspects)
print(len(muc_aspects))

12862
8203


In [6]:
import gensim
google_vec_file = os.path.join(data_dir, "GoogleNews-vectors-negative300.bin.gz")
word2vec = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True, limit=200000)
word2vec

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f0481b49710>

In [7]:
nyc_matrix = aspect_tools.DistanceMatrix.calculate(word2vec, nyc_aspects)
muc_matrix = aspect_tools.DistanceMatrix.calculate(word2vec, muc_aspects)

nyc_word_map = nyc_matrix.word_replacement_map(cut_off=0.5)
muc_word_map = muc_matrix.word_replacement_map(cut_off=0.5)

Calculating matrix with 3977 words.
Calculating matrix with 2658 words.


In [8]:
print(len(nyc_word_map), len(muc_word_map))

3977 2658


In [25]:
import collections
def merge_words_in_counter(counter: collections.Counter, word_map) -> collections.Counter:
    new_counter = collections.Counter()
    for w, c in counter.items():
        if w in word_map:
            new_counter[word_map[w]] += c
    return new_counter
        

nyc_aspects_merged = merge_words_in_counter(nyc_aspects, nyc_word_map)
muc_aspects_merged = merge_words_in_counter(muc_aspects, muc_word_map)

In [26]:
# Normalize aspects
nyc_aspects_merged = collections.Counter({k: v * 100.0 / len(nyc_reviews) for k, v in nyc_aspects_merged.items()})
muc_aspects_merged = collections.Counter({k: v * 100.0 / len(muc_reviews) for k, v in muc_aspects_merged.items()})

In [27]:
import plotly.graph_objects as go
bar_plot_word_index0 = 20
bar_plot_word_index1 = 50

all_aspects_merged = nyc_aspects_merged + muc_aspects_merged

bar_plot_words =all_aspects_merged.most_common()[bar_plot_word_index0: bar_plot_word_index1]
bar_plot_words = [word for word, _ in bar_plot_words]

fig = go.Figure()
fig.add_trace(go.Bar(
    y=bar_plot_words,
    x=[nyc_aspects_merged[word] for word in bar_plot_words],
    orientation="h",
    name="New York"
))
fig.add_trace(go.Bar(
    y=bar_plot_words,
    x=[muc_aspects_merged[word] for word in bar_plot_words],
    orientation="h",
    name="Munich"
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode="group", 
                  width=900,
                  height=1000,
                  font_size=16, 
                  xaxis_title="Aspect occurence per review (%)",
                  bargap=0.2, # gap between bars of adjacent location coordinates.
                  bargroupgap=0.1, # gap between bars of the same location coordinate.
                  legend=dict(x=0.8, y=1.0,bgcolor='rgba(255, 255, 255, 0)', 
                              bordercolor='rgba(255, 255, 255, 0)', font_size=20)
                  )
fig.show()

In [22]:
nyc_reviews.processed_comments

0         Molly s place is quiet clean and comfortable. ...
2         Hakim is an amazing host Hakim was more than w...
3         Fantastic! Great location and great stay! Than...
4         The stay was great. A lot of restaurants aroun...
5         Clean nicely decorated place. Easy check in. T...
                                ...                        
199995    We were very happy We chose Host and Host s pl...
199996    Five stars across the board does not seem to b...
199997    We had a perfect time at Host place ! He has a...
199998    Host s place is fantastic! Host s place s an i...
199999    I was stuck at Laguardia Airport after a misse...
Name: processed_comments, Length: 177490, dtype: object

In [31]:
import re
def remove_special_characters(review):
    # Substitute all special characters with spaces
    text = re.sub("[^a-zA-z\s]", " ", review)
    # Substitute any white space character with a single space
    text = " ".join(text.split())
    return text.lower()

nyc_comments = nyc_reviews.processed_comments.map(remove_special_characters)

In [32]:
from sklearn import feature_extraction
tfidf = feature_extraction.text.TfidfVectorizer()

In [33]:
nyc_tfidf_matrix = tfidf.fit_transform(nyc_comments)

In [34]:
nyc_tfidf_matrix

<177490x45001 sparse matrix of type '<class 'numpy.float64'>'
	with 6907455 stored elements in Compressed Sparse Row format>

In [51]:
word_scores = nyc_tfidf_matrix.mean(axis=0)
word_scores = collections.Counter({w: word_scores[0, ind] for w, ind in tfidf.vocabulary_.items()})

In [56]:
word_scores.most_common(50)

[('the', 0.09777273742166412),
 ('and', 0.09733486040061426),
 ('host', 0.07901457329709424),
 ('to', 0.06583809889875307),
 ('was', 0.06499328488304465),
 ('is', 0.05944692950485614),
 ('great', 0.0521466467365233),
 ('very', 0.04900140365535534),
 ('in', 0.04641388565093847),
 ('place', 0.04602691590203144),
 ('we', 0.0445874203054972),
 ('stay', 0.03614375512617354),
 ('for', 0.03505924852936403),
 ('location', 0.033558252500996553),
 ('apartment', 0.03311486129956851),
 ('of', 0.03171560525757919),
 ('clean', 0.03010846992094095),
 ('with', 0.029242392535786046),
 ('you', 0.02850672238599397),
 ('nice', 0.028463582537996652),
 ('it', 0.02477633299917145),
 ('this', 0.023972324711325065),
 ('would', 0.022280868469834663),
 ('good', 0.021772500141203893),
 ('room', 0.021709913587632763),
 ('my', 0.021677548383012276),
 ('had', 0.021540670089421707),
 ('our', 0.021119372557413356),
 ('at', 0.02049680345123053),
 ('comfortable', 0.019620520234492695),
 ('recommend', 0.01946070106792220