In [2]:
import os
import collections
import re
import json
import spacy
import pandas as pd
data_dir = "/home/stavros/DATA/TripAdvisorReviews/kresten_palace_hotel_&_wellness'"

In [2]:
data = pd.read_csv(os.path.join(data_dir, "kresten_palace_hotel_&_wellness'_365reviews.csv"))
print(data.shape)
print(data.columns)

published_year = data.publishedDate.apply(lambda x: x.split("-")[0])
published_year.value_counts()

(365, 24)
Index(['id', 'absoluteUrl', 'createdDate', 'publishedDate', 'locationId',
       'originalLanguage', 'language', 'stayDate', 'tripType', 'helpfulVotes',
       'title', 'text', 'rating', 'additionalRatings', 'username', 'userId',
       'user_hometownId', 'user_hometownName', 'response_id',
       'response_publishedDate', 'response_language', 'response_username',
       'response_connectionToSubject', 'response_text'],
      dtype='object')


2019    86
2016    48
2018    47
2017    45
2015    41
2014    29
2012    20
2011    14
2013    13
2010     8
2005     5
2006     3
2007     2
2008     1
2009     1
2004     1
2003     1
Name: publishedDate, dtype: int64

In [35]:
type(data.id[0])

numpy.int64

Expand contractions

In [3]:
_CMAP_DIR = os.path.join("/home/stavros/GitHub/Review-Aspects-App/nlptools", "contractions.txt")
with open(_CMAP_DIR, "r") as file:
  _CMAP = json.load(file)


def expand_contractions(text, contraction_mapping=_CMAP):
  contractions_pattern = re.compile('({})'.format('|'.join(
      contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)

  def expand_match(contraction):
    match = contraction.group(0)
    first_char = match[0]
    if contraction_mapping.get(match):
      expanded_contraction = contraction_mapping.get(match)
    else:
      expanded_contraction = contraction_mapping.get(match.lower())
    expanded_contraction = first_char+expanded_contraction[1:]
    return expanded_contraction

  expanded_text = contractions_pattern.sub(expand_match, text)
  expanded_text = re.sub("'s", "", expanded_text)
  expanded_text = re.sub("'", "", expanded_text)
  return expanded_text

Define stop word and special character sets

In [4]:
STOP_WORDS = set(
    """
a about above across after afterwards again against all almost alone along
already also although always am among amongst amount an and another any anyhow
anyone anything anyway anywhere are around as at
back be became because become becomes becoming been before beforehand behind
being below beside besides between beyond both bottom but by
call can cannot ca could
did do does doing done down due during
each eight either eleven else elsewhere empty enough even ever every
everyone everything everywhere except
few fifteen fifty first five for former formerly forty four from front full
further
get give go
had has have he hence her here hereafter hereby herein hereupon hers herself
him himself his how however hundred
i if in indeed into is it its itself
keep
last latter latterly least less
just
made make many may me meanwhile might mine more moreover most mostly move much
must my myself
name namely neither never nevertheless next nine no nobody none noone nor not
nothing now nowhere
of off often on once one only onto or other others otherwise our ours ourselves
out over own
part per perhaps please put
quite
rather re really regarding
same say see seem seemed seeming seems serious several she should show side
since six sixty so some somehow someone something sometime sometimes somewhere
still such
take ten than that the their them themselves then thence there thereafter
thereby therefore therein thereupon these they third this those though three
through throughout thru thus to together too top toward towards twelve twenty
two
under until up unless upon us used using
various very very via was we well were what whatever when whence whenever where
whereafter whereas whereby wherein whereupon wherever whether which while
whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves
""".split()
)
len(STOP_WORDS)

305

In [5]:
SPECIAL_CHARS = set('.,!?/\n;" ()-_&:')
for n in range(5): SPECIAL_CHARS.add(n * '\n')
for n in range(5): SPECIAL_CHARS.add(n * ' ')
SPECIAL_CHARS.add("...")
print(SPECIAL_CHARS)

INVALID_TOKENS = STOP_WORDS | SPECIAL_CHARS

{'', ';', ' ', '\n\n\n\n', '\n', '!', '\n\n\n', ',', '/', '&', '    ', '-', '  ', '_', '"', '?', ':', '   ', ')', '(', '...', '\n\n', '.'}


Apply spacy

In [6]:
nlp = spacy.load('en_core_web_sm')
expanded_text = data.text.apply(lambda x: expand_contractions(x))
processed_text = list(nlp.pipe(expanded_text))

Test spacy files

In [3]:
data = pd.read_pickle(os.path.join(data_dir, "kresten_palace_hotel_&_wellness'_365reviews_spacy.pkl"))
data.shape

(365, 25)

In [4]:
for k in data.keys():
    print(k, type(data.iloc[1][k]))

id <class 'numpy.int64'>
absoluteUrl <class 'str'>
createdDate <class 'str'>
publishedDate <class 'str'>
locationId <class 'numpy.int64'>
originalLanguage <class 'str'>
language <class 'str'>
stayDate <class 'str'>
tripType <class 'str'>
helpfulVotes <class 'numpy.int64'>
title <class 'str'>
text <class 'str'>
rating <class 'numpy.int64'>
additionalRatings <class 'str'>
username <class 'str'>
userId <class 'str'>
user_hometownId <class 'numpy.float64'>
user_hometownName <class 'float'>
response_id <class 'numpy.float64'>
response_publishedDate <class 'str'>
response_language <class 'str'>
response_username <class 'str'>
response_connectionToSubject <class 'str'>
response_text <class 'str'>
spacy_text <class 'spacy.tokens.doc.Doc'>


In [34]:
query = "SELECT * FROM review ORDER BY publishedDate DESC"
data_sql = pd.read_sql(query, "sqlite:////home/stavros/DATA/TripAdvisorReviews/app_storage/app.db")

In [32]:
data.iloc[0]

id                                                                      724363847
absoluteUrl                     https://www.tripadvisor.com/ShowUserReviews-g1...
createdDate                                                            2019-11-06
publishedDate                                                          2019-11-06
locationId                                                                 234314
originalLanguage                                                               en
language                                                                       en
stayDate                                                               2019-10-31
tripType                                                                     NONE
helpfulVotes                                                                    0
title                                                       Average and expensive
text                            Paid for a sea view or garden view, got a grou...
rating          

In [35]:
data_sql.iloc[0]

id                                                           724363847
absoluteUrl          https://www.tripadvisor.com/ShowUserReviews-g1...
createdDate                                                 2019-11-06
stayDate                                                    2019-10-31
publishedDate                                               2019-11-06
rating                                                               2
username                                                      LJPhoto1
userId                                C97A5EA1F916B46ED399D5D04DEBF4F2
user_hometownId                                                    NaN
user_hometownName                                                 None
title                                            Average and expensive
text                 Paid for a sea view or garden view, got a grou...
spacy_text           b'\x85\xa4text\xda\x05<Paid for a sea view or ...
hotelId                               kresten_palace_hotel_&_wellness'
Name: 

In [36]:
vocab = spacy.vocab.Vocab().from_disk("/home/stavros/DATA/TripAdvisorReviews/app_storage/vocab_kresten_palace_hotel_&_wellness'")
vocab

<spacy.vocab.Vocab at 0x7feb2e1dbac8>

In [12]:
sentence = list(data.spacy_text[0].sents)[0]

In [19]:
sentence_doc = sentence.as_doc()

In [39]:
text_load = spacy.tokens.Doc(vocab).from_bytes(data_sql.iloc[0]["spacy_text"])

In [41]:
for t1, t2 in zip(data.iloc[0]["spacy_text"], text_load):
    print(t1, t1.pos_)
    print(t2, t2.pos_)
    print()

Paid VERB
Paid VERB

for ADP
for ADP

a DET
a DET

sea NOUN
sea NOUN

view NOUN
view NOUN

or CCONJ
or CCONJ

garden NOUN
garden NOUN

view NOUN
view NOUN

, PUNCT
, PUNCT

got VERB
got VERB

a DET
a DET

ground NOUN
ground NOUN

floor NOUN
floor NOUN

room NOUN
room NOUN

facing VERB
facing VERB

trees NOUN
trees NOUN

with ADP
with ADP

a DET
a DET

road NOUN
road NOUN

behind ADV
behind ADV

. PUNCT
. PUNCT

One NUM
One NUM

of ADP
of ADP

the DET
the DET

beds NOUN
beds NOUN

had AUX
had AUX

springs NOUN
springs NOUN

sticking VERB
sticking VERB

out ADP
out ADP

and CCONJ
and CCONJ

air NOUN
air NOUN

con PROPN
con PROPN

was AUX
was AUX

not PART
not PART

working VERB
working VERB

. PUNCT
. PUNCT

Would VERB
Would VERB

not PART
not PART

move VERB
move VERB

us PRON
us PRON

to ADP
to ADP

another DET
another DET

room NOUN
room NOUN

because SCONJ
because SCONJ

they PRON
they PRON

said VERB
said VERB

they PRON
they PRON

had AUX
had AUX

closed VERB
closed VERB

down ADP


Try seriealizing spacy

In [7]:
type(processed_text[0])

spacy.tokens.doc.Doc

In [8]:
text_ser = processed_text[0].to_bytes()

In [13]:
type(text_ser) == bytes

True

In [15]:
processed_text[0].vocab

<spacy.vocab.Vocab at 0x7feb48ca1248>

In [22]:
nlp.vocab.to_disk(os.path.join(data_dir, "spacy_vocab"))

In [23]:
loaded_vocab = spacy.vocab.Vocab().from_disk(os.path.join(data_dir, "spacy_vocab"))

In [28]:
text_load = spacy.tokens.Doc(loaded_vocab).from_bytes(text_ser)

In [29]:
for token in processed_text[0][:10]:
    print(token, token.pos_)

Paid VERB
for ADP
a DET
sea NOUN
view NOUN
or CCONJ
garden NOUN
view NOUN
, PUNCT
got VERB


In [30]:
for token in text_load[:10]:
    print(token, token.pos_)

Paid VERB
for ADP
a DET
sea NOUN
view NOUN
or CCONJ
garden NOUN
view NOUN
, PUNCT
got VERB


In [36]:
len(text_ser)

129635

Test loading and saving

In [26]:
data["spacy_text"] = processed_text
data.to_pickle(os.path.join(data_dir, "kresten_palace_hotel_&_wellness'_365reviews_spacy.pkl"))

In [29]:
type(processed_text[0])

spacy.tokens.doc.Doc

In [30]:
loaded_data = pd.read_pickle(os.path.join(data_dir, "kresten_palace_hotel_&_wellness'_365reviews_spacy.pkl"))
loaded_data.columns

Index(['id', 'absoluteUrl', 'createdDate', 'publishedDate', 'locationId',
       'originalLanguage', 'language', 'stayDate', 'tripType', 'helpfulVotes',
       'title', 'text', 'rating', 'additionalRatings', 'username', 'userId',
       'user_hometownId', 'user_hometownName', 'response_id',
       'response_publishedDate', 'response_language', 'response_username',
       'response_connectionToSubject', 'response_text', 'spacy_text'],
      dtype='object')

In [31]:
print(type(loaded_data.text[0]))
print(type(loaded_data.spacy_text[0]))

<class 'str'>
<class 'spacy.tokens.doc.Doc'>


In [36]:
print(list(processed_text[0].sents))
print("\n\n\n")
print(list(loaded_data.spacy_text[0].sents))

[Paid for a sea view or garden view, got a ground floor room facing trees with a road behind., One of the beds had springs sticking out and air con was not working., Would not move us to another room because they said they had closed down most of the hotel as it was end of the season., They were quick to sort out the bed and and air con., The room was cleaned well every day. 

, We travelled at the end of Oct into first week of Nov, everything is closing, no reps on hand to help and receptionists unhelpful., They actually closed the hotel the day before we left and transferred us to their other hotel but the room was not ready!, Really inconvenient when you have to pack early and have wet clothes to dry and took up valuable daylight time as the sun sets at 5:30pm. 

, Apart from the receptionists, the staff were really friendly and nice., Food and drinks are expensive at the hotel, and you need to either get their shuttle bus, the normal bus or hire a car, , there is not anything aroun

Find most common words

1. Simple counting of most common words 

In [12]:
text_to_token = {} # Dict[str, List[spacy.Token]]
for review in processed_text:
    for token in review:
        text = token.text.lower()
        if text not in INVALID_TOKENS:
            if text in text_to_token:
                text_to_token[text].append(token)
            else:
                text_to_token[text] = [token]
                
text_score = collections.Counter({text: len(tokens) for text, tokens in text_to_token.items()})
text_score.most_common(10)

[('hotel', 1059),
 ('room', 542),
 ('pool', 512),
 ('staff', 453),
 ('good', 430),
 ('food', 416),
 ('great', 290),
 ('nice', 264),
 ('day', 258),
 ('bar', 251)]

In [17]:
text_to_token["bar"][0].sent.text

'The pool was just the pecfect spot,  bar right on the side,  food and ice cream.  '

Classify sentiment using `NLTK`

In [8]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/stavros/nltk_data...


In [20]:
scores = [sid.polarity_scores(token.sent.text) for token in text_to_token["hotel"]]

In [19]:
for token in text_to_token["bar"]:
    sentence = token.sent.text
    print(sentence)
    print(sid.polarity_scores(sentence))
    print()

The pool was just the pecfect spot,  bar right on the side,  food and ice cream.  
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

The evening bar did table service and a good selection of cocktails.  
{'neg': 0.0, 'neu': 0.775, 'pos': 0.225, 'compound': 0.4404}

The bar tender at the poolside, Irini is also very nice 

{'neg': 0.0, 'neu': 0.764, 'pos': 0.236, 'compound': 0.4754}

There are no restaurants or bar near it so you will have to just go to the hotel bar where drinks are really expensive. 


{'neg': 0.087, 'neu': 0.913, 'pos': 0.0, 'compound': -0.296}

There are no restaurants or bar near it so you will have to just go to the hotel bar where drinks are really expensive. 


{'neg': 0.087, 'neu': 0.913, 'pos': 0.0, 'compound': -0.296}

when eating everybody outside , staff in bar fantastic tho best thing about kresten palace , we will not be returning
{'neg': 0.0, 'neu': 0.685, 'pos': 0.315, 'compound': 0.8316}

Not complaining it’s our own fault for not looking into it 

Classify sentiment using `TextBlob`

In [78]:
from textblob import TextBlob
sentences = [TextBlob(sentence.sent.text) for sentence in text_to_token["food"]]

In [79]:
for sentence in sentences:
    print(sentence)
    print(sentence.sentiment)
    print()

Food and drinks are expensive at the hotel
Sentiment(polarity=-0.5, subjectivity=0.7)

The food was average and there was not any greek food apart from greek corner which had dolmades, olives and a few other bits.


Sentiment(polarity=-0.095, subjectivity=0.175)

The food was average and there was not any greek food apart from greek corner which had dolmades, olives and a few other bits.


Sentiment(polarity=-0.095, subjectivity=0.175)

The pool was just the pecfect spot,  bar right on the side,  food and ice cream.  
Sentiment(polarity=0.2857142857142857, subjectivity=0.5357142857142857)

We found the food rather boring and not very warm all week.
Sentiment(polarity=-0.6153846153846154, subjectivity=0.7307692307692307)

I found that along of the food had an unpleasant distinctive taste to it. 


Sentiment(polarity=-0.6499999999999999, subjectivity=0.95)

The food is delicious with various of dishes to choose from 
The receptionists were helpful in telling us information on how to get 

Sentiment(polarity=-0.3662109375, subjectivity=0.5)

The food was varied and plentyful with the dining staff working very hard to ensure the smooth running of the  restaurant, it was helpful that the staff were usulaly fluent in several languages, as the clients came from several different countries, the restaurant manager was always welcoming with a smile and a hand shake.


Sentiment(polarity=0.05347222222222222, subjectivity=0.31736111111111115)

The food was hit
Sentiment(polarity=0.0, subjectivity=0.0)

Food has lots of choice but isn’t good quality, made on mass cafeteria buffet-
Sentiment(polarity=0.7, subjectivity=0.6000000000000001)

On the whole the food was really good.  
Sentiment(polarity=0.44999999999999996, subjectivity=0.5)

If you want to try something local there is always Greek food on offer, along with German, Italian, cakes, Turkish delight (which is to die for, the best I have ever tasted).  
Sentiment(polarity=0.2, subjectivity=0.06)

There was never a queue, or 

Sentiment(polarity=0.2333333333333333, subjectivity=0.4527777777777778)

The food is good.
Sentiment(polarity=0.7, subjectivity=0.6000000000000001)

Some people say the food is not good.
Sentiment(polarity=-0.35, subjectivity=0.6000000000000001)

The food is a bit hit and miss
Sentiment(polarity=0.0, subjectivity=0.0)

The food was good, had always many choices & tasty as well.
Sentiment(polarity=0.6, subjectivity=0.55)

Food was also very nice, with quite a lot of variety.
Sentiment(polarity=0.78, subjectivity=1.0)

Food was fine - buffet style.
Sentiment(polarity=0.4166666666666667, subjectivity=0.5)

Eden Roc has the edge in terms of food.

Sentiment(polarity=0.0, subjectivity=0.0)

The entertainment was good, and there was a good variety of BBQ food but we did not feel it was worth the extra.

Sentiment(polarity=0.425, subjectivity=0.3500000000000001)

The food was buffet-style and good, compared to what you can find in Rhodes city and regardless of previous comments, there was a c

In [86]:
text_to_token["food"][1].sent == text_to_token["food"][2].sent

True

In [37]:
test = nlp("cleanliness")

In [41]:
test[0].pos_

'PROPN'