In [2]:
import os
import collections
import re
import json
import spacy
import pandas as pd
data_dir = "/home/stavros/DATA/TripAdvisorReviews/kresten_palace_hotel_&_wellness'"

In [3]:
data = pd.read_csv(os.path.join(data_dir, "kresten_palace_hotel_&_wellness'_365reviews.csv"))
print(data.shape)
print(data.columns)

published_year = data.publishedDate.apply(lambda x: x.split("-")[0])
published_year.value_counts()

(365, 24)
Index(['id', 'absoluteUrl', 'createdDate', 'publishedDate', 'locationId',
       'originalLanguage', 'language', 'stayDate', 'tripType', 'helpfulVotes',
       'title', 'text', 'rating', 'additionalRatings', 'username', 'userId',
       'user_hometownId', 'user_hometownName', 'response_id',
       'response_publishedDate', 'response_language', 'response_username',
       'response_connectionToSubject', 'response_text'],
      dtype='object')


2019    86
2016    48
2018    47
2017    45
2015    41
2014    29
2012    20
2011    14
2013    13
2010     8
2005     5
2006     3
2007     2
2008     1
2009     1
2004     1
2003     1
Name: publishedDate, dtype: int64

Expand contractions

In [4]:
_CMAP_DIR = os.path.join("/home/stavros/GitHub/Review-Aspects-App/nlptools", "contractions.txt")
with open(_CMAP_DIR, "r") as file:
  _CMAP = json.load(file)


def expand_contractions(text, contraction_mapping=_CMAP):
  contractions_pattern = re.compile('({})'.format('|'.join(
      contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)

  def expand_match(contraction):
    match = contraction.group(0)
    first_char = match[0]
    if contraction_mapping.get(match):
      expanded_contraction = contraction_mapping.get(match)
    else:
      expanded_contraction = contraction_mapping.get(match.lower())
    expanded_contraction = first_char+expanded_contraction[1:]
    return expanded_contraction

  expanded_text = contractions_pattern.sub(expand_match, text)
  expanded_text = re.sub("'s", "", expanded_text)
  expanded_text = re.sub("'", "", expanded_text)
  return expanded_text

In [5]:
expanded_text = data.text.apply(lambda x: expand_contractions(x))

Define stop word and special character sets

In [6]:
STOP_WORDS = set(
    """
a about above across after afterwards again against all almost alone along
already also although always am among amongst amount an and another any anyhow
anyone anything anyway anywhere are around as at
back be became because become becomes becoming been before beforehand behind
being below beside besides between beyond both bottom but by
call can cannot ca could
did do does doing done down due during
each eight either eleven else elsewhere empty enough even ever every
everyone everything everywhere except
few fifteen fifty first five for former formerly forty four from front full
further
get give go
had has have he hence her here hereafter hereby herein hereupon hers herself
him himself his how however hundred
i if in indeed into is it its itself
keep
last latter latterly least less
just
made make many may me meanwhile might mine more moreover most mostly move much
must my myself
name namely neither never nevertheless next nine no nobody none noone nor not
nothing now nowhere
of off often on once one only onto or other others otherwise our ours ourselves
out over own
part per perhaps please put
quite
rather re really regarding
same say see seem seemed seeming seems serious several she should show side
since six sixty so some somehow someone something sometime sometimes somewhere
still such
take ten than that the their them themselves then thence there thereafter
thereby therefore therein thereupon these they third this those though three
through throughout thru thus to together too top toward towards twelve twenty
two
under until up unless upon us used using
various very very via was we well were what whatever when whence whenever where
whereafter whereas whereby wherein whereupon wherever whether which while
whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves
""".split()
)
len(STOP_WORDS)

305

In [7]:
SPECIAL_CHARS = set('.,!?/\n;" ()-_&:')
for n in range(5): SPECIAL_CHARS.add(n * '\n')
for n in range(5): SPECIAL_CHARS.add(n * ' ')
SPECIAL_CHARS.add("...")
print(SPECIAL_CHARS)

INVALID_TOKENS = STOP_WORDS | SPECIAL_CHARS

{'', '  ', ' ', '\n\n\n\n', '\n\n\n', '    ', '-', '\n\n', '/', '!', '&', ';', ',', '(', ')', '?', '   ', '\n', '_', ':', '.', '...', '"'}


Apply spacy

In [8]:
nlp = spacy.load('en_core_web_sm')
processed_text = list(nlp.pipe(expanded_text))

In [14]:
for token in processed_text[0]:
    if token.pos_ == "VERB":
        print(token.text, list(token.subtree))

Paid [Paid, for, a, sea, view, or, garden, view, ,, got, a, ground, floor, room, facing, trees, with, a, road, behind, .]
got [got, a, ground, floor, room, facing, trees, with, a, road, behind]
facing [facing, trees, with, a, road, behind]
sticking [sticking, out, and, air, con]
working [was, not, working]
Would [Would]
move [Would, not, move, us, to, another, room, because, they, said, they, had, closed, down, most, of, the, hotel, as, it, was, end, of, the, season, .]
said [because, they, said, they, had, closed, down, most, of, the, hotel, as, it, was, end, of, the, season]
closed [they, had, closed, down, most, of, the, hotel, as, it, was, end, of, the, season]
sort [to, sort, out, the, bed, and, and, air, con]
cleaned [The, room, was, cleaned, well, every, day, ., 

]
travelled [We, travelled, at, the, end, of, Oct, into, first, week, of, Nov, ,, everything, is, closing]
closing [,, everything, is, closing]
help [to, help, and, receptionists, unhelpful]
closed [They, actually, clo

In [15]:
processed_text[0]

Paid for a sea view or garden view, got a ground floor room facing trees with a road behind. One of the beds had springs sticking out and air con was not working. Would not move us to another room because they said they had closed down most of the hotel as it was end of the season. They were quick to sort out the bed and and air con. The room was cleaned well every day. 

We travelled at the end of Oct into first week of Nov, everything is closing, no reps on hand to help and receptionists unhelpful. They actually closed the hotel the day before we left and transferred us to their other hotel but the room was not ready! Really inconvenient when you have to pack early and have wet clothes to dry and took up valuable daylight time as the sun sets at 5:30pm. 

Apart from the receptionists, the staff were really friendly and nice. Food and drinks are expensive at the hotel and you need to either get their shuttle bus, the normal bus or hire a car, there is not anything around there apart f

In [13]:
list(test_token.subtree)

[sea, view, or]

Find most common words

1. Simple counting of most common words 

In [18]:
text_to_token = {} # Dict[str, List[spacy.Token]]
for review in processed_text:
    for token in review:
        text = token.text.lower()
        if text not in INVALID_TOKENS:
            if text in text_to_token:
                text_to_token[text].append(token)
            else:
                text_to_token[text] = [token]
                
text_score = collections.Counter({text: len(tokens) for text, tokens in text_to_token.items()})
text_score.most_common(10)

[('hotel', 1059),
 ('room', 542),
 ('pool', 512),
 ('staff', 453),
 ('good', 430),
 ('food', 416),
 ('great', 290),
 ('nice', 264),
 ('day', 258),
 ('bar', 251)]

2. Find sentences using verbs

In [9]:
class EnhancedToken:
    
    def __init__(self, token: spacy.tokens.Token, verb: spacy.tokens.Token):
        self.token = token
        self.verbs = [verb]
        self._largest_sentence = list(verb.subtree)
        
    def append(self, verb: spacy.tokens.Token):
        self.verbs.append(verb)
        new_sentence = list(verb.subtree)
        if len(new_sentence) > len(self._largest_sentence):
            self._largest_sentence = list(new_sentence)
    
    @property
    def sentence(self):
        """Returns the largest sentence generated from verb subtrees."""
        return self._largest_sentence
    
    @property
    def sentence_str(self):
        return " ".join(x.text for x in self.sentence)    

In [20]:
text_to_enhtoken = {} # Dict[str, List[EnhancedToken]]
for review in processed_text:
    token_to_enhtoken = {}
    for verb in review:
        if verb.pos_ == "VERB":
            for token in verb.subtree:
                text = token.text.lower()
                if text not in INVALID_TOKENS:
                    if text in text_to_enhtoken:
                        if token in token_to_enhtoken:
                            token_to_enhtoken[token].append(verb)
                        else:
                            enh_token = EnhancedToken(token, verb)
                            token_to_enhtoken[token] = enh_token
                            text_to_enhtoken[text].append(enh_token)
                            
                    else:
                        assert token not in token_to_enhtoken
                        enh_token = EnhancedToken(token, verb)
                        token_to_enhtoken[token] = enh_token
                        text_to_enhtoken[text] = [enh_token]
                        
                
text_score = collections.Counter({text: len(tokens) for text, tokens in text_to_enhtoken.items()})
text_score.most_common(10)

[('hotel', 582),
 ('room', 316),
 ('pool', 209),
 ('day', 185),
 ('staff', 185),
 ('food', 163),
 ('rhodes', 147),
 ('good', 130),
 ('night', 125),
 ('kresten', 124)]

In [25]:
text_score.most_common(20)

[('hotel', 582),
 ('room', 316),
 ('pool', 209),
 ('day', 185),
 ('staff', 185),
 ('food', 163),
 ('rhodes', 147),
 ('good', 130),
 ('night', 125),
 ('kresten', 124),
 ('bar', 119),
 ('like', 119),
 ('stay', 110),
 ('recommend', 104),
 ('beach', 103),
 ('time', 102),
 ('went', 102),
 ('restaurant', 101),
 ('stayed', 100),
 ('palace', 99)]

In [26]:
for enhtoken in text_to_enhtoken["restaurant"]:
    print(enhtoken.sentence_str)
    print()

guided by the restaurant manager Georgios and an always perfect service by Margarita , Antonius and Filip

, the cost of beers changed , pool 3.6 Euro , restaurant 4 Euro and Bar 4.2 Euro .

Special mention goes to Anna in the restaurant who looked after us all really well all week , making sure we could all sit together at breakfast and dinner .  

The main restaurant as lovely , lots of staff , kept immaculately clean but had a great atmosphere , never too crowded , did not feel like a cattle market like some hotels can .

lunches Equally the main bar and restaurant staff

PS you can wear shorts at the restaurant , even dinner now , as the rules have been relaxed as its so hot there , so Thank you Giorgos for that ! 



the restaurant staff , always busy , if you talk to them they will talk to you .. 


not forgetting the Restaurant Manager Giorgos Mauros , firm but nice , a good man always motivating his staff and welcoming guests and keeping this complicated vast catering machine g

In [16]:
text_to_enhtoken

{}