In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

sns.set()
%matplotlib inline

In [2]:
import spacy
from langdetect import detect
from textblob import TextBlob

spacy_eng = spacy.load('en_core_web_sm')

In [24]:
DATAFILE = 'London_hotel_reviews.csv'
data = pd.read_csv(DATAFILE,encoding='latin-1') 

In [25]:
data.head()

Unnamed: 0,Property Name,Review Rating,Review Title,Review Text,Location Of The Reviewer,Date Of Review
0,Apex London Wall Hotel,5,Ottima qualità prezzo,Siamo stati a Londra per un week end ed abbiam...,"Casale Monferrato, Italy",10/20/2012
1,Corinthia Hotel London,5,"By far, my best hotel in the world",I had a pleasure of staying in this hotel for ...,"Savannah, Georgia",3/23/2016
2,The Savoy,5,First visit to the American Bar at the Savoy,A very lovely first visit to this iconic hotel...,London,7/30/2013
3,Rhodes Hotel,4,Nice stay,3 of us stayed at the Rhodes Hotel for 4 night...,"Maui, Hawaii",6/2/2012
4,The Savoy,5,Perfection,Form the moment we arrived until we left we ex...,"London, United Kingdom",11/24/2017


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27330 entries, 0 to 27329
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Property Name             27330 non-null  object
 1   Review Rating             27330 non-null  int64 
 2   Review Title              27330 non-null  object
 3   Review Text               27330 non-null  object
 4   Location Of The Reviewer  23377 non-null  object
 5   Date Of Review            27329 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.3+ MB


Number of reviews per hotel

In [27]:
data['Property Name'].value_counts()

The Savoy                                                         5417
Mondrian London at Sea Containers                                 4330
The Rembrandt                                                     3028
Corinthia Hotel London                                            2820
Apex London Wall Hotel                                            2205
The Dorchester                                                    1720
Rhodes Hotel                                                      1554
Hotel Xenia, Autograph Collection                                 1524
Ridgemount Hotel                                                  1365
Mandarin Oriental Hyde Park, London                               1215
Bulgari Hotel, London                                              473
The Wellesley Knightsbridge, a Luxury Collection Hotel, London     354
The Lanesborough                                                   324
London Guest House                                                 263
Newham

<h3> Filter Non English Reviews <h3>

In [7]:
def filter_non_english(sentence):
    return detect(sentence) == 'en'

non_english_indices = [index for index in tqdm(range(len(data)))
                       if filter_non_english(data.iloc[index]['Review Text']) == False
                      ]

HBox(children=(FloatProgress(value=0.0, max=27330.0), HTML(value='')))




In [8]:
len(non_english_indices)

3758

In [9]:
non_english_df = data.iloc[non_english_indices]
non_english_df

Unnamed: 0,Property Name,Review Rating,Review Title,Review Text,Location Of The Reviewer,Date Of Review
0,Apex London Wall Hotel,5,Ottima qualità prezzo,Siamo stati a Londra per un week end ed abbiam...,"Casale Monferrato, Italy",10/20/2012
9,Rhodes Hotel,5,Ottima scelta!,"Ottimo rapporto qualità - prezzo, ottima la po...",Cagliari,2/7/2011
23,A To Z Hotel,5,Excellent rapport qualité prix,"A 15 mn à pied, 5 en bus (passage fréquent) de...","Paris, France",1/5/2016
27,Ridgemount Hotel,5,Excellent,Nous avons pris l'habitude de descendre dans c...,"Zurich, Switzerland",10/30/2011
38,Apex London Wall Hotel,5,Freundliches & zukommendes Personal sowie sehr...,Wir waren im Rahmen einer Städtereise für 3 Üb...,"Roedermark, Germany",9/15/2012
...,...,...,...,...,...,...
27317,"Mandarin Oriental Hyde Park, London",4,tolles Gebäude von aussen,schönes Haus von aussen und im Lobby/bar Berei...,"Zurich, Switzerland",5/29/2012
27322,The Savoy,5,Traumhafter Service,Wir haben gerade 4 Tage im The Savoy in London...,Hamburg,11/18/2014
27323,Corinthia Hotel London,4,Il mio primo afternoon tea........ al Corinthi...,"Io e mio marito, abbiamo passato le vacanze di...",,1/25/2015
27326,A To Z Hotel,3,Mala Estadia,En cuarto que nos tocó no había toallas y habí...,"Mexico City, Mexico",9/29/2015


In [28]:
data = data.drop(non_english_indices,axis = 0)
data = data.drop(['Location Of The Reviewer','Date Of Review'],axis=1)
data.head()

Unnamed: 0,Property Name,Review Rating,Review Title,Review Text
1,Corinthia Hotel London,5,"By far, my best hotel in the world",I had a pleasure of staying in this hotel for ...
2,The Savoy,5,First visit to the American Bar at the Savoy,A very lovely first visit to this iconic hotel...
3,Rhodes Hotel,4,Nice stay,3 of us stayed at the Rhodes Hotel for 4 night...
4,The Savoy,5,Perfection,Form the moment we arrived until we left we ex...
5,Corinthia Hotel London,1,Staff stole from me!!,Well I am no strange to London's 5star hotels ...


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23572 entries, 1 to 27329
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Property Name  23572 non-null  object
 1   Review Rating  23572 non-null  int64 
 2   Review Title   23572 non-null  object
 3   Review Text    23572 non-null  object
dtypes: int64(1), object(3)
memory usage: 920.8+ KB


In [31]:
# case text as lowercase, remove punctuation, remove extra whitespace in string and on both sides of string

data['review_cleaned'] = data['Review Text'].str.lower().str.replace("'", '').str.replace('[^\w\s]', ' ').str.replace(" \d+", " ").str.replace(' +', ' ').str.strip()


In [32]:
data

Unnamed: 0,Property Name,Review Rating,Review Title,Review Text,review_cleaned
1,Corinthia Hotel London,5,"By far, my best hotel in the world",I had a pleasure of staying in this hotel for ...,i had a pleasure of staying in this hotel for ...
2,The Savoy,5,First visit to the American Bar at the Savoy,A very lovely first visit to this iconic hotel...,a very lovely first visit to this iconic hotel...
3,Rhodes Hotel,4,Nice stay,3 of us stayed at the Rhodes Hotel for 4 night...,3 of us stayed at the rhodes hotel for nights ...
4,The Savoy,5,Perfection,Form the moment we arrived until we left we ex...,form the moment we arrived until we left we ex...
5,Corinthia Hotel London,1,Staff stole from me!!,Well I am no strange to London's 5star hotels ...,well i am no strange to londons star hotels an...
...,...,...,...,...,...
27321,Mondrian London at Sea Containers,5,Birthday Brunch,Recently came here for a friends birthday. We ...,recently came here for a friends birthday we d...
27324,"Mandarin Oriental Hyde Park, London",4,We love it every time!!!,Greatly located with lovely views of the park ...,greatly located with lovely views of the park ...
27325,Corinthia Hotel London,5,The Best 5 star Luxury Hotel in London,I come to London often but since I stayed in t...,i come to london often but since i stayed in t...
27327,The Rembrandt,4,Excellent hotel in outstanding location,This is a quality quiet hotel located in an ex...,this is a quality quiet hotel located in an ex...


In [41]:
for sentence in data['review_cleaned'].head(3).tolist():
    doc = spacy_eng(sentence)
    
    descriptive_terms = []
    
    for token in doc:
        if token.pos_ == 'ADJ':
            descriptive_terms.append(token)
    print('Sentence: \n============================================')
    print(sentence)
    print('Descriptive Terms: \n====================================')
    print(descriptive_terms)
    print('-'*50)

Sentence 
i had a pleasure of staying in this hotel for nights recently this hotel was perfect in every way communication with the hotel before staying was prompt and very efficient checking in was a breeze you go through the spectacular lobby with modern glass chandeliers and take the elevator to your room my room they gave me an upgrade to junior suite was spectacular we had a walk in closet of the size where you could have put a small bed in there it served us nicely for the seven day stay the decor was very refined and oh the bathroom carrera marble floor was heated throughout rain shower was to die for location as it turned out was as good as it can be we were minutes walk to trafalgar square but it was very quiet right outside was embankment tube stop we would walk to theater area and to numerous restaurants and many major sites such as london eye or westminster abbey were within walking distance we had buffet breakfast or room service every morning it was pricy but my rate inclu

Extracting intensifiers of the adjectives

In [42]:
for sentence in data['review_cleaned'].head(3).tolist():
    doc = spacy_eng(sentence)
    
    descriptive_terms = []
    
    for token in doc:
        if token.pos_ == 'ADJ':
            intensifier = ''
            for child in token.children:
                if child.pos_ != 'ADV':
                    continue
                intensifier += child.text + ' '
            descriptive_term = intensifier + token.text
            descriptive_terms.append(descriptive_term)
    print('Sentence: \n============================================')
    print(sentence)
    print('Descriptive Terms: \n====================================')
    print(descriptive_terms)
    print('-'*50)

Sentence: 
i had a pleasure of staying in this hotel for nights recently this hotel was perfect in every way communication with the hotel before staying was prompt and very efficient checking in was a breeze you go through the spectacular lobby with modern glass chandeliers and take the elevator to your room my room they gave me an upgrade to junior suite was spectacular we had a walk in closet of the size where you could have put a small bed in there it served us nicely for the seven day stay the decor was very refined and oh the bathroom carrera marble floor was heated throughout rain shower was to die for location as it turned out was as good as it can be we were minutes walk to trafalgar square but it was very quiet right outside was embankment tube stop we would walk to theater area and to numerous restaurants and many major sites such as london eye or westminster abbey were within walking distance we had buffet breakfast or room service every morning it was pricy but my rate incl

In [97]:
def extract_aspects(sentence):
    doc = spacy_eng(sentence)
    descriptive_term = ''
    target = ''
    temp_aspects = []
    
    for token in doc:
        if token.dep_ == 'nsubj' and token.pos_ == 'NOUN':
            target = token.text
            
        if token.pos_ == 'ADJ':
                intensifier = ''
                
                for child in token.children:
                    if child.pos_ != 'ADV':
                        continue
                    intensifier += child.text + ' '
                descriptive_term = intensifier + token.text
                temp_aspects.append({'aspect': target ,
                               'description': descriptive_term})
                
    return temp_aspects

In [98]:
extract_aspects(data['Review Text'].iloc[0])

[{'aspect': 'hotel', 'description': 'perfect'},
 {'aspect': 'Communication', 'description': 'prompt'},
 {'aspect': 'Communication', 'description': 'very efficient'},
 {'aspect': 'Communication', 'description': 'spectacular'},
 {'aspect': 'Communication', 'description': 'modern'},
 {'aspect': 'Communication', 'description': 'junior'},
 {'aspect': 'Communication', 'description': 'spectacular'},
 {'aspect': 'Communication', 'description': 'small'},
 {'aspect': 'decor', 'description': 'very refined'},
 {'aspect': 'shower', 'description': 'as good'},
 {'aspect': 'shower', 'description': 'very quiet'},
 {'aspect': 'shower', 'description': 'numerous'},
 {'aspect': 'shower', 'description': 'many'},
 {'aspect': 'shower', 'description': 'major'},
 {'aspect': 'sites', 'description': 'such'},
 {'aspect': 'sauna', 'description': 'weary'},
 {'aspect': 'sauna', 'description': 'many'},
 {'aspect': 'hotel', 'description': 'next'}]

In [154]:
aspects = {'aspect':list(),'description':list()}

for i in tqdm(range(len(data))):
    
    for curr_sent_aspect in extract_aspects(data['Review Text'].iloc[i]):
        curr_aspect = curr_sent_aspect['aspect']
        if aspect == '':
            continue
        
        aspects['aspect'].append(curr_aspect)
        aspects['description'].append(curr_sent_aspect['description'])
        
aspects = pd.DataFrame(aspects)

HBox(children=(FloatProgress(value=0.0, max=23572.0), HTML(value='')))




In [155]:
aspects

Unnamed: 0,aspect,description
0,hotel,perfect
1,Communication,prompt
2,Communication,very efficient
3,Communication,spectacular
4,Communication,modern
...,...,...
318912,staff,excellent
318913,savoy,favourite
318914,savoy,new
318915,savoy,welcome


In [156]:
polarity = []
subjectivity = []
for i in tqdm(range(len(aspects))):
    sentiment = TextBlob(aspects['description'].iloc[i]).sentiment
    polarity.append(sentiment.polarity)
    subjectivity.append(sentiment.subjectivity)

HBox(children=(FloatProgress(value=0.0, max=318917.0), HTML(value='')))




In [157]:
aspects['polarity'] = polarity
aspects['subjectivity'] = subjectivity

In [158]:
aspects

Unnamed: 0,aspect,description,polarity,subjectivity
0,hotel,perfect,1.000000,1.000000
1,Communication,prompt,0.000000,0.000000
2,Communication,very efficient,0.200000,0.300000
3,Communication,spectacular,0.600000,0.900000
4,Communication,modern,0.200000,0.300000
...,...,...,...,...
318912,staff,excellent,1.000000,1.000000
318913,savoy,favourite,0.000000,0.000000
318914,savoy,new,0.136364,0.454545
318915,savoy,welcome,0.800000,0.900000


In [187]:
counts = aspects['aspect'].value_counts()

In [198]:
spurious_aspects = [asp for asp,val in counts.items() if val < 5]

In [202]:
spurious_aspects[60:100]

['rep',
 'Steps',
 'Foyer',
 'Plumbing',
 'Door',
 'hardware',
 'goal',
 'outpost',
 'princess',
 'grandchildren',
 'cosies',
 'proof',
 'assistant',
 'themes',
 'tape',
 'yogamat',
 'privacy',
 'enthusiast',
 'reference',
 'myriad',
 'labeling',
 'bills',
 'ballrooms',
 'Gold',
 'oils',
 'Jaime',
 'habit',
 'pain',
 'irritations',
 'tasting',
 'granddaughters',
 '.It',
 '.service',
 'wings',
 'tunes',
 'conceige',
 'elite',
 'Trains',
 'rooftop',
 'grail']

In [178]:
def get_sentiment(polarity):
    if polarity > 0.1:
        return 1
    elif polarity >-0.1 and polarity < 0.1:
        return 0
    elif polarity < -0.1:
        return -1

In [179]:
aspects['sentiment'] = aspects['polarity'].apply(get_sentiment)

In [180]:
aspects

Unnamed: 0,aspect,description,polarity,subjectivity,sentiment
0,hotel,perfect,1.000000,1.000000,1.0
1,Communication,prompt,0.000000,0.000000,0.0
2,Communication,very efficient,0.200000,0.300000,1.0
3,Communication,spectacular,0.600000,0.900000,1.0
4,Communication,modern,0.200000,0.300000,1.0
...,...,...,...,...,...
318912,staff,excellent,1.000000,1.000000,1.0
318913,savoy,favourite,0.000000,0.000000,0.0
318914,savoy,new,0.136364,0.454545,1.0
318915,savoy,welcome,0.800000,0.900000,1.0


In [181]:
aspects['sentiment'].value_counts()

 1.0    181969
 0.0     95529
-1.0     33678
Name: sentiment, dtype: int64