# **Group 1**
# **Project 339 Hotel Reviews**
## **Part 1 - EDA**

In [3]:
pip install swifter

Collecting swifter
  Downloading swifter-1.4.0.tar.gz (1.2 MB)
     ---------------------------------------- 1.2/1.2 MB 968.5 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py): started
  Building wheel for swifter (setup.py): finished with status 'done'
  Created wheel for swifter: filename=swifter-1.4.0-py3-none-any.whl size=16519 sha256=b5cb25f33e190f728f885d2be34ef36f9266e25c36b4a0a4c2fc1cd830953be3
  Stored in directory: c:\users\dell\appdata\local\pip\cache\wheels\43\a7\a3\1194ca51c35c2a0c0041c97e4a9c1f0ed82a20cb3b1b08d610
Successfully built swifter
Installing collected packages: swifter
Successfully installed swifter-1.4.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
from os import cpu_count
import re
import pickle
from collections import defaultdict, Counter


import swifter
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from wordcloud import WordCloud
import plotly.io as pio
pio.templates.default = "plotly"

import json
import spacy

from nltk import ngrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
stop_words = stopwords.words("english")
nlp = spacy.load("en_core_web_sm")
CPU_COUNT = cpu_count()
with open('../Data/negations.json') as f:
    negations = json.load(f)

## Loading Data

In [None]:
df = pd.read_excel("hotel_reviews.xlsx")
df.head()

In [None]:
df.info()

## Checking for missing/null values

In [None]:
df.isna().sum()

#### *There are no null/missing values*

## Preprocessing Reviews

In [None]:
def make_string(tokens):
    return " ".join(tokens)

In [None]:
def strip_leading_trailing_space(text):
    return text.strip()

In [None]:
def remove_alphanumeric(text):
    pattern = re.compile('\w*\d\w* ')
    result = pattern.sub('', text)
    return result

In [None]:
def add_space_after_punctuation(text):
    pattern = re.compile('(?<=[.,])(?=[^\s])')
    return pattern.sub(' ', text)

In [None]:
def remove_unknown(text):
    text = re.sub('__Çî__otal', '', text)
    text = re.sub('__Ç_é_', '', text)
    return text

In [None]:
def correct_spellings(text):
    text = re.sub(r"\b(can't)\b|\B(can't)\B", r' \1\2 ', text)
    text = re.sub(r"(?<=\s)ca n't(?=[\s.,!?])", "can't", text)
    text = re.sub(r"(?<=\s)n't(?=[\s.,!?])", "can't", text)
    text = re.sub(r'(?<=\s)n(?=[\s.,!?])', "and", text)
    text = re.sub(r'(?<=\s)u(?=[\s.,!?])', "you", text)
    return text

In [None]:
def remove_tokens_with_special_characters(tokens):
    pattern = re.compile('[^a-zA-Z0-9]')
    return [token for token in tokens if not pattern.search(token)]

In [None]:
def remove_tokens_with_numbers(tokens):
    pattern = re.compile('\d')
    return [token for token in tokens if not pattern.search(token)]

In [None]:
def lemmatize(text):
    lemma_arr = []
    doc = nlp(text)
    for token in doc:
        lemma_arr.append(token.lemma_)
    return lemma_arr

In [None]:
def to_lowercase(tokens):
    return [token.lower() for token in tokens]

In [None]:
def remove_stopwords(tokens):
    result = []
    for token in tokens:
        if token not in stop_words:
            result.append(token)
        elif token in stop_words and token in negations:
            result.append(token)
    return result

In [None]:
def preprocess_reviews(text):
    text = strip_leading_trailing_space(text)
    text = remove_unknown(text)
    text = add_space_after_punctuation(text)
    text = correct_spellings(text)
    text = remove_alphanumeric(text)
    tokens = lemmatize(text)
    tokens = to_lowercase(tokens)
    tokens = remove_stopwords(tokens)
    tokens = remove_tokens_with_special_characters(tokens)
    tokens = remove_tokens_with_numbers(tokens)
    return tokens

### Performing preprocessing techniques on reviews and saving them on disk

In [None]:
reviews = df['Review']\
.swifter.set_npartitions(CPU_COUNT)\
.progress_bar(desc="Preprocessing Reviews")\
.apply(preprocess_reviews)

In [None]:
print("Saving preprocessed reviews to disk...")
with open('preprocessed_reviews.pkl', 'wb') as f:
    pickle.dump(reviews, f)
print("Done!")

### Loading preprocessed reviews from disk

In [None]:
with open('/preprocessed_reviews.pkl', 'rb') as f:
    reviews = pickle.load(f)
reviews

## Performing EDA

### 1. Rating distribution

In [None]:
ratings = df["Rating"].value_counts()
fig = px.bar(data_frame=ratings, y="count")
fig.update_layout(xaxis_title="Rating", yaxis_title="Count")
fig.show()

#### Inference-
- A large number of customers have given good 4-5 star ratings.
- Few number of peoples have given low 1-2 star ratings.
- This implies that most of the customers are nearly satisfied with their stay at the hotel.
- And a large number of hotels are providing good and useful services to customers.
- Few hotels are lacking a little bit in their quality of service.

### 2. Finding out what words stand out in cleaned reviews using word cloud

In [None]:
all_reviews = reviews.apply(make_string).to_numpy()
all_reviews = make_string(all_reviews)

wordcloud = WordCloud(
    width=1280, 
    height=720, 
    stopwords=stop_words, 
    max_words=100, 
    background_color="white",
    margin=3
).generate(all_reviews)

In [None]:
plt.figure(dpi=150, figsize=(8,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Cleaned Reviews", pad=15, weight="bold")
plt.axis("off")

#### Inference-
- From the word cloud it can be concluded that most of the reviews contain positve words, this means most of the reveiws about hotels are positive.
- So, majority of hotels are of good standards and are providing good and useful services to customers.

### 3. N-Grams

In [None]:
def create_bigrams(token):
    return list(ngrams(token, 2))

def create_trigrams(token):
    return list(ngrams(token, 3))

In [None]:
def get_ngrams_frequency(ngrams, top_n=None):
    ngram_pairs = []
    for item in ngrams:
        for pairs in item:
            ngram_pairs.append(" ".join(pairs))
    ngrams_freq = Counter(ngram_pairs)
    if top_n:
        labels, counts = zip(*ngrams_freq.most_common(n=top_n))
    else:
        labels, counts = zip(*ngrams_freq.items())
    return labels, counts

### Creating n-grams from cleaned reviews

In [None]:
bigrams = reviews.apply(create_bigrams)
trigrams = reviews.apply(create_trigrams)

### Plotting Top Frequent N-grams

In [None]:
top_n = 20

#### Bigrams

In [None]:
bigram_labels, bigram_counts = get_ngrams_frequency(bigrams, top_n)
fig = px.bar(x=bigram_labels, y=bigram_counts)
fig.update_layout(xaxis_title="Bigram", yaxis_title="Count", title=f"Top {top_n} Bigrams")
fig.show()

#### Trigrams

In [None]:
trigram_labels, trigram_counts = get_ngrams_frequency(trigrams, top_n)
fig = px.bar(x=trigram_labels, y=trigram_counts)
fig.update_layout(xaxis_title="Trigram", yaxis_title="Count", title=f"Top {top_n} Trigrams")
fig.show()

#### Inference-
- Top frequent bigrams and trigrams are positive sounding like **"great location"**, **"clean room"**, **"staff freindly helpful"**, **"higly recommend hotel"**, **"flat screen tv"**, **"hotel staff friendly"**, etc.
- This means that most of the hotels are of good standard and provide good and useful services to customers. Resulting in most number of happy customers.

### 4. Sentiment Analysis

In [None]:
reviews_strings = reviews.apply(make_string)

In [None]:
analyzer = SentimentIntensityAnalyzer()
scores = [analyzer.polarity_scores(review) for review in reviews_strings]

result_dict = defaultdict(list)
result_dict["Cleaned_Reviews"] = reviews_strings
for d in scores:
  for key, value in d.items():
    if key == "compound":
      result_dict["Score"].append(value)
      if value > 0:
          result_dict["Sentiment"].append("Positive")
      elif value < 0:
          result_dict["Sentiment"].append("Negative")
      else:
          result_dict["Sentiment"].append("Neutral")

scores_df = pd.DataFrame(result_dict)

#### Save cleaned reviews with their sentiment label

In [None]:
final_df = pd.DataFrame(dict(Reviews=reviews, Sentiment=scores_df['Sentiment'].apply(lambda s: 'Positive' if s == 'Neutral' else s)))
final_df.to_pickle('processed_reviews_with_sentiment.pkl')

### Overall Sentiment

In [None]:
value_counts = scores_df["Sentiment"].value_counts()
fig = px.pie(
    names=value_counts.index,
    values=value_counts.values,
    color_discrete_sequence=["#00CC96", "#636EFA", "#EF553B"],
    hole=0.5
)
fig.update_layout(legend={"title": "Sentiment"})
fig.show()

### Review count for each sentiment category

In [None]:
fig = px.bar(x=value_counts.index, y=value_counts)
fig.update_layout(xaxis_title="Sentiment", yaxis_title="Count")
fig.show()

#### Inference-
- As speculated earlier from word clouds and n-grams, indeed majority of the reviews are positve after performing sentiment analysis.
- Number of neutral reviews are insignificant hence, ignoring them for further steps.

### Distribution of Sentiment by Rating

In [None]:
rating_counts = scores_df.groupby(["Rating", "Sentiment"]).size().unstack().fillna(0)
fig = go.Figure()
for sentiment in rating_counts.columns:
    fig.add_trace(go.Bar(
        name=sentiment,
        x=rating_counts.index,
        y=rating_counts[sentiment]))
fig.update_layout(barmode="stack")
fig.update_layout(xaxis_title="Rating", yaxis_title="Count", legend={"title": "Sentiment"})
fig.show()

#### Inference-
- With increase in rating, number of positive reviews also increases.
- For 1 star rating there are nearly equal number negative and postive reviews

#### *Since there are fewer neutral reviews, not taking them into account for subsequent actions.*

## Comparing Postive and Negative Reviews

### 1. Finding what words stand out for both sentiments by creating word clouds

In [None]:
positive_reviews = scores_df[scores_df["Sentiment"] == "Positive"]
negative_reviews = scores_df[scores_df["Sentiment"] == "Negative"]

In [None]:
merged_positive_reviews = make_string(positive_reviews["Cleaned_Reviews"].to_numpy())
merged_negative_reviews = make_string(negative_reviews["Cleaned_Reviews"].to_numpy())

In [None]:
positive_wordcloud = WordCloud(width=1280, height=720, max_words=100, background_color="white", margin=3).generate(merged_positive_reviews)
negative_wordcloud = WordCloud(width=1280, height=720, max_words=100, background_color="white", margin=3).generate(merged_negative_reviews)

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12,7), dpi=250)
axs[0].imshow(positive_wordcloud, interpolation="bilinear")
axs[0].set_title("Positive Reviews", pad=15, weight="bold")
axs[1].imshow(negative_wordcloud, interpolation="bilinear")
axs[1].set_title("Negative Reviews", pad=15, weight="bold")
for ax in axs:
    ax.axis("off")

line = Line2D([0.5, 0.5], [0.25, 0.75], transform=fig.transFigure, figure=fig, color='black')
fig.lines.extend([line])

plt.tight_layout(pad=2)

#### Inference-
- **"hotel"** is the main topic in both positve and negative reviews. And indeed it should be like this.
- In negative reviews **"room"** is another biggest topic. This indicates that people who wrote negative reviews had faced most of the problems in their rooms. This as well makes sense as hotels are used for temporary stay, rooms should be in good condition in hotels and if anything is not appropriate in a room, customers will not be happy.
- In positive reviews there are many postive words like **"clean room"**, **"excellent"**, **"fun"**, **"great location"**, **"great hotel"**, etc.
- In negative reviews there are mostly positive sounding words but still there are negative words like **"bad"**, **"rude"**, **"small"**, **"noting"**, etc., but overall they are still negative, this means some of the hotels are lacking a little bit in providing good and usefule services to customers.