# **Description**

# Abstract

Explore Hotel aspects and Predict the rating of each review.

# About this dataset

Hotels play a crucial role in traveling and with the increased access to information new pathways of selecting the best ones emerged.
With this dataset, consisting of 20k reviews crawled from Tripadvisor, you can explore what makes a great hotel and maybe even use this model in your travels!

# How to use

Predict Review Rating
Topic Modeling on Reviews
Explore key aspects that make hotels good or bad
Acknowledgements
If you use this dataset in your research, please credit the authors.

# Citation

Alam, M. H., Ryu, W.-J., Lee, S., 2016. Joint multi-grain topic sentiment: modeling semantic aspects for online reviews. Information Sciences 339, 206–223.
DOI

# License

CC BY NC 4.0

# Splash banner

Photo by Rhema Kallianpur on Unsplash.

# Splash icon

Logo by Tripadvisor.

# Importing Libraries

In [None]:
!pip install bert-for-tf2

In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert
import plotly.express as px
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
import spacy

# Analyzing the Data

In [None]:
data = pd.read_csv('../input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')
data

In [None]:
sns.heatmap(data.isnull());

In [None]:
temp = data.describe()
temp.style.background_gradient(cmap='Oranges')

In [None]:
total = len(data)
ax1 = plt.figure(figsize=(12,5))

g = sns.countplot(x='Rating', data=data)
g.set_title("Evaluation", fontsize=20)
g.set_xlabel("Evaluation", fontsize=17)
g.set_ylabel("Values", fontsize=17)
sizes = []
for p in g.patches:
    height = p.get_height()
    sizes.append(height)
    g.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/total*100),
            ha="center", fontsize=10) 
g.set_ylim(0, max(sizes) * 1.1)

Now it will depend on the analysis of standards, based on the assessment scores what could be considered a negative assessment.

Let's consider a score less than 2 will be negative, we can consider it to be below 50%.

In [None]:
data['Rating'] = data['Rating'].apply(lambda x: 1 if x >= 2 else 0) 

In [None]:
positive = data[data['Rating'] == 1 ]
negative = data[data['Rating'] == 0]

In [None]:
plt.rcParams['figure.figsize'] = (10, 10)
plt.style.use('fast')

wc = WordCloud(background_color = 'orange', width = 1500, height = 1500).generate(str(positive['Review']))
plt.title('Description Positive', fontsize = 15)

plt.imshow(wc)
plt.axis('off')
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (10, 10)
plt.style.use('fast')

wc = WordCloud(background_color = 'orange', width = 1500, height = 1500).generate(str(negative['Review']))
plt.title('Description Negative', fontsize = 15)

plt.imshow(wc)
plt.axis('off')
plt.show()

In [None]:
data.head()

# Processing

In [None]:
def clean_t(t):
  t = BeautifulSoup(t, 'lxml').get_text()
  t = re.sub(r"@[A-Za-z0-9]+", ' ', t)
  t = re.sub(r"https?://[A-Za-z0-9./]+", ' ', t)
  t = re.sub(r"[^a-zA-Z.!?]", ' ', t)
  t = re.sub(r" +", ' ', t)
  return t

In [None]:
test = '99 ' + data.Review[0]
test

In [None]:
result = clean_t(test)
result

In [None]:
data_clean = [clean_t(t) for t in data.Review]

In [None]:
data_clean[0:4]

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1', trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
vocab_file

In [None]:
len(tokenizer.vocab)

In [None]:
tokenizer.tokenize('My dog likes strawberries.')

In [None]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize('My dog likes strawberries.'))

In [None]:
def encode_sentence(sent):
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [None]:
encode_sentence('my dog likes strawberries')

In [None]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

In [None]:
data_inputs[1]

# Database creation

In [None]:
data_labels = data.iloc[:,1].values

In [None]:
data_labels

In [None]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]

In [None]:
data_with_len[0:2]

In [None]:
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7]

In [None]:
sorted_all[3000:3005]

In [None]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types = (tf.int32, tf.int32))

In [None]:
next(iter(all_dataset))

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [None]:
next(iter(all_batched))

In [None]:
len(sorted_all)

In [None]:
NB_BATCHES = len(sorted_all) // BATCH_SIZE
NB_BATCHES

In [None]:
NB_BATCHES_TEST = NB_BATCHES // 10
NB_BATCHES_TEST

In [None]:
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

In [None]:
next(iter(test_dataset))

In [None]:
next(iter(train_dataset))

# Model building

In [None]:
class DCNN(tf.keras.Model):

  def __init__(self,
               vocab_size,
               emb_dim=128,
               nb_filters = 50,
               FFN_units=512,
               nb_classes=2,
               dropout_rate=0.1,
               training=False,
               name="dcnn"):
    super(DCNN, self).__init__(name=name)

    self.embedding = layers.Embedding(vocab_size, emb_dim)

    self.bigram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 2,
                                padding='valid',
                                activation='relu')
    self.trigram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 3,
                                padding='valid',
                                activation='relu')
    self.fourgram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 4,
                                padding='valid',
                                activation='relu')
    
    self.pool = layers.GlobalMaxPool1D()

    self.dense_1 = layers.Dense(units = FFN_units, activation='relu')
    self.dropout = layers.Dropout(rate=dropout_rate)
    if nb_classes == 2:
      self.last_dense = layers.Dense(units=1, activation='sigmoid')
    else:
      self.last_dense = layers.Dense(units=nb_classes, activation='softmax')

  def call(self, inputs, training):
    x = self.embedding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool(x_2)
    x_3 = self.fourgram(x)
    x_3 = self.pool(x_3)

    merged = tf.concat([x_1, x_2, x_3], axis = -1)
    merged = self.dense_1(merged)
    merged = self.dropout(merged, training)
    output = self.last_dense(merged)

    return output

# Training

In [None]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2
DROPOUT_RATE = 0.2
NB_EPOCHS = 12

In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters = NB_FILTERS,
            FFN_units = FFN_UNITS,
            nb_classes = NB_CLASSES,
            dropout_rate = DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
  Dcnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
else:
  Dcnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])

In [None]:
history = Dcnn.fit(train_dataset,
                   epochs=NB_EPOCHS)

# Evaluation

In [None]:
history.history.keys()

In [None]:
plt.plot(history.history['loss'])
plt.title('Loss progress');

In [None]:
plt.plot(history.history['accuracy'])
plt.title('Accuracy progress');

In [None]:
results = Dcnn.evaluate(test_dataset)
print(results)

In [None]:
def get_prediction(sentence):
  tokens = encode_sentence(sentence)
  inputs = tf.expand_dims(tokens, 0)
  output = Dcnn(inputs, training=False)
  sentiment = math.floor(output*2)
  if sentiment == 0:
    print('negative')
  elif sentiment == 1:
    print('positive')

In [None]:
get_prediction('This movie was pretty interesting')

In [None]:
get_prediction("I'd rather not do that again")

In [None]:
get_prediction("I don't like you")

# **If you find this notebook useful, support with an upvote** 👍