In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Packages Required

In [None]:
!pip3 install ktrain

In [None]:
import numpy as np
import pandas as pd
import missingno as msno
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import plotly.graph_objects as go
#import plotly.express as px
import matplotlib.pyplot as plt
import spacy
import tensorflow as tf
from wordcloud import WordCloud, STOPWORDS 
import ktrain
from ktrain import text

from collections import Counter
%matplotlib inline

In [None]:
df = pd.read_csv('../input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum().any()

# Sentiment Visualisation

In [None]:
pos = [4, 5]
neg = [1, 2]
neu = [3]

def sentiment(rating):
  if rating in pos:
    return 2
  elif rating in neg:
    return 0
  else:
    return 1  

In [None]:
df['Sentiment'] = df['Rating'].apply(sentiment)
df.head()

In [None]:
fig = go.Figure([go.Bar(x=df.Sentiment.value_counts().index, y=df.Sentiment.value_counts().tolist())])
fig.update_layout(
    title="Values in each Sentiment",
    xaxis_title="Sentiment",
    yaxis_title="Values")
fig.show()

Here,

2 - Positive (4, 5)<br>
1 - Neutral (3)<br>
0 - Negative (1, 2)

Review distrubution seems more inclined in the positive than the negative. The reason I am not going to neutral is cause that is just one rating whereas positive and negative have two ratings.

## Wordclouds of each Sentiment

Let us now look at the word distribution overrall and for each sentiment.

In [None]:
nlp = spacy.load('en')

def normalize(msg):
    
    doc = nlp(msg)
    res = []
    
    for token in doc:
        if(token.is_stop or token.is_punct or token.is_space):
            pass
        else:
            res.append(token.lemma_.lower())
            
    return res

In [None]:
df['Review'] = df['Review'].apply(normalize)
df.head()

In [None]:
words_collection = Counter([item for sublist in df['Review'] for item in sublist])
freq_word_df = pd.DataFrame(words_collection.most_common(15))
freq_word_df.columns = ['frequently_used_word','count']

freq_word_df.style.background_gradient(cmap='PuBuGn', low=0, high=0, axis=0, subset=None)

## Overrall Wordcloud of Reviews 

In [None]:
word_list = [item for sublist in df['Review'] for item in sublist]
word_string = " ".join(word_list)

wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white', 
                      max_words=60000, 
                      width=1000,
                      height=650
                         ).generate(word_string)

In [None]:
plt.figure(figsize=(20,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

As expected, it is a mixture of some good words and not much of the bad reviews. Let's break it down with each sentiment.

## Positive Sentiment Wordcloud

In [None]:
pos_df = df[df['Sentiment'] == 2]
words_collection = Counter([item for sublist in pos_df['Review'] for item in sublist])
freq_word_df = pd.DataFrame(words_collection.most_common(15))
freq_word_df.columns = ['frequently_used_word','count']

freq_word_df.style.background_gradient(cmap='PuBuGn', low=0, high=0, axis=0, subset=None)

In [None]:
word_list_pos = [item for sublist in pos_df['Review'] for item in sublist]
word_string_pos = " ".join(word_list)

wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white', 
                      max_words=40000, 
                      width=1000,
                      height=650
                         ).generate(word_string_pos)

In [None]:
plt.figure(figsize=(20,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

Positive words like "good", "love", "great" can be seen from this wordcloud

## Neutral Sentiment Wordcloud

In [None]:
neu_df = df[df['Sentiment'] == 1]
words_collection = Counter([item for sublist in neu_df['Review'] for item in sublist])
freq_word_df = pd.DataFrame(words_collection.most_common(15))
freq_word_df.columns = ['frequently_used_word','count']

freq_word_df.style.background_gradient(cmap='PuBuGn', low=0, high=0, axis=0, subset=None)

In [None]:
word_list_neu = [item for sublist in neu_df['Review'] for item in sublist]
word_string_neu = " ".join(word_list)

wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white', 
                      max_words=6000, 
                      width=1000,
                      height=650
                         ).generate(word_string_neu)

In [None]:
plt.figure(figsize=(20,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

Ah it seems "problem" sticks out a little here in the neutral reviews. Another word I managed to look into is "expensive". Maybe in future versions we can probably remove the obvious words like hotel and resort.  

## Negative Sentiment Wordcloud

In [None]:
neg_df = df[df['Sentiment'] == 0]
words_collection = Counter([item for sublist in neg_df['Review'] for item in sublist])
freq_word_df = pd.DataFrame(words_collection.most_common(15))
freq_word_df.columns = ['frequently_used_word','count']

freq_word_df.style.background_gradient(cmap='PuBuGn', low=0, high=0, axis=0, subset=None)

In [None]:
word_list_neg = [item for sublist in neg_df['Review'] for item in sublist]
word_string_neg = " ".join(word_list)

wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white', 
                      max_words=10000, 
                      width=1000,
                      height=650
                         ).generate(word_string_neg)

In [None]:
plt.figure(figsize=(20,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

Some obvious words like "bad", "problem" can be seen in this wordcloud. Maybe when we expand the vocabulary we can clean the text a little better.

The model I am going to be using is the BERT model from the ktrain module. Before we get into the model, we need to turn the lists into strings.

In [None]:
df['Review'] = df['Review'].apply(lambda m: " ".join(m))
df.head()

# Training BERT model

In [None]:
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(df, 
                                                                    'Review',
                                                                    label_columns=['Sentiment'],
                                                                    preprocess_mode='bert')

In [None]:
model = text.text_classifier(name='bert',
                             train_data=(x_train, y_train),
                             preproc=preproc)

In [None]:
learner = ktrain.get_learner(model=model,
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=6)

In [None]:
learner.fit_onecycle(lr=2e-5,
                     epochs=1)

# Final Notes

Slightly lower accuracy when dealing with more clean data in BERT with 83.81% in Training and 86.1% in Validation. <br>

That's it for now in this notebook. <br><br> Upvotes will be greatly appreciated :)