# Sentiment Analysis of Hotel reviews


## Importing the packages for data analysis



In [None]:
! pip install nltk
! pip install wordcloud



In [None]:
# General packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# NLP packages
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from wordcloud import WordCloud

# Modeling packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

import re

from pylab import rcParams
import warnings
warnings.filterwarnings("ignore")
rcParams['figure.figsize'] = 14, 6
plt.style.use('ggplot')



## Reading the data



In [None]:
hotel_reviews = pd.read_csv('../input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')
hotel_reviews.head(3)

In [None]:
## Getting the number of words by splitting them by a space
words_per_review = hotel_reviews.Review.apply(lambda x: len(x.split(" ")))
words_per_review.hist(bins = 100)
plt.xlabel('Review Length (words)')
plt.ylabel('Frequency')
plt.show()

In [None]:
print('Average words:', words_per_review.mean())
print('Skewness:', words_per_review.skew())

In [None]:
percent_val = 100 * hotel_reviews['Rating'].value_counts()/len(hotel_reviews)
percent_val

In [None]:
percent_val.plot.bar()
plt.show()

In [None]:
# Mapping the ratings
hotel_reviews['Sentiment_rating'] = np.where(hotel_reviews.Rating > 3,1,0)

## Removing neutral reviews 
hotel_reviews = hotel_reviews[hotel_reviews.Rating != 3]

# Printing the counts of each class
hotel_reviews['Sentiment_rating'].value_counts()

In [None]:
hotel_reviews.Sentiment_rating.value_counts().plot.bar()
plt.show()

## Pre-processing



1. Converting words to lower/upper case
2. Removing special characters
3. Removing stopwords and high/low-frequency words
4. Stemming/lemmatization

### 1. Converting words to lower/upper case



In [None]:
hotel_reviews['reviews_text_new'] = hotel_reviews['Review'].str.lower()

# # word tokenization

In [None]:
from nltk import word_tokenize

# Word tokenization example:
word_tokenize("DPhi Bootcamp rules. It is awesome :D")

In [None]:
# For reviews not converted to lower case
token_lists = [word_tokenize(each) for each in hotel_reviews['Review']]
tokens = [item for sublist in token_lists for item in sublist]
print("Number of unique tokens then: ",len(set(tokens)))

# For reviews converted to lower case
token_lists_lower = [word_tokenize(each) for each in hotel_reviews['reviews_text_new']]
tokens_lower = [item for sublist in token_lists_lower for item in sublist]
print("Number of unique tokens now: ",len(set(tokens_lower)))

### 2. Removing special characters

In [None]:
### Selecting non alpha numeric charactes that are not spaces
spl_chars = hotel_reviews['reviews_text_new'].apply(lambda review: 
                                                     [char for char in list(review) if not char.isalnum() and char != ' '])

## Getting list of list into a single list
flat_list = [item for sublist in spl_chars for item in sublist]

## Unique special characters
set(flat_list)

In [None]:
review_backup = hotel_reviews['reviews_text_new'].copy()
hotel_reviews['reviews_text_new'] = hotel_reviews['reviews_text_new'].str.replace(r'[^A-Za-z0-9]+', ' ')



In [None]:
print("- Old Review -")
print(review_backup.values[7])
print("\n- New Review -")
print(hotel_reviews['reviews_text_new'][8])

In [None]:
hotel_reviews.head(5)

### 3. Removing stop words

In [None]:
from nltk.corpus import stopwords

print('Available languages for NLTK v.3.4.5: ')
print(stopwords.fileids())

In [None]:
noise_words = []
eng_stop_words = stopwords.words('english')
eng_stop_words

In [None]:
stop_words = set(eng_stop_words)
without_stop_words = []
stopword = []
sentence = hotel_reviews['reviews_text_new'][0]
words = nltk.word_tokenize(sentence)

for word in words:
    if word in stop_words:
        stopword.append(word)
    else:
        without_stop_words.append(word)

print('-- Original Sentence --\n', sentence)
print('\n-- Stopwords in the sentence --\n', stopword)
print('\n-- Non-stopwords in the sentence --\n', without_stop_words)

In [None]:
def stopwords_removal(stop_words, sentence):
    return [word for word in nltk.word_tokenize(sentence) if word not in stop_words]

hotel_reviews['reviews_text_nonstop'] = hotel_reviews['reviews_text_new'].apply(lambda row: stopwords_removal(stop_words, row))
hotel_reviews[['reviews_text_new','reviews_text_nonstop']]

In [None]:
print("- Old Review -")
print(hotel_reviews['reviews_text_nonstop'][6])
print("\n- New Review -")
print(hotel_reviews['reviews_text_new'][6])

### 4. Stemming & lemmatization

In [None]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

word_data = "It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms"
# First Word tokenization
nltk_tokens = word_tokenize(word_data)
#Next find the roots of the word
for w in nltk_tokens:
       print ("Actual: %s  Stem: %s"  % (w,porter_stemmer.stem(w)))

In [None]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

word_data = "It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms"
nltk_tokens = nltk.word_tokenize(word_data)
for w in nltk_tokens:
       print ("Actual: %s  Lemma: %s"  % (w,wordnet_lemmatizer.lemmatize(w)))

## Building a machine learning model

In [None]:
hotel_reviews[['Review','Rating','Sentiment_rating']].head(5)

## n-grams



In [None]:
from nltk import ngrams

sentence = 'A bird in the hand worths two in the bush'

for n in range(1, 6):
    print(str(n) + '-grams:\n', list(ngrams(sentence.split(), n)))

### Bag-of-words

In [None]:
# The following code creates a word-document matrix.
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
X = vec.fit_transform(hotel_reviews['reviews_text_new'])
df = pd.DataFrame(X.toarray(), columns = vec.get_feature_names())
df.head()

In [None]:
### Creating a python object of the class CountVectorizer

bow_counts = CountVectorizer(tokenizer= word_tokenize, # type of tokenization
                             stop_words=noise_words, # List of stopwords
                             ngram_range=(1,1)) # number of n-grams

bow_data = bow_counts.fit_transform(hotel_reviews['reviews_text_new'])

In [None]:
bow_data

# Divide into training and test sets:

In [None]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow_data, # Features
                                                                    hotel_reviews['Sentiment_rating'], # Target variable
                                                                    test_size = 0.2, # 20% test size
                                                                    random_state = 0) # random state for replication purposes

In [None]:
y_test_bow.value_counts()/y_test_bow.shape[0]

# Applying logistic regression

In [None]:
### Training the model 
lr_model_all = LogisticRegression() # Logistic regression
lr_model_all.fit(X_train_bow, y_train_bow) # Fitting a logistic regression model

## Predicting the output
test_pred_lr_all = lr_model_all.predict(X_test_bow) # Class prediction

## Calculate key performance metrics
print("F1 score: ", f1_score(y_test_bow, test_pred_lr_all))

In [None]:
### Changes with respect to the previous code
### 1. Increasing the n-grams from just having 1-gram to (1-gram, 2-gram, 3-gram, and 4-gram)
### 2. Including the stopwords in the bag of words features

bow_counts = CountVectorizer(tokenizer= word_tokenize,
                             ngram_range=(1,4))

bow_data = bow_counts.fit_transform(hotel_reviews.reviews_text_new)

In [None]:
# Notice the increase in features with inclusion of n-grams
bow_data

In [None]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow_data,
                                                                    hotel_reviews['Sentiment_rating'],
                                                                    test_size = 0.2,
                                                                    random_state = 0)

In [None]:
# Defining and training the model
lr_model_all_new = LogisticRegression(max_iter = 200)
lr_model_all_new.fit(X_train_bow, y_train_bow)

# Predicting the results
test_pred_lr_all = lr_model_all_new.predict(X_test_bow)

print("F1 score: ", f1_score(y_test_bow,test_pred_lr_all))

## TF-IDF model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

### Creating a python object of the class CountVectorizer
tfidf_counts = TfidfVectorizer(tokenizer= word_tokenize, # type of tokenization
                               stop_words=noise_words, # List of stopwords
                               ngram_range=(1,1)) # number of n-grams

tfidf_data = tfidf_counts.fit_transform(hotel_reviews['reviews_text_new'])

In [None]:
tfidf_data

In [None]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_data,
                                                                            hotel_reviews['Sentiment_rating'],
                                                                            test_size = 0.2,
                                                                            random_state = 0)

In [None]:
### Setting up the model class
lr_model_tf_idf = LogisticRegression()

## Training the model 
lr_model_tf_idf.fit(X_train_tfidf,y_train_tfidf)

## Prediciting the results
test_pred_lr_all = lr_model_tf_idf.predict(X_test_tfidf)

## Evaluating the model
print("F1 score: ",f1_score(y_test_bow, test_pred_lr_all))

In [None]:
from joblib import load 
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
loaded_joblib_model = load(filename="utils/Sentiment_Analysis_unigram.joblib")
feats = loaded_joblib_model.feature_names
feats_len = len(feats)

def sentiment_analyzer(sent):
    lemmatizer = WordNetLemmatizer()
    sent =sent.lower()
    sent = sent.translate(str.maketrans('', '', string.punctuation))
    filtered_sentence = [] 
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(sent) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    lemmatized_output =[]
    lemmatized_output = [lemmatizer.lemmatize(w) for w in filtered_sentence ]

    sent_features=[]
    sent_dict = {}
    for word in lemmatized_output:
        if not word in sent_dict:
            sent_dict[word] = 0
        sent_dict[word] = sent_dict[word] + 1
    for i in range(feats_len):
        if not feats[i] in sent_dict:
            sent_features.append(0)
        else:
            sent_features.append(sent_dict[feats[i]])
    joblib_y_preds = loaded_joblib_model.predict([sent_features])
    return joblib_y_preds[0]
