In [None]:
#importing the libraries

import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from textblob import TextBlob
import nltk
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#reading the data
df = pd.read_csv("/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv")

In [None]:
df.head()

In [None]:
print("Star reviews and number of times they occur.")
df["Rating"].value_counts()

In [None]:
df_arr = df.to_numpy()

In [None]:
#creating a single text 
l=len(df)
polarity_arr=[]
subjectivity_arr=[]

text_corpus=""

# Analysis of Entire Text

In [None]:
for t in df["Review"]:
    text_corpus=text_corpus+"."+t

In [None]:
text_corpus[0:500]

In [None]:
len(text_corpus)

In [None]:
import string
string.punctuation

In [None]:
#removing the punctuations

text_nopunct=''

text_nopunct= "".join([char for char in text_corpus if char not in string.punctuation])

In [None]:
#Creating the tokenizer
tokenizer = nltk.tokenize.RegexpTokenizer('\w+')

In [None]:
#Tokenizing the text
text_tokens = tokenizer.tokenize(text_nopunct)
len(text_tokens)

In [None]:
#now we shall make everything lowercase for uniformity
#to hold the new lower case words

text_words = []

# Looping through the tokens and make them lower case
for word in text_tokens:
    text_words.append(word.lower())


In [None]:
#Stop words are generally the most common words in a language.
#English stop words from nltk.

stopwords = nltk.corpus.stopwords.words('english')

In [None]:
final_words=[]

#Now we need to remove the stop words from the words variable
#Appending to words_new all words that are in words but not in sw

for word in text_words:
    if word not in stopwords:
        final_words.append(word)

In [None]:
#using WordNetLemmatizer

wn = WordNetLemmatizer()

In [None]:
lem_final_words=[]

for word in final_words:
    word=wn.lemmatize(word)
    lem_final_words.append(word)

In [None]:
#The frequency distribution of the words
freq_dist_text = nltk.FreqDist(lem_final_words)

In [None]:
#Frequency Distribution Plot
plt.subplots(figsize=(20,12))
freq_dist_text.plot(30)

# We can see some interesting words. 

Hotel, room, great, time etc. All words indicate about a review. Based on these frequencies, we can understand the reviews.

In [None]:
#converting into string

res_text=' '.join([i for i in lem_final_words if not i.isdigit()])

In [None]:
from wordcloud import WordCloud

In [None]:
plt.subplots(figsize=(16,10))
wordcloud = WordCloud(
                          background_color='black',
                          max_words=100,
                          width=1400,
                          height=1200
                         ).generate(res_text)


plt.imshow(wordcloud)
plt.title('Text Corpus WordCloud (100 words)')
plt.axis('off')
plt.show()

In [None]:
plt.subplots(figsize=(16,10))
wordcloud = WordCloud(
                          background_color='black',
                          max_words=200,
                          width=1400,
                          height=1200
                         ).generate(res_text)


plt.imshow(wordcloud)
plt.title('Text Corpus WordCloud (200 words)')
plt.axis('off')
plt.show()

# Reviews Classifier

We will use the in-built methods in TextBlob to generate review polarity and subjectivity.

In [None]:
for a in df_arr:
    text=a[0]
    testimonial = TextBlob(text)
    testimonial.sentiment
    polarity_arr.append(testimonial.sentiment.polarity)
    subjectivity_arr.append(testimonial.sentiment.subjectivity)

In [None]:
df["Review_Polarity"]=polarity_arr
df["Review_Subjectivity"]=subjectivity_arr

In [None]:
df.head(3)

In [None]:
df_arr[0:3]

# Some Data Analysis

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(df["Review_Polarity"])


In [None]:
plt.figure(figsize=(15,8))
sns.distplot(df["Review_Subjectivity"])


With this, we get more valuable data. 

In [None]:
len(df_arr)

# Training the model on Data

In [None]:
from textblob.classifiers import NaiveBayesClassifier

Note:

I tried training with 20,000+ data points. But with so many data points and TextBlob, I could not train on KAGGLE notebook. 
Let us try with only 1000 data points.


In [None]:
df_model=df_arr[0:1000]

In [None]:
cl = NaiveBayesClassifier(df_model)

# Classifying Text

In [None]:
cl.classify("The hotel is very good. Food was good, housekeeping could have been better. The staff was ok")
            

In [None]:
test_text=".before stay hotel arrange car service price 53 tip reasonable driver waiting arrival.checkin easy downside room picked 2 person jacuzi tub no bath accessories salts bubble bath did n't stay, night got 12/1a checked voucher bottle champagne nice gesture fish waiting room, impression room huge open space felt room big, tv far away bed chore change channel, ipod dock broken disappointing.in morning way asked desk check thermostat said 65f "

In [None]:
cl.classify(test_text)

In [None]:
#so we can say that the classifier has been doing well.
#With better computational resources, it can perform better.