In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# getting the data
trip = pd.read_csv('/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')

In [None]:
trip.head()

In [None]:
# Let's see if there is any null values in any column
trip.isnull().sum()

**No Null Values**

In [None]:
# Let's see if there is any empty string present in the review.If yes,we will go ahead and remove them
empty = []

for i,review,rating in trip.itertuples():
    if type(review) == str:
        if review.isspace():
            empty.append(i) # will take the index of that review
            
            
print(f"There are total {len(empty)} empty strings as review.")

**Great! There are no empty string present as rating**

In [None]:
# Let's check the different rating present
trip['Rating'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
trip.groupby('Rating').size().plot(kind='pie',y='Rating',legend=True)
plt.ylabel('Rating')

The number of reviews with rating 5 is 9054 and combined reviews with 1 and 2 rating is (1793+1421) = 3214
If we create and train our model with this data it will be very week in predicting the correct rating.
It will get enough reviews with 5 rating to train on but will not get sufficient reviews to train for 1st or 2nd rating.

One workaround can be to modify the Rating column.We can take the reviews with rating 5 and call them Positive reviews and can combine the 1st rating and 2nd rating reviews and call them Negative reviews.We will ignore the reviews with 4th or 3rd rating.

In [None]:
# Let's create a new data frame
 
df = trip[(trip['Rating']==5)|(trip['Rating']==2)|(trip['Rating']==1)][['Review','Rating']]

# Lets modify the Rating column
df['Rating'] = df['Rating'].apply(lambda rating: 'Pos' if rating==5 else 'Neg')

In [None]:
df.head()

**We will use the new dataframe df to create and train our model.**

In [None]:
df.head()

In [None]:
# Let's assign the Id to all the unique words present in Tokens columns
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(df['Review'])

In [None]:
# Let's see total number of unique words present in all the reveiws combined
vocab_size = len(tokenizer.index_word)

# tokeninzer.index_word is a dictonary which can be user to view all the unique words and there ID's.

print(f'There are total {vocab_size} unique words present')

In [None]:
# Now let's replace each word in review with there respective token id
sequences = tokenizer.texts_to_sequences(df['Review'])

In [None]:
tokenizer.index_word

In [None]:
# Now to create a RNN-NLP model,each input should be equal in length
# we can acheciving by padding
from tensorflow.keras.preprocessing.sequence import pad_sequences

# We will keep the length of inputs to be 100.So if any input have length less then 100,extra 0s will be added
# Any input with length more then 100 will have the extra word removed.
sequences_padded = pad_sequences(sequences,maxlen=100,padding='post')


In [None]:
sequences_padded # input data

In [None]:
# Now our input variable is in correct format to create and train our model
# Let's check the output variable
df['Rating'].unique()

In [None]:
# We can replace the Pos with 1 and Neg with 0
y = np.array(df['Rating'].apply(lambda rating: 1 if rating is 'Pos' else 0))

y

In [None]:
# Our input and out put varibales are ready.Let's split the data into train set and test set
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(sequences_padded,y,test_size=0.2)

In [None]:
# Libraries to create and train our model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding

In [None]:
# Model
max_len = 100 # length of each input
embedding_size = 32

model = Sequential()

model.add(Embedding(vocab_size+1,embedding_size,input_length = max_len))

model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(150))
model.add(Dropout(0.2))

model.add(Dense(256,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(128,activation='relu'))

model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

model.summary()

In [None]:
# Let's train our model
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=15,batch_size=64)

In [None]:
metrics = pd.DataFrame(model.history.history)

metrics.head()

In [None]:
metrics[['loss','val_loss']].plot()

In [None]:
metrics[['accuracy','val_accuracy']].plot()

In [None]:
# Let's predict on few reviews
neg_review = ['Rooms were old. Staff difficult to reach. Food bad. Loud room parties. Kid based place. No concierge room available. Pick another hotel. This is definitely on a low level with little interest in higher']

# This is a negative review.Let's check if the model is able to predict it

In [None]:
# Let's tokenize it and do the pad_sequence to make it in right format acceptable by model
neg_review_token = tokenizer.texts_to_sequences(neg_review)

# padding
neg_review_padded = pad_sequences(neg_review_token,maxlen=100,padding='post')

In [None]:
review_predict = (model.predict(neg_review_padded)>0.5).astype('int32')

In [None]:
# 1 is Positive review and 0 is negative review
if review_predict[0] == 0:
    print("It's a negative review")
else:
    print("It's a positive review")

In [None]:
# Let's try another one.This time we will take a positive review
pos_review = ["We were booked at a hotel across the street originally, and it was disgusting! So we went on the Hotel Tonight app and found this hotel! It was beautiful, and the staff was very friendly. The rooms are clean and modern. Very impressed with this hotel!"]

# Tokenization
pos_review = tokenizer.texts_to_sequences(pos_review)

# padding
pos_review = pad_sequences(pos_review,maxlen=100,padding='post')

# prediction
review_predict = (model.predict(pos_review)>0.5).astype('int')

if review_predict[0] == 0:
    print("It's a negative review")
else:
    print("It's a positive review")

In [None]:
# Let's predict on X_test
prediction = (model.predict(X_test)>0.5).astype('int32')

In [None]:
# Evaluation
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,prediction))

# Model Accuracy on test data - 94%

# Thank you