In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
#pad_sequence is used to make sure that input length of every sentence is same
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM,Dropout
from tensorflow.keras.layers import Dense
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Dataset

In [None]:
df=pd.read_csv("/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv")
df.head()

In [None]:
#Combining lower values and assigning it as 0
for j in [1,2,3]:
    df.loc[df["Rating"]==j,"Rating"]=0

In [None]:
#Combining lower values and assigning it as 1
for i in [5,4]:
    df.loc[df["Rating"]==i,"Rating"]=1

In [None]:
df.head()

In [None]:
sns.countplot(x="Rating",data=df)

In [None]:
df.isnull().sum()

In [None]:
#Splitting into independent and dependent variable
X=df["Review"]
y=df["Rating"]

In [None]:
X.shape,y.shape

# Data Preprocessing

In [None]:
#Setting the vocabulary size
voc_size=10000

In [None]:
#Downloading all the stopwords
nltk.download("stopwords")

In [None]:
ps=PorterStemmer()
corpus=[]
for i in range(len(X)):
    #Removing everything other than alphabets from the text
    review=re.sub("[^a-zA-Z]"," ",X[i])
    #Converting the reviews into lowercase texts
    review=review.lower()
    review=review.split()
    #removing all the stopwords from the reviews
    review=[ps.stem(word) for word in review if word not in stopwords.words("english")]
    review=" ".join(review)
    corpus.append(review)

In [None]:
#we will do one hot encoding for the corpus. It is alloting every word an index according to the vocabulary size
onehot=[one_hot(words,voc_size) for words in corpus]

In [None]:
#We are converting every sentence into same length by adding 0 in front of the text if it is not of the desired length.
#Desired length = maximum length of the sentence that is present in the review
embedded_docs=pad_sequences(onehot,padding="pre")
embedded_docs

In [None]:
len(embedded_docs)

In [None]:
len(embedded_docs[0])

# Creating the Model

In [None]:
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=len(X[0])))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation="sigmoid"))
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
len(embedded_docs),y.shape

In [None]:
#Creating new independent and dependent variables
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [None]:
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_final,y_final,test_size=0.3,random_state=0)

# Training the model

In [None]:
history=model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5,batch_size=64)

# Visualising the result

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'bo', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'bo', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()