In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.layers as layer
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from wordcloud import WordCloud 
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns


import numpy as np 
import pandas as pd

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords

import random as rn

import re

In [None]:
data = pd.read_csv("/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv")

In [None]:
print(data.head())

print("Data shape {}".format(data.shape))

Let's check the districution of Rating

In [None]:
check = data.Rating.value_counts()
check.plot.bar()

In [None]:
# Null values

data.isnull().sum()

### Most Words Used


In [None]:
def wordCloud_generator(data, title=None):
    wordcloud = WordCloud(width = 800, height = 800,
                          background_color ='black',
                          min_font_size = 10
                         ).generate(" ".join(data.values))
    # plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud, interpolation='bilinear') 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title(title,fontsize=30)
    plt.show() 

In [None]:
wordCloud_generator(data['Review'], title="Most used words in reviews")


## Data Prepeocessing

In [None]:
X = data['Review'].copy()
y = data['Rating'].copy()

In [None]:
ps = PorterStemmer() 
stop_words = stopwords.words('english')
def dataPreprocessor(review):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(review))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()
    
    # Converting to stopwords
    review = processed_feature.split()
    review = [word for word in review if not word in stop_words]
    
    # Stemming words
    
    review = [ps.stem(word) for word in review]
    review = " ".join(word for word in review)
    
    return review


X_cleaned = X.apply(dataPreprocessor)
    
    

In [None]:
print("Origional :" , X[1])
print("After Preprocessing :", X_cleaned[1])

## Distribution of sentence length

In [None]:
length_dist = [len(x.split(" ")) for x in X_cleaned]
plt.hist(length_dist , bins = 20)
plt.show()

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_cleaned)

X = tokenizer.texts_to_sequences(X_cleaned)

max_length = max([len(x) for x in X])
vocab_size = len(tokenizer.word_index)+1
exp_sen = 1

print("Vocabulary size: {}".format(vocab_size))
print("max length of sentence: {}".format(max_length))
print("\nExample:\n")
print("Sentence:\n{}".format(X_cleaned[exp_sen]))
print("\nAfter tokenizing :\n{}".format(X[exp_sen]))

X = pad_sequences(X, padding='pre', maxlen=350)
print("\nAfter padding :\n{}".format(X[exp_sen]))

In [None]:
encoder = OneHotEncoder(sparse=False)
y = np.asanyarray(y)
y_encoded = encoder.fit_transform(y.reshape((-1,1)))


In [None]:
sample = 5
print(y_encoded[sample])
print(y[sample])

## Train_Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.33, random_state=67, stratify=y
)

## Model Building & Training

In [None]:
# hyper parameters
EPOCHS = 10
BATCH_SIZE = 32
embedding_dim = 32
units = 32

model = tf.keras.Sequential([
    layer.Embedding(vocab_size, embedding_dim, input_length=X.shape[1]),
    layer.Bidirectional(layer.LSTM(units,return_sequences=True)),
    layer.Bidirectional(layer.LSTM(units,return_sequences=True)),

    #L.LSTM(units,return_sequences=True),
    layer.Flatten(),
    layer.Dropout(0.3),
    layer.Dense(2048, activation="relu"),
    layer.Dropout(0.3),
    layer.Dense(512, activation="relu"),
    layer.Dropout(0.3),
    layer.Dense(5, activation="softmax")
])


model.compile(loss='categorical_crossentropy',
              optimizer='adam',metrics=['accuracy']
             )

model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=EPOCHS, validation_split=0.1, batch_size=BATCH_SIZE, verbose=1)
