In [None]:
import pandas as pd
import numpy as np
import os
import random
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import text
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, LSTM, Dropout, Bidirectional, Conv1D, MaxPooling1D
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [None]:
random.seed(42)

In [None]:
df = pd.read_csv("/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv")

In [None]:
df.head()

In [None]:
df["split"] = df.apply(lambda x: "train" if random.randrange(0,100) > 10 else "valid", axis=1)

In [None]:
df["split"].value_counts()

In [None]:
df["Rating"].value_counts()

In [None]:
df_train = df[df["split"] == "train"]
df_val = df[df["split"] == "valid"]

In [None]:
tokenizer=Tokenizer(oov_token="'oov'")
tokenizer.fit_on_texts(df_train['Review'])

In [None]:
maxlen = 200
train_X = pad_sequences(tokenizer.texts_to_sequences(df_train['Review']), maxlen=maxlen)
val_X = pad_sequences(tokenizer.texts_to_sequences(df_val['Review']), maxlen=maxlen)

In [None]:
train_Y = df_train["Rating"]
val_Y = df_val["Rating"]
train_Y_cat = to_categorical(df_train["Rating"]-1, num_classes=5)
val_Y_cat = to_categorical(df_val["Rating"]-1, num_classes=5)

In [None]:
glove_dir="/kaggle/input/glove-global-vectors-for-word-representation/"

embedding_index = {}
f = open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coefs
f.close()
print('Found %s word vectors ' % len(embedding_index))

In [None]:
max_words = len(tokenizer.word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((max_words,embedding_dim))

for word, idx in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx]=embedding_vector

## Regression

In [None]:
model=Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(16, activation="relu"))
model.add(Dense(1, activation="linear"))
model.compile(optimizer="Adam", loss='mean_squared_error', metrics=['mse'])
print(model.summary())

In [None]:
model.fit(train_X, train_Y, epochs=30, batch_size=256, validation_data=(val_X, val_Y))

In [None]:
pred = model.predict(val_X)

In [None]:
pred_hard = np.array([round(p[0]) for p in pred])

In [None]:
pred_hard[pred_hard < 1] = 1
pred_hard[pred_hard > 5] = 5

In [None]:
np.unique(pred_hard)

In [None]:
accuracy_score(val_Y, pred_hard)

## Classification

In [None]:
model=Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(16, activation="relu"))
model.add(Dense(5, activation="softmax"))
model.compile(optimizer="Adam", loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(train_X, train_Y_cat, epochs=30, batch_size=256, validation_data=(val_X, val_Y_cat))

In [None]:
pred = model.predict(val_X)

In [None]:
accuracy_score(val_Y, [np.argmax(p)+1 for p in pred])