# **Simple sentiment analisys model**

# Model accuracy: 
# - with parsing to sentiment ~82-85%
# - without ~60-63%

In [None]:
# Imports
import numpy as np
import pandas as pd
from keras import layers
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras import Sequential
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


In [None]:
# Set random state for repeatable data
np.random.RandomState(21)

# Set number of training samples, epochs and num words for tokenizer
training_samples = 18000
epochs = 5
numWords = 10000

In [None]:
# Loading data to csv
df = pd.read_csv(r'../input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')
# Lowercase columns
df.columns = df.columns.str.lower()

# **Data info**

***First 5 rows***

In [None]:
print(df.head(5))

***Total examples***

In [None]:
print(len(df))

***Ratings range***

In [None]:
print(f'Ratings: {sorted(df.rating.unique())}')

*Rating counts*

In [None]:
print(df.rating.value_counts())

In [None]:
# Replace 1-5 rating to 0-2 sentiment where 0 is bad, 1 is neutral and 2 is good
def parseToSentiment(x):
    if x == 5 or x == 4:
        x = 2
        return x
    elif x == 3:
        x = 1
        return x
    else:
        x = 0
        return x


toSentimentMap = map(parseToSentiment, df.rating)

# Replace rating column
df.rating = list(toSentimentMap)

In [None]:
# Set columns for data and labels
data = df.review
labels = df.rating

***Ratings range after replacing with sentiment***

In [None]:
print(f'Ratings: {sorted(df.rating.unique())}')

# **Tokenizing**

In [None]:
# Use tokenizer on text date for vectorizing it to numbers
tokenizer = Tokenizer(num_words=numWords)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
word_index = tokenizer.word_index
data = pad_sequences(sequences)

***Data after tokenizing***

In [None]:
print(data)

In [None]:
# Convert rating column to array and then with to_categorical convert it to binary class matrix
labels = np.asarray(labels)
labelsCategories = len(np.unique(labels))
labels = to_categorical(labels, labelsCategories)

In [None]:
# Shuffle data
indices = np.arange(len(data))
np.random.shuffle(indices)

In [None]:
# Apply shuffle on the data
data = data[indices]
labels = labels[indices]

In [None]:
# Split data to train and test
X_train = data[:training_samples]
y_train = labels[:training_samples]
X_test = data[training_samples:]
y_test = labels[training_samples:]

In [None]:
# Embedding layer expects vocabulary size + 1 as input dimension
inputDim = numWords + 1
# For input length we need to use shape of one row of our data
inputLength = len(X_train[0])

# **Model training**

In [None]:
# Use Sequential model with one Embedding and two Convolutional layers and after them use 2 max pooling operations
# ended with one Dense layer with softmax activation for multiclass classification
model = Sequential()
model.add(layers.Embedding(inputDim, 128, input_length=inputLength))
model.add(layers.Conv1D(64, 7, activation='relu'))
model.add(layers.MaxPool1D(5))
model.add(layers.Conv1D(64, 7, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(3, activation="softmax"))

In [None]:
# For loss in multiclass classification problem we need to use categorical_crossentropy, as optimizer set rmsprop
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
# Training model
history = model.fit(X_train,
                    y_train,
                    epochs=epochs,
                    batch_size=128,
                    validation_split=0.2)

# **Evaluation and plotting**

In [None]:
# Evalute model for accuracy and loss information
print(f'Model loss(1) and accuracy(2): {model.evaluate(X_test, y_test)}')

In [None]:
# Save metrics to variables and plot them
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

In [None]:
# Plotting with Plotly

epochs = range(1, epochs + 1)

fig = make_subplots(rows=2, cols=1, subplot_titles=('Loss', 'Accuracy'))

fig.add_trace(
    go.Scatter(x=list(epochs), y=loss, mode='lines+markers', name='Training loss'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=list(epochs), y=val_loss, mode='lines+markers', name='Validation loss'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=list(epochs), y=acc, mode='lines+markers', name='Training accuracy'),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(x=list(epochs), y=val_acc, mode='lines+markers', name='Validation accuracy'),
    row=2, col=1
)
