In [42]:
import pandas as pd
import numpy as np
import nltk
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

In [43]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CPN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CPN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:
# Load the dataset
data = pd.read_excel('./project_dataset.xlsx')

In [45]:
data.head()

Unnamed: 0,Review,Sentiments
0,nice hotel expensive parking got good deal sta...,positive
1,ok nothing special charge diamond member hilto...,negative
2,"unique, great stay, wonderful time hotel monac...",positive
3,"great stay great stay, went seahawk game aweso...",positive
4,love monaco staff husband stayed hotel crazy w...,positive


In [2]:
# Map sentiment labels to numerical values
label_encoder = LabelEncoder()
data['Sentiments'] = label_encoder.fit_transform(data['Sentiments'])


#num_classes = len(label_encoder.classes_)


NameError: name 'LabelEncoder' is not defined

In [47]:
# Preprocess the data
lemmatizer = WordNetLemmatizer()

In [48]:

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

In [49]:
# Apply preprocessing to the 'Text' column
data['Review'] = data['Review'].apply(preprocess_text)

In [17]:
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data['Review'], data['Sentiments'], test_size=0.2, random_state=42)

In [19]:
from keras.preprocessing.text import Tokenizer
# Tokenize the training data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)
train_sequences = tokenizer.texts_to_sequences(train_data)
#Tokenization involves converting each word in the text into a unique integer ID. 'this': 1,'is': 2,  'first': 4,

In [20]:
# Tokenize the testing data
test_sequences = tokenizer.texts_to_sequences(test_data)

In [21]:
# Pad the sequences to ensure consistent length
max_sequence_length = 100  # Set the desired maximum sequence length
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)
#The tokenized sequences are padded to ensure a consistent length for each sequence. 

In [22]:
# Convert labels to categorical format
num_classes = len(label_encoder.classes_)
train_labels_categorical = to_categorical(train_labels, num_classes=num_classes)
test_labels_categorical = to_categorical(test_labels, num_classes=num_classes)

#cat:  [1, 0, 0] dog:  [0, 1, 0] cat:  [1, 0, 0] One-hot encoding represents each label as a binary vector of the same length as the number of classes

In [23]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
# we set it to 100, meaning each word will be represented as a 100-dimensional vector.
model.add(LSTM(100))
#The parameter 100 specifies the number of LSTM units or cells in the layer
model.add(Dense(num_classes, activation='softmax'))
#adds a dense layer to the model.
#for  two classes (positive and negative), num_classes would be set to 2.
#The softmax function applies the exponential function and normalizes the values so that they sum up to 1.

In [24]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
# Train the model
model.fit(train_sequences, train_labels_categorical, validation_data=(test_sequences, test_labels_categorical), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x199a6ffbb20>

In [26]:
# Save the tokenizer and label encoder
tokenizer_path = './tokenizer.pkl'
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)

In [27]:
label_encoder_path = './label_encoder.pkl'
with open(label_encoder_path, 'wb') as f:
    pickle.dump(label_encoder, f)

In [29]:
# Save the trained model
model_path = 'sentiment_model.h5'
model.save(model_path)

In [50]:
# Measure accuracy
_, train_accuracy = model.evaluate(train_sequences, train_labels_categorical, verbose=0)
_, test_accuracy = model.evaluate(test_sequences, test_labels_categorical, verbose=0)

# Print accuracy
print('Train Accuracy:', train_accuracy)
print('Test Accuracy:', test_accuracy)

Train Accuracy: 1.0
Test Accuracy: 0.9262697696685791
