In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Input, SimpleRNN, Dropout, Flatten
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from gensim.models import Word2Vec




In [2]:
# Load the dataset
data = pd.read_csv('https://archive.ics.uci.edu/static/public/911/data.csv')


In [None]:
# Preprocessing steps for the 'review_text' column
data['review_text'] = data['text'].str.lower()
data['review_text'] = data['review_text'].str.replace(r'[^\w\s]', '', regex=True)
data['review_text'] = data['review_text'].str.replace(r'\d+', '', regex=True)
data = data.dropna(subset=['review_text', 'best_score'])

# Categorize 'best_score' into 5 sentiment classes
def categorize_score(score):
    if score <= 189:  # Top 20% of 946
        return 0  # very negative
    elif score <= 378:  # Next 20%
        return 1  # negative
    elif score <= 567:  # Middle 20%
        return 2  # neutral
    elif score <= 756:  # Next 20%
        return 3  # positive
    else:
        return 4  # very positive

# Apply categorization
data['classified_sentiments'] = data['best_score'].apply(categorize_score)


In [6]:
# Word2Vec Embedding
sentences = data['review_text'].apply(lambda x: x.split()).tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Create an embedding matrix
vocab_size = len(word2vec_model.wv)+1
embedding_matrix = np.zeros((vocab_size, 100))
word_index = {word: idx + 1 for idx, word in enumerate(word2vec_model.wv.index_to_key)}
for word, idx in word_index.items():
    embedding_matrix[idx] = word2vec_model.wv[word]

# Split data
X_train, X_test, y_train, y_test = train_test_split(data['review_text'], data['classified_sentiments'], test_size=0.2, random_state=42)


In [7]:

# Tokenization for RNN and BiRNN
def texts_to_sequences(texts, word_index):
    return [[word_index.get(word, 0) for word in text.split()] for text in texts]

X_train_seq = texts_to_sequences(X_train, word_index)
X_test_seq = texts_to_sequences(X_test, word_index)
X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=200)
X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=200)

# Build Simple RNN model using Sequential API
rnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=200, weights=[embedding_matrix], trainable=False),
    SimpleRNN(64, return_sequences=False, activation='relu'),
    Dropout(0.5),
    Dense(5, activation='softmax')
])

rnn_optimizer = Adam(learning_rate=0.001)
rnn_model.compile(optimizer=rnn_optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train Simple RNN model
rnn_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))


Epoch 1/5




[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.7509 - loss: 0.9143 - val_accuracy: 0.7965 - val_loss: 0.7149
Epoch 2/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.7935 - loss: 0.7585 - val_accuracy: 0.7965 - val_loss: 0.7218
Epoch 3/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.7919 - loss: 0.7340 - val_accuracy: 0.7965 - val_loss: 0.7074
Epoch 4/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.7917 - loss: 0.7110 - val_accuracy: 0.7965 - val_loss: 0.7052
Epoch 5/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.7953 - loss: 0.7055 - val_accuracy: 0.7965 - val_loss: 0.7009


<keras.src.callbacks.history.History at 0x313b71d30>

In [8]:

# Build BiRNN model using Sequential API
birnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=200, weights=[embedding_matrix], trainable=False),
    Bidirectional(LSTM(64, return_sequences=False, activation='tanh')),
    Dropout(0.5),
    Dense(5, activation='softmax')
])

birnn_optimizer = RMSprop(learning_rate=0.001)
birnn_model.compile(optimizer=birnn_optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train BiRNN model
birnn_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))


Epoch 1/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 61ms/step - accuracy: 0.7818 - loss: 0.7722 - val_accuracy: 0.7965 - val_loss: 0.7061
Epoch 2/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 69ms/step - accuracy: 0.7982 - loss: 0.6949 - val_accuracy: 0.7962 - val_loss: 0.6879
Epoch 3/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 69ms/step - accuracy: 0.7957 - loss: 0.6998 - val_accuracy: 0.7959 - val_loss: 0.6864
Epoch 4/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 69ms/step - accuracy: 0.7982 - loss: 0.6789 - val_accuracy: 0.7959 - val_loss: 0.6845
Epoch 5/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 69ms/step - accuracy: 0.7954 - loss: 0.6889 - val_accuracy: 0.7957 - val_loss: 0.6856


<keras.src.callbacks.history.History at 0x317f64670>

In [9]:

# Evaluation Function
def evaluate_model(model, X_test, y_test, model_type="RNN"):
    predictions = model.predict(X_test)
    y_pred = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=['very negative', 'negative', 'neutral', 'positive', 'very positive'])
    print(f"Accuracy ({model_type}): {accuracy}")
    print(f"Classification Report ({model_type}):\n{report}")

# Evaluate Simple RNN
evaluate_model(rnn_model, X_test_pad, y_test, model_type="Simple RNN")

# Evaluate BiRNN
evaluate_model(birnn_model, X_test_pad, y_test, model_type="BiRNN")


[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Accuracy (Simple RNN): 0.7964796479647965
Classification Report (Simple RNN):
               precision    recall  f1-score   support

very negative       0.80      1.00      0.89      2896
     negative       0.00      0.00      0.00       524
      neutral       0.00      0.00      0.00        68
     positive       0.00      0.00      0.00        91
very positive       0.00      0.00      0.00        57

     accuracy                           0.80      3636
    macro avg       0.16      0.20      0.18      3636
 weighted avg       0.63      0.80      0.71      3636

[1m  1/114[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m16s[0m 143ms/step

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step
Accuracy (BiRNN): 0.7956545654565457
Classification Report (BiRNN):
               precision    recall  f1-score   support

very negative       0.80      1.00      0.89      2896
     negative       0.00      0.00      0.00       524
      neutral       0.00      0.00      0.00        68
     positive       0.00      0.00      0.00        91
very positive       0.00      0.00      0.00        57

     accuracy                           0.80      3636
    macro avg       0.16      0.20      0.18      3636
 weighted avg       0.63      0.80      0.71      3636



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
