In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SimpleRNN, Bidirectional
from tensorflow.keras.optimizers import Adam

# Load the dataset
data = pd.read_csv('https://archive.ics.uci.edu/static/public/911/data.csv')


# Inspect and preprocess the dataset
# Normalize 'best_score' to classify into 5 classes: "very negative", "negative", "neutral", "positive", "very positive"
score_bins = [0, 200, 400, 600, 800, 964]
score_labels = [0, 1, 2, 3, 4]  # Use numerical labels to avoid encoding issues
data['best_score_class'] = pd.cut(data['best_score'], bins=score_bins, labels=score_labels)

# Combine review text and target variable
texts = data['text'].astype(str)
labels = data['best_score_class']

# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize and pad the text
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Embedding dimensions
embedding_dim = 128

# RNN Model
print("\nTraining RNN model...")
rnn_model = Sequential()
rnn_model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
rnn_model.add(SimpleRNN(128))
rnn_model.add(Dense(5, activation='softmax'))
rnn_optimizer = Adam(learning_rate=0.001)
rnn_model.compile(optimizer=rnn_optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
rnn_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)
rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test_pad, y_test)
print(f"RNN Model - Loss: {rnn_loss}, Accuracy: {rnn_accuracy}")

# BiRNN Model
print("\nTraining BiRNN model...")
birnn_model = Sequential()
birnn_model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
birnn_model.add(Bidirectional(SimpleRNN(128)))
birnn_model.add(Dense(5, activation='softmax'))
birnn_optimizer = Adam(learning_rate=0.001)
birnn_model.compile(optimizer=birnn_optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
birnn_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)
birnn_loss, birnn_accuracy = birnn_model.evaluate(X_test_pad, y_test)
print(f"BiRNN Model - Loss: {birnn_loss}, Accuracy: {birnn_accuracy}")

# LSTM Model
print("\nTraining LSTM model...")
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
lstm_model.add(LSTM(128))
lstm_model.add(Dense(5, activation='softmax'))
lstm_optimizer = Adam(learning_rate=0.001)
lstm_model.compile(optimizer=lstm_optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test_pad, y_test)
print(f"LSTM Model - Loss: {lstm_loss}, Accuracy: {lstm_accuracy}")

# Save the label encoder for inference
import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Save the tokenizer for inference
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)



Training RNN model...
Epoch 1/5




[1m125/364[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m3s[0m 16ms/step - accuracy: 0.8591 - loss: 0.5951

2024-12-10 23:37:48.192443: W tensorflow/core/framework/op_kernel.cc:1841] OP_REQUIRES failed at sparse_xent_op.cc:103 : INVALID_ARGUMENT: Received a label value of 5 which is outside the valid range of [0, 5).  Label values: 0 0 0 0 0 1 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


InvalidArgumentError: Graph execution error:

Detected at node compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits defined at (most recent call last):
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main

  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/runpy.py", line 87, in _run_code

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/asyncio/base_events.py", line 596, in run_forever

  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/asyncio/base_events.py", line 1890, in _run_once

  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/asyncio/events.py", line 80, in _run

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/IPython/core/interactiveshell.py", line 3048, in run_cell

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/IPython/core/interactiveshell.py", line 3103, in _run_cell

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/IPython/core/interactiveshell.py", line 3308, in run_cell_async

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/IPython/core/interactiveshell.py", line 3490, in run_ast_nodes

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/IPython/core/interactiveshell.py", line 3550, in run_code

  File "/var/folders/hj/50ynckj97xq7d49_2_s66f100000gn/T/ipykernel_55135/1865235275.py", line 53, in <module>

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/backend/tensorflow/trainer.py", line 368, in fit

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/backend/tensorflow/trainer.py", line 216, in function

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/backend/tensorflow/trainer.py", line 129, in multi_step_on_iterator

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/backend/tensorflow/trainer.py", line 110, in one_step_on_data

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/backend/tensorflow/trainer.py", line 59, in train_step

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/trainers/trainer.py", line 399, in _compute_loss

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/trainers/trainer.py", line 367, in compute_loss

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/trainers/compile_utils.py", line 692, in __call__

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/trainers/compile_utils.py", line 701, in call

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/losses/loss.py", line 67, in __call__

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/losses/losses.py", line 33, in call

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/losses/losses.py", line 2241, in sparse_categorical_crossentropy

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/ops/nn.py", line 1841, in sparse_categorical_crossentropy

  File "/Users/huiyisang/Library/Python/3.9/lib/python/site-packages/keras/src/backend/tensorflow/nn.py", line 714, in sparse_categorical_crossentropy

Received a label value of 5 which is outside the valid range of [0, 5).  Label values: 0 0 0 0 0 1 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
	 [[{{node compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_multi_step_on_iterator_351963]

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SimpleRNN, Bidirectional
from tensorflow.keras.optimizers import Adam

# Load the dataset
data = pd.read_csv('https://archive.ics.uci.edu/static/public/911/data.csv')

# Inspect and preprocess the dataset
# Normalize 'best_score' to classify into 5 classes: "very negative", "negative", "neutral", "positive", "very positive"
score_bins = [0, 200, 400, 600, 800, 964]
score_labels = [0, 1, 2, 3, 4]  # Use numerical labels to avoid encoding issues
data['best_score_class'] = pd.cut(data['best_score'], bins=score_bins, labels=score_labels, include_lowest=True)

# Combine review text and target variable
texts = data['text'].astype(str)
labels = data['best_score_class'].astype(int)  # Ensure labels are integers

# Split the data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize and pad the text
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Embedding dimensions
embedding_dim = 128

# RNN Model
print("\nTraining RNN model...")
rnn_model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    SimpleRNN(128, activation='tanh'),
    Dense(5, activation='softmax')
])
rnn_model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
rnn_history = rnn_model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=32)

# BiRNN Model
print("\nTraining BiRNN model...")
birnn_model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Bidirectional(SimpleRNN(128, activation='tanh')),
    Dense(5, activation='softmax')
])
birnn_model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
birnn_history = birnn_model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=32)

# LSTM Model
print("\nTraining LSTM model...")
lstm_model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    LSTM(128, activation='tanh'),
    Dense(5, activation='softmax')
])
lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_history = lstm_model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=32)


Training RNN model...
Epoch 1/5




[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - accuracy: 0.8688 - loss: 0.5572 - val_accuracy: 0.8812 - val_loss: 0.4997
Epoch 2/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - accuracy: 0.8738 - loss: 0.5034 - val_accuracy: 0.8804 - val_loss: 0.5363
Epoch 3/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - accuracy: 0.8885 - loss: 0.3803 - val_accuracy: 0.8697 - val_loss: 0.5954
Epoch 4/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - accuracy: 0.9258 - loss: 0.2474 - val_accuracy: 0.8389 - val_loss: 0.7209
Epoch 5/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - accuracy: 0.9526 - loss: 0.1652 - val_accuracy: 0.8062 - val_loss: 0.8288

Training BiRNN model...
Epoch 1/5
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 29ms/step - accuracy: 0.8640 - loss: 0.5612 - val_accuracy: 0.8812 - val_loss: 0.5040
Epoch 2/5
[

In [15]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Function to create RNN model
def create_rnn_model():
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
        SimpleRNN(128, activation='tanh'),
        Dense(5, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Function to create BiRNN model
def create_birnn_model():
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
        Bidirectional(SimpleRNN(128, activation='tanh')),
        Dense(5, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Function to create LSTM model
def create_lstm_model():
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
        LSTM(128, activation='tanh'),
        Dense(5, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Cross-validation setup
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Function for cross-validation with F1 score
def cross_validate_model(create_model_func, X, y):
    X = np.array(X)  # Ensure X is a NumPy array
    y = np.array(y)  # Ensure y is a NumPy array
    f1_scores = []
    for train_idx, test_idx in kfold.split(X):
        model = create_model_func()
        model.fit(X[train_idx], y[train_idx], epochs=5, batch_size=32, verbose=0)
        y_pred = np.argmax(model.predict(X[test_idx]), axis=1)
        f1 = f1_score(y[test_idx], y_pred, average='macro')
        f1_scores.append(f1)
    return np.mean(f1_scores), np.std(f1_scores)

# Cross-validate RNN
print("\nCross-validating RNN model...")
rnn_f1_mean, rnn_f1_std = cross_validate_model(create_rnn_model, X_train_pad, y_train)
print(f"RNN Cross-Validation F1 Score: {rnn_f1_mean} (+/- {rnn_f1_std})")

# Cross-validate BiRNN
print("\nCross-validating BiRNN model...")
birnn_f1_mean, birnn_f1_std = cross_validate_model(create_birnn_model, X_train_pad, y_train)
print(f"BiRNN Cross-Validation F1 Score: {birnn_f1_mean} (+/- {birnn_f1_std})")

# Cross-validate LSTM
print("\nCross-validating LSTM model...")
lstm_f1_mean, lstm_f1_std = cross_validate_model(create_lstm_model, X_train_pad, y_train)
print(f"LSTM Cross-Validation F1 Score: {lstm_f1_mean} (+/- {lstm_f1_std})")


Cross-validating RNN model...




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
RNN Cross-Validation F1 Score: 0.20779403586532816 (+/- 0.006787435585328952)

Cross-validating BiRNN model...




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
BiRNN Cross-Validation F1 Score: 0.20759809846920874 (+/- 0.009690370643322023)

Cross-validating LSTM model...




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step
LSTM Cross-Validation F1 Score: 0.20410120864244669 (+/- 0.0038513337513023517)


In [9]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = np.argmax(model.predict(X_test), axis=1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

evaluate_model(rnn_model, X_test_pad, y_test)

[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.90      3205
           1       0.08      0.05      0.06       254
           2       0.04      0.04      0.04        52
           3       0.01      0.01      0.01        86
           4       0.04      0.03      0.03        40

    accuracy                           0.81      3637
   macro avg       0.21      0.21      0.21      3637
weighted avg       0.78      0.81      0.79      3637


Confusion Matrix:
[[2915  148   49   72   21]
 [ 234   13    1    3    3]
 [  44    3    2    2    1]
 [  79    4    2    1    0]
 [  36    1    0    2    1]]


In [10]:
evaluate_model(birnn_model, X_test_pad, y_test)

[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93      3205
           1       0.12      0.04      0.06       254
           2       0.00      0.00      0.00        52
           3       0.00      0.00      0.00        86
           4       0.00      0.00      0.00        40

    accuracy                           0.86      3637
   macro avg       0.20      0.20      0.20      3637
weighted avg       0.79      0.86      0.82      3637


Confusion Matrix:
[[3132   66    1    6    0]
 [ 244   10    0    0    0]
 [  50    2    0    0    0]
 [  83    3    0    0    0]
 [  39    0    0    1    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
evaluate_model(lstm_model, X_test_pad, y_test)

[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.97      0.93      3205
           1       0.12      0.04      0.05       254
           2       0.00      0.00      0.00        52
           3       0.11      0.03      0.05        86
           4       0.00      0.00      0.00        40

    accuracy                           0.86      3637
   macro avg       0.22      0.21      0.21      3637
weighted avg       0.79      0.86      0.82      3637


Confusion Matrix:
[[3116   62    8   16    3]
 [ 235    9    4    6    0]
 [  48    2    0    2    0]
 [  82    1    0    3    0]
 [  39    1    0    0    0]]


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SimpleRNN, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Load the dataset
data_path = '/mnt/data/Recipe Reviews and User Feedback Dataset.csv'
data = pd.read_csv(data_path)

# Inspect and preprocess the dataset
# Normalize 'best_score' to classify into 5 classes: "very negative", "negative", "neutral", "positive", "very positive"
score_bins = [0, 200, 400, 600, 800, 964]
score_labels = [0, 1, 2, 3, 4]  # Use numerical labels to avoid encoding issues
data['best_score_class'] = pd.cut(data['best_score'], bins=score_bins, labels=score_labels, include_lowest=True)

# Combine review text and target variable
texts = data['text'].astype(str)
labels = data['best_score_class'].astype(int)  # Ensure labels are integers

# Tokenize and pad the text
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Embedding dimensions
embedding_dim = 128

# Function to create RNN model
def create_rnn_model():
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
        SimpleRNN(128, activation='tanh'),
        Dense(5, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Function to create BiRNN model
def create_birnn_model():
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
        Bidirectional(SimpleRNN(128, activation='tanh')),
        Dense(5, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Function to create LSTM model
def create_lstm_model():
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
        LSTM(128, activation='tanh'),
        Dense(5, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Cross-validation setup
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate RNN model
print("\nCross-validating RNN model...")
rnn_keras = KerasClassifier(build_fn=create_rnn_model, epochs=5, batch_size=32, verbose=0)
rnn_results = []
for train_idx, test_idx in kfold.split(padded_sequences, labels):
    rnn_keras.fit(padded_sequences[train_idx], np.array(labels)[train_idx])
    rnn_score = rnn_keras.score(padded_sequences[test_idx], np.array(labels)[test_idx])
    rnn_results.append(rnn_score)
print(f"RNN Cross-Validation Accuracy: {np.mean(rnn_results)} (+/- {np.std(rnn_results)})")

# Evaluate BiRNN model
print("\nCross-validating BiRNN model...")
birnn_keras = KerasClassifier(build_fn=create_birnn_model, epochs=5, batch_size=32, verbose=0)
birnn_results = []
for train_idx, test_idx in kfold.split(padded_sequences, labels):
    birnn_keras.fit(padded_sequences[train_idx], np.array(labels)[train_idx])
    birnn_score = birnn_keras.score(padded_sequences[test_idx], np.array(labels)[test_idx])
    birnn_results.append(birnn_score)
print(f"BiRNN Cross-Validation Accuracy: {np.mean(birnn_results)} (+/- {np.std(birnn_results)})")

# Evaluate LSTM model
print("\nCross-validating LSTM model...")
lstm_keras = KerasClassifier(build_fn=create_lstm_model, epochs=5, batch_size=32, verbose=0)
lstm_results = []
for train_idx, test_idx in kfold.split(padded_sequences, labels):
    lstm_keras.fit(padded_sequences[train_idx], np.array(labels)[train_idx])
    lstm_score = lstm_keras.score(padded_sequences[test_idx], np.array(labels)[test_idx])
    lstm_results.append(lstm_score)
print(f"LSTM Cross-Validation Accuracy: {np.mean(lstm_results)} (+/- {np.std(lstm_results)})")

# Save the label encoder for inference
import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(score_labels, f)

# Save the tokenizer for inference
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
