In [1]:
# Import packages
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import TextVectorization

In [107]:
#Breaking down a sentiment classification dataset from UCI
import glob
text = []
label = []

for filename in glob.glob("Sentiment/*.txt"):  # Gathers all .txt files in the directory
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            #print(f"Raw line: {line}")
            line = line.rstrip('\n').split('\t')  # Split each line on tabs
            if len(line) > 1 and line[1].isdigit():  # Check if the second part is a digit
                text.append(line[0])  # Append the text part
                label.append(int(line[1]))
            else:
                print(f"Skipping invalid line: {line}")

#print("Text:", text)
#print("Label:", label)
#text = text.astype(str)
# Convert lists to numpy arrays
text = np.array(text)
label = np.array(label)

In [110]:
#Use text vectorization to complete some preprocessing tasks, including tokenizations (word splits) and remove punctuation/lowercase words
vectorize_layer = TextVectorization(
    max_tokens = None,
    standardize = 'lower_and_strip_punctuation',
    split = 'whitespace',
    ngrams = None,
    output_mode = 'int',
    output_sequence_length = None
)

In [112]:
# apply it to the text data with "adapt"
vectorize_layer.adapt(text)

In [114]:
# check preprocessing results, such as vocabulary, 
vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'and',
 'i',
 'a',
 'is',
 'to',
 'it',
 'this',
 'of',
 'was',
 'in',
 'for',
 'not',
 'that',
 'with',
 'my',
 'very',
 'good',
 'on',
 'great',
 'you',
 'but',
 'have',
 'are',
 'movie',
 'as',
 'so',
 'phone',
 'film',
 'its',
 'be',
 'all',
 'one',
 'had',
 'at',
 'food',
 'like',
 'just',
 'place',
 'time',
 'were',
 'service',
 'an',
 'really',
 'if',
 'from',
 'there',
 'they',
 'bad',
 'we',
 'well',
 'out',
 'has',
 'dont',
 'about',
 'would',
 'your',
 'or',
 'no',
 'only',
 'by',
 'best',
 'ever',
 'even',
 'here',
 'also',
 'will',
 'back',
 'up',
 'when',
 'me',
 'than',
 'more',
 'quality',
 'go',
 'what',
 'love',
 'ive',
 'which',
 'made',
 'he',
 'can',
 'because',
 'product',
 'im',
 'how',
 'too',
 'get',
 'work',
 'their',
 'some',
 'works',
 'nice',
 'could',
 'better',
 'any',
 'excellent',
 'after',
 'never',
 'do',
 'recommend',
 'much',
 'been',
 'who',
 'use',
 'our',
 'did',
 'again',
 'sound',
 'other',
 'think',
 'his',
 'headset',


In [116]:
#Recurrent Neural Network
model_rnn = keras.Sequential()

model_rnn.add(vectorize_layer)

model_rnn.add(keras.layers.Embedding(
    input_dim = len(vectorize_layer.get_vocabulary()),
    output_dim = 64,
    mask_zero = True
))

model_rnn.add(keras.layers.SimpleRNN(128)) 

model_rnn.add(keras.layers.Dense(1, activation = 'sigmoid'))

In [118]:
#Train/compile
model_rnn.compile(loss = keras.losses.BinaryCrossentropy(),
                  optimizer='adam',
                  metrics=['accuracy'])

In [124]:
text = tf.convert_to_tensor(text, dtype=tf.string)
label = tf.convert_to_tensor(label, dtype=tf.int32)

In [126]:
#fit the model
model_rnn.fit(x = text, y = label, validation_split = 0.2,
              epochs=10, batch_size = 32)

Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step - accuracy: 0.5492 - loss: 0.6881 - val_accuracy: 0.7267 - val_loss: 0.6005
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.8569 - loss: 0.4680 - val_accuracy: 0.7300 - val_loss: 0.5670
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9579 - loss: 0.1784 - val_accuracy: 0.7833 - val_loss: 0.5334
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9822 - loss: 0.0654 - val_accuracy: 0.7733 - val_loss: 0.7440
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9890 - loss: 0.0453 - val_accuracy: 0.7483 - val_loss: 0.6478
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9986 - loss: 0.0149 - val_accuracy: 0.7767 - val_loss: 0.6977
Epoch 7/10
[1m75/75[0m [32m━━━━

<keras.src.callbacks.history.History at 0x28f0ec034d0>

In [128]:
model_rnn.summary()

In [146]:
#prediction
test_text = [['I hate this meal!'], ['I love this restaurant']]
test_text_tensor = tf.convert_to_tensor(test_text, dtype=tf.string)
model_rnn.predict(test_text_tensor)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 746ms/step


array([[0.04141991],
       [0.9998292 ]], dtype=float32)

In [148]:
#LTSM model
model_lstm = keras.Sequential()

model_lstm.add(vectorize_layer)

model_lstm.add(keras.layers.Embedding(
    input_dim = len(vectorize_layer.get_vocabulary()),
    output_dim = 64,
    mask_zero = True
))

model_lstm.add(keras.layers.LSTM(128))

model_lstm.add(keras.layers.Dense(1, activation = 'sigmoid'))

In [150]:
model_lstm.compile(loss = keras.losses.BinaryCrossentropy(),
                   optimizer='adam',
                   metrics=['accuracy'])

In [152]:
model_lstm.fit(x = text, y = label, validation_split = 0.2,
               epochs=10, batch_size = 32)

Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - accuracy: 0.5883 - loss: 0.6767 - val_accuracy: 0.7733 - val_loss: 0.5145
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.8824 - loss: 0.3528 - val_accuracy: 0.8250 - val_loss: 0.4224
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.9628 - loss: 0.1354 - val_accuracy: 0.8233 - val_loss: 0.4455
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - accuracy: 0.9804 - loss: 0.0833 - val_accuracy: 0.8150 - val_loss: 0.5889
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - accuracy: 0.9910 - loss: 0.0404 - val_accuracy: 0.8083 - val_loss: 0.5690
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.9899 - loss: 0.0339 - val_accuracy: 0.8100 - val_loss: 0.7096
Epoch 7/10
[1m75/75[0m [32m━━━

<keras.src.callbacks.history.History at 0x28f0f0c8f20>

In [154]:
model_lstm.summary()

In [158]:
model_lstm.predict(test_text_tensor)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 608ms/step


array([[0.00658366],
       [0.99999934]], dtype=float32)

In [160]:
#Gated Recurrent Unit Model (GRU)
model_gru = keras.Sequential()

model_gru.add(vectorize_layer)

model_gru.add(keras.layers.Embedding(
    input_dim = len(vectorize_layer.get_vocabulary()),
    output_dim = 64,
    mask_zero = True
))

model_gru.add(keras.layers.GRU(128))

model_gru.add(keras.layers.Dense(1, activation = 'sigmoid'))

In [162]:
model_gru.compile(loss = keras.losses.BinaryCrossentropy(),
                  optimizer='adam',
                  metrics=['accuracy'])

In [164]:
model_gru.fit(x = text, y = label, validation_split = 0.2,
              epochs=10, batch_size = 32)

Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 42ms/step - accuracy: 0.5652 - loss: 0.6795 - val_accuracy: 0.7467 - val_loss: 0.5500
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.8803 - loss: 0.3620 - val_accuracy: 0.8083 - val_loss: 0.4433
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - accuracy: 0.9621 - loss: 0.1351 - val_accuracy: 0.8117 - val_loss: 0.4783
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 45ms/step - accuracy: 0.9812 - loss: 0.0714 - val_accuracy: 0.8067 - val_loss: 0.5434
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - accuracy: 0.9915 - loss: 0.0380 - val_accuracy: 0.7933 - val_loss: 0.8507
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.9979 - loss: 0.0139 - val_accuracy: 0.7950 - val_loss: 0.9009
Epoch 7/10
[1m75/75[0m [32m━━━

<keras.src.callbacks.history.History at 0x28f119e7d40>

In [166]:
model_gru.summary()

In [168]:
model_gru.predict(test_text_tensor)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 694ms/step


array([[0.00509745],
       [0.99997145]], dtype=float32)

In [170]:
#Bidirectional RNN model
model_bilstm = keras.Sequential()

model_bilstm.add(vectorize_layer)

model_bilstm.add(keras.layers.Embedding(
    input_dim = len(vectorize_layer.get_vocabulary()),
    output_dim = 64,
    mask_zero = True
))

model_bilstm.add(keras.layers.Bidirectional(keras.layers.LSTM(128)))

model_bilstm.add(keras.layers.Dense(1, activation = 'sigmoid'))

In [172]:
model_bilstm.compile(loss = keras.losses.BinaryCrossentropy(),
                     optimizer='adam',
                     metrics=['accuracy'])

In [174]:
model_bilstm.fit(x = text, y = label, validation_split = 0.2,
                 epochs = 10, batch_size = 32)

Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 43ms/step - accuracy: 0.6031 - loss: 0.6676 - val_accuracy: 0.7517 - val_loss: 0.5347
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.9002 - loss: 0.3353 - val_accuracy: 0.7883 - val_loss: 0.4261
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.9573 - loss: 0.1430 - val_accuracy: 0.8367 - val_loss: 0.4488
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.9864 - loss: 0.0810 - val_accuracy: 0.8317 - val_loss: 0.6994
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.9772 - loss: 0.0713 - val_accuracy: 0.7983 - val_loss: 0.6961
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.9922 - loss: 0.0450 - val_accuracy: 0.8283 - val_loss: 0.6226
Epoch 7/10
[1m75/75[0m [32m━━━

<keras.src.callbacks.history.History at 0x28f1e1ee4e0>

In [176]:
model_bilstm.summary()

In [178]:
model_bilstm.predict(test_text_tensor)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 845ms/step


array([[0.03857815],
       [0.9999464 ]], dtype=float32)