In [1]:
import tensorflow as tf
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


# Load Dataset

In [2]:
# 1. Load the tweet_eval/sentiment dataset
dataset = load_dataset("tweet_eval", "sentiment")

# Extract train and test sets
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']

test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/3.78M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/901k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/167k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45615 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12284 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

# Preprocess (Tokenize & Pad)

In [3]:
vocab_size = 10000
max_len = 100
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(train_texts)

x_train = tokenizer.texts_to_sequences(train_texts)
x_test = tokenizer.texts_to_sequences(test_texts)

x_train = pad_sequences(x_train, maxlen=max_len, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_len, padding='post', truncating='post')

y_train = np.array(train_labels)
y_test = np.array(test_labels)



# Build LSTM Model

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64),  # input_length removed
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3, activation='softmax')
])


# Compile and Train

In [14]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(x_train, y_train, epochs=5, batch_size=64, validation_data=(x_test, y_test))


Epoch 1/5
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 39ms/step - accuracy: 0.5348 - loss: 0.9359 - val_accuracy: 0.6010 - val_loss: 0.8838
Epoch 2/5
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 40ms/step - accuracy: 0.7097 - loss: 0.6635 - val_accuracy: 0.6024 - val_loss: 0.8705
Epoch 3/5
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 40ms/step - accuracy: 0.7483 - loss: 0.5837 - val_accuracy: 0.6065 - val_loss: 0.9123
Epoch 4/5
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 38ms/step - accuracy: 0.7800 - loss: 0.5112 - val_accuracy: 0.5931 - val_loss: 1.0316
Epoch 5/5
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 39ms/step - accuracy: 0.8106 - loss: 0.4403 - val_accuracy: 0.5839 - val_loss: 1.1398


# Evaluate

In [15]:
loss, acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {acc * 100:.2f}%")


[1m384/384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.5896 - loss: 1.1416
Test Accuracy: 58.39%


In [16]:
sample_text = input("Enter the text: ")
sample_seq = tokenizer.texts_to_sequences([sample_text])  # wrap in list
sample_pad = pad_sequences(sample_seq, maxlen=max_len, padding='post')

pred = model.predict(sample_pad)
class_names = ['negative', 'neutral', 'positive']

print(f"{sample_text} => {class_names[np.argmax(pred)]}")


Enter the text:  i love india


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step
i love india => positive
