In [13]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [14]:
train_data = pd.read_pickle('train_data.pkl')
test_data = pd.read_pickle('test_data.pkl')

In [15]:
x_train = train_data['text'].fillna('').astype(str).tolist()
x_test = test_data['text'].fillna('').astype(str).tolist()

y_train = train_data['label'].tolist()
y_train = np.array(y_train)
y_test = test_data['label'].tolist()
y_test = np.array(y_test)

In [16]:
max_words = 10000
max_len = 100

In [17]:
tokenizer = Tokenizer(num_words = max_words, oov_token = '<OOV>')
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
padded = pad_sequences(sequences, maxlen = max_len, padding = 'post', truncating = 'post')

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim = max_words, output_dim = 64, input_length = max_len))
model.add(LSTM(64, return_sequences = False))
model.add(Dropout(0.5))
model.add(Dense(1, activation = 'sigmoid'))



In [22]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

In [23]:
model.fit(padded, y_train, epochs = 5, batch_size = 64, validation_split=0.2)

Epoch 1/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 57ms/step - accuracy: 0.8965 - loss: 0.3447 - val_accuracy: 0.8990 - val_loss: 0.3259
Epoch 2/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 58ms/step - accuracy: 0.9015 - loss: 0.3212 - val_accuracy: 0.9494 - val_loss: 0.1537
Epoch 3/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 62ms/step - accuracy: 0.9544 - loss: 0.1363 - val_accuracy: 0.9614 - val_loss: 0.1142
Epoch 4/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 64ms/step - accuracy: 0.9652 - loss: 0.0948 - val_accuracy: 0.9600 - val_loss: 0.1213
Epoch 5/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 108ms/step - accuracy: 0.9715 - loss: 0.0763 - val_accuracy: 0.9595 - val_loss: 0.1248


<keras.src.callbacks.history.History at 0x24c7abe91d0>

In [None]:
x_test_seq = tokenizer.texts_to_sequences(x_test)
x_test_padded = pad_sequences(x_test_seq, maxlen = max_len, padding = 'post', truncating='post')

In [25]:
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f"Test Accuracy: {accuracy}")

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 18ms/step - accuracy: 0.9625 - loss: 0.1203
Test Accuracy: 0.9610527753829956


In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test_padded)
report = classification_report(y_test, y_pred.round(), target_names=['Negative', 'Positive'])

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step


'              precision    recall  f1-score   support\n\n    Negative       0.97      0.99      0.98     28671\n    Positive       0.88      0.72      0.79      3244\n\n    accuracy                           0.96     31915\n   macro avg       0.92      0.85      0.88     31915\nweighted avg       0.96      0.96      0.96     31915\n'

In [29]:
print(report)

              precision    recall  f1-score   support

    Negative       0.97      0.99      0.98     28671
    Positive       0.88      0.72      0.79      3244

    accuracy                           0.96     31915
   macro avg       0.92      0.85      0.88     31915
weighted avg       0.96      0.96      0.96     31915

