In [1]:
import pandas as pd
df_train = pd.read_csv('data/cleaned_train.csv')
df_test = pd.read_csv('data/cleaned_test.csv')

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_train["cleaned_text"],
    df_train["target"],
    test_size=0.2,
    stratify=df_train["target"],
    random_state=42
)


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_WORDS = 50000
MAX_LEN = 50  # tweets are short

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq   = tokenizer.texts_to_sequences(X_test)

df_train_seq = tokenizer.texts_to_sequences(df_test["cleaned_text"])
df_test_seq  = tokenizer.texts_to_sequences(df_test["cleaned_text"])

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

df_train_pad = pad_sequences(df_train_seq, maxlen=MAX_LEN, padding='post')
df_test_pad  = pad_sequences(df_test_seq, maxlen=MAX_LEN, padding='post')


In [74]:
import numpy as np

EMBED_DIM = 200
glove_path = "glove.6B/glove.6B.200d.txt"

embeddings_index = {}

with open(glove_path, encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = vector

print("Loaded word vectors:", len(embeddings_index))


Loaded word vectors: 400000


In [75]:
word_index = tokenizer.word_index
num_words = min(MAX_WORDS, len(word_index) + 1)

embedding_matrix = np.zeros((num_words, EMBED_DIM))

for word, i in word_index.items():
    if i >= MAX_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [76]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

model = Sequential([
    Embedding(
        input_dim=num_words,
        output_dim=EMBED_DIM,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=False      # <--- freeze GloVe
    ),
    Bidirectional(LSTM(256, return_sequences=False)),
    Dropout(0.5),
    Dense(128, activation="relu"),
    Dropout(0.4),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()




In [67]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(
    monitor='val_loss',          # watch validation loss
    patience=3,                  # stop if no improvement for 2 epochs
    restore_best_weights=True,   # load best model weights at the end
    verbose=1
)

In [77]:
history = model.fit(
    X_train_pad, y_train,
    validation_split=0.2,
    epochs=15,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/15
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 93ms/step - accuracy: 0.6775 - loss: 0.5894 - val_accuracy: 0.8038 - val_loss: 0.4238
Epoch 2/15
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 141ms/step - accuracy: 0.8017 - loss: 0.4636 - val_accuracy: 0.7939 - val_loss: 0.4630
Epoch 3/15
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 150ms/step - accuracy: 0.8186 - loss: 0.4376 - val_accuracy: 0.8112 - val_loss: 0.4137
Epoch 4/15
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 153ms/step - accuracy: 0.8260 - loss: 0.4248 - val_accuracy: 0.7989 - val_loss: 0.4423
Epoch 5/15
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 151ms/step - accuracy: 0.8414 - loss: 0.3817 - val_accuracy: 0.8046 - val_loss: 0.4384
Epoch 6/15
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 159ms/step - accuracy: 0.8381 - loss: 0.3781 - val_accuracy: 0.8186 - val_loss: 0.4227
Epoch 6: early stopping


In [78]:
test_preds_prob = model.predict(X_test_pad)
test_preds = (test_preds_prob > 0.5).astype(int)


[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step


In [82]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report


print("Accuracy:", accuracy_score(y_test, test_preds))
print("F1 Score:", f1_score(y_test, test_preds))
print("\nClassification Report:\n")
print(classification_report(y_test, test_preds))


Accuracy: 0.8076165462902167
F1 Score: 0.7737451737451736

Classification Report:

              precision    recall  f1-score   support

           0       0.83      0.84      0.83       869
           1       0.78      0.77      0.77       654

    accuracy                           0.81      1523
   macro avg       0.80      0.80      0.80      1523
weighted avg       0.81      0.81      0.81      1523



In [81]:
bilstm_pred = model.predict(df_test_pad)
bilstm_pred = (bilstm_pred > 0.5).astype(int)
bilstm_pred = bilstm_pred.flatten()
bl_submission = pd.DataFrame({
    'id': df_test['id'],
    'target':bilstm_pred
})

bl_submission.to_csv('bilstm_submission.csv', index=False)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step
