In [None]:
import json
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

#Load JSONL Dataset
with open("data.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

#Filter English Data Robustly
english_data = [entry for entry in data if entry["language"].lower().startswith("en")]
print(f"Filtered {len(english_data)} English entries.")

texts = [entry["text"] for entry in english_data]
labels = [entry["labels"] for entry in english_data]

if len(texts) == 0:
    raise ValueError("No English entries found. Check your language filtering step.")

#Label Encoding
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)
categorical_labels = to_categorical(encoded_labels, num_classes=num_classes)

#Tokenize and Pad
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
maxlen = 100
padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding="post", truncating="post")

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, categorical_labels, test_size=0.2, random_state=42
)

#Load GloVe Embeddings with Auto-Dimension Detection
embedding_index = {}
glove_path = "glove.6B.100d.txt"

with open(glove_path, encoding="utf-8") as f:
    for line in f:
        values = line.strip().split()
        if len(values) < 2:
            continue
        word = values[0]
        vector = values[1:]
        try:
            vector = np.asarray(vector, dtype="float32")
        except ValueError:
            continue
        embedding_index[word] = vector

embedding_dim = len(next(iter(embedding_index.values())))
print(f"Loaded {len(embedding_index)} word vectors of dimension {embedding_dim}")

word_index = tokenizer.word_index
num_words = min(20000, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))

skipped = 0
for word, i in word_index.items():
    if i >= 20000:
        continue
    vector = embedding_index.get(word)
    if vector is not None and len(vector) == embedding_dim:
        embedding_matrix[i] = vector
    else:
        skipped += 1

print(f"Skipped {skipped} words due to missing/mismatched vectors.")

#Build the Improved LSTM Model
model = Sequential([
    Embedding(input_dim=num_words, output_dim=embedding_dim,
              weights=[embedding_matrix], input_length=maxlen, trainable=True),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dense(num_classes, activation="softmax")
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

#Add Early Stopping and Train
early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=25,
    batch_size=64,
    callbacks=[early_stop]
)

#Evaluate Model on Test Set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nFinal Test Accuracy: {accuracy:.4f}")

#Sample Predictions
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

print("\n--- Sample Predictions ---")
for i in range(5):
    decoded_text = tokenizer.sequences_to_texts([X_test[i]])[0]
    actual = label_encoder.inverse_transform([y_true_labels[i]])[0]
    predicted = label_encoder.inverse_transform([y_pred_labels[i]])[0]
    print(f"\nText:\n{decoded_text[:300]}...")
    print(f"Actual: {actual}")
    print(f"Predicted: {predicted}")


Filtered 1237 English entries.
Loaded 400000 word vectors of dimension 100
Skipped 839 words due to missing/mismatched vectors.




Epoch 1/25
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 459ms/step - accuracy: 0.1677 - loss: 2.1444 - val_accuracy: 0.3333 - val_loss: 2.0332
Epoch 2/25
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 414ms/step - accuracy: 0.2669 - loss: 1.9691 - val_accuracy: 0.3636 - val_loss: 1.8325
Epoch 3/25
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 420ms/step - accuracy: 0.3753 - loss: 1.7308 - val_accuracy: 0.4242 - val_loss: 1.6992
Epoch 4/25
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 519ms/step - accuracy: 0.5031 - loss: 1.4593 - val_accuracy: 0.4040 - val_loss: 1.5111
Epoch 5/25
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 539ms/step - accuracy: 0.5407 - loss: 1.3128 - val_accuracy: 0.4545 - val_loss: 1.4636
Epoch 6/25
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 412ms/step - accuracy: 0.6373 - loss: 1.0872 - val_accuracy: 0.4646 - val_loss: 1.4319
Epoch 7/25
[1m14/14[0