# BiLSTM MODEL (Deep Learning Approach)
##### Using tokenized Reddit posts + optional GloVe embeddings


In [None]:
!pip install --quiet tensorflow

Setup & sanity checks

In [None]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models, callbacks, optimizers
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
)
from sklearn.utils.class_weight import compute_class_weight

In [None]:
assert 'X_train' in globals() and 'X_val' in globals() and 'X_test' in globals(), "Missing X_* splits"
assert 'y_train' in globals() and 'y_val' in globals() and 'y_test' in globals(), "Missing y_* splits"

# Keepping reproducible
tf.random.set_seed(42)
np.random.seed(42)

print(f"Samples: train={len(X_train)}, val={len(X_val)}, test={len(X_test)}")


Samples: train=1980406, val=495102, test=618878


Tokenize & pad sequences

In [None]:
# Hyperparams
MAX_VOCAB   = 30000
MAX_LEN     = 200
OOV_TOKEN   = "<OOV>"

In [None]:
# Fitting tokenizer on training text
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(X_train)

In [None]:
# Converting to integer sequences
def to_seq(texts):
    return tokenizer.texts_to_sequences(texts)

Xtr_seq = to_seq(X_train)
Xva_seq = to_seq(X_val)
Xte_seq = to_seq(X_test)


In [None]:
# Pad to fixed length
Xtr_pad = pad_sequences(Xtr_seq, maxlen=MAX_LEN, padding='post', truncating='post')
Xva_pad = pad_sequences(Xva_seq, maxlen=MAX_LEN, padding='post', truncating='post')
Xte_pad = pad_sequences(Xte_seq, maxlen=MAX_LEN, padding='post', truncating='post')

ytr = np.asarray(y_train).astype(int)
yva = np.asarray(y_val).astype(int)
yte = np.asarray(y_test).astype(int)

vocab_size = min(MAX_VOCAB, len(tokenizer.word_index) + 1)

In [None]:
print("Vocab size used:", vocab_size, "| Sequence shape:", Xtr_pad.shape)

Vocab size used: 30000 | Sequence shape: (1980406, 200)


In [None]:
import os, zipfile, io, requests


In [None]:
EMB_DIM = 100
MAX_VOCAB   = 30000
OOV_TOKEN   = "<OOV>"

glove_zip_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_dir = "glove"
glove_zip = os.path.join(glove_dir, "glove.6B.zip")
glove_txt = os.path.join(glove_dir, f"glove.6B.{EMB_DIM}d.txt")

os.makedirs(glove_dir, exist_ok=True)

def ensure_glove():
    if not os.path.exists(glove_txt):
        try:
            print("Downloading GloVe embeddings...")
            r = requests.get(glove_zip_url, timeout=60)
            z = zipfile.ZipFile(io.BytesIO(r.content))
            z.extractall(glove_dir)
            print("GloVe downloaded and extracted.")
        except Exception as e:
            print("Could not download GloVe:", e)

ensure_glove()

Downloading GloVe embeddings...
GloVe downloaded and extracted.


In [None]:
# Fitting tokenizer on training text
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(X_train)
vocab_size = min(MAX_VOCAB, len(tokenizer.word_index) + 1)


In [None]:
# Build embedding matrix
embedding_matrix = np.random.normal(0, 0.6, size=(vocab_size, EMB_DIM)).astype(np.float32)
found = 0

if os.path.exists(glove_txt):
    print("Loading embeddings from:", glove_txt)
    embeddings_index = {}
    with open(glove_txt, encoding="utf-8") as f:
        for line in f:
            values = line.rstrip().split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    for word, idx in tokenizer.word_index.items():
        if idx >= vocab_size:
            continue
        vec = embeddings_index.get(word)
        if vec is not None:
            embedding_matrix[idx] = vec
            found += 1
    print(f"Loaded {found} / {vocab_size} embeddings.")
else:
    print("GloVe not available; using random trainable embeddings.")

Loading embeddings from: glove/glove.6B.100d.txt
Loaded 26996 / 30000 embeddings.


### Building the BiLSTM model

In [None]:
def build_bilstm(vocab_size, emb_dim=EMB_DIM, max_len=MAX_LEN, use_pretrained=True):
    inp = layers.Input(shape=(max_len,), dtype="int32")

    if use_pretrained and os.path.exists(glove_txt):
        emb = layers.Embedding(
            input_dim=vocab_size, output_dim=emb_dim,
            weights=[embedding_matrix], input_length=max_len,
            trainable=False, name="embeddings"
        )(inp)
    else:
        emb = layers.Embedding(
            input_dim=vocab_size, output_dim=emb_dim,
            input_length=max_len, name="embeddings"
        )(inp)

    x = layers.SpatialDropout1D(0.2)(emb)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(1, activation="sigmoid")(x)

    model = models.Model(inp, out)
    opt = optimizers.Adam(learning_rate=2e-3)
    model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])
    return model

In [None]:
bilstm = build_bilstm(vocab_size, EMB_DIM, MAX_LEN, use_pretrained=True)
bilstm.summary()


In [None]:
# Computing class weights from the training labels
classes = np.unique(ytr)
class_wts = compute_class_weight(class_weight="balanced", classes=classes, y=ytr)
class_wts = {int(c): w for c, w in zip(classes, class_wts)}

In [None]:
print("Class weights:", class_wts)

Class weights: {0: np.float64(1.0), 1: np.float64(1.0)}


In [None]:
# Early stopping + best model checkpoint
ckpt_path = "bilstm_best.h5"
cbs = [
    callbacks.EarlyStopping(monitor="val_f1", mode="max", patience=3, restore_best_weights=True),
    callbacks.ModelCheckpoint(ckpt_path, monitor="val_f1", mode="max", save_best_only=True, verbose=1)
]

In [None]:
# Custom F1 metric
def f1_m(y_true, y_pred, thresh=0.5):
    y_pred = tf.cast(y_pred > thresh, tf.float32)
    y_true = tf.cast(y_true, tf.float32)
    tp = tf.reduce_sum(tf.cast(y_true*y_pred, tf.float32))
    fp = tf.reduce_sum(tf.cast((1-y_true)*y_pred, tf.float32))
    fn = tf.reduce_sum(tf.cast(y_true*(1-y_pred), tf.float32))
    precision = tp / (tp + fp + 1e-12)
    recall    = tp / (tp + fn + 1e-12)
    return 2*precision*recall/(precision+recall+1e-12)

bilstm.compile(
    loss="binary_crossentropy",
    optimizer=optimizers.Adam(learning_rate=2e-3),
    metrics=["accuracy", f1_m]
)

history = bilstm.fit(
    Xtr_pad, ytr,
    validation_data=(Xva_pad, yva),
    epochs=8,
    batch_size=128,
    class_weight=class_wts,
    callbacks=cbs,
    verbose=1
)

Epoch 1/8
[1m15471/15472[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 18ms/step - accuracy: 0.9062 - f1_m: 0.5027 - loss: 0.2393



[1m15472/15472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m319s[0m 20ms/step - accuracy: 0.9062 - f1_m: 0.5027 - loss: 0.2393 - val_accuracy: 0.9329 - val_f1_m: 0.5099 - val_loss: 0.1747
Epoch 2/8
[1m15470/15472[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 18ms/step - accuracy: 0.9279 - f1_m: 0.5035 - loss: 0.1888



[1m15472/15472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 20ms/step - accuracy: 0.9279 - f1_m: 0.5035 - loss: 0.1888 - val_accuracy: 0.9378 - val_f1_m: 0.5067 - val_loss: 0.1627
Epoch 3/8
[1m15472/15472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.9317 - f1_m: 0.5041 - loss: 0.1797



[1m15472/15472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 20ms/step - accuracy: 0.9317 - f1_m: 0.5041 - loss: 0.1797 - val_accuracy: 0.9394 - val_f1_m: 0.5074 - val_loss: 0.1597
Epoch 4/8
[1m15470/15472[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 18ms/step - accuracy: 0.9333 - f1_m: 0.5045 - loss: 0.1752



[1m15472/15472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m314s[0m 20ms/step - accuracy: 0.9333 - f1_m: 0.5045 - loss: 0.1752 - val_accuracy: 0.9407 - val_f1_m: 0.5079 - val_loss: 0.1552
Epoch 5/8
[1m15472/15472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.9347 - f1_m: 0.5047 - loss: 0.1718



[1m15472/15472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 20ms/step - accuracy: 0.9347 - f1_m: 0.5047 - loss: 0.1718 - val_accuracy: 0.9412 - val_f1_m: 0.5077 - val_loss: 0.1532
Epoch 6/8
[1m15472/15472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.9356 - f1_m: 0.5047 - loss: 0.1695



[1m15472/15472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 20ms/step - accuracy: 0.9356 - f1_m: 0.5047 - loss: 0.1695 - val_accuracy: 0.9423 - val_f1_m: 0.5051 - val_loss: 0.1521
Epoch 7/8
[1m14639/15472[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m15s[0m 18ms/step - accuracy: 0.9361 - f1_m: 0.5049 - loss: 0.1679

In [None]:
def plot_history(h, metrics=("accuracy","f1_m")):
    for m in metrics:
        plt.figure(figsize=(5,3.5))
        plt.plot(h.history[m], label=f"train_{m}")
        plt.plot(h.history[f"val_{m}"], label=f"val_{m}")
        plt.title(f"Training history — {m}")
        plt.xlabel("Epoch"); plt.ylabel(m)
        plt.legend(); plt.tight_layout(); plt.show()

plot_history(history)


In [None]:
# Probabilities
val_proba_bilstm = bilstm.predict(Xva_pad, batch_size=512).ravel()
test_proba_bilstm = bilstm.predict(Xte_pad, batch_size=512).ravel()

In [None]:
# Default 0.5 threshold
val_pred_bilstm  = (val_proba_bilstm >= 0.5).astype(int)
test_pred_bilstm = (test_proba_bilstm >= 0.5).astype(int)

In [None]:
# Reports
print("BiLSTM — Validation")
print(classification_report(yva, val_pred_bilstm, target_names=["Non-Depressed","Depressed"]))
print("ROC-AUC (val):", roc_auc_score(yva, val_proba_bilstm))

In [None]:
print("\nBiLSTM — Test")
print(classification_report(yte, test_pred_bilstm, target_names=["Non-Depressed","Depressed"]))
print("ROC-AUC (test):", roc_auc_score(yte, test_proba_bilstm))


In [None]:
# Confusion matrix
ConfusionMatrixDisplay.from_predictions(
    yte, test_pred_bilstm, display_labels=["Non-Depressed","Depressed"],
    cmap="magma", values_format="d"
)
plt.title("BiLSTM — Confusion Matrix")
plt.tight_layout(); plt.show()


In [None]:
# ROC
fpr, tpr, _ = roc_curve(yte, test_proba_bilstm)
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(yte, test_proba_bilstm):.3f}")
plt.plot([0,1],[0,1],'k--'); plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("BiLSTM — ROC"); plt.legend(); plt.tight_layout(); plt.show()


In [None]:
# Precision–Recall
prec, rec, _ = precision_recall_curve(yte, test_proba_bilstm)
plt.figure(figsize=(5,4))
plt.plot(rec, prec, label=f"AP = {average_precision_score(yte, test_proba_bilstm):.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title("BiLSTM — Precision–Recall"); plt.legend(); plt.tight_layout(); plt.show()


In [None]:
def best_threshold_by_f1(y_true, scores):
    prec, rec, thr = precision_recall_curve(y_true, scores)
    thr = np.append(thr, 1.0)
    f1s = 2*(prec*rec)/(prec+rec+1e-12)
    i = np.argmax(f1s)
    return float(thr[i]), {"precision": float(prec[i]), "recall": float(rec[i]), "f1": float(f1s[i])}

thr_bilstm, stats_bilstm = best_threshold_by_f1(yva, val_proba_bilstm)
print("Best BiLSTM threshold on validation:", round(thr_bilstm,3), stats_bilstm)

test_pred_bilstm_tuned = (test_proba_bilstm >= thr_bilstm).astype(int)
print("\nBiLSTM (Test) with tuned threshold:")
print(classification_report(yte, test_pred_bilstm_tuned, target_names=["Non-Depressed","Depressed"]))


In [None]:
bilstm.save("bilstm_model.h5")
print("Saved bilstm_model.h5")


## Model comparison table

In [None]:
rows = []

# LR
if 'test_proba_lr' in globals():
    rows.append({"Model":"LogReg", "F1_test": f1_score(y_test, (test_proba_lr>=0.5)),
                 "ROC_AUC_test": roc_auc_score(y_test, test_proba_lr)})

# SVM
if 'test_scores_svm' in globals():
    rows.append({"Model":"Linear SVM", "F1_test": f1_score(y_test, (test_scores_svm>=0)),
                 "ROC_AUC_test": roc_auc_score(y_test, test_scores_svm)})

# BiLSTM
rows.append({"Model":"BiLSTM", "F1_test": f1_score(yte, test_pred_bilstm),
             "ROC_AUC_test": roc_auc_score(yte, test_proba_bilstm)})

pd.DataFrame(rows).sort_values("F1_test", ascending=False).round(4)


#### Small implimentation of SBERT (Sentence-BERT) on which future work that can be done