#### Small implimentation of SBERT (Sentence-BERT) on which future work that can be done

# SBERT (Sentence-BERT) + Logistic Regression

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    roc_auc_score, roc_curve, precision_recall_curve, average_precision_score, f1_score
)
from sklearn.preprocessing import StandardScaler


In [None]:
# Make sure both naming schemes point to the same objects
if 'X_valid' in globals() and 'y_valid' in globals():
    X_val = X_valid
    y_val = y_valid

assert 'X_train' in globals() and 'X_val' in globals() and 'X_test' in globals(), "Missing text splits"
assert 'y_train' in globals() and 'y_val' in globals() and 'y_test' in globals(), "Missing label splits"

# Ensure all are strings
X_train = [str(t) for t in X_train]
X_val   = [str(t) for t in X_val]
X_test  = [str(t) for t in X_test]


In [None]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
sbert = SentenceTransformer(model_name)  # uses GPU if available

# Batch encode (adjust batch_size if you have more/less memory)
BATCH = 128

X_train_emb = sbert.encode(X_train, batch_size=BATCH, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
X_val_emb   = sbert.encode(X_val,   batch_size=BATCH, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
X_test_emb  = sbert.encode(X_test,  batch_size=BATCH, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

print("Embeddings shapes:", X_train_emb.shape, X_val_emb.shape, X_test_emb.shape)


In [None]:
###  Standardize features for LR/SVM

In [None]:
scale_features = True  # set False if you want to skip

if scale_features:
    scaler = StandardScaler(with_mean=False)  # keep sparse-like behavior
    X_train_emb_sc = scaler.fit_transform(X_train_emb)
    X_val_emb_sc   = scaler.transform(X_val_emb)
    X_test_emb_sc  = scaler.transform(X_test_emb)
else:
    X_train_emb_sc, X_val_emb_sc, X_test_emb_sc = X_train_emb, X_val_emb, X_test_emb


In [None]:
#PCA for dimensionality reduction
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train_emb)
X_val_pca   = pca.transform(X_val_emb)
X_test_pca  = pca.transform(X_test_emb)

print("Shapes after PCA:", X_train_pca.shape, X_val_pca.shape, X_test_pca.shape)


# SBERT → Classifier
#1: Logistic Regression

In [None]:
# SBERT-Logistic Regression
lr_sbert = LogisticRegression(
    penalty='l2', C=1.0, class_weight='balanced',
    solver='saga', max_iter=1000, n_jobs=-1
)
lr_sbert.fit(X_train_pca, y_train)


In [None]:
# Predictions
val_pred_proba = lr_sbert.predict_proba(X_val_pca)[:,1]
val_pred = (val_pred_proba >= 0.5).astype(int)

test_pred_proba = lr_sbert.predict_proba(X_test_pca)[:,1]
test_pred = (test_pred_proba >= 0.5).astype(int)

In [None]:
# Evaluation metrics
def summarize_metrics(y_true, y_pred, y_proba):
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', pos_label=1)
    roc_auc = roc_auc_score(y_true, y_proba)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1, "roc_auc": roc_auc}

In [None]:
val_metrics = summarize_metrics(y_val, val_pred, val_pred_proba)
test_metrics = summarize_metrics(y_test, test_pred, test_pred_proba)

In [None]:
print("\nValidation metrics:", val_metrics)
print("Test metrics:", test_metrics)

In [None]:
# Step 6: Classification report
print("\nClassification Report (Test Set):")
print(classification_report(y_test, test_pred, target_names=["Non-Depressed","Depressed"]))


In [None]:
#ROC Curve
fpr, tpr, _ = roc_curve(y_test, test_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – SBERT + Logistic Regression")
plt.legend(loc="lower right")
plt.show()

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, test_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Non-Depressed","Depressed"],
            yticklabels=["Non-Depressed","Depressed"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix – SBERT + Logistic Regression")
plt.show()

# SBERT with PCA
#2: Linear SVM

In [None]:
try:
    _ = X_train_pca.shape
    _ = X_val_pca.shape
    _ = X_test_pca.shape
    print("Using existing SBERT+PCA arrays.")
except NameError:
    print("SBERT+PCA arrays not found. Creating them now (MiniLM + PCA=100).")
    from sentence_transformers import SentenceTransformer
    from sklearn.decomposition import PCA

    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
    X_train_emb = sbert_model.encode(list(map(str, X_train)), batch_size=128, show_progress_bar=True)
    X_val_emb   = sbert_model.encode(list(map(str, X_val)),   batch_size=128, show_progress_bar=True)
    X_test_emb  = sbert_model.encode(list(map(str, X_test)),  batch_size=128, show_progress_bar=True)

    pca = PCA(n_components=100, random_state=42)
    X_train_pca = pca.fit_transform(X_train_emb)
    X_val_pca   = pca.transform(X_val_emb)
    X_test_pca  = pca.transform(X_test_emb)
    print("Shapes after PCA:", X_train_pca.shape, X_val_pca.shape, X_test_pca.shape)


In [None]:
#Train Linear SVM on PCA-reduced SBERT embeddings
svm_sbert_pca = LinearSVC(C=1.0, class_weight='balanced', random_state=42)
svm_sbert_pca.fit(X_train_pca, y_train)


In [None]:
# Validation predictions
val_scores_svm_s = svm_sbert_pca.decision_function(X_val_pca)
val_pred_svm_s   = (val_scores_svm_s >= 0).astype(int)


In [None]:
print("=== SBERT + PCA + Linear SVM — Validation ===")
print(classification_report(y_val, val_pred_svm_s, target_names=["Non-Depressed","Depressed"]))
print("ROC-AUC (val):", roc_auc_score(y_val, val_scores_svm_s))
print("PR-AUC  (val):", average_precision_score(y_val, val_scores_svm_s))


In [None]:
#Test predictions
test_scores_svm_s = svm_sbert_pca.decision_function(X_test_pca)
test_pred_svm_s   = (test_scores_svm_s >= 0).astype(int)


In [None]:
print("\n=== SBERT + PCA + Linear SVM — Test ===")
print(classification_report(y_test, test_pred_svm_s, target_names=["Non-Depressed","Depressed"]))
print("ROC-AUC (test):", roc_auc_score(y_test, test_scores_svm_s))
print("PR-AUC  (test):", average_precision_score(y_test, test_scores_svm_s))


In [None]:
# Confusion Matrix (Test)
ConfusionMatrixDisplay.from_predictions(
    y_test, test_pred_svm_s, display_labels=["Non-Depressed","Depressed"],
    cmap="Greens", values_format="d"
)
plt.title("SBERT + PCA + Linear SVM — Confusion Matrix (Test)")
plt.tight_layout(); plt.show()

In [None]:
# ROC (Test)
fpr, tpr, _ = roc_curve(y_test, test_scores_svm_s)
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test, test_scores_svm_s):.3f}")
plt.plot([0,1],[0,1],'k--'); plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("SBERT + PCA + Linear SVM — ROC (Test)"); plt.legend(); plt.tight_layout(); plt.show()


In [None]:
#Precision–Recall (Test)
prec, rec, _ = precision_recall_curve(y_test, test_scores_svm_s)
plt.figure(figsize=(5,4))
plt.plot(rec, prec, label=f"AP = {average_precision_score(y_test, test_scores_svm_s):.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title("SBERT + PCA + Linear SVM — Precision–Recall (Test)"); plt.legend(); plt.tight_layout(); plt.show()

## Global Model Comparison (Test Set)

In [None]:
rows = []

def add_row(name, y_true, y_pred, scores_for_auc):
    rows.append({
        "Model": name,
        "F1": f1_score(y_true, y_pred),
        "ROC_AUC": roc_auc_score(y_true, scores_for_auc),
        "PR_AUC": average_precision_score(y_true, scores_for_auc)
    })

# TF‑IDF + LR
if 'test_proba_lr' in globals():
    add_row("TFIDF + LR", y_test, (test_proba_lr>=0.5).astype(int), test_proba_lr)

# TF‑IDF + Linear SVM
if 'test_scores_svm' in globals():
    add_row("TFIDF + LinearSVM", y_test, (test_scores_svm>=0).astype(int), test_scores_svm)

# BiLSTM
if 'test_proba_bilstm' in globals():
    add_row("BiLSTM", yte, (test_proba_bilstm>=0.5).astype(int), test_proba_bilstm)

# SBERT + LR
if 'test_proba_lr_s' in globals():
    add_row("SBERT+PCA + LR", y_test, (test_proba_lr_s>=0.5).astype(int), test_proba_lr_s)

# SBERT + Linear SVM
if 'test_scores_svm_s' in globals():
    add_row("SBERT+PCA + LinearSVM", y_test, (test_scores_svm_s>=0).astype(int), test_scores_svm_s)

cmp = pd.DataFrame(rows).sort_values("F1", ascending=False).round(4)
display(cmp)


In [None]:
cmp.groupby('Model').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# Model vs PR_AUC
figsize = (12, 1.2 * len(cmp['Model'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(cmp, x='PR_AUC', y='Model', inner='stick', palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

In [None]:
# Model vs ROC_AUC
figsize = (12, 1.2 * len(cmp['Model'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(cmp, x='ROC_AUC', y='Model', inner='stick', palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

In [None]:
# Model vs F1
figsize = (12, 1.2 * len(cmp['Model'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(cmp, x='F1', y='Model', inner='stick', palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

In [None]:
def maybe_curves(label, y_true, scores, roc_color=None, pr_color=None):
    fpr, tpr, _ = roc_curve(y_true, scores)
    prec, rec, _ = precision_recall_curve(y_true, scores)
    return (label, fpr, tpr, prec, rec)

curves = []

if 'test_proba_lr' in globals():
    curves.append(maybe_curves("TFIDF+LR", y_test, test_proba_lr))
if 'test_scores_svm' in globals():
    curves.append(maybe_curves("TFIDF+LinearSVM", y_test, test_scores_svm))
if 'test_proba_bilstm' in globals():
    curves.append(maybe_curves("BiLSTM", yte, test_proba_bilstm))
if 'test_proba_lr_s' in globals():
    curves.append(maybe_curves("SBERT+PCA+LR", y_test, test_proba_lr_s))
if 'test_scores_svm_s' in globals():
    curves.append(maybe_curves("SBERT+PCA+LinearSVM", y_test, test_scores_svm_s))

In [None]:
# ROC overlay
plt.figure(figsize=(5,4))
for (label, fpr, tpr, _, _) in curves:
    plt.plot(fpr, tpr, label=label)
plt.plot([0,1],[0,1],'k--')
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC Curves (Test)")
plt.legend(); plt.tight_layout(); plt.show()


In [None]:
# PR overlay
plt.figure(figsize=(5,4))
for (label, _, __, prec, rec) in curves:
    plt.plot(rec, prec, label=label)
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Precision–Recall Curves (Test)")
plt.legend(); plt.tight_layout(); plt.show()


In [None]:
# Choose your best model's scores here:
# Example: SBERT+PCA+LinearSVM doesn't have probabilities; prefer SBERT+PCA+LR or BiLSTM for this section.
if 'val_proba_lr_s' in globals() and 'test_proba_lr_s' in globals():
    scores_val  = val_proba_lr_s
    scores_test = test_proba_lr_s
    chosen_name = "SBERT+PCA + Logistic Regression"
elif 'val_proba_bilstm' in globals() and 'test_proba_bilstm' in globals():
    scores_val  = val_proba_bilstm
    scores_test = test_proba_bilstm
    chosen_name = "BiLSTM"
else:
    scores_val  = val_proba_lr
    scores_test = test_proba_lr
    chosen_name = "TFIDF + Logistic Regression"

def tune_threshold(y_true, scores, min_recall=None):
    prec, rec, thr = precision_recall_curve(y_true, scores)
    thr = np.append(thr, 1.0)
    f1 = 2*(prec*rec)/(prec+rec+1e-12)
    if min_recall is not None:
        f1[rec < min_recall] = -1
    i = np.argmax(f1)
    return float(thr[i]), {"precision": float(prec[i]), "recall": float(rec[i]), "f1": float(f1[i])}

best_thr, stats = tune_threshold(y_val, scores_val, min_recall=None)  # or min_recall=0.85
print(f"Best threshold on validation for {chosen_name}:", round(best_thr,3), stats)

pred_test_tuned = (scores_test >= best_thr).astype(int)
print(f"\n{chosen_name} — Test @ tuned threshold:")
print(classification_report(y_test, pred_test_tuned, target_names=["Non-Depressed","Depressed"]))

# Calibration plot
prob_true, prob_pred = calibration_curve(y_test, scores_test, n_bins=10, strategy='quantile')
plt.figure(figsize=(5,4))
plt.plot(prob_pred, prob_true, marker='o', label=chosen_name)
plt.plot([0,1],[0,1],'k--', label='Perfect')
plt.xlabel('Predicted probability'); plt.ylabel('Observed frequency')
plt.title('Calibration (Test)'); plt.legend(); plt.tight_layout(); plt.show()
