<a href="https://colab.research.google.com/github/sgh4000/NLPSafeAI/blob/sh%2Finitial/Lab1and2_newdataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Using this dataset - https://www.kaggle.com/datasets/infamouscoder/depression-reddit-cleaned?resource=download

# Following the Kaggle download instructions
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd

# Set the path to the file you'd like to load
file_path = "depression_dataset_reddit_cleaned.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "infamouscoder/depression-reddit-cleaned",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

clean_text = df['clean_text']
display(clean_text[:5])

In [None]:
# Investigate the data

# Count how many records of each type, depression means is_depression = 1 and no depression means is_depression = 0

depression_count = df['is_depression'].value_counts()
print(depression_count)

In [None]:
#####################################
### Import the relevant libraries ###
#####################################

from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import time
import pandas as pd
from sklearn.model_selection import train_test_split
import requests
from sentence_transformers import SentenceTransformer
from tensorflow.keras.utils import to_categorical
from sklearn.decomposition import PCA

In [None]:
# Split into train/test

df_train, df_test = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["is_depression"]
)

# Extract training features and labels if needed. Keeping raw for future analysis
X_train_raw = df_train["clean_text"].tolist()
y_train = df_train["is_depression"].values
X_test_raw = df_test["clean_text"].tolist()
y_test = df_test["is_depression"].values

In [None]:
########################################################
### I want to do some analysis on the PCA embeddings ###
########################################################

# Also, I want to investigate how different number of components has an effect

# The pca.explained_variance_ratio returns a vector of the variance explained by each dimension
# So [0.11,0.095,0.085] means that the first dimension explains 11% of variance and so on
# explained_variance_ratio_.cumsum gives the cumulated totals, so taking the final value gives the
# total variance explained by all dimensions

import matplotlib.pyplot as plt

encoding_model = "all-MiniLM-L6-v2"
encoder = SentenceTransformer(encoding_model)
n_components = 30

X_train = encoder.encode(X_train_raw, show_progress_bar=False)
X_test = encoder.encode(X_test_raw, show_progress_bar=False)

data = np.vstack([X_train])
# PCA data
data_pca = PCA(n_components=n_components).fit(data)

print("---- n = 30 ----")

cum_sum_30 = data_pca.explained_variance_ratio_.cumsum()[-1]

print(f"Cum sum {n_components} is {cum_sum_30:.3f}")

n_components = 50

X_train = encoder.encode(X_train_raw, show_progress_bar=False)
X_test = encoder.encode(X_test_raw, show_progress_bar=False)

data = np.vstack([X_train])
# PCA data
data_pca = PCA(n_components=n_components).fit(data)

print("---- n = 50 ----")

cum_sum_50 = data_pca.explained_variance_ratio_.cumsum()[-1]

print(f"Cum sum {n_components} is {cum_sum_50:.3f}")


# n_components = 70

# X_train = encoder.encode(X_train_raw, show_progress_bar=False)
# X_test = encoder.encode(X_test_raw, show_progress_bar=False)

# data = np.vstack([X_train])
# PCA data
# data_pca = PCA(n_components=n_components).fit(data)

# print("---- n = 70 ----")

# cum_sum_70 = data_pca.explained_variance_ratio_.cumsum()[-1]

# print(f"Cum sum {n_components} is {cum_sum_70:.3f}")

# n_components = 100

# X_train = encoder.encode(X_train_raw, show_progress_bar=False)
# X_test = encoder.encode(X_test_raw, show_progress_bar=False)

# data = np.vstack([X_train])
# PCA data
# data_pca = PCA(n_components=n_components).fit(data)

# print("---- n = 100 ----")

# cum_sum_100 = data_pca.explained_variance_ratio_.cumsum()[-1]

# print(f"Cum sum {n_components} is {cum_sum_100:.3f}")

# n_components = 200

# X_train = encoder.encode(X_train_raw, show_progress_bar=False)
# X_test = encoder.encode(X_test_raw, show_progress_bar=False)

# data = np.vstack([X_train])
# PCA data
# data_pca = PCA(n_components=n_components).fit(data)

# print("---- n = 200 ----")

# cum_sum_200 = data_pca.explained_variance_ratio_.cumsum()[-1]

# print(f"Cum sum {n_components} is {cum_sum_200:.3f}")

# plt.plot([30, 50, 70, 100, 200], [cum_sum_30, cum_sum_50, cum_sum_70, cum_sum_100, cum_sum_200], marker='o')
# plt.xlabel('Number of Principal Components')
# plt.ylabel('Cumulative Explained Variance Ratio')
# plt.title('Cumulative Explained Variance Ratio by Principal Components')
# plt.ylim(bottom=0)
# plt.show()





In [None]:
##################################################
### Do the text embeddings needed for training ###
##################################################

encoding_model = "all-MiniLM-L6-v2"
encoder = SentenceTransformer(encoding_model)
n_components = 30
batch_size = 64

X_train = encoder.encode(X_train_raw, show_progress_bar=False)
X_test = encoder.encode(X_test_raw, show_progress_bar=False)

data = np.vstack([X_train])
# PCA data
data_pca = PCA(n_components=n_components).fit(data)

X_train = data_pca.transform(X_train)
X_test = data_pca.transform(X_test)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

In [None]:
###############################################################################################################
### Define a simple model - credit for this spec https://github.com/Tgl70/DAIR-course-NLP/blob/main/main.py ###
###############################################################################################################

print(y_train.shape, y_train[:10])
print(y_test.shape, y_test[:10])

def get_model(input_size):
    initializer = tf.keras.initializers.GlorotUniform(seed=42)
    model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(input_size,), name='input_features'),
            tf.keras.layers.Dense(128, activation='relu', kernel_initializer=initializer, name='dense_1'),
            tf.keras.layers.Dense(2, activation='softmax', kernel_initializer=initializer, name='output_layer')
        ])
    print(model.summary())
    return model

In [None]:
##########################
### Set some variables ###
##########################

input_size = X_train.shape[1]
batch_size = 64
epochs = 6
n_classes = 2
epsilon = 0.3
alpha = 0.1
num_iter = 10

In [None]:
########################################################
### Train the base model (simple and no adversarial) ###
########################################################

model_base = get_model(input_size)

model_base.summary()

model_base.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]
    )

model_base.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=epochs,
    verbose=1
)

In [None]:
###############################################################
### Define the precision, recall, F1 and ROC curve function ###
###############################################################

# Taken from lab https://github.com/KatyaKom/DAIR/blob/main/Lab2/CV.ipynb

def print_metrics(model, x, y, c):
    # Get predicted probabilities for all classes
    y_pred_prob = model.predict(x)

    # Get predicted class labels (highest probability class)
    y_pred_class = np.argmax(y_pred_prob, axis=1)

    # Calculate precision, recall, and F1-score (using macro average)
    precision = precision_score(y, y_pred_class, average='macro')
    recall = recall_score(y, y_pred_class, average='macro')
    f1 = f1_score(y, y_pred_class, average='macro')

    # Display the macro/micro/weighted average metrics
    print(f'Precision (macro): {precision:.4f}')
    print(f'Recall (macro): {recall:.4f}')
    print(f'F1-score (macro): {f1:.4f}')

    y_test_bin = to_categorical(y, num_classes=c)

    # Compute ROC curve and AUC for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(c):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i])
        roc_auc[i] = roc_auc_score(y_test_bin[:, i], y_pred_prob[:, i])

    # Plot the ROC curve for each class
    plt.figure(figsize=(6, 5))
    for i in range(c):
        plt.plot(fpr[i], tpr[i], label=f'Class {i} (AUC = {roc_auc[i]:.2f})')

    plt.plot([0, 1], [0, 1], 'k--')  # Dashed diagonal line
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) for Each Class')
    plt.legend(loc='lower right')
    plt.show()

print_metrics(model_base, X_test, y_test, 2)

In [None]:
#######################################
### Calculate the loss and accuracy ###
#######################################

loss, acc = model_base.evaluate(X_test, y_test, verbose=0)

print(f"Base model loss: {loss:.3f}")
print(f"Base model accuracy: {acc:.3f}")


In [None]:
##################
### LIME Setup ###
##################

%pip install Lime
from lime.lime_text import LimeTextExplainer

In [None]:
####################
### LIME Explain ###
####################

# Local Interpretable Model-Agnostic Explanations
# LIME is representing local explanation, need to be mindful that a local explanation isn't a global explanation

queries = X_test_raw[:2]

encoder = SentenceTransformer("all-MiniLM-L6-v2")

class_names = ["non-depression", "depression"]


def make_predict_fn(model, encoder, pca):
    def predict_fn(texts):
        X_encoded = encoder.encode(texts, show_progress_bar=False)
        X_pca = pca.transform(X_encoded)
        preds = model.predict(X_pca)
        return tf.nn.softmax(preds, axis=1).numpy()
    return predict_fn

def explain_text(query, model, encoder, pca, class_names, print_out=True):

    print(f"query: {query}")

    predict_fn = make_predict_fn(model, encoder, pca)
    explainer = LimeTextExplainer(class_names=class_names, random_state=42)

    print(predict_fn)

    predicted_probs = predict_fn([query])
    pred_class = int(np.argmax(predicted_probs))

    exp = explainer.explain_instance(
        query,
        predict_fn,
        num_features=10,
        labels=[0,1]
    )

    print("Model probabilities:", predicted_probs)

    print(f"Pred class: {pred_class}")
    if (print_out == True):
      print(f"\nLIME Explanation for: '{query}'")
      available_labels = exp.available_labels()
      for label in available_labels:
          print(f"\nClass {label} ({class_names[label]}) explanation:")
          for word, weight in exp.as_list(label=label):
              print(f"  {word}: {weight:.4f}")

      print(exp.as_list(label=pred_class))

      exp.as_pyplot_figure(label=pred_class)
      plt.show()

      print("\n\n")


    return exp

query_and_important_word = {}

for q in queries:
  exp = explain_text(q, model_base, encoder, data_pca, class_names)
  query_and_important_word[q] = str(exp.as_list()[0][0])


print(query_and_important_word)

In [None]:
######################################
### SHAP Analysis helper functions ###
######################################

import shap
import tensorflow as tf
import numpy as np
from tensorflow import keras
import matplotlib.pyplot as plt
import pickle as pk
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import pandas as pd
from collections import Counter

def feature_importance_graph(X_train, X_test, model):
  # Take a small subset of the training data to explain using SHAP
  background = X_train[np.random.choice(X_train.shape[0], 100, replace=False)]
  to_explain = X_test[:100]

  explainer = shap.KernelExplainer(model.predict, background)
  shap_values = explainer.shap_values(to_explain)

  # Want to measure how features push towards class 1
  shap_diff = shap_values[..., 1] - shap_values[..., 0]

  shap.summary_plot(
    shap_diff, to_explain,
    feature_names=[f"f{i}" for i in range(X_train.shape[1])]
  )

  return shap_diff

def get_associated_queries_for_top_features(no_top_PCA_features, no_top_embedding_dimensions, no_top_text_queries, pca, encoder, queries_raw, shap_diff):
  # The idea here is that we can use SHAP to find the most important PCA features. From here, find the embeddings that drive them. Then look at real queries (from training data) that score highly for these embeddings.

  encoder = SentenceTransformer(encoder)
  embeddings = encoder.encode(queries_raw, show_progress_bar=False)

  X_pca = pca.transform(embeddings)

  # Get the mean absolute SHAP values per PCA feature
  mean_abs_shap = np.abs(shap_diff).mean(axis=0)

  # Find the top PCA features
  sorted_indices = np.argsort(mean_abs_shap)
  sorted_indices_desc = np.flip(sorted_indices)
  top_features = sorted_indices_desc[:no_top_PCA_features]

  print(f"\nTop {no_top_PCA_features} PCA features ranked by SHAP importance:\n")

  # Loop over the identified top PCA features
  for rank, feature in enumerate(top_features, start=1):
    print(f"{rank}. Feature {feature}")
    print(f"  Mean absolute SHAP value: {mean_abs_shap[feature]}")

    # Find which embedding matters for the feature
    component = pca.components_[feature]
    abs_component = np.abs(component)
    sorted_indices = np.argsort(abs_component)
    sorted_indices_desc = np.flip(sorted_indices)
    top_dimensions = sorted_indices_desc[:no_top_embedding_dimensions]

    for dimension in top_dimensions:
      weight = component[dimension]
      print(f"\n  Embedding dimension {dimension} contributes to PCA feature f{feature} "f"({weight:+.4f})")

      # Identify queries where this embedding dimension is strong
      # Take all sentences, and get this embedding dimension value
      dimension_values = embeddings[:, dimension]
      # Depends on sign, if positive then largest embedding value is making PCA feature important, if negative than smallest value
      order = np.argsort(dimension_values)
      if weight > 0:
        # Then need to flip for descending
        order = order[::-1]
      # Get top queries
      top_query = order[:no_top_text_queries]
      listWords = []
      for i in top_query:
        query = queries_raw[i]
        print(f"    Query: {query} | dimension {dimension} value = {dimension_values[i]:.4f}")
        listWords.extend(query.split())

      counts = Counter(listWords)
      print(f"Common words among top queries: {counts.most_common(5)}")
    print()

  return mean_abs_shap, top_features





In [None]:
################################
### Now, doing SHAP analysis ###
################################

# A really good article on SHAP and SHAPley values and how they are calculated - https://medium.com/data-science/shap-explained-the-way-i-wish-someone-explained-it-to-me-ab81cc69ef30
# SHAP is quantifying the impcat that each feature has on the model's prediction

# However, unlike a model which is using specific features in input data, such as age, salary etc, we are using a text query
# So, I think it makes sense to
# 1. Identify important PCA features using SHAP
# 2. Trace them back to embedding dimensions
# 3. Then to actual text queries, to give some semantically understandable meaning

import shap
import tensorflow as tf
import numpy as np
from tensorflow import keras
import matplotlib.pyplot as plt
import pickle as pk
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import pandas as pd
from collections import Counter

# Set some values for analysis
no_top_PCA_features = 3
no_top_embedding_dimensions = 3
no_top_text_queries = 10

queries_raw = X_test_raw
encoder_name = "all-MiniLM-L6-v2"

shap_diff = feature_importance_graph(X_train, X_test, model_base)

# Look at graoh produced to understand the features which have the biggest SHAP value and thus effect on model
# Then want to investigate these features in a semantically meaningful way

mean_abs_shap, top_features = get_associated_queries_for_top_features(no_top_PCA_features, no_top_embedding_dimensions, no_top_text_queries, data_pca, encoder_name, queries_raw, shap_diff)

# An idea that has occured to me is to use LLMs to try to come up with a semantic value for each top PCA feature, given I have got queries in plain text
# Possibly something to explore?