# 7. Explanations

This notebook, part of the analysis phase of the project, is devoted to generating explanations using Integrated Gradients (IG), Integrated Directional Gradients (IDG) and Shapley Additive Explanations (SHAP).

In [None]:
# Necessary imports
import json
import numpy as np
import polars as pl
import sys

import torch
import shap

from pathlib import Path

from tqdm import tqdm
from tqdm.auto import tqdm as tqdma

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TextClassificationPipeline,
)

from transformers_interpret import SequenceClassificationExplainer


# Get the absolute path of the project's root directory
ROOT_DIR = Path.resolve(Path.cwd() / "../")

# Add root directory to sys.path
sys.path.append(str(ROOT_DIR))

from src.utils.set_seed import set_seed

from src.integrated_directional_gradients.IDG.calculate_gradients import execute_IDG

# Set the seed for reproducibility
rng = set_seed()


2025-05-04 18:54:52.832820: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-04 18:54:52.900335: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-04 18:54:52.912977: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-04 18:54:52.920707: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04 18:54:52.990917: I tensorflow/core/platform/cpu_feature_guar

In [None]:
# Directory management
DATA_DIR = ROOT_DIR / "data"
SPLITTED_DATA_DIR = DATA_DIR / "splitted"

MODELS_DIR = ROOT_DIR / "models"

OUTPUT_DIR = ROOT_DIR / "output"
TREES_DIR = OUTPUT_DIR / "constituency_trees"

IG_DIR = OUTPUT_DIR / "integrated_gradients"
IDG_DIR = OUTPUT_DIR / "integrated_directional_gradients"
SHAP_DIR = OUTPUT_DIR / "shap"


### 1. Load and process the HateXplain Dataset

In [3]:
# Load the dataset
df_test_trees = pl.read_parquet(TREES_DIR / "test_2_classes_with_trees.parquet")


In [4]:
# Load the fine-tuned model and tokenizer
model_clf = AutoModelForSequenceClassification.from_pretrained(
    MODELS_DIR / "bert-base-uncased_2_classes",
)

tokenizer = AutoTokenizer.from_pretrained(
    MODELS_DIR / "bert-base-uncased_2_classes",
)


In [5]:
# Create a text column from the tokens
df_test_trees = df_test_trees.with_columns(
    pl.col("tokens").map_elements(
        lambda tokens: " ".join(tokens),
        return_dtype=pl.String,
    ).alias("text"),
)


### 2. Integrated Gradients

Here, we apply the integrated gradients method to the dataset.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_clf.to(device)

cls_explainer = SequenceClassificationExplainer(model_clf, tokenizer)


In [18]:
# Execute IG
with tqdm(total=len(df_test_trees), desc="Executing IG") as pbar:
    for post in df_test_trees.iter_rows(named=True):
        text = post["text"]
        gt_cls = post["label"]
        guid = post["post_id"]

        # Execute IG
        try:
            attributions = cls_explainer(
                text,
                class_name="hatespeech",
                n_steps=150,
            )
            data = {
                "guid": guid,
                "gt_cls": gt_cls,
                "attributions": attributions,
            }

            with Path.open(IG_DIR / f"{guid}_ig_expl_data.json", "w") as  f:
                json.dump(data, f, indent=4, ensure_ascii=False)

        except Exception as e:
            print(f"Error processing post {guid}: {e}")
            continue

        # Update progress bar
        pbar.update(1)



Executing IG:  96%|█████████▌| 1319/1376 [09:22<00:33,  1.68it/s]

Error processing post 24796291_gab: CUDA out of memory. Tried to allocate 146.00 MiB. GPU 0 has a total capacity of 7.65 GiB of which 111.19 MiB is free. Including non-PyTorch memory, this process has 7.53 GiB memory in use. Of the allocated memory 7.01 GiB is allocated by PyTorch, and 407.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Executing IG: 100%|█████████▉| 1375/1376 [09:52<00:00,  2.32it/s]


### 3. Integrated Directional Gradients

Here, we apply the integrated directional gradients method to the dataset.

In [6]:
# Execute IDG
model_clf.to(torch.device("cpu"))
with tqdm(total=len(df_test_trees), desc="Executing IDG") as pbar:
    for post in df_test_trees.iter_rows(named=True):
        tree = [post["tree"]]
        gt_cls = post["label"]
        guid = post["post_id"]

        # We will compute attributions wrt the positive (hatespeech) class
        target_cls = 1

        # Execute IDG
        try:
            coalitions, value_func, dividend_dir, p_tree = execute_IDG(
                tree,
                model_clf,
                tokenizer,
                target_cls,
                IDG_DIR,
                guid,
                bert=True,
            )
            data = {
                "guid": guid,
                "gt_cls": gt_cls,
                "coalitions": coalitions,
                "value_func": value_func,
                "dividend_dir": dividend_dir,
                "p_tree": p_tree,
            }

            with Path.open(IDG_DIR / f"{guid}_idg_expl_data.json", "w") as  f:
                json.dump(data, f, indent=4, ensure_ascii=False)

        except Exception as e:
            print(f"Error processing post {guid}: {e}")
            continue

        # Update progress bar
        pbar.update(1)


Executing IDG:  19%|█▉        | 267/1376 [21:28<1:36:36,  5.23s/it]

Error processing post 10040607_gab: '\ufeff'


Executing IDG:  23%|██▎       | 322/1376 [25:49<1:50:49,  6.31s/it]

Error processing post 14445063_gab: '✝️'


Executing IDG:  49%|████▉     | 677/1376 [1:03:01<50:56,  4.37s/it]  

Error processing post 1178921633031114752_twitter: unsupported format character ',' (0x2c) at index 375


Executing IDG:  52%|█████▏    | 718/1376 [1:06:54<45:05,  4.11s/it]  

Error processing post 1179048983420899328_twitter: '♂️'


Executing IDG:  53%|█████▎    | 725/1376 [1:07:24<37:47,  3.48s/it]  

Error processing post 1179098097160470529_twitter: '♂️'


Executing IDG:  72%|███████▏  | 985/1376 [1:30:15<25:39,  3.94s/it]  

Error processing post 1178785944238841857_twitter: '♂️'


Executing IDG:  74%|███████▍  | 1017/1376 [1:32:46<24:29,  4.09s/it]

Error processing post 1178750958718914561_twitter: '♂️'


Executing IDG:  78%|███████▊  | 1078/1376 [1:37:25<30:06,  6.06s/it]

Error processing post 10362793_gab: 'u.s1.'


Executing IDG:  80%|████████  | 1104/1376 [1:39:34<11:55,  2.63s/it]

Error processing post 1109087701960146945_twitter: '️'


Executing IDG:  82%|████████▏ | 1132/1376 [1:41:27<16:18,  4.01s/it]

Error processing post 1163082929557450753_twitter: '️'


Executing IDG:  83%|████████▎ | 1140/1376 [1:41:49<09:30,  2.42s/it]

Error processing post 1171521116570959872_twitter: '️'


Executing IDG:  88%|████████▊ | 1213/1376 [1:46:11<09:02,  3.33s/it]

Error processing post 1258666596333170689_twitter: '🇬'


Executing IDG:  99%|█████████▉| 1364/1376 [2:00:00<01:03,  5.28s/it]


### 4. Shapley Additive Explanations
Here, we apply the SHAP method to the dataset.

In [6]:
# Move the model back to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_clf.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30831, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
# Create a text classification pipeline
pipe = TextClassificationPipeline(
    model=model_clf,
    tokenizer=tokenizer,
    top_k=None,
    truncation=True,
    padding=True,
    device=0 if torch.cuda.is_available() else -1,
    batch_size=32,
)


test_data = df_test_trees.select("text").to_numpy().flatten().tolist()

# Compute SHAP values
explainer = shap.Explainer(pipe)
shap_values = explainer(test_data)


Device set to use cuda:0
PartitionExplainer explainer: 1377it [20:37,  1.10it/s]                          


In [23]:
for idx, explanation in tqdma(enumerate(shap_values), total=len(shap_values)):
    attributions = explanation.values[:, 1]
    guid = df_test_trees[idx]["post_id"].to_list()[0]
    gt_cls = df_test_trees[idx]["label"].to_list()[0]
    tokens = np.array(
        tokenizer.tokenize(df_test_trees[idx]["text"].to_list()[0], add_special_tokens=True),
    )

    try:
        assert len(attributions) == len(tokens)
    except AssertionError:
        print(f"Length mismatch for post {guid}: {len(attributions)} != {len(tokens)}")
        continue

    data = {
        "guid": guid,
        "gt_cls": gt_cls,
        "attributions": list(zip(tokens, attributions)),
    }

    with Path.open(SHAP_DIR / f"{guid}_shap_expl_data.json", "w") as  f:
        json.dump(data, f, indent=4, ensure_ascii=False)



  0%|          | 0/1376 [00:00<?, ?it/s]