In [2]:
import torch
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import evaluate
import accelerate
from datasets import load_dataset, Dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, pipeline
from sklearn.metrics import f1_score
import shap
torch.cuda.is_available()

False

In [3]:
# Set the default device globally
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_default_tensor_type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor)

In [4]:
label_map = {'toxic':0, 'severe_toxic':1, 'obscene':2, 'threat':3, 'insult':4, 'identity_hate':5}

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(f"./models/BERT_Multi-Label_classification", num_labels=len(label_map.keys()), hidden_dropout_prob=0.1).to(device)
tokenizer = AutoTokenizer.from_pretrained(f"./models/BERT_Multi-Label_classification")

In [19]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device, return_all_scores=True)
explainer = shap.Explainer(classifier,output_names=list(label_map.keys()))


Device set to use cpu


In [20]:
test_dataset = Dataset.from_file(r"processed_dataset/test/data-00000-of-00001.arrow")
def create_multi_label(example):
    return {"labels": [np.float32(example[label]) for label in label_map.keys()]}

test_dataset = test_dataset.map(create_multi_label).remove_columns(list(label_map.keys()))
test_dataset[0]

{'id': '0001ea8717f6de06',
 'comment_text': 'Thank you for understanding. I think very highly of you and would not revert without discussion.',
 'cyberbullying': 0,
 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}

In [21]:
# Debug the input to ensure it is a list of strings
comment_texts = [str(text) for text in test_dataset[:10]['comment_text']]

In [24]:
# Use the corrected input for SHAP explainer
shap_values = explainer(comment_texts,batch_size=2)

PartitionExplainer explainer: 11it [03:41, 22.18s/it]                        
PartitionExplainer explainer: 11it [03:41, 22.18s/it]                        


In [25]:
shap.plots.text(shap_values[0])

# Understanding SHAP Values
SHAP (SHapley Additive exPlanations) values provide a unified measure of feature importance by attributing the output of a machine learning model to its input features. 
For text classification tasks, SHAP values help explain how individual words or tokens contribute to the model's predictions.

### Key Points:
- Positive SHAP values indicate that a feature (e.g., a word) contributes positively to the prediction of a specific class.
- Negative SHAP values indicate that a feature reduces the likelihood of a specific class.
- SHAP visualizations, such as text plots, bar plots, and beeswarm plots, provide insights into model behavior and feature importance.