In [2]:

import shap
import lime
from lime.lime_text import LimeTextExplainer
from transformers import (pipeline, AutoTokenizer, AutoModelForTokenClassification)
import numpy as np


In [3]:
# Load the model and tokenizer (for interpretability, choose one model, e.g., XLM-Roberta)
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import pandas as pd
import torch
import shap
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load your CSV data
csv_file_path = '../data/cleaned_telegram_data.csv'  # Update this with your CSV file path
data = pd.read_csv(csv_file_path)

# Clean the 'Message' column by dropping NaN values and converting to strings
data['Message'] = data['Message'].dropna().astype(str)

# Convert the cleaned column to a list of strings
texts = data['Message'].tolist()

# Check for any NaN or empty values after cleaning
if any(pd.isna(texts)) or any(text == "" for text in texts):
    print("There are still NaN or empty values in the texts.")
else:
    print("All NaN and empty values have been cleaned from the texts.")

# Define your model name (change this to your specific model)
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"  # Example model

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# 1. SHAP Explanation
# Prepare the function to predict token classification
def predict_shap(texts):
    # Ensure input is a list of strings
    print(f"Type of input: {type(texts)}")
    print(f"Contents of input: {texts}")

    if isinstance(texts, list) and all(isinstance(text, str) for text in texts):
        # Tokenize all texts at once, ensure it's a list of strings
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits
        predicted_classes = logits.argmax(dim=-1)  # Get predicted classes
        
        # Return predictions as a list of lists
        return predicted_classes.tolist()  # Convert to list of lists
    else:
        raise ValueError("Input must be a list of strings.")

# Create a new explainer that directly works with the model's predictions
explainer = shap.Explainer(predict_shap, tokenizer)

# Limit the number of texts for SHAP to avoid memory issues
num_samples = min(10, len(texts))  # Change 10 to any number you want to visualize

# Get a sample of texts
sample_texts = texts[:num_samples]  # Get a sample of texts
print(f"Sample input type: {type(sample_texts)}, Content: {sample_texts}")

# Ensure the sample input is indeed a list of strings
if isinstance(sample_texts, list) and all(isinstance(text, str) for text in sample_texts):
    print("Input format is correct.")
else:
    print("Input format is incorrect.")
    raise ValueError("Input must be a list of strings.")  # Raise error for debugging

# Pass a slice of the data to the explainer
shap_values = None  # Initialize shap_values to None
try:
    shap_values = explainer(sample_texts)  # Ensure this is a list of strings
except ValueError as e:
    print(f"Error while explaining SHAP values: {e}")

# Check if shap_values was defined before attempting to visualize
if shap_values is not None:
    # Visualization of SHAP values
    for i in range(num_samples):
        shap.plots.text(shap_values[i])  # This will show which tokens contributed to the NER decision
else:
    print("SHAP values could not be computed due to previous errors.")


All NaN and empty values have been cleaned from the texts.


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Sample input type: <class 'list'>, Content: ['💥3pcs silicon brush spatulas\n\n⚡እስከ 260°c ሙቀት መቆቆም የሚችል\n\xa0\xa0\xa0\xa0\xa0 \n\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 ዋጋ-550ብር✅\n\n🏢 አድራሻ\xa0 ቁ.1👉 ስሪ ኤም ሲቲ ሞል\xa0 ሁለተኛ ፎቅ ቢሮ ቁ. SL-05A(ከ ሊፍቱ ፊት ለ ፊት)\n\n📍ቁ.2 👉ለቡ\xa0 መዳህኒዓለም ቤተ/ክርስቲያን ፊት ለፊት\xa0 #ዛም_ሞል 2ኛ ፎቅ ቢሮ ቁጥር.214\n\n👍ለቡ\xa0ቅርንጫፍ📲0973611819\n\n\n\n\xa0\xa0\xa0 📲 0909522840\n\xa0\xa0\xa0 📲 0923350054\n\n🔖\n💬\xa0 በTelegram ለማዘዝ ⤵️ ይጠቀሙ\n@shager_onlinestore\n\xa0 \nለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵️\nhttps://t.me/Shageronlinestore', '💥Mandoline Slicer\n\n👉 ጊዜ ቆጣቢ ስላይስ ማድረጊያ \n👉\xa0 ለእጅ ሴፍቲ ተመራጭ\n👉\xa0 ለድንች ለካሮትና ሌሎች አታክልቶች ተመራጭ \n👉ጥራት ያለው ዕቃ\n\n\xa0\xa0\xa0  ዋጋ፦ ✅ 1,200 ብር\n\n🏢 አድራሻ\xa0 ቁ.1👉 ስሪ ኤም ሲቲ ሞል\xa0 ሁለተኛ ፎቅ ቢሮ ቁ. SL-05A(ከ ሊፍቱ ፊት ለ ፊት)\n\n📍ቁ.2 👉ለቡ\xa0 መዳህኒዓለም ቤተ/ክርስቲያን ፊት ለፊት\xa0 #ዛም_ሞል 2ኛ ፎቅ ቢሮ ቁጥር.214\n\n👍ለቡ\xa0ቅርንጫፍ📲0973611819\n\n\n\n\xa0\xa0\xa0 📲 0909522840\n\xa0\xa0\xa0 📲 0923350054\n\n🔖\n💬\xa0 በTelegram ለማዘዝ ⤵️ ይጠቀሙ\n@shager_onlinestore\n\xa0 \nለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵️\nhttps://t.me/Shage

In [10]:
import numpy as np
import shap
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from lime.lime_text import LimeTextExplainer

# Load your CSV data
csv_file_path = '../data/cleaned_telegram_data.csv'  # Update this with your CSV file path
data = pd.read_csv(csv_file_path)

# Clean the 'Message' column by dropping NaN values and converting to strings
data['Message'] = data['Message'].dropna().astype(str)

# Convert the cleaned column to a list of strings
texts = data['Message'].tolist()

# Check for any NaN or empty values after cleaning
if any(pd.isna(texts)) or any(text == "" for text in texts):
    print("There are still NaN or empty values in the texts.")
else:
    print("All NaN and empty values have been cleaned from the texts.")

# Define your model name (change this to your specific model)
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"  # Example model

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Initialize NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)


class_names = [f"Class {i}" for i in range(len(model.config.id2label))]  # Update class names based on your model
lime_explainer = LimeTextExplainer(class_names=class_names)

# Use LIME to explain NER output
def lime_predict_proba(texts):
    output = []
    for text in texts:
        result = ner_pipeline(text)
        proba = np.zeros(len(text))
        for item in result:
            # Simple binary marking of entities
            proba[item['start']:item['end']] = 1  
        output.append(proba)
    return np.array(output)

# Example text to explain
text_example = "John Smith went to the bank."  


lime_explanation = lime_explainer.explain_instance(text_example, lime_predict_proba, num_features=6)

# Visualize explanation
lime_explanation.show_in_notebook(text=True)  


All NaN and empty values have been cleaned from the texts.


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
