In [4]:
import pandas as pd
import numpy as np
import torch
import json
from transformers import AutoTokenizer, AutoModel
from huggingface_hub import hf_hub_download
import shap



In [7]:
# Load the dataset
reddit_df = pd.read_csv('../../data/combined_cleaned_500k.csv', lineterminator='\n', encoding='utf8')
reddit_df['text'] = reddit_df['text'].fillna('').astype(str)
batch_text = reddit_df['text'].tolist()
batch_id = reddit_df['id'].tolist()

In [8]:
# Load Lionguard Model Configuration
repo_path = "govtech/lionguard-v1"
config_path = hf_hub_download(repo_id=repo_path, filename="config.json")
with open(config_path, 'r') as f:
    config = json.load(f)



In [10]:
# Function to generate embeddings
def get_embeddings(device, data):
    tokenizer = AutoTokenizer.from_pretrained(config['embedding']['tokenizer'])
    model = AutoModel.from_pretrained(config['embedding']['model'])
    model.eval()
    model.to(device)
    
    batch_size = config['embedding']['batch_size']
    num_batches = int(np.ceil(len(data) / batch_size))
    output = []
    
    for i in range(num_batches):
        sentences = data[i * batch_size:(i + 1) * batch_size]
        encoded_input = tokenizer(sentences, max_length=config['embedding']['max_length'], padding=True, truncation=True, return_tensors='pt')
        encoded_input.to(device)
        
        with torch.no_grad():
            model_output = model(**encoded_input)
            sentence_embeddings = model_output[0][:, 0]
        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
        output.extend(sentence_embeddings.cpu().numpy())
    
    return np.array(output)



In [11]:
# Generate embeddings for SHAP analysis
device = "cuda" if torch.cuda.is_available() else "cpu"
embeddings = get_embeddings(device, batch_text)

KeyboardInterrupt: 

In [None]:
# Placeholder function for Lionguard model predictions
def predict2(batch_text):
    # Replace with Lionguard model's prediction function
    # Here we're assuming it returns a dictionary of scores
    return {
        'hateful Score': {'scores': np.random.rand(len(batch_text))},
        'hateful HR': np.random.rand(len(batch_text)),
        'toxic Score': {'scores': np.random.rand(len(batch_text))},
        'toxic HR': np.random.rand(len(batch_text))
    }

# Generate the scores and predictions
results = predict2(batch_text)



In [None]:
# Prepare results DataFrame
output_data = []
for i in range(len(batch_text)):
    output_row = {
        'id': batch_id[i],
        'Text': batch_text[i],
        'hateful Score': results['hateful Score']['scores'][i],
        'hateful HR': results['hateful HR'][i],
        'toxic Score': results['toxic Score']['scores'][i],
        'toxic HR': results['toxic HR'][i]
    }
    output_data.append(output_row)



In [None]:
mid_results_df = pd.DataFrame(output_data)

# SHAP Analysis
explainer = shap.KernelExplainer(predict2, embeddings)
shap_values = explainer.shap_values(embeddings)




In [None]:
# Visualizations
# 1. Summary Plot
shap.summary_plot(shap_values, embeddings, feature_names=['hateful Score', 'hateful HR', 'toxic Score', 'toxic HR'])



In [None]:
# 2. Dependence Plot for 'toxic Score'
shap.dependence_plot('toxic Score', shap_values, embeddings)


In [None]:

# 3. Force Plot for a specific instance
instance_index = 0  # You can adjust this index as needed
shap.force_plot(explainer.expected_value, shap_values[instance_index], embeddings[instance_index])