# Generating global explanations of LLM-as-a-Judge using GloVE algorithm

In [None]:
import datetime
import json
import logging
import os
import pickle

import pandas as pd
from dotenv import load_dotenv

import sys
import os

In [None]:
from risk_policy_distillation.pipeline.clusterer import Clusterer
from risk_policy_distillation.pipeline.concept_extractor import Extractor
from risk_policy_distillation.evaluation.evaluate import Evaluator
from risk_policy_distillation.pipeline.pipeline import Pipeline
from risk_policy_distillation.models.explainers.local_explainers.lime import LIME
from risk_policy_distillation.utils.data_util import load_ds
from risk_policy_distillation.models.guardians.rits_guardian import RITSGuardian
from risk_policy_distillation.models.guardians.granite_guardian_batch import GGRits
from risk_policy_distillation.llms.rits_component import RITSComponent
from risk_policy_distillation.datasets.prompt_dataset import PromptDataset
from risk_policy_distillation.datasets.prompt_response_dataset import PromptResponseDataset

In [None]:
load_dotenv()

In [None]:
os.getenv("RITS_API_KEY")

### Create an LLM-as-a-Judge

To create a wrapper for your LLM-as-a-Judge create either a GraniteGuardian object or inherit the [Judge](../src/models/guardians/judge.py) class to create.
You also need to define the specific of the task, such as the criterion the LLM-as-a-Judge is using and label names.

In [None]:
# Define a guardian model and the task
guardian_config = {
     "task": "harm detection", 
     "criterion": "harm",
     "criterion_definition": "The message is harmful by common-sense.",
     "labels": [0, 1],
     "label_names": [ "harmless", "harmful"],
     "output_labels": ["No", "Yes"],
     "rits": {
       "model_name": "granite-guardian-3-2-5b-ris",
       "model_served_name": "ibm-granite/granite-guardian-3.2-5b"
     }
   }

guardian = RITSGuardian(guardian_config['rits']['model_name'], guardian_config['rits']['model_served_name'], guardian_config, 'gg3.2:5b')

### Create a dataset

To explain the LLM-as-a-Judge you need to provide a dataset. [AbstractDataset](../src/models/datasets/abs_dataset.py) class provides a wrapper for a dataframe you want to explain. You can use [PromptDataset](../src/models/datasets/prompt_dataset.py) or [PromptResponseDataset](../src/models/datasets/prompt_response_dataset.py) depending on whether your dataframe consists of only prompts or prompt-response pairs. You can also create a custom dataset by inheriting the Dataset class. 

You have to provide a config with information on column name mapping. Additional parameters: *flip_labels* indicates whether labels of the dataframe should be flipped in preprocessing step (e.g. for BeaverTails where labels indicate that the content is safe rather than harmful); *split* indicates whether a train-val-test split needs to be performed during preprocessing.

In [None]:
# Creating a test dataset
bt_config = {
    "general": {
      "location": "PKU-Alignment/BeaverTails",
      "dataset_name": "BeaverTails"
    },
    "data": {
      "type": "prompt_response",
      "index_col": "",
      "prompt_col": "prompt",
      "response_col": "response",
      "label_col": "is_safe",
      "flip_labels": True,
      "category_label": "category_simple"
    },
   "split": {
      "split": False,
      "sample_ratio": 0.0001,
      "subset": "330k_train"
   }
  }

# Wrap the dataframe 
dataset = PromptResponseDataset(bt_config)

In [None]:
dataset.dataframe.head()

### Define components

Next we need to define how to access the LLM-based components. You can use a [RITSComponent](../src/models/components/llms/rits_component.py) or [OllamaComponent](../src/models/components/llms/ollama_component.py) wrappers for querying an LLM. You just need to pass the name of the LLM. Otherwise, you can create a custom LLM wrapper by inheriting [LLMComponent](../src/models/components/llms/llm_component.py) class. 

You can also define a local word-based explainer component which is used by the CloVE algorithm. At the moment, you can use LIME or create custom word-based explainer by inheriting [LocalExplainer](../src/models/local_explainers/local_explainer.py) class.

In [None]:
# This is an LLM that is used for generating concepts and labels 
llm_component = RITSComponent('llama-3-3-70b-instruct', 'meta-llama/llama-3-3-70b-instruct')
local_explainer = LIME(bt_config['general']['dataset_name'], guardian_config['label_names'], n_samples=100)

### Create and run the explanation generation pipeline

Pipeline streamlines local and global explanation generation process. Extractor executes the CLoVE algorithm and generates a set of local explanations, and Clusterer executes GloVE algorithm and merges the local explanations into a global one. 

Pass ```lime=False``` to pipeline creation step if no local word-based verification is done. SImilarly, use ```fr=False``` if FactReasoner is not used to verify global explanations.

The resulting local and global explanations are saved in the path folder passed to the pipeline.run() call. 
The execution logs can be found in the logs folder.

In [None]:
pipeline = Pipeline(extractor = Extractor(guardian, 
                                          llm_component, 
                                          guardian_config['criterion'], 
                                          guardian_config['criterion_definition'], 
                                          local_explainer),
                    clusterer = Clusterer(llm_component,
                                          guardian_config['criterion_definition'],
                                          guardian_config['label_names'], 
                                          n_iter=10),
                    lime=True, 
                    fr=True)

In [None]:
expl = pipeline.run(dataset, 
                    path='../results/')

### Printing the global explanation

In [None]:
# Printing the rules
for i, argument in enumerate(expl.rules):
    print(i)
    decision = guardian.label_names[expl.predictions[i]]
    rule = '{} IF {}'.format(decision, argument)

    if expl.despites[i] != 'none':
        rule += ' DESPITE '

        indent = ' ' * (len(rule))
        for d in expl.despites[i]:
            rule += '{}\n'.format(d) + indent
    else:
        rule += '\n'

    print(rule)

In [None]:
print(expl.rules)

In [1]:
from risk_atlas_nexus.library import RiskAtlasNexus

  from tqdm.autonotebook import tqdm


INFO 10-06 17:59:48 [__init__.py:216] Automatically detected platform cpu.


In [2]:
risk_atlas_nexus = RiskAtlasNexus()

[2025-10-06 17:59:50:482] - INFO - RiskAtlasNexus - Created RiskAtlasNexus instance. Base_dir: None


In [3]:
expl = risk_atlas_nexus.generate_policy_rules(task="harm detection", label_names=[ "harmless", "harmful"])

INFO:logger:Loaded dataset with 300567 instances.)
INFO:logger:Built pipeline.
INFO:logger:Using LIME = True
INFO:logger:Using FactReasoner = True
INFO:logger:Loaded explanation at ../results/BeaverTails/global/global_expl.pkl
INFO:logger:Loaded 3 rules
INFO:logger:Loaded graph explanation from ../results/BeaverTails/global/global_expl.pkl


Path:  ../results/
../results/BeaverTails/local/expl.csv ../results/BeaverTails/global/global_expl.pkl


In [8]:
from risk_policy_distillation.llms.rits_component import RITSComponent
from risk_policy_distillation.models.guardians.rits_guardian import RITSGuardian

guardian_config = {
     "task": "harm detection", 
     "criterion": "harm",
     "criterion_definition": "The message is harmful by common-sense.",
     "labels": [0, 1],
     "label_names": [ "harmless", "harmful"],
     "output_labels": ["No", "Yes"],
     "rits": {
       "model_name": "granite-guardian-3-2-5b-ris",
       "model_served_name": "ibm-granite/granite-guardian-3.2-5b"
     }
   }

guardian = RITSGuardian(guardian_config['rits']['model_name'], guardian_config['rits']['model_served_name'], guardian_config, 'gg3.2:5b')

In [9]:
# Printing the rules
for i, argument in enumerate(expl.rules):
    print(i)
    decision = guardian.label_names[expl.predictions[i]]
    rule = '{} IF {}'.format(decision, argument)

    if expl.despites[i] != 'none':
        rule += ' DESPITE '

        indent = ' ' * (len(rule))
        for d in expl.despites[i]:
            rule += '{}\n'.format(d) + indent
    else:
        rule += '\n'

    print(rule)

0
harmless IF does not cause harm

1
harmless IF has innocent meaning

2
harmless IF provides accurate info DESPITE contains harmful content
                                           
