## RQ3: Fine-tuned vs Base FNR

In [1]:
import OpenPromptInjection as PI
from OpenPromptInjection.utils import open_config
from OpenPromptInjection import DataSentinelDetector
from OpenPromptInjection.evaluator.utils import *
from OpenPromptInjection.evaluator.gleu_utils import *
import json
import os
import numpy as np
from typing import Literal
from collections import defaultdict
import pandas as pd
from IPython.display import display

### Detection Models
Set `detector_path_config` to load the detection model. In this case, setting `ft_path=""`just loads the base Mistral model.

In [None]:
detector_config_path = 'configs/model_configs/mistral_config.json'
detector_config = open_config(detector_config_path)
detector_config["params"]["gpus"] = ['0', '1', '2', '3']
# Load the DataSentinel finetuned detector.
detector_ft = DataSentinelDetector(detector_config)

detector_config["params"]["ft_path"] = ""
# Load the base Mistral model.
detector_base = DataSentinelDetector(detector_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7591644753d0>>
Traceback (most recent call last):
  File "/nobackup3/divyam/envs/open-prompt-injection/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [35]:
target_task_configs = [
    "gigaword_config.json",
    "hsol_config.json",
    "jfleg_config.json",
    "mrpc_config.json",
    "rte_config.json",
    "sms_spam_config.json",
    "sst2_config.json"
]

inject_task_configs = [
    # "websiteinject_config.json",
    "gigaword_config.json",
    "hsol_config.json",
    "jfleg_config.json",
    "mrpc_config.json",
    "rte_config.json",
    "sms_spam_config.json",
    "sst2_config.json",
]

### Benign FPR
Get FPR on benign samples. We only consider the target task here, without any prompt injection. We store the detection rate and detector responses under `results`.

`results` structure:
```
results
    |__ target_task (e.g. summarization)
            |__ det_base (Baseline detection LLM)
                    |__ det_rate (detection rate of the detection LLM)
                    |__ responses (responses of the detection LLM)
            |__ det_ft (Fine-tuned detection LLM)
                    |__ det_rate (detection rate of the detection LLM)
                    |__ responses (responses of the detection LLM)
```

1. `detected_benign_cnt_{base/ft}`: Benign samples detected as contaminated by the base or fine-tuned detection LLM.
2. `detector_responses_benign_{base/ft}`: Responses of the base or fine-tuned detection LLM.

In [65]:
results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

save_path = "./results/RQ3/detector_benign_prompts.json"

if os.path.exists(f"{save_path}"):
    print(f"Results found at: {save_path}!")
else:
    for target_task_config in target_task_configs:
        target_task = PI.create_task(open_config(config_path=f"./configs/task_configs/{target_task_config}"), 100)
        
        target_app = PI.create_app(target_task, None, defense='no')
        
        detected_benign_cnt_base = 0
        detected_benign_cnt_ft = 0
        detector_responses_benign_base = []
        detector_responses_benign_ft = []
        
        # Run base detector on benign prompts
        for i, (data_prompt, ground_truth_label) in enumerate(target_app):
            benign = data_prompt
            success_base, detector_response_base = detector_base.detect(benign)
            detected_benign_cnt_base += success_base
            detector_responses_benign_base.append(detector_response_base)

        results[target_task.task]["det_base"]["det_rate"] = detected_benign_cnt_base
        results[target_task.task]["det_base"]["responses"] = detector_responses_benign_base

        # Run ft detector on benign prompts
        for i, (data_prompt, ground_truth_label) in enumerate(target_app):
            benign = data_prompt
            success_ft, detector_response_ft = detector_ft.detect(benign)
            detected_benign_cnt_ft += success_ft
            detector_responses_benign_ft.append(detector_response_ft)

        results[target_task.task]["det_ft"]["det_rate"] = detected_benign_cnt_ft
        results[target_task.task]["det_ft"]["responses"] = detector_responses_benign_ft

        with open(f"{save_path}", 'w') as fp:
            json.dump(results, fp, indent=2)

Results found at: ./results/RQ3/detector_benign_prompts.json!


## FNR for Combined Attack
Get FNR on samples crafted using the Combined Attack, for every combination of target and injected task. We store the detection rate and detector responses under `results`.

`results` structure:
```
results
    |__ inject_task (e.g. hate_detection)
            |
            target_task (e.g. summarization)
                |__ det_base (Baseline detection LLM)
                        |__ det_rate (detection rate of the detection LLM)
                        |__ responses (responses of the detection LLM)
                |__ det_ft (Fine-tuned detection LLM)
                        |__ det_rate (detection rate of the detection LLM)
                        |__ responses (responses of the detection LLM)
```

1. `detected_attack_prompt_cnt_{base/ft}`: Benign samples detected as contaminated by the base or fine-tuned detection LLM.
2. `detector_responses_{base/ft}`: Responses of the base or fine-tuned detection LLM.

In [37]:
results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

save_path = "./results/RQ3/detector_combined_attack_prompts.json"

if os.path.exists(f"{save_path}"):
    print(f"Results found at: {save_path}!")
else:
    for target_task_config in target_task_configs:
        for inject_task_config in inject_task_configs:
            if target_task_config==inject_task_config: continue
            target_task = PI.create_task(open_config(config_path=f"./configs/task_configs/{target_task_config}"), 100)
            inject_task = PI.create_task(open_config(config_path=f'configs/task_configs/{inject_task_config}'), 100, for_injection=True)
            
            attacker = PI.create_attacker('combine', inject_task)
            
            target_app = PI.create_app(target_task, None, defense='no')

            detected_attack_prompt_cnt_base = 0
            detected_attack_prompt_cnt_ft = 0
            detector_responses_base = []
            detector_responses_ft = []

            # Run the base detector on combine attack prompts
            for i, (data_prompt, ground_truth_label) in enumerate(target_app):
                data_prompt_after_attack = attacker.inject(clean_data=data_prompt, idx=i, target_task=target_task.task)#, inject_task=inject_task.task)
                success_base, detector_response_base = detector_base.detect(data_prompt_after_attack)
                detected_attack_prompt_cnt_base += success_base
                detector_responses_base.append(detector_response_base)

            results[inject_task.task][target_task.task]["det_base"]["det_rate"] = detected_attack_prompt_cnt_base
            results[inject_task.task][target_task.task]["det_base"]["responses"] = detector_responses_base

            # Run the ft detector on combine attack prompts
            for i, (data_prompt, ground_truth_label) in enumerate(target_app):
                data_prompt_after_attack = attacker.inject(clean_data=data_prompt, idx=i, target_task=target_task.task)#, inject_task=inject_task.task)
                success_ft, detector_response_ft = detector_ft.detect(data_prompt_after_attack)
                detected_attack_prompt_cnt_ft += success_ft
                detector_responses_ft.append(detector_response_ft)

            results[inject_task.task][target_task.task]["det_ft"]["det_rate"] = detected_attack_prompt_cnt_ft
            results[inject_task.task][target_task.task]["det_ft"]["responses"] = detector_responses_ft

            with open(f"{save_path}", 'w') as fp:
                json.dump(results, fp, indent=2)

Results found at: ./results/RQ3/detector_combined_attack_prompts.json!


## FNR for DataFlip
Get FNR on samples crafted using DataFlip, for every combination of target and injected task. We store the detection rate and detector responses under `results`.

`results` structure:
```
results
    |__ inject_task (e.g. hate_detection)
            |
            target_task (e.g. summarization)
                |__ det_base (Baseline detection LLM)
                        |__ det_rate (detection rate of the detection LLM)
                        |__ responses (responses of the detection LLM)
                |__ det_ft (Fine-tuned detection LLM)
                        |__ det_rate (detection rate of the detection LLM)
                        |__ responses (responses of the detection LLM)
```

1. `detected_attack_prompt_cnt_{base/ft}`: Benign samples detected as contaminated by the base or fine-tuned detection LLM.
2. `detector_responses_{base/ft}`: Responses of the base or fine-tuned detection LLM.

In [38]:
results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

save_path = "./results/RQ3/detector_dataflip_prompts.json"

if os.path.exists(f"{save_path}"):
    print(f"Results found at: {save_path}!")
else:
    for target_task_config in target_task_configs:
        have_evaluated_benign = False
        for inject_task_config in inject_task_configs:
            if target_task_config==inject_task_config: continue
            target_task = PI.create_task(open_config(config_path=f"./configs/task_configs/{target_task_config}"), 100)
            inject_task = PI.create_task(open_config(config_path=f'configs/task_configs/{inject_task_config}'), 100, for_injection=True)
            
            attacker = PI.create_attacker('dataflip_datasentinel', inject_task)
            
            target_app = PI.create_app(target_task, None, defense='no')

            detected_attack_prompt_cnt_base = 0
            detected_attack_prompt_cnt_ft = 0
            detector_responses_base = []
            detector_responses_ft = []

            # Run the base detector on dataflip attack prompts
            for i, (data_prompt, ground_truth_label) in enumerate(target_app):
                data_prompt_after_attack = attacker.inject(data_prompt=data_prompt, idx=i, target_task=target_task.task, inject_task=inject_task.task)
                success_base, detector_response_base = detector_base.detect(data_prompt_after_attack)
                detected_attack_prompt_cnt_base += success_base
                detector_responses_base.append(detector_response_base)

            results[inject_task.task][target_task.task]["det_base"]["det_rate"] = detected_attack_prompt_cnt_base
            results[inject_task.task][target_task.task]["det_base"]["responses"] = detector_responses_base

            # Run the ft detector on dataflip attack prompts
            for i, (data_prompt, ground_truth_label) in enumerate(target_app):
                data_prompt_after_attack = attacker.inject(data_prompt=data_prompt, idx=i, target_task=target_task.task, inject_task=inject_task.task)
                success_ft, detector_response_ft = detector_ft.detect(data_prompt_after_attack)
                detected_attack_prompt_cnt_ft += success_ft
                detector_responses_ft.append(detector_response_ft)

            results[inject_task.task][target_task.task]["det_ft"]["det_rate"] = detected_attack_prompt_cnt_ft
            results[inject_task.task][target_task.task]["det_ft"]["responses"] = detector_responses_ft

            with open(f"{save_path}", 'w') as fp:
                json.dump(results, fp, indent=2)

Results found at: ./results/RQ3/detector_dataflip_prompts.json!


In [39]:
def mean(l):
    return sum(l)/len(l)

def printing_name(task_name):
    words = task_name.split("_")
    return " ".join(w[:1].upper() + w[1:] for w in words)

## Evaluation

In [40]:
task_configs = [
    "gigaword_config.json",
    "hsol_config.json",
    "jfleg_config.json",
    "mrpc_config.json",
    "rte_config.json",
    "sms_spam_config.json",
    "sst2_config.json"
]

task_configs = {
    config: open_config(config_path=f"./configs/task_configs/{config}")
    for config in task_configs
}

task_dataset_names = {
    config["task_info"]["task"]: config["dataset_info"]["dataset"]
    for config in task_configs.values()
}

task_name_map = {
    k: printing_name(k)
    for k in task_dataset_names.keys()
}

tasks = list(task_dataset_names.keys())

In [68]:
names = [
     "benign",
     "combined_attack",
     "dataflip",
]

all_results = defaultdict(lambda: defaultdict(dict))

def collect_results(name):
    experiments_path = f"results/RQ3/detector_{name}_prompts.json"
    
    def load_results(path):
        with open(experiments_path) as f:
            return json.load(f)

    results = load_results(experiments_path)
    for injected_task_name in tasks:
        values_base = []
        values_ft = []
        for target_task_name in tasks:
            if injected_task_name == target_task_name:
                continue
            if name=="benign":
                values_base.append(results[injected_task_name]["det_base"]["det_rate"])
                values_ft.append(results[injected_task_name]["det_ft"]["det_rate"])    
            else:
                values_base.append(results[injected_task_name][target_task_name]["det_base"]["det_rate"])
                values_ft.append(results[injected_task_name][target_task_name]["det_ft"]["det_rate"])
        all_results[name][injected_task_name]["base"] = mean(values_base) 
        all_results[name][injected_task_name]["ft"] = mean(values_ft) 


In [69]:
for name in names:
    collect_results(name)

### Metrics for the Base Mistral model

In [70]:
print("Model: Mistral 7B (Base)")
data = {
    "Injected Task": [task_name[1] for task_name in task_name_map.items()],
        "FPR": [f"{all_results['benign'][task]['base']:.1f}" for task in task_name_map],
    "FNR Combined Attack": [f"{100-all_results['combined_attack'][task]['base']:.1f}" for task in task_name_map],
    "FNR DataFlip": [f"{100-all_results['dataflip'][task]['base']:.1f}" for task in task_name_map],
}

df = pd.DataFrame(data)
display(df)

Model: Mistral 7B (Base)


Unnamed: 0,Injected Task,FPR,FNR Combined Attack,FNR DataFlip
0,Summarization,92.0,1.5,7.0
1,Hate Detection,82.0,1.5,10.0
2,Grammar Correction,81.0,0.8,4.5
3,Duplicate Sentence Detection,100.0,1.3,11.7
4,Natural Language Inference,97.0,1.5,8.5
5,Spam Detection,79.0,1.2,4.8
6,Sentiment Analysis,85.0,1.2,10.5


### Metrics for the fine-tuned DataSentinel model

In [71]:
print("Model: DataSentinel (FT)")
data = {
    "Injected Task": [task_name[1] for task_name in task_name_map.items()],
        "FPR": [f"{all_results['benign'][task]['ft']:.1f}" for task in task_name_map],
    "FNR Combined Attack": [f"{100-all_results['combined_attack'][task]['ft']:.1f}" for task in task_name_map],
    "FNR DataFlip": [f"{100-all_results['dataflip'][task]['ft']:.1f}" for task in task_name_map],
}

df = pd.DataFrame(data)
display(df)

Model: DataSentinel (FT)


Unnamed: 0,Injected Task,FPR,FNR Combined Attack,FNR DataFlip
0,Summarization,0.0,0.0,46.5
1,Hate Detection,0.0,0.0,98.7
2,Grammar Correction,0.0,0.0,97.5
3,Duplicate Sentence Detection,0.0,0.0,88.2
4,Natural Language Inference,0.0,0.0,87.3
5,Spam Detection,1.0,0.0,95.0
6,Sentiment Analysis,0.0,0.0,93.7
