# LLM Watermark Judger

## Setup

Load necessary packages and initialize OpenAI API.

In [1]:
# ----- SYSTEM PACKAGES ----- ##
import json
import time
import numpy as np
from openai import OpenAI

In [None]:
client = OpenAI(api_key="[FILL IN]",)

## Helper Functions

In [2]:
# gpt-judger prompt and score extraction
def gpt_judge(prompt, uw_output, w_output, is_randomized = None):

    if is_randomized is None:
            is_randomized = random.choice([True, False])
    
    # randomizing the order
    a_output, b_output = (uw_output, w_output) if is_randomized else (w_output, uw_output)
    
    judge_prompt = """[System]
    Please act as an impartial judge and evaluate the quality of the text completions provided by two large language models to the prompt displayed below. 
    Assess each response according to the criteria outlined, using a 1-5 Likert scale where 1 indicates strong disagreement or the lowest quality, and 5 indicates strong agreement or the highest quality. 
    After scoring each criterion, provide a short summary for each response, including specific examples that influenced your scoring. 
    Additionally, don't let the length of the responses influence your evaluation. 
    Be as objective as possible and ensure that the order in which the responses are presented does not affect your decision.
    
    Criteria:
    1. Relevance to the prompt
    2. Depth of detail
    3. Clarity of writing
    4. Coherence and logical flow
    5. Originality and insight
    6. Use of specific examples
    7. Accuracy of information
    
    Start with a brief statement about which response you think is superior. Then, for each response and criterion, provide a score, followed by a brief justification for that score. At the very end of your response, declare your verdict by choosing one of the choices below, strictly following the given format:
    
    "[[A]]: [list of scores]" if assistant A is better, 
    "[[B]]: [list of scores]" if assistant B is better, or 
    "[[C]]" for a tie.

    [Prompt]
    {prompt}
    
    [The Start of LLM A’s Answer]
    {a_output}
    [The End of LLM A’s Answer]
    
    [The Start of LLM B’s Answer]
    {b_output}
    [The End of LLM B’s Answer]
    """.format(prompt=prompt, a_output=a_output, b_output=b_output)
    
    completion = client.chat.completions.create(
        messages=[{
                "role": "user",
                "content": judge_prompt,}],
        model="gpt-3.5-turbo",
    )

    judge_output = completion.choices[0].message.content

    # search for a tie first
    tie_pattern = r'\[\[C\]\]'
    tie_match = re.search(tie_pattern, judge_output)
    
    if tie_match:
        judge_choice = "C"
        final_verdict = "Tie"
        scores = []
    else:
        # pattern to match the verdict and the scores for A or B
        pattern = r'\[\[([AB])\]\]: (?:\[)?([5, 4, 3, 2, 1, ]+)(?:\])?'
        matches = re.findall(pattern, judge_output)
        
        if matches:
            
            # extract the last match which will have the choice and the corresponding scores
            judge_choice, scores_str = matches[-1]
            
            # remove square brackets if they exist, strip whitespace, and split by comma
            scores_str = scores_str.replace('[', '').replace(']', '').strip()
            scores = [float(score) for score in scores_str.split(',')]
            
            # determine verdict based on the judge choice
            if is_randomized:
                final_verdict = "Unwatermarked" if judge_choice == 'A' else "Watermarked"
            else:
                final_verdict = "Watermarked" if judge_choice == 'A' else "Unwatermarked"
        else:
            final_verdict = "Model Failure"
            scores = []
    
    return judge_output, final_verdict, scores, is_randomized

# helper function to save results to a JSON file
def save_to_json(data_list, filename="output.json"):
    with open(filename, 'w', encoding='utf-8') as json_file:
        json.dump(data_list, json_file, ensure_ascii=False, indent=4)

# helper function to load results from a JSON file
def load_from_json(filename="output.json"):
    with open(filename, 'r', encoding='utf-8') as json_file:
        return json.load(json_file)
    
# function to tally judger results
def extract_and_count_choices(data_list, category):
    
    # extract all 'category' entries from the list of dictionaries
    choices = [entry[category] for entry in data_list]
    
    # count occurrences of each 'judge_choice'
    count = {}
    for choice in choices:
        if choice in count:
            count[choice] += 1
        else:
            count[choice] = 1

    count_norm = count.copy()
    for choice in count_norm:
        count_norm[choice] /= len(choices)
        count_norm[choice] *= 100
        count_norm[choice] = np.round(count_norm[choice], 3)
        
    return count, count_norm

## Judge New Samples

Load sample pairs from a JSON file and use the GPT-judger to evaluate each sample, saving the results to the same data structure.

In [None]:
data_list = load_from_json(filename="results.json")

for i in range(len(data_list)):

    item = data_list[i]

    # unpack needed fields
    prompt = item["prompt"]
    uw_output = data_list[i]["uw_output"]
    w_output = item["w_output"]

    # evaluate responses using gpt
    judge_choice = ""
    exception_counter = 0
    while judge_choice == "" or judge_choice == "Model Failure":
        try:
            t1 = time.time()
            judge_output, judge_choice, scores, is_randomized = gpt_judge(prompt, uw_output, w_output)
            t2 = time.time()
        except Exception as e:
            print(e)
            pass

    # # save results
    item["judge_output"] = judge_output
    item["randomized"] = is_randomized
    item["judge_choice"] = judge_choice
    item["scores"] = scores

    print(i+1, "out of", len(data_list), "items processed!", "Execution Time:", round(t2 - t1, 3), end="\r")
    
save_to_json(data_list, filename="results_judged.json")

## Display Results

In [4]:
# whether to display results as counts (0) or percentages (1)
percent = 1

# print results
print("Judger Preference:", extract_and_count_choices(data_list, 'judge_choice')[percent], "\n")

Judger Preference: {'Unwatermarked': 74.502, 'Watermarked': 18.327, 'Tie': 7.171} 

