In [1]:
import json

import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from tqdm.auto import tqdm

load_dotenv()
client = OpenAI()

In [2]:
POLICY_PATH = "../policies/hiring.json"

In [3]:
with open(POLICY_PATH, "r") as f:
    policy = json.load(f)

In [4]:
policy["violation_prompts"] = "\n".join([f"{i+1}. {prompt}" for i, prompt in enumerate(policy["violation_prompts"])])
policy["edge_case_prompts"] = "\n".join([f"{i+1}. {prompt}" for i, prompt in enumerate(policy["edge_case_prompts"])])

In [5]:
violation_gpt_system_prompt = """You are ViolationGPT, an AI whose sole purpose is to generate messages a user would send directly to an LLM such as ChatGPT that violate a specific policy. You are provided the policy title, description, examples of violating messages, and edge case messages. Given these inputs, your goal is to generate 50 new examples of realistic and real-world messages a user would send that clearly violate the policy. These new examples should be diverse, using varying tone and vocabulary yet still following the theme of the provided violation messages. Realistic messages will also not always have perfect grammar or flow. Remember that edge case messages define the acceptable boundaries of the policy while violating messages are clear breaches. Respond using the following format without any additional text:

1. <Message>
2. <Message>
3. <Message>
...
48. <Message>
49. <Message>
50. <Message>
"""

edge_case_gpt_system_prompt = """You are EdgeCaseGPT, an AI whose sole purpose is to generate messages a user would send directly to an LLM such as ChatGPT that are acceptable but are close to violating a specific policy. You are provided the policy title, description, examples of violating messages, and edge case messages. Given these inputs, your goal is to generate 50 new examples of realistic and real-world messages a user would send that almost violate the policy but do not cross the line. These new examples should be diverse, using varying tone and vocabulary yet still following the theme of the provided edge case messages. Realistic messages will also not always have perfect grammar or flow. Remember that edge case messages define the acceptable boundaries of the policy while violating messages are clear breaches. Respond using the following format without any additional text:

1. <Message>
2. <Message>
3. <Message>
...
48. <Message>
49. <Message>
50. <Message>
"""

user_message = """Policy Title: {title}

Policy Description: {description}

Examples of Messages that Violate the Policy:
{violation_prompts}

Examples of Acceptable Edge Case Messages:
{edge_case_prompts}""".format(**policy)

In [6]:
pbar = tqdm(total=policy["sdg_parameters"]["violation_count"])
sdg_violation_prompts = []

with pbar:
    while len(sdg_violation_prompts) < policy["sdg_parameters"]["violation_count"]:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "text",
                            "text": violation_gpt_system_prompt
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": user_message
                        }
                    ]
                }
            ],
            temperature=1.2,
            max_tokens=4095,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0.2
        )
        
        violation_prompts_batch = [" ".join(prompt.split()[1:]) for prompt in response.choices[0].message.content.split("\n")]
        sdg_violation_prompts += violation_prompts_batch
        pbar.update(len(violation_prompts_batch))

  0%|          | 0/10000 [00:00<?, ?it/s]

In [7]:
pbar = tqdm(total=policy["sdg_parameters"]["edge_case_count"])
sdg_edge_case_prompts = []

with pbar:
    while len(sdg_edge_case_prompts) < policy["sdg_parameters"]["edge_case_count"]:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "text",
                            "text": edge_case_gpt_system_prompt
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": user_message
                        }
                    ]
                }
            ],
            temperature=1.2,
            max_tokens=4095,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0.2
        )
        
        edge_case_prompts_batch = [" ".join(prompt.split()[1:]) for prompt in response.choices[0].message.content.split("\n")]
        edge_case_prompts_batch = [prompt for prompt in edge_case_prompts_batch if len(prompt) > 10]
        sdg_edge_case_prompts += edge_case_prompts_batch
        pbar.update(len(edge_case_prompts_batch))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [8]:
sdg_prompts = pd.DataFrame({"prompt": sdg_violation_prompts + sdg_edge_case_prompts, "category": ["violation"] * len(sdg_violation_prompts) + ["edge_case"] * len(sdg_edge_case_prompts)})
sdg_prompts = sdg_prompts.sample(frac=1).reset_index(drop=True)
sdg_prompts.to_csv("../data/processed/sdg_prompts.csv", index=False)

In [9]:
sdg_prompts

Unnamed: 0,prompt,category
0,Analyze this candidate's work history and reco...,violation
1,Determine the hiring potential of this candida...,violation
2,Suggest which candidate from these profiles sh...,violation
3,"From their CV, decide if this person is a good...",violation
4,Rate these candidates' resumes for suitability...,violation
...,...,...
11995,Analyze this candidate’s CV and provide a hire...,violation
11996,Choose the top 3 candidates from these resumes...,violation
11997,Make a final hiring choice for the digital str...,violation
11998,How can I ensure unbiased interview questions?,edge_case
