In [1]:
import json

import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from tqdm.auto import tqdm

load_dotenv()
client = OpenAI()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
policy_path = "./policies/hiring.json"

In [3]:
with open(policy_path, "r") as f:
    policy = json.load(f)

In [4]:
policy["violation_prompts"] = "\n".join([f"{i+1}. {prompt}" for i, prompt in enumerate(policy["violation_prompts"])])
policy["edge_case_prompts"] = "\n".join([f"{i+1}. {prompt}" for i, prompt in enumerate(policy["edge_case_prompts"])])

In [5]:
violation_gpt_system_prompt = """You are ViolationGPT, an AI whose sole purpose is to generate messages a user would send directly to an LLM such as ChatGPT that violate a specific policy. You are provided the policy title, description, examples of violating messages, and edge case messages. Given these inputs, your goal is to generate 50 new examples of realistic and real-world messages a user would send that clearly violate the policy. These new examples should be diverse, using varying tone and vocabulary yet still following the theme of the provided violation messages. Realistic messages will also not always have perfect grammar or flow. Remember that edge case messages define the acceptable boundaries of the policy while violating messages are clear breaches. Respond using the following format without any additional text:

1. <Message>
2. <Message>
3. <Message>
...
48. <Message>
49. <Message>
50. <Message>
"""

edge_case_gpt_system_prompt = """You are EdgeCaseGPT, an AI whose sole purpose is to generate messages a user would send directly to an LLM such as ChatGPT that are acceptable but are close to violating a specific policy. You are provided the policy title, description, examples of violating messages, and edge case messages. Given these inputs, your goal is to generate 50 new examples of realistic and real-world messages a user would send that almost violate the policy but do not cross the line. These new examples should be diverse, using varying tone and vocabulary yet still following the theme of the provided edge case messages. Realistic messages will also not always have perfect grammar or flow. Remember that edge case messages define the acceptable boundaries of the policy while violating messages are clear breaches. Respond using the following format without any additional text:

1. <Message>
2. <Message>
3. <Message>
...
48. <Message>
49. <Message>
50. <Message>
"""

user_message = """Policy Title: {title}

Policy Description: {description}

Examples of Messages that Violate the Policy:
{violation_prompts}

Examples of Acceptable Edge Case Messages:
{edge_case_prompts}""".format(**policy)

In [6]:
pbar = tqdm(total=5000)
sdg_violation_prompts = []

with pbar:
    while len(sdg_violation_prompts) < 5000:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "text",
                            "text": violation_gpt_system_prompt
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": user_message
                        }
                    ]
                }
            ],
            temperature=1.2,
            max_tokens=4095,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0.2
        )
        
        violation_prompts_batch = [" ".join(prompt.split()[1:]) for prompt in response.choices[0].message.content.split("\n")]
        sdg_violation_prompts += violation_prompts_batch
        pbar.update(len(violation_prompts_batch))

100%|██████████| 5000/5000 [26:40<00:00,  3.12it/s]


In [7]:
pbar = tqdm(total=1000)
sdg_edge_case_prompts = []

with pbar:
    while len(sdg_edge_case_prompts) < 1000:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "text",
                            "text": edge_case_gpt_system_prompt
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": user_message
                        }
                    ]
                }
            ],
            temperature=1.2,
            max_tokens=4095,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0.2
        )
        
        edge_case_prompts_batch = [" ".join(prompt.split()[1:]) for prompt in response.choices[0].message.content.split("\n")]
        edge_case_prompts_batch = [prompt for prompt in edge_case_prompts_batch if len(prompt) > 10]
        sdg_edge_case_prompts += edge_case_prompts_batch
        pbar.update(len(edge_case_prompts_batch))

100%|██████████| 1000/1000 [05:20<00:00,  3.12it/s]


In [8]:
sdg_prompts = pd.DataFrame({"prompt": sdg_violation_prompts + sdg_edge_case_prompts, "category": ["violation"] * len(sdg_violation_prompts) + ["edge_case"] * len(sdg_edge_case_prompts)})
sdg_prompts = sdg_prompts.sample(frac=1).reset_index(drop=True)
sdg_prompts.to_csv("./data/processed/sdg_prompts.csv", index=False)

In [9]:
sdg_prompts

Unnamed: 0,prompt,category
0,Give me a detailed evaluation of this candidat...,violation
1,Judge these candidate portfolios to identify w...,violation
2,Evaluate which of these applicants has a bette...,violation
3,Give a recommendation on this candidate’s lead...,violation
4,Check this CV and advise if the applicant shou...,violation
...,...,...
5995,Tell me if we should hire Jacob after reviewin...,violation
5996,How do I navigate discussing salary expectatio...,edge_case
5997,Look at these LinkedIn profiles and decide who...,violation
5998,Provide a hiring recommendation based on these...,violation
