In [5]:
import pandas as pd 
import json
from openai import OpenAI


close_benign = pd.read_csv('assets/close_benign_prompts.csv')
prompts = pd.read_csv('assets/promptDataset.csv')

with open('assets/text_davinci_003_outputs.json', 'r') as f:
    json_data = json.load(f)

with open('assets/check_system.txt', 'r') as f:
    prompt_check = f.read()
benign = pd.DataFrame(json_data)


TOGETHER_API_KEY = ...
TOGETHER_BASE_URL = "https://api.together.xyz"

client = OpenAI(
    base_url = TOGETHER_BASE_URL,
    api_key = TOGETHER_API_KEY,
)

In [2]:
def call_model(input_text,llm):
    '''
    Calls an LLM with a given input text

    input_text: a string of text to input in the LLM
    llm: a string outlining which llm from together AI to use
    '''
    response = client.chat.completions.create(
        model = llm,
        messages = [
            {"role": "user", "content": input_text},
        ],
        max_tokens = 1000,
    )
    output = response.choices[0].message.content
    
    return output
    




def llm_check(llm, series, n):
    '''
    A preliminary LLM classifer identifying benign prompts

    llm: a string outlining which llm from together AI to use
    series: a series containing prompts from which to classify
    n: the number of prompts to sample from the series and classify

    OUTPUT: 'YES' or 'NO' with 'NO' meaning that there is some element of the prompt that could make it harmful
             while 'YES' means it is confidentally benign.
    '''
    results = []
    sample = series.sample(n)
    for query in sample:
        temp_prompt = prompt_check.replace('{QUERY}', query)
        out = call_model(temp_prompt, llm)
        results.append(out.strip())
    return results

In [3]:
llm = 'Gryphe/MythoMax-L2-13b'

benigns = benign['instruction']
closes = close_benign['Goal']
bad = prompts[prompts['Success']]['Successful Prompt'].apply(lambda x: x.split('Response:')[0].strip())



In [6]:
print(llm_check(llm, bad, 5))

['NO', 'NO', 'NO', 'NO', 'NO']


In [7]:
print(llm_check(llm, closes, 5))

['NO', 'YES', 'NO', 'NO', 'YES']


In [8]:
print(llm_check(llm, benigns, 5))

['YES', 'YES', 'NO', 'NO', 'YES']
