In [None]:
!pip install openai prettytable tqdm tenacity wandb -qq

In [None]:
from prettytable import PrettyTable
import time
import openai
from tqdm import tqdm
import itertools
import wandb
from tenacity import retry, stop_after_attempt, wait_exponential

openai.api_key = 'your-api-key' # enter your OpenAI API key here

use_wandb = True # set to True if you want to use wandb to log your config and results

use_portkey = True #set to True if you want to use Portkey to log all the prompt chains and their responses Check https://portkey.ai/


In [None]:
system_gen_system_prompt = """Your job is to generate system prompts for GPT-4, given a description of the use-case and some test cases.

The prompts you will be generating will be for freeform tasks, such as generating a landing page headline, an intro paragraph, solving a math problem, etc.

In your generated prompt, you should describe how the AI should behave in plain English. Include what it will see, and what it's allowed to output. Be creative with prompts to get the best possible results. The AI knows it's an AI -- you don't need to tell it this.

You will be graded based on the performance of your prompt... but don't cheat! You cannot include specifics about the test cases in your prompt. Any prompts with examples will be disqualified.

Most importantly, output NOTHING but the prompt. Do not include anything else in your message."""


ranking_system_prompt = """Your job is to rank the quality of two outputs generated by different prompts. The prompts are used to generate a response for a given task.

You will be provided with the task description, the test prompt, and two generations - one for each system prompt.

Rank the generations in order of quality. If Generation A is better, respond with 'A'. If Generation B is better, respond with 'B'.

Remember, to be considered 'better', a generation must not just be good, it must be noticeably superior to the other.

Also, keep in mind that you are a very harsh critic. Only rank a generation as better if it truly impresses you more than the other.

Respond with your ranking, and nothing else. Be fair and unbiased in your judgement."""

In [None]:
# K is a constant factor that determines how much ratings change
K = 32

CANDIDATE_MODEL = 'gpt-4'
CANDIDATE_MODEL_TEMPERATURE = 0.9

GENERATION_MODEL = 'gpt-3.5-turbo'
GENERATION_MODEL_TEMPERATURE = 0.8
GENERATION_MODEL_MAX_TOKENS = 60

N_RETRIES = 3  # number of times to retry a call to the ranking model if it fails
RANKING_MODEL = 'gpt-3.5-turbo'
RANKING_MODEL_TEMPERATURE = 0.5

NUMBER_OF_PROMPTS = 10 # this determines how many candidate prompts to generate... the higher, the more expensive, but the better the results will be

WANDB_PROJECT_NAME = "gpt-prompt-eng" # used if use_wandb is True, Weights &| Biases project name
WANDB_RUN_NAME = None # used if use_wandb is True, optionally set the Weights & Biases run name to identify this run

PORTKEY_API = "your-portkey" # used if use_portkey is True. Get api key here: https://app.portkey.ai/ (click on profile photo on top left)
PORTKEY_TRACE = "prompt_engineer_test_run" # used if use_portkey is True. Trace each run with a separate ID to differentiate prompt chains
HEADERS = {} # don't change. headers will auto populate if use_portkey is true.


In [None]:
def start_wandb_run():
  # start a new wandb run and log the config
  wandb.init(
    project=WANDB_PROJECT_NAME,
    name=WANDB_RUN_NAME,
    config={
      "K": K,
      "system_gen_system_prompt": system_gen_system_prompt,
      "ranking_system_prompt": ranking_system_prompt,
      "candiate_model": CANDIDATE_MODEL,
      "candidate_model_temperature": CANDIDATE_MODEL_TEMPERATURE,
      "generation_model": GENERATION_MODEL,
      "generation_model_temperature": GENERATION_MODEL_TEMPERATURE,
      "generation_model_max_tokens": GENERATION_MODEL_MAX_TOKENS,
      "n_retries": N_RETRIES,
      "ranking_model": RANKING_MODEL,
      "ranking_model_temperature": RANKING_MODEL_TEMPERATURE,
      "number_of_prompts": NUMBER_OF_PROMPTS
      })

  return

In [None]:
# Optional logging to Weights & Biases to reocrd the configs, prompts and results
if use_wandb:
  start_wandb_run()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112992999995994, max=1.0…

In [None]:
def start_portkey_run():
  # define Portkey headers to start logging all prompts & their responses
  openai.api_base="https://api.portkey.ai/v1/proxy"
  HEADERS = {
    "x-portkey-api-key": PORTKEY_API,
    "x-portkey-mode": "proxy openai",
    "x-portkey-trace-id": PORTKEY_TRACE,
    #"x-portkey-retry-count": 5 # perform automatic retries with exponential backoff if the OpenAI requests fails
  }
  return HEADERS

In [None]:
# Optional prompt & responses logging
if use_portkey:
    HEADERS=start_portkey_run()

In [None]:
def generate_candidate_prompts(description, test_cases, number_of_prompts):
  outputs = openai.ChatCompletion.create(
      model=CANDIDATE_MODEL, # change this to gpt-3.5-turbo if you don't have GPT-4 access
      messages=[
          {"role": "system", "content": system_gen_system_prompt},
          {"role": "user", "content": f"Here are some test cases:`{test_cases}`\n\nHere is the description of the use-case: `{description.strip()}`\n\nRespond with your prompt, and nothing else. Be creative."}
          ],
      temperature=CANDIDATE_MODEL_TEMPERATURE,
      n=number_of_prompts,
      headers=HEADERS)

  prompts = []

  for i in outputs.choices:
    prompts.append(i.message.content)
  return prompts

def expected_score(r1, r2):
    return 1 / (1 + 10**((r2 - r1) / 400))

def update_elo(r1, r2, score1):
    e1 = expected_score(r1, r2)
    e2 = expected_score(r2, r1)
    return r1 + K * (score1 - e1), r2 + K * ((1 - score1) - e2)

# Get Score - retry up to N_RETRIES times, waiting exponentially between retries.
@retry(stop=stop_after_attempt(N_RETRIES), wait=wait_exponential(multiplier=1, min=4, max=70))
def get_score(description, test_case, pos1, pos2, ranking_model_name, ranking_model_temperature):
    score = openai.ChatCompletion.create(
        model=ranking_model_name,
        messages=[
            {"role": "system", "content": ranking_system_prompt},
            {"role": "user", "content": f"""Task: {description.strip()}
Prompt: {test_case['prompt']}
Generation A: {pos1}
Generation B: {pos2}"""}
        ],
        logit_bias={
              '32': 100,  # 'A' token
              '33': 100,  # 'B' token
        },
        max_tokens=1,
        temperature=ranking_model_temperature,
        headers=HEADERS,
    ).choices[0].message.content
    return score

@retry(stop=stop_after_attempt(N_RETRIES), wait=wait_exponential(multiplier=1, min=4, max=70))
def get_generation(prompt, test_case):
    generation = openai.ChatCompletion.create(
        model=GENERATION_MODEL,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"{test_case['prompt']}"}
        ],
        max_tokens=GENERATION_MODEL_MAX_TOKENS,
        temperature=GENERATION_MODEL_TEMPERATURE,
        headers=HEADERS,
    ).choices[0].message.content
    return generation

def test_candidate_prompts(test_cases, description, prompts):
  # Initialize each prompt with an ELO rating of 1200
  prompt_ratings = {prompt: 1200 for prompt in prompts}

  # Calculate total rounds for progress bar
  total_rounds = len(test_cases) * len(prompts) * (len(prompts) - 1) // 2

  # Initialize progress bar
  pbar = tqdm(total=total_rounds, ncols=70)

  # For each pair of prompts
  for prompt1, prompt2 in itertools.combinations(prompts, 2):
      # For each test case
      for test_case in test_cases:
          # Update progress bar
          pbar.update()

          # Generate outputs for each prompt
          generation1 = get_generation(prompt1, test_case)
          generation2 = get_generation(prompt2, test_case)

          # Rank the outputs
          score1 = get_score(description, test_case, generation1, generation2, RANKING_MODEL, RANKING_MODEL_TEMPERATURE)
          score2 = get_score(description, test_case, generation2, generation1, RANKING_MODEL, RANKING_MODEL_TEMPERATURE)

          # Convert scores to numeric values
          score1 = 1 if score1 == 'A' else 0 if score1 == 'B' else 0.5
          score2 = 1 if score2 == 'B' else 0 if score2 == 'A' else 0.5

          # Average the scores
          score = (score1 + score2) / 2

          # Update ELO ratings
          r1, r2 = prompt_ratings[prompt1], prompt_ratings[prompt2]
          r1, r2 = update_elo(r1, r2, score)
          prompt_ratings[prompt1], prompt_ratings[prompt2] = r1, r2

          # Print the winner of this round
          if score > 0.5:
              print(f"Winner: {prompt1}")
          elif score < 0.5:
              print(f"Winner: {prompt2}")
          else:
              print("Draw")

  # Close progress bar
  pbar.close()

  return prompt_ratings

def generate_optimal_prompt(description, test_cases, number_of_prompts=10, use_wandb=False):
  if use_wandb:
    wandb_table = wandb.Table(columns=["Prompt", "Ranking"])
    if wandb.run is None:
      start_wandb_run()

  prompts = generate_candidate_prompts(description, test_cases, number_of_prompts)
  prompt_ratings = test_candidate_prompts(test_cases, description, prompts)

  # Print the final ELO ratingsz
  table = PrettyTable()
  table.field_names = ["Prompt", "Rating"]
  for prompt, rating in sorted(prompt_ratings.items(), key=lambda item: item[1], reverse=True):
      table.add_row([prompt, rating])
      if use_wandb:
         wandb_table.add_data(prompt, rating)

  if use_wandb: # log the results to a Weights & Biases table and finsih the run
    wandb.log({"prompt_ratings": wandb_table})
    wandb.finish()
  print(table)


Inserting Specific Test Cases:

In [None]:
description = "Given a prompt, generate a landing page headline." # this style of description tends to work well

test_cases = [
  {
    "prompt": "Explain the concept of short selling a stock in simple terms"
  },
  {
    "prompt": "Provide a 200 word overview of quantitative easing monetary policy"
  },
  {
    "prompt": "Compare the Bloomberg terminals to alternative financial data platforms"
  },
  {
    "prompt": "Describe the typical workflow of a hedge fund analyst"
  },
  {
    "prompt": "Explain the capital asset pricing model formula in simple terms"
  },
  {
    "prompt": "Provide 3 creative examples comparing stocks to different food items"
  },
  {
    "prompt": "Write a children's story explaining bonds and yields"
  },
  {
    "prompt": "Analyze the impact of raising interest rates on economic growth"
  },
  {
    "prompt": "Discuss the role creativity plays in designing fintech solutions"
  },
  {
    "prompt": "Propose 5 potential startup ideas in decentralized finance"
  }

]

if use_wandb:
    wandb.config.update({"description": description,
                        "test_cases": test_cases})

In [None]:
!openai migrate

usage: openai [-h] [-V] [-v] [-b API_BASE] [-k API_KEY] [-p PROXY [PROXY ...]] [-o ORGANIZATION]
              {api,tools,wandb} ...
openai: error: argument {api,tools,wandb}: invalid choice: 'migrate' (choose from 'api', 'tools', 'wandb')


In [None]:
generate_optimal_prompt(description, test_cases, NUMBER_OF_PROMPTS, use_wandb)

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [None]:
import openai

def generate_optimal_prompt(description, test_cases, number_of_prompts, use_wandb):
    # Your OpenAI API key
    openai.api_key = "<your-api-key>"

    # Define a prompt based on your use case
    prompt = f"Description: {description}\nTest Cases:\n{test_cases}\nOptimal Prompt:"

    # Generate prompts using OpenAI's GPT-3
    prompts = []
    for _ in range(number_of_prompts):
        response = openai.Completion.create(
            engine="text-davinci-002",  # Choose the appropriate engine
            prompt=prompt,
            max_tokens=150,
            temperature=0.7,
            n = 1
        )
        prompts.append(response.choices[0].text.strip())

    # Additional processing or logging, if needed
    if use_wandb:
        import wandb
        wandb.log({"prompts": prompts})

    return prompts

# Example usage
description = "Your project description here."
test_cases = "Your test cases here."
NUMBER_OF_PROMPTS = 5
use_wandb = True  # Set to True if you want to log prompts using WandB

generated_prompts = generate_optimal_prompt(description, test_cases, NUMBER_OF_PROMPTS, use_wandb)
print(generated_prompts)
