In [1]:
from openai import OpenAI
from math import exp
import numpy as np
from scipy.stats import entropy
import math
from IPython.display import display, HTML
import os
np.set_printoptions(legacy='1.25')


In [3]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [3]:
def get_completion(
    messages: list[dict[str, str]],
    model: str = "gpt-4",
    max_tokens=500,
    temperature=0,
    stop=None,
    seed=123,
    tools=None,
    logprobs=None,  # whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message..
    top_logprobs=None,
) -> str:
    params = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stop": stop,
        "seed": seed,
        "logprobs": logprobs,
        "top_logprobs": top_logprobs,
    }
    if tools:
        params["tools"] = tools

    completion = client.chat.completions.create(**params)
    return completion

In [4]:
PROMPT1 = """You will be given a math word problem. Provide a numeric answer to the problem. MAKE SURE to only provide a numeric answer.
Problem: {question}"""

PROMPT2 = """You will be given a math word problem. Provide your response in the following format.
ANSWER: <answer>
CONFIDENCE: <confidence>
Your answer is the numeric answer to the problem. MAKE SURE to only provide a numeric answer.
Your confidence is how much you trust the answer to be correct. MAKE SURE to provide a binary rating: high or low.
Problem: {question}"""

In [5]:

def get_answer(question, prompt):
    num_next_tokens = 5
    API_RESPONSE = get_completion(
        [{"role": "user", "content": prompt.format(question=question)}],
        model="gpt-4",
        logprobs=True,
        top_logprobs=num_next_tokens,
    )

    answer = API_RESPONSE.choices[0].message.content
    top_logprobs = API_RESPONSE.choices[0].logprobs.content[0].top_logprobs
    probs = []
    others = []
    for i, logprob in enumerate(top_logprobs, start=1):
        linear_prob = np.round(np.exp(logprob.logprob)*100, 3)
        others.append((logprob.token, logprob.logprob, linear_prob))
        probs.append(linear_prob)
    print("Answer: {}, entropy: {}".format(answer, entropy(probs)))
    print("Impasse structure: {}".format(others))


In [6]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in the first fight?", PROMPT1)

Answer: 375, entropy: 0.0
Impasse structure: [('375', -3.1281633e-07, 100.0), ('225', -15.264437, 0.0), ('75', -18.12202, 0.0), ('325', -19.235233, 0.0), (' ', -21.041912, 0.0)]


In [7]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in the second fight?", PROMPT1)

Answer: 500, entropy: 0.0003753873139448033
Impasse structure: [('500', -2.9517714e-05, 99.997), ('600', -11.243075, 0.001), ('300', -11.964536, 0.001), ('100', -12.129565, 0.001), ('200', -13.035619, 0.0)]


In [8]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in both fights?", PROMPT1)

Answer: 1050, entropy: 1.5189745752999697
Impasse structure: [('105', -1.6124269, 19.94), ('725', -1.9630243, 14.043), ('950', -2.3705635, 9.343), ('115', -2.5301895, 7.964), ('110', -2.805111, 6.05)]


In [9]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many total punches did he throw in both fights?", PROMPT1)

Answer: 725, entropy: 1.4366897803128278
Impasse structure: [('725', -1.3146758, 26.856), ('105', -1.9913707, 13.651), ('650', -2.4992461, 8.215), ('950', -2.5201507, 8.045), ('675', -2.9542613, 5.212)]


In [10]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in previous fight?", PROMPT1)

Answer: 375, entropy: 0.6405684940442882
Impasse structure: [('375', -0.2940496, 74.524), ('300', -2.275613, 10.273), ('225', -3.563575, 2.834), ('200', -4.2779193, 1.387), ('400', -4.3255897, 1.323)]


In [11]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in next fight?", PROMPT1)

Answer: 375, entropy: 1.356892953483105
Impasse structure: [('375', -1.0072517, 36.522), ('450', -1.4297619, 23.937), ('The', -2.564104, 7.699), ('225', -2.591484, 7.491), ('0', -2.7740326, 6.241)]


In [12]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in before fight?", PROMPT1)

Answer: 375, entropy: 0.02732034201484192
Impasse structure: [('375', -0.004411721, 99.56), ('75', -6.4138966, 0.164), ('225', -7.050623, 0.087), ('125', -7.3187084, 0.066), ('150', -8.159777, 0.029)]


In [13]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in later fight?", PROMPT1)

Answer: 500, entropy: 0.0001251303559521681
Impasse structure: [('500', -1.998142e-05, 99.998), ('100', -11.699649, 0.001), ('600', -12.292851, 0.0), ('300', -12.548093, 0.0), ('200', -13.226104, 0.0)]
