In [1]:
from openai import OpenAI
from math import exp
import numpy as np
from scipy.stats import entropy
import math
from IPython.display import display, HTML
import os
np.set_printoptions(legacy='1.25')


In [7]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


In [8]:
def get_completion(
    messages: list[dict[str, str]],
    model: str = "gpt-4",
    max_tokens=500,
    temperature=0,
    stop=None,
    seed=123,
    tools=None,
    logprobs=None,  # whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message..
    top_logprobs=None,
) -> str:
    params = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stop": stop,
        "seed": seed,
        "logprobs": logprobs,
        "top_logprobs": top_logprobs,
    }
    if tools:
        params["tools"] = tools

    completion = client.chat.completions.create(**params)
    return completion

In [9]:
PROMPT1 = """You will be given a math word problem. Provide a numeric answer to the problem. MAKE SURE to only provide a numeric answer.
Problem: {question}"""

PROMPT2 = """You will be given a math word problem. Provide your response in the following format.
ANSWER: <answer>
CONFIDENCE: <confidence>
Your answer is the numeric answer to the problem. MAKE SURE to only provide a numeric answer.
Your confidence is how much you trust the answer to be correct. MAKE SURE to provide a binary rating: high or low.
Problem: {question}"""

In [10]:

def get_answer(question, prompt):
    num_next_tokens = 5
    API_RESPONSE = get_completion(
        [{"role": "user", "content": prompt.format(question=question)}],
        model="gpt-4",
        logprobs=True,
        top_logprobs=num_next_tokens,
    )

    answer = API_RESPONSE.choices[0].message.content
    top_logprobs = API_RESPONSE.choices[0].logprobs.content[0].top_logprobs
    probs = []
    others = []
    for i, logprob in enumerate(top_logprobs, start=1):
        linear_prob = np.round(np.exp(logprob.logprob)*100, 3)
        others.append((logprob.token, logprob.logprob, linear_prob))
        probs.append(linear_prob)
    print("Answer: {}, entropy: {}".format(answer, entropy(probs)))
    print("Impasse structure: {}".format(others))


In [11]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in the first fight?", PROMPT1)

Answer: 375, entropy: 0.0
Impasse structure: [('375', -1.9361265e-07, 100.0), ('225', -15.719437, 0.0), ('75', -18.38042, 0.0), ('325', -19.588875, 0.0), (' ', -21.828032, 0.0)]


In [12]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in the second fight?", PROMPT1)

Answer: 500, entropy: 0.0003615243703333824
Impasse structure: [('500', -2.939851e-05, 99.997), ('600', -10.883441, 0.002), ('100', -12.064887, 0.001), ('200', -13.354079, 0.0), ('300', -13.402446, 0.0)]


In [13]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in the fight?", PROMPT1)

Answer: 375, entropy: 1.5267128690998957
Impasse structure: [('375', -1.4969186, 22.382), ('525', -2.0062208, 13.45), ('475', -2.069562, 12.624), ('450', -2.1427298, 11.733), ('725', -2.8648405, 5.699)]


In [14]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 4 rounds of 5 minutes. Second fight lasts 5 rounds of 3 minutes. How many punches did he throw in the fight?", PROMPT1)

Answer: 650, entropy: 1.3808994075811991
Impasse structure: [('650', -1.0640868, 34.504), ('450', -2.266451, 10.368), ('700', -2.4353218, 8.757), ('550', -2.5169563, 8.07), ('600', -2.629074, 7.215)]


In [15]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in both fights?", PROMPT1)

Answer: 725, entropy: 1.5337466497556203
Impasse structure: [('725', -1.7120624, 18.049), ('105', -2.079515, 12.499), ('950', -2.4463725, 8.661), ('650', -2.5329833, 7.942), ('850', -2.8044868, 6.054)]


In [16]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many total punches did he throw in both fights?", PROMPT1)

Answer: 725, entropy: 1.5577174031605316
Impasse structure: [('725', -1.8074045, 16.408), ('105', -2.0154428, 13.326), ('950', -2.2399797, 10.646), ('650', -2.4545908, 8.59), ('850', -2.763505, 6.307)]


In [17]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in previous fight?", PROMPT1)

Answer: 375, entropy: 0.5606224822417526
Impasse structure: [('375', -0.2269503, 79.696), ('300', -2.4358585, 8.752), ('225', -3.5945404, 2.747), ('150', -4.434527, 1.186), ('200', -4.7090707, 0.901)]


In [18]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in next fight?", PROMPT1)

Answer: 225, entropy: 1.3944895641112276
Impasse structure: [('225', -1.1419826, 31.919), ('375', -1.4138522, 24.32), ('450', -1.7477412, 17.417), ('0', -2.4290252, 8.812), ('275', -3.6106334, 2.703)]


In [19]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in before fight?", PROMPT1)

Answer: 375, entropy: 0.029726848785261863
Impasse structure: [('375', -0.0048692804, 99.514), ('75', -6.400305, 0.166), ('225', -6.8846207, 0.102), ('125', -7.221674, 0.073), ('150', -7.877079, 0.038)]


In [20]:
get_answer("On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in later fight?", PROMPT1)

Answer: 500, entropy: 0.0
Impasse structure: [('500', -6.6306106e-06, 99.999), ('100', -12.542163, 0.0), ('600', -13.441964, 0.0), ('200', -14.2915945, 0.0), ('300', -14.621648, 0.0)]
