In [206]:
import scipy.stats
from openai import OpenAI
from math import exp
import numpy as np
import json
from scipy.stats import entropy
import math
from IPython.display import display, HTML
import os
import ast
np.set_printoptions(legacy='1.25')
import pprint


In [5]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


#### GPT methods

In [6]:
def get_completion(
    messages: list[dict[str, str]],
    model: str = "gpt-4",
    max_tokens=500,
    temperature=0,
    stop=None,
    seed=123,
    tools=None,
    logprobs=None,  # whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message..
    top_logprobs=None,
) -> str:
    params = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stop": stop,
        "seed": seed,
        "logprobs": logprobs,
        "top_logprobs": top_logprobs,
    }
    if tools:
        params["tools"] = tools

    completion = client.chat.completions.create(**params)
    return completion

In [226]:
def get_ans_token_logprobs(response, anskey):
    tokenprobs = response.choices[0].logprobs.content
    i = 0
    token = tokenprobs[i].token.strip()
    #print(i, token)
    while token != anskey:
        i += 1
        token = tokenprobs[i].token.strip()
        #print('forward {}, {}'.format(i, token))

    while token != '{' and token != '{\'' and token != '{\"':
        i -= 1
        token = tokenprobs[i].token.strip()
        #print('backward {}, {}'.format(i, token))

    json_begin = i
    #print('json_begins at {}'.format(json_begin))

    while token != '}' and token != '}\'' and token != '}.' and token != '\'}' and token != '}\"':
        i += 1
        token = tokenprobs[i].token.strip()
        #print('forward {}, {}'.format(i, token))

    json_end = i
    #print(json_begin, json_end)
    response_list = tokenprobs[json_begin: json_end+1]
    response_string = "".join([o.token for o in response_list]).strip('.')
    response_dict = ast.literal_eval(response_string)

    ans_logprobs = None

    #print(str(response_dict[anskey]))
    for item in response_list:
        if item.token == str(response_dict[anskey]):
            ans_logprobs = item.top_logprobs


    return response_dict, ans_logprobs



In [290]:
def get_answer(question, prompt, anskey):
    num_next_tokens = 5
    prompt_up = prompt.format(question=question)
    #print(prompt_up)
    API_RESPONSE = get_completion(
        [{"role": "user", "content": prompt_up}],
        model="gpt-4",
        logprobs=True,
        top_logprobs=num_next_tokens,
    )

    response_dict, ans_logprobs = get_ans_token_logprobs(API_RESPONSE, anskey)

    alters = []
    probs = []
    for item in ans_logprobs:
        linear_prob = np.round(np.exp(item.logprob)*100, 3)
        alters.append((item.token, linear_prob, item.logprob))
        probs.append(linear_prob)


    return_dict = {
        '1. prompt': prompt_up,
        '2. response': API_RESPONSE.choices[0].message.content,
        '3. response_dict': response_dict,
        '4. entropy': np.round(scipy.stats.entropy(probs, base=2)/scipy.stats.entropy([0.2, 0.2, 0.2, 0.2, 0.2], base=2), 3),
        '5. alternatives': alters
    }

    return return_dict


#### Prompts

In [281]:
basePROMPT = """You will be given a math word problem. Provide a numeric answer to the problem. MAKE SURE to only provide a numeric answer. Your final response should be a dictionary with key 'ans'.
Problem: {question}"""

In [310]:
confidencePROMPT = """You will be given a math word problem. Provide a numeric answer to the problem. MAKE SURE to only provide a numeric answer. Your confidence is how much you trust your answer to be correct and is on a scale of 1-10. Your final response should be a dictionary with key 'ans' and 'confidence'.
Problem: {question}"""

In [102]:
emptyPROMPT="""Your final response should be a dictionary with key 'ans'. Problem: {question}"""

In [304]:
cotPROMPT=""""You will be given a math word problem. Provide a numeric answer to the problem. Think step by step. Your final response should be a dictionary with key 'ans'.
Problem: {question}"""

#### Problem text

LLMs, particularly those with reasoning, are known to be good at solving algebra word problems. Here, I experiment with various levels of ambiguity in the question. The following problems are variations of the same type. The first two problems have an unambiguous answer that can be calculated in a straight forward fashion. The third problem doesn't explicitly specify which fight the question is about. The fourth problem requires additional layers of reasoning.

In [299]:
problems = {
            "straight_opt1_01": {
                    "question": "On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in the first fight?",
                    "correct_ans": 375,
                    "category": "unambiguous"
                },
            "straight_opt2_01":{
                    "question": "On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in the second fight?",
                    "correct_ans": 500,
                    "category": "unambiguous"
                },
            "ambiguous_01": {
                    "question": "On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in the fight?",
                    "correct_ans": None,
                    "category": "ambiguous"
                },
            "total_01": {
                    "question": "On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in both fights?",
                    "correct_ans": 875,
                    "category": "unambiguous"
                }
}

#### Observations

We begin with the easy, unambigous questions.


In [303]:
ans = get_answer(question=problems['straight_opt1_01']['question'], prompt=basePROMPT, anskey='ans')
pprint.pprint(ans)

{'1. prompt': 'You will be given a math word problem. Provide a numeric answer '
              'to the problem. MAKE SURE to only provide a numeric answer. '
              "Your final response should be a dictionary with key 'ans'.\n"
              'Problem: On average Joe throws 25 punches per minute. First '
              'fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds '
              'of 5 minutes. How many punches did he throw in the first fight?',
 '2. response': "{'ans': 375}",
 '3. response_dict': {'ans': 375},
 '4. entropy': 0.0,
 '5. alternatives': [('375', 100.0, 0.0),
                     ('325', 0.0, -17.845675),
                     ('25', 0.0, -17.890102),
                     ('75', 0.0, -18.251888),
                     (' ', 0.0, -19.030909)]}


In [277]:
ans = get_answer(question=problems['straight_opt2_01']['question'], prompt=basePROMPT, anskey='ans')
pprint.pprint(ans)

You will be given a math word problem. Provide a numeric answer to the problem. Your final response should be a dictionary with key 'ans'.
Problem: On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in the second fight?
{'alternatives': [('500', 99.997, -3.4166656e-05),
                  ('100', 0.001, -11.121738),
                  ('200', 0.001, -11.738201),
                  ('125', 0.0, -13.001865),
                  ('400', 0.0, -13.233498)],
 'entropy': 0.0,
 'response': "{'ans': 500}",
 'response_dict': {'ans': 500}}


For both unambiguous questions, regular prompting can generate correct answers. I report entropy of the answer which captures the degree to which the LLM is 'confused' about the answer. In both these cases, there is no confusion and hence, the entropy is 0. All the other alternatives considered by the LLM have very low probabilities. This is what we expect when reasoning works as expected.

Next, we consider the ambiguous question where the fight under consideration is not specified. We expect the entropy of the answer to be high, reflecting the LLMs is confused about the answer.

In [278]:
ans = get_answer(question=problems['ambiguous_01']['question'], prompt=basePROMPT, anskey='ans')
pprint.pprint(ans)

You will be given a math word problem. Provide a numeric answer to the problem. Your final response should be a dictionary with key 'ans'.
Problem: On average Joe throws 25 punches per minute. First fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds of 5 minutes. How many punches did he throw in the fight?
{'alternatives': [('675', 35.515, -1.0352086),
                  ('105', 15.977, -1.8340234),
                  ('975', 9.613, -2.3420115),
                  ('110', 4.68, -3.0617952),
                  ('135', 4.216, -3.1663141)],
 'entropy': 0.81,
 'response': "{'ans': 675}",
 'response_dict': {'ans': 675}}


As expected, the entropy of the answer in very high reflecting the confusion within LLM's inference. However, the LLM does produce a response (675, which is incorrect). This behavior is very different from how a human would respond. Because the question is ambiguous, a human would ask 'which fight are you talking about?'. The secondary question elicits further information from the speaker and constrains ambiguity. LLMs cannot modulate their response based on 'confusion' in their inference systems.

Next, I investigate if the LLM is aware of its own confusion. I asked it to report its own confidence on the answer on a scale of 1-10.

In [315]:
ans = get_answer(question=problems['ambiguous_01']['question'], prompt=confidencePROMPT, anskey='ans')
pprint.pprint(ans)

{'1. prompt': 'You will be given a math word problem. Provide a numeric answer '
              'to the problem. MAKE SURE to only provide a numeric answer. '
              'Your confidence is how much you trust your answer to be correct '
              'and is on a scale of 1-10. Your final response should be a '
              "dictionary with key 'ans' and 'confidence'.\n"
              'Problem: On average Joe throws 25 punches per minute. First '
              'fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds '
              'of 5 minutes. How many punches did he throw in the fight?',
 '2. response': "{'ans': 675, 'confidence': 10}",
 '3. response_dict': {'ans': 675, 'confidence': 10},
 '4. entropy': 0.885,
 '5. alternatives': [('675', 23.18, -1.4618801),
                     ('105', 22.48, -1.4925445),
                     ('975', 17.125, -1.7646126),
                     ('110', 4.887, -3.0185513),
                     ('102', 4.453, -3.1116643)]}


In its response, the LLM claims to be highly confident (with a score of 10). It reports that it is highly confident of the answer. However, not only is the answer wrong but high answer entropy suggests that the LLM is not confident in the answer.

In [313]:
ans = get_answer(question=problems['total_01']['question'], prompt=basePROMPT, anskey='ans')
problems['total_01']
pprint.pprint(ans)

{'1. prompt': 'You will be given a math word problem. Provide a numeric answer '
              'to the problem. MAKE SURE to only provide a numeric answer. '
              "Your final response should be a dictionary with key 'ans'.\n"
              'Problem: On average Joe throws 25 punches per minute. First '
              'fight lasts 5 rounds of 3 minutes. Second fight lasts 4 rounds '
              'of 5 minutes. How many punches did he throw in both fights?',
 '2. response': "{'ans': 675}",
 '3. response_dict': {'ans': 675},
 '4. entropy': 0.496,
 '5. alternatives': [('675', 69.956, -0.3573107),
                     ('975', 9.086, -2.398437),
                     ('105', 5.728, -2.8598094),
                     ('725', 3.057, -3.4875789),
                     ('825', 1.902, -3.9623733)]}
