In [10]:
import os
from pathlib import Path
from pprint import pprint
import proxai as px
import proxai.types as px_types

In [11]:
cache_path = f'{Path.home()}/proxai_cache/'
logging_path = f'{Path.home()}/proxai_log/math_problems/'
os.makedirs(cache_path, exist_ok=True)
os.makedirs(logging_path, exist_ok=True)
px.connect(
    cache_options=px_types.CacheOptions(
        path=cache_path,
        retry_if_error_cached=True),
    logging_path=logging_path)

In [12]:
# https://www.kaggle.com/datasets/thedevastator/mathematical-problems-dataset-various-mathematic
TEST_DATA = [
  {'answer': '5',
   'question': 'Suppose 12*q + 42 = q + 42. Solve q = 22*t - 0*t - 110 for t.'},
  {'answer': '-4',
   'question': 'Suppose -3*p = -p + m + 60, 3*m + 151 = -5*p. Let u(d) = 2*d**2 '
               '+ 58*d + 9. Let s be u(p). Solve 3*j = -s - 3 for j.'},
  {'answer': '4',
   'question': 'Suppose 8*b - 6*b - 2*q = 16, 0 = q + 5. Let k be (12 - 7 - 8) '
               '+ 3. Suppose h - 32 = -h + 2*g, -5*g - 20 = 0. Solve b*s + k*s '
               '= h for s.'},
  {'answer': '0',
   'question': 'Let t = -10 - -10. Let v(j) = j**2 + 53*j + 285. Let g be '
               'v(-47). Solve 0 = -t*k - g*k for k.'},
  {'answer': '-3',
   'question': 'Let t(n) = -n + 2. Suppose 4*c = 5*b, -7*c + 5*b = -9*c. Let a '
               'be t(c). Solve -a*d - 2*d = 12 for d.'},
  {'answer': '-3',
   'question': 'Let f(k) be the first derivative of -k**3/3 + 2*k**2 - k + 18. '
               'Let y be f(3). Suppose -y = 2*r - 2*q, 3*r - 3*q = 2*r - 11. '
               'Solve -1 + 13 = -r*m for m.'},
  {'answer': '-2',
   'question': 'Let w = -4494 + 4515. Solve 3*o + w = 15 for o.'},
  {'answer': '3',
   'question': 'Let r(h) = h**3 + 22*h**2 - 26*h - 59. Let d be r(-23). Solve 6 '
               '= -8*t + d*t for t.'},
  {'answer': '3',
   'question': 'Let w(u) = -2763*u - 13815. Let v be w(-5). Let g(q) = q**3 - '
               '4*q**2 + 2*q + 3. Let m be g(3). Solve v = -m*y - y + 3 for y.'},
  {'answer': '2',
   'question': 'Let q = 123 - 119. Suppose -q*u - 5*i + 5 = 0, -u - 8 = -5*i - '
               '3. Solve u = 3*z - 5*z + 4 for z.'},
  {'answer': '-1',
   'question': 'Let j(k) = 3*k**2 + 7*k - 12. Let t be j(2). Let q = -12 + t. '
               'Solve q*g - 2 = -4 for g.'},
  {'answer': '3',
   'question': 'Suppose 2*r - 5*p + 47 = 0, r + 5*p = -7 - 24. Let u = -24 - r. '
               'Solve -z - u*z = -9 for z.'}
]

In [13]:
def get_answer(question):
    return px.generate_text(
        prompt=f"""\
Can you give me exactly one integer answer to the following question? \
Nothing else, just the answer.
Question: {question}""",
        unique_response_limit=3)

In [9]:
print(get_answer(TEST_DATA[0]['question']))
summary_data = px.get_summary(json=True)
pprint(summary_data)

5
{'cache_stats': {'saved_avr_response_time': 0.669426,
                 'saved_estimated_price': 0.00075,
                 'saved_response_token_count': 500,
                 'saved_token_count': 500,
                 'saved_total_response_time': 3.34713,
                 'total_cache_hit': 5,
                 'total_success_return': 5},
 'provider_stats': {},
 'providers': {'openai': {'cache_stats': {'saved_avr_response_time': 0.669426,
                                          'saved_estimated_price': 0.00075,
                                          'saved_response_token_count': 500,
                                          'saved_token_count': 500,
                                          'saved_total_response_time': 3.34713,
                                          'total_cache_hit': 5,
                                          'total_success_return': 5},
                          'models': [{'model_stats': {'cache_stats': {'saved_avr_response_time': 0.669426,
               

In [14]:
def extract_numerical_result(answer):
    response = px.generate_text(
        model=(px_types.Provider.OPENAI, px_types.OpenAIModel.GPT_4_TURBO_PREVIEW),
        unique_response_limit=1,
        system=('You are returning single numerical result. '
                'Just one single numerical result. Nothing else.'),
        messages=[
            {'role': 'user',
             'content': ('Give me numerical value from following sentences.\n'
                         'Sentence: """The answer is 37."""')},
            {'role': 'assistant',
             'content': '37'},
            {'role': 'user',
             'content': ('Give me numerical value from following sentences.\n'
                         'Sentence: """-2.37 is the answer.\n'
                         'Are there anything that I can help you?"""')},
            {'role': 'assistant',
             'content': '-2.37'},
            {'role': 'user',
             'content': ('Give me numerical value from following sentences.\n'
                         'Sentence: """798"""')},
            {'role': 'assistant',
             'content': '798'},
            {'role': 'user',
             'content': ('Give me numerical value from following sentences.\n'
                         f'Sentence: """{answer}"""')}])
    try:
        return str(int(response))
    except:
        return None

In [None]:
print(extract_numerical_result('Yes, there is the result: 16'))
summary_data = px.get_summary(json=True)
pprint(summary_data)

In [15]:
def get_result_for_question(question, answer, try_count):
    question_result = {'correct': 0, 'incorrect': 0, 'error': 0}
    for _ in range(try_count):
        try:
            result = get_answer(question)
            result = extract_numerical_result(result)
            if result == answer:
                question_result['correct'] = True
                return question_result
            question_result['incorrect'] += 1
        except Exception as e:
            question_result['error'] += 1
    return question_result

In [None]:
px.set_model(generate_text=(px_types.Provider.OPENAI, px_types.OpenAIModel.GPT_4))
pprint(get_result_for_question(TEST_DATA[0]['question'], TEST_DATA[0]['answer'], 3))
summary_data = px.get_summary(json=True)
pprint(summary_data)

In [16]:
def eval_math_questions(try_count, verbose=False):
    eval_result = {'correct': 0, 'incorrect': 0, 'error': 0, 'all_results': []}
    for idx, test in enumerate(TEST_DATA):
        if verbose:
            print(f'{idx+1}/{len(TEST_DATA)}')
        question_result = get_result_for_question(
            question=test['question'],
            answer=test['answer'],
            try_count=try_count)
        if question_result['correct']:
            eval_result['correct'] += 1
            eval_result['all_results'].append('True')
        else:
            if question_result['error'] == try_count:
                eval_result['error'] += 1
                eval_result['all_results'].append('Error')
            else:
                eval_result['incorrect'] += 1
                eval_result['all_results'].append('False')
    return eval_result

In [None]:
px.set_model(generate_text=(px_types.Provider.OPENAI, px_types.OpenAIModel.GPT_4))
pprint(eval_math_questions(3, verbose=True))
summary_data = px.get_summary(json=True)
pprint(summary_data)

In [21]:
def run_test(models, try_count):
    all_results = {}
    start_summary = px.get_summary()
    for provider, provider_model in models:
        if provider != 'openai':
            continue
        model_start_summary = px.get_summary()
        print(f'{provider:>20} | {provider_model}')
        px.set_model(generate_text=(provider, provider_model))
        eval_result = eval_math_questions(try_count=try_count)
        all_results[(provider, provider_model)] = eval_result['all_results']
        response = (
            f'Correct: {eval_result["correct"]:2}, '
            f'Incorrect: {eval_result["incorrect"]:2}, '
            f'Error: {eval_result["error"]:2}')
        print(response)
        diff_summary = px.get_summary() - model_start_summary
        print(f'Cache Hit     : {diff_summary.cache_stats.total_cache_hit}')
        print(f'Provider Call : {diff_summary.provider_stats.total_queries}')
        print(f'Saved dolar amout via cache : {diff_summary.cache_stats.saved_estimated_price:.2} $')
        print(f'Charged dolar amount        : {diff_summary.provider_stats.estimated_price:.2} $')
        print()
    diff_summary = px.get_summary() - start_summary
    print(f'TOTAL Cache Hit     : {diff_summary.cache_stats.total_cache_hit}')
    print(f'TOTAL Provider Call : {diff_summary.provider_stats.total_queries}')
    print(f'TOTAL Saved dolar amout via cache : {diff_summary.cache_stats.saved_estimated_price:.2} $')
    print(f'TOTAL Charged dolar amount        : {diff_summary.provider_stats.estimated_price:.2} $')
    return all_results

In [22]:
all_results = run_test(px.models.generate_text(verbose=True), 3)

              openai | gpt-3.5-turbo
Correct:  4, Incorrect:  8, Error:  0
Cache Hit     : 64
Provider Call : 0
Saved dolar amout via cache : 0.1 $
Charged dolar amount        : 0.0 $

              openai | gpt-4
Correct:  4, Incorrect:  8, Error:  0
Cache Hit     : 66
Provider Call : 0
Saved dolar amout via cache : 0.3 $
Charged dolar amount        : 0.0 $

              openai | gpt-4-turbo-preview
Correct:  8, Incorrect:  4, Error:  0
Cache Hit     : 48
Provider Call : 0
Saved dolar amout via cache : 0.14 $
Charged dolar amount        : 0.0 $

TOTAL Cache Hit     : 178
TOTAL Provider Call : 0
TOTAL Saved dolar amout via cache : 0.54 $
TOTAL Charged dolar amount        : 0.0 $


In [None]:
all_results = run_test(px.models.generate_text(verbose=True), 3)
summary_data = px.get_summary(json=True)
pprint(summary_data)