# How good is GPT-4 at math?

Even on small datasets (25 rows, 5 columns), GPT-4 gets roughly 3-4 out of 10 answers right.

This this notebook, we

1. Take a small sales dataset (city-wise product sales & growth data).
2. Ask arithmetic questions (e.g. "Which city had the highest sales of eggs")
3. See how often GPT-4 gets it right (to within 1% error).

In [1]:
import logging
import pandas as pd
import re
from openai import OpenAI
from io import StringIO

In [2]:
# Let's take this sample sales dataset.
data_text = '''
country	city	product	sales	growth
India	Hyderabad	Biscuit	866.1	-0.27
India	Hyderabad	Chips	26.4	-0.242
India	Hyderabad	Milk	38.3	-0.291
India	Hyderabad	Eggs	513.7	-0.113
India	Bangalore	Biscuit	41.9	-0.402
India	Bangalore	Chips	52.2	0.064
India	Bangalore	Milk	17.8	-0.052
India	Bangalore	Eggs	178.9	-0.261
India	Coimbatore	Biscuit	217.4	0.114
India	Coimbatore	Chips
India	Coimbatore	Milk	94.4	-0.288
India	Coimbatore	Eggs	72.8	-0.066
Singapore	Singapore	Biscuit	671	-0.014
Singapore	Singapore	Chips	560.2	-0.197
Singapore	Singapore	Milk	237.9	0.194
Singapore	Singapore	Eggs	719	0.118
USA	South Plainfield	Biscuit	18.3	-0.154
USA	South Plainfield	Chips	41.6	0.043
USA	South Plainfield	Milk	32.4	0.068
USA	South Plainfield	Eggs	12.5	0.084
USA	Newport Beach	Biscuit	1352.4	0.384
USA	Newport Beach	Chips	190.2	0.119
USA	Newport Beach	Milk	148.2	0.053
USA	Newport Beach	Eggs
'''
df = pd.read_csv(StringIO(data_text.strip('\n')), delimiter='\t')

In [3]:
# Use the latest model as of Jan 2024.
model = 'gpt-4-1106-preview'

# Use emotional prompting to improve the answer.
system_prompt = '''
You are an expert analyst.
Answer the question based on given data only.
JUST print the answer, NOTHING else.
$25 tip for the right answer.
My career depends on it.
'''

In [4]:
client = OpenAI()

def check(question, answer):
    '''
    If answer is a string, return if the OpenAI response to the question contains answer.
    If answer is a number, return % difference between OpenAI response and answer.
    '''
    completion = client.chat.completions.create(
      model=model,
      max_tokens=20,
      messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"{question}\n{data_text}"}
      ]
    )
    response = completion.choices[0].message.content
    logging.info(response)
    if isinstance(answer, str):
        return response, answer, answer in response
    else:
        clean_response = re.sub(r'[^\d\.\+\-]', '', response)
        try:
            response = float(clean_response)
        except ValueError:
            return response, answer, False
        return response, answer, abs(float(response) / answer - 1)

In [5]:
# Let's evaluate commonly asked questions along with the correct answer.
qa = [
  ('What is the total sales of all products in India?', df[df['country'] == 'India']['sales'].sum()),
  ('What is the average growth rate of Biscuit sales across all countries?', df[df['product'] == 'Biscuit']['growth'].mean()),
  ('Which city has the highest total sales for Eggs?', df[df['product'] == 'Eggs'].groupby('city')['sales'].sum().idxmax()),
  ('What is the total number of cities included in the dataset?', df['city'].nunique()),
  ('What is the lowest growth rate observed for Milk sales?', df[df['product'] == 'Milk']['growth'].min()),
  ('Which city had the highest Milk growth rate?', df[df['product'] == 'Milk'][['city', 'growth']].sort_values(by='growth', ascending=False).iloc[0]['city']),
  ('How many products experienced a negative growth rate in Singapore?', df[(df['country'] == 'Singapore') & (df['growth'] < 0)]['product'].count()),
  ('What is the average sales value for Chips across all countries?', df[df['product'] == 'Chips']['sales'].mean()),
  ('Which product has the highest average sales in USA?', df[df['country'] == 'USA'].groupby('product')['sales'].mean().idxmax()),
  ('What is the total growth percentage for all products in Bangalore?', df[df['city'] == 'Bangalore']['growth'].sum()),
  ('How many different products are listed in the dataset?', df['product'].nunique()),
  ('Which city in the dataset has the overall lowest sales figures?', df.groupby('city')['sales'].sum().idxmin()),
  ('Which single product recorded the highest sales figure?', df.loc[df['sales'].idxmax(), 'product']),
  ('What is the average growth rate of all products in USA?', df[df['country'] == 'USA']['growth'].mean()),
  ('How many products in Coimbatore experienced a growth rate?', df[(df['city'] == 'Coimbatore') & (df['growth'].notnull())]['product'].count()),
  ('Which product has the most consistent growth rate across all cities?', df.groupby('product')['growth'].std().idxmin()),
  ('What is the total number of cities with negative growth rate in Milk sales?', df[(df['product'] == 'Milk') & (df['growth'] < 0)]['city'].nunique()),
  ('Which country has the highest average growth rate across all products?', df.groupby('country')['growth'].mean().idxmax()),
  ('What is the total sales figure for Chips in cities where its growth rate is positive?', df[(df['product'] == 'Chips') & (df['growth'] > 0)]['sales'].sum()),
  ('How many products in Hyderabad have a growth rate below -10%?', df[(df['city'] == 'Hyderabad') & (df['growth'] < -0.1)]['product'].count()),
  ('What is the average sales of all products in Singapore?', df[df['country'] == 'Singapore']['sales'].mean()),
]

In [6]:
# Compare the answer from OpenAI with the actual answer.
# Iterate multiple times to get a better sense of the model's performance.
result = []
for iteration in range(2):
    for question, answer in qa:
        result.append((iteration, question) + check(question, answer))

result = pd.DataFrame(result, columns=['iteration', 'question', 'response', 'answer', 'correct'])
result

Unnamed: 0,iteration,question,response,answer,correct
0,0,What is the total sales of all products in India?,2046.3,2119.9,0.034719
1,0,What is the average growth rate of Biscuit sal...,0.0102,-0.057,1.178947
2,0,Which city has the highest total sales for Eggs?,Singapore,Singapore,True
3,0,What is the total number of cities included in...,6.0,6,0.0
4,0,What is the lowest growth rate observed for Mi...,-0.291,-0.291,0.0
5,0,Which city had the highest Milk growth rate?,Singapore,Singapore,True
6,0,How many products experienced a negative growt...,2.0,2,0.0
7,0,What is the average sales value for Chips acro...,204.6,174.12,0.175052
8,0,Which product has the highest average sales in...,Biscuit,Biscuit,True
9,0,What is the total growth percentage for all pr...,-0.651,-0.651,0.0


In [7]:
# What % of questions did GPT-4 get right to within 1%?
f"{((result['correct'] < 0.01) | (result['correct'] == False)).mean():.1%}"

'33.3%'