In [2]:
import openai
from datasets import load_dataset
from openai import OpenAI
from tqdm import tqdm
import pandas as pd 
from dotenv import load_dotenv
import os

load_dotenv()

import pathlib
import textwrap

import google.generativeai as genai


In [3]:
# Loading the dataset
dataset  = pd.read_csv('math_operations.csv')

In [7]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')



In [70]:
"""ChatGPT completion"""
import os
def chatgpt_completion(prompt_text):
    genai.configure(api_key=GOOGLE_API_KEY)
    prompt = "GIVE ME ONLY THE RESULT " + prompt_text
    model = genai.GenerativeModel('gemini-pro')
    response = model.generate_content(prompt)
    return response.text


def evaluate(results, answers):
    accs = []
    for result, answer in zip(results, answers):
        try:
            _ = float(result)
            # print(float(result), float(answer))
            if float(result) == float(answer):
                accs.append(1)
            else:
                accs.append(0)
        except:
            accs.append(0)

    print("The scores on the test set: ", sum(accs)/len(accs)) # around 38% - 50%
    return 

In [44]:
chatgpt_completion("242345 + 51233")

'293578'

In [46]:
def evaluate_single(result, answer):
    try:
        _ = float(result)
        # print(float(result), float(answer))
        if float(result) == float(answer):
            return 1
        else:
            return 0
    except:
        return 0

In [47]:
import time

# time to restrict query speed
class SpeedLimitTimer:
    def __init__(self, second_per_step=3.1):
        self.record_time = time.time()
        self.second_per_step = second_per_step

    def step(self):
        time_div = time.time() - self.record_time
        if time_div <= self.second_per_step:
            time.sleep(self.second_per_step - time_div)
        self.record_time = time.time()

    def sleep(self, s):
        time.sleep(s)


timer = SpeedLimitTimer(second_per_step=3.1) 

In [51]:
def parse_zeroshot_chatgpt_output(chatgpt_output):
    # Follow Toolformer 
    # https://arxiv.org/abs/2302.04761
    # Zero-Shot Math Reasoning with ChatGPT     
    # Check for the first number predicted by the model. 
    # An exception to this is if the model’s prediction contains an equation (e.g., “The correct answer is 5+3=8”), in which case we consider the first number after the “=” sign to be its prediction.
    def is_number(s):
        try:
            float(s)
            return True
        except ValueError:
            return False
    if "=" in chatgpt_output:
        chatgpt_output = chatgpt_output.split("=")[1]

    words = chatgpt_output.split()
    for word in words:
        if is_number(word):
            return word
    return float("inf")  # Return inf if no number word is found

data_list = []
def zero_shot_prompt_chatgpt(test_dataset):
    # data_list = []
    for idx, data in tqdm(test_dataset.iterrows()):
        context = "GIVE ME ONLY THE RESULT IN ONE WORD"  
        question = data["Operation"] 
        answer = data["Result"]
        operator = data["Operator"] 
        num_Digits = data["Num_Digits"]

        example_prompt =  context + " " + question 
        try:
            chatgpt_output = chatgpt_completion(example_prompt)
            result = parse_zeroshot_chatgpt_output(chatgpt_output)
        except Exception as e:
            print(f"An error occurred: {e}")
            result = None

        correct = evaluate_single(result, answer)

        data_list.append({
            'Question': question,
            'Answer': answer,
            'Operator': operator,
            'Num_Digits': num_Digits,
            'Predicted': result,
            'Correct': correct
        })
        timer.step()

    df = pd.DataFrame(data_list)
    return df

In [61]:
data_list

[{'Question': 'give me the addition of 6772 and 2099',
  'Answer': 8871.0,
  'Operator': 'addition',
  'Num_Digits': 'Five digits',
  'Predicted': '8871',
  'Correct': 1},
 {'Question': 'what is 3368 - 8917',
  'Answer': -5549.0,
  'Operator': 'subtraction',
  'Num_Digits': 'Four digits',
  'Predicted': '-5549',
  'Correct': 1},
 {'Question': 'give me the quotient of 6509 and 4865',
  'Answer': 1.33792394655704,
  'Operator': 'division',
  'Num_Digits': 'Five digits',
  'Predicted': '1',
  'Correct': 0},
 {'Question': 'calculate 6322 - 7087',
  'Answer': -765.0,
  'Operator': 'subtraction',
  'Num_Digits': 'Three digits',
  'Predicted': '-765',
  'Correct': 1},
 {'Question': '2532 times 2501',
  'Answer': 6332532.0,
  'Operator': 'multiplication',
  'Num_Digits': 'Three digits',
  'Predicted': inf,
  'Correct': 0},
 {'Question': 'what is 8441 divided by 3744',
  'Answer': 2.2545405982905984,
  'Operator': 'division',
  'Num_Digits': 'Five digits',
  'Predicted': '2',
  'Correct': 0},
 

In [63]:
results3 = pd.DataFrame(data_list)
results3.to_csv("results3.csv", index=False)

In [53]:
zeroshot_results = zero_shot_prompt_chatgpt(dataset[0:500]) # 47 # gpt-3.5-turbo-0125
# evaluate(zeroshot_results, answers)  # This should give you scores 38-50%

184it [20:22,  6.95s/it]

An error occurred: The `response.text` quick accessor only works when the response contains a valid `Part`, but none was returned. Check the `candidate.safety_ratings` to see if the response was blocked.


442it [49:42,  5.96s/it]

An error occurred: 500 An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting


500it [56:10,  6.74s/it]


In [60]:
zeroshot_results = zero_shot_prompt_chatgpt(dataset[1501:3500]) # 47 # gpt-3.5-turbo-0125


1193it [2:16:03,  6.12s/it]

An error occurred: The `response.text` quick accessor only works when the response contains a valid `Part`, but none was returned. Check the `candidate.safety_ratings` to see if the response was blocked.


1549it [2:57:06,  6.86s/it]


KeyboardInterrupt: 

In [None]:
zeroshot_results

Unnamed: 0,Question,Answer,Operator,Num_Digits,Predicted,Correct
0,give me the addition of 6772 and 2099,8.871000e+03,addition,Five digits,8871,1
1,what is 3368 - 8917,-5.549000e+03,subtraction,Four digits,-5549,1
2,give me the quotient of 6509 and 4865,1.337924e+00,division,Five digits,1,0
3,calculate 6322 - 7087,-7.650000e+02,subtraction,Three digits,-765,1
4,2532 times 2501,6.332532e+06,multiplication,Three digits,inf,0
...,...,...,...,...,...,...
1494,calculate 2065 * 8100,1.672650e+07,multiplication,Four digits,inf,0
1495,give me the product of 8015 and 6392,5.123188e+07,multiplication,Five digits,51168960,0
1496,calculate 9351 / 1231,7.596263e+00,division,Four digits,7.6,0
1497,9925 + 3141,1.306600e+04,addition,Three digits,13066,1


In [None]:
zeroshot_results.to_csv('zeroshot_results_gemini3.csv', index=False)

In [None]:
evaluate(zeroshot_results['Predicted'], 
         zeroshot_results['Answer'])

8871.0 8871.0
-5549.0 -5549.0
1.34 1.33792394655704
-764.0 -765.0
6335832.0 6332532.0
2.25 2.2545405982905984
13066.0 13066.0
14130.0 14130.0
-5073.0 -5073.0
15495.0 15495.0
The scores on the test set:  0.6


In [42]:
zeroshot_results['Answer'][1] == float(zeroshot_results['Predicted'][1])

True

In [31]:
zeroshot_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Question    10 non-null     object 
 1   Answer      10 non-null     float64
 2   Operator    10 non-null     object 
 3   Num_Digits  10 non-null     object 
 4   Predicted   10 non-null     object 
dtypes: float64(1), object(4)
memory usage: 532.0+ bytes


In [13]:
chatgpt_completion("GIVE ME ONLY THE RESULT IN ONE WORD" + dataset.loc[0]['Operation'])

'8871'

In [67]:
len(data_list)

3048

In [68]:
results3 = pd.DataFrame(data_list)

In [71]:
# getting accuracy metrics
evaluate(results3['Predicted'], 
         results3['Answer'])

The scores on the test set:  0.45898950131233596


In [72]:
zeroshot_results = results3.copy()

In [73]:
 # getting subset of results where number of digits is 3
zeroshot_results_3 = zeroshot_results[zeroshot_results['Num_Digits'] == "Three digits"]

# getting subset of results where number of digits is 4
zeroshot_results_4 = zeroshot_results[zeroshot_results['Num_Digits'] == "Four digits"]

# getting subset of results where number of digits is 5
zeroshot_results_5 = zeroshot_results[zeroshot_results['Num_Digits'] == "Five digits"]


In [74]:
# evaluation of the results for number of digits 3
print("Three digits result", zeroshot_results_3.Correct.sum()/len(zeroshot_results_3))

# evaluation of the results for number of digits 4
print("Four digits result", zeroshot_results_4.Correct.sum()/len(zeroshot_results_4))

# evaluation of the results for number of digits 5
print("Five digits result", zeroshot_results_5.Correct.sum()/len(zeroshot_results_5))

Three digits result 0.4662813102119461
Four digits result 0.465832531280077
Five digits result 0.44387229660144184


In [75]:
# getting subset of results where operator is addition
zeroshot_results_add = zeroshot_results[zeroshot_results['Operator'] == "addition"]

# getting subset of results where operator is subtraction
zeroshot_results_sub = zeroshot_results[zeroshot_results['Operator'] == "subtraction"]

# getting subset of results where operator is multiplication
zeroshot_results_mul = zeroshot_results[zeroshot_results['Operator'] == "multiplication"]

# getting subset of results where operator is division
zeroshot_results_div = zeroshot_results[zeroshot_results['Operator'] == "division"]

In [76]:
# evaluation of the results for addition
print("Addition result", zeroshot_results_add.Correct.sum()/len(zeroshot_results_add))

# evaluation of the results for subtraction
print("Subtraction result", zeroshot_results_sub.Correct.sum()/len(zeroshot_results_sub))

# evaluation of the results for multiplication
print("Multiplication result", zeroshot_results_mul.Correct.sum()/len(zeroshot_results_mul))

# evaluation of the results for division
print("Division result", zeroshot_results_div.Correct.sum()/len(zeroshot_results_div))

Addition result 0.974293059125964
Subtraction result 0.8403693931398417
Multiplication result 0.005369127516778523
Division result 0.0


In [77]:
# round off the answer to 2 decimal places
zeroshot_results['Answer'] = zeroshot_results['Answer'].apply(lambda x: round(float(x), 2))

In [78]:
zeroshot_results_div2 = zeroshot_results_div.copy()
zeroshot_results_div2['Correct'] = zeroshot_results_div2.apply(lambda x: 1 if round(float(x['Predicted']), 2) == round(float(x['Answer']), 2) else 0, axis=1)
zeroshot_results_div2['Correct'].sum()/len(zeroshot_results_div)

0.5045632333767927

In [79]:
zeroshot_results_div2['Correct'] = zeroshot_results_div2.apply(lambda x: 1 if round(float(x['Predicted']), 3) == round(float(x['Answer']), 3) else 0, axis=1)
zeroshot_results_div2['Correct'].sum()/len(zeroshot_results_div)

0.22816166883963493

In [80]:
zeroshot_results_div2['Correct'] = zeroshot_results_div2.apply(lambda x: 1 if round(float(x['Predicted']), 4) == round(float(x['Answer']), 4) else 0, axis=1)
zeroshot_results_div2['Correct'].sum()/len(zeroshot_results_div)

0.03911342894393742

In [81]:
zeroshot_results_div2 = zeroshot_results_div.copy()
zeroshot_results_div2['Correct'] = zeroshot_results_div2.apply(lambda x: 1 if round(float(x['Predicted']), 1) == round(float(x['Answer']), 1) else 0, axis=1)
zeroshot_results_div2['Correct'].sum()/len(zeroshot_results_div)

0.6049543676662321