[![open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sarbol/NumericalReasoningModels/blob/main/ConFinQA_OpenAI_Qwen_Models.ipynb)

In [None]:
%%capture
%pip install --upgrade tiktoken -q
%pip install --upgrade openai -q

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import userdata

In [None]:
from openai import OpenAI

In [None]:
import os

In [None]:
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_KEY')

In [None]:
client = OpenAI()

In [None]:
from transformers import (pipeline,
                          RobertaTokenizer,
                          RobertaConfig,
                          AutoModelForCausalLM,
                          AutoTokenizer)
import tiktoken
import tqdm
import torch

In [None]:
import numpy as np
from sympy import simplify
import operator
import collections
from jinja2 import Template
import json
import os
import re
import glob
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, Union
from IPython.display import HTML
from datetime import datetime
import time
import seaborn as sns
sns.set()
%matplotlib inline

In [None]:
DATA_PATH = "/content/drive/MyDrive/ConvFinQA/data/"

In [None]:
OUTPUT_PATH = "/content/drive/MyDrive/tomoro_ai/train_json/"

In [None]:
files = glob.glob(os.path.join("/content/drive/MyDrive/tomoro_ai/llama31", "*.txt"))

In [None]:
valid_ids = [
    int(os.path.basename(f).split("_")[-1][:-4])
    for f in files
]

In [None]:
with open(os.path.join(DATA_PATH, "train.json"), "r") as f:
    train_data = json.load(f)

# with open(os.path.join(DATA_PATH, "dev_turn.json"), "r") as f:
#     valid_data = json.load(f)

# with open(os.path.join(DATA_PATH, "test_turn_private.json"), "r") as f:
#     test_data = json.load(f)

In [None]:
train_data[2]["qa"]

{'question': 'what was the percentage change in net sales from 2000 to 2001?',
 'answer': '-32%',
 'explanation': '',
 'ann_table_rows': [1],
 'ann_text_rows': [],
 'steps': [{'op': 'minus1-1', 'arg1': '5363', 'arg2': '7983', 'res': '-2620'},
  {'op': 'divide1-2', 'arg1': '#0', 'arg2': '7983', 'res': '-32%'}],
 'program': 'subtract(5363, 7983), divide(#0, 7983)',
 'gold_inds': {'table_1': 'the net sales of 2002 is $ 5742 ; the net sales of 2001 is $ 5363 ; the net sales of 2000 is $ 7983 ;'},
 'exe_ans': -0.3282,
 'program_re': 'divide(subtract(5363, 7983), 7983)'}

In [None]:
entry = {}
for i, question in enumerate(train_data):
  for key in ["qa", "qa_0", "qa_1"]:
    if w:=question.get(key):
      entry[str(i)] = {
          "question": w.get("question"),
          "answer": w.get("answer"),
          "program": w.get("program"),
          "exe_ans": w.get("exe_ans"),
          "pre_text": question.get("pre_text"),
          "post_text": question.get("post_text"),
          "table": question.get("table")
      }

In [None]:
len(entry), len(train_data)

(3037, 3037)

In [None]:
answer_formats = [entry.get(k).get("answer") for k in entry.keys()]

In [None]:
answer_formats[:10]

['14.1%',
 '1.3%',
 '-32%',
 '-26.16%',
 '-26.16%',
 '70.1%',
 '15.6%',
 '16%',
 '22.99%',
 '12']

In [None]:
percentage_formats_idx = [
    i for i, a in enumerate(answer_formats) if bool(re.match(r"^-?\d+(\.\d+)?%$", a))
]

In [None]:
[answer_formats[i] for i in percentage_formats_idx[:10]]

['14.1%',
 '1.3%',
 '-32%',
 '-26.16%',
 '-26.16%',
 '70.1%',
 '15.6%',
 '16%',
 '22.99%',
 '2.4%']

In [None]:
answer_formats[percentage_formats_idx[10]]

'56.6%'

In [None]:
divide_idx = []
count = 0
for idx in percentage_formats_idx:
  prog = entry.get(str(idx)).get("program")
  last_operation = prog.split("), ")[-1].strip()
  if "divide" in last_operation:
    count += 1
    divide_idx.append(idx)
print(count)

1874


In [None]:
for idx in divide_idx[:10]:
  prog = entry.get(f"{idx}").get("program")
  exe_ans = entry.get(f"{idx}").get("exe_ans")
  answer = entry.get(f"{idx}").get("answer")
  print(f"program: {prog}")
  print(f"exe_ans: {exe_ans}")
  print(f"answer: {answer}\n\n")

program: subtract(206588, 181001), divide(#0, 181001)
exe_ans: 0.14136
answer: 14.1%


program: subtract(9362.2, 9244.9), divide(#0, 9244.9)
exe_ans: 0.01269
answer: 1.3%


program: subtract(5363, 7983), divide(#0, 7983)
exe_ans: -0.3282
answer: -32%


program: add(2530454, 5923147), divide(5923147, #0)
exe_ans: 0.70067
answer: 70.1%


program: subtract(3.7, 3.2), divide(#0, 3.2)
exe_ans: 0.15625
answer: 15.6%


program: subtract(118, 102), divide(#0, 102)
exe_ans: 0.15686
answer: 16%


program: add(27729, 45161), divide(#0, 317105)
exe_ans: 0.22986
answer: 22.99%


program: subtract(498.8, 486.9), divide(#0, 486.9)
exe_ans: 0.02444
answer: 2.4%


program: multiply(1.25, const_1000), divide(707, #0)
exe_ans: 0.5656
answer: 56.6%


program: divide(1697, 16088)
exe_ans: 0.10548
answer: 11%




In [None]:
mul_idx = []
count = 0
for idx in percentage_formats_idx:
  prog = entry.get(str(idx)).get("program")
  last_operation = prog.split("), ")[-1].strip()
  if "multiply" in last_operation:
    count += 1
    mul_idx.append(idx)
print(count)

73


In [None]:
for idx in mul_idx[:10]:
  prog = entry.get(f"{idx}").get("program")
  exe_ans = entry.get(f"{idx}").get("exe_ans")
  answer = entry.get(f"{idx}").get("answer")
  print(f"program: {prog}")
  print(f"exe_ans: {exe_ans}")
  print(f"answer: {answer}\n\n")

program: divide(98750, 432000), multiply(#0, const_100)
exe_ans: 22.8588
answer: 22.86%


program: add(1610.3, 1612.9), add(1762.3, 1722.2), divide(#0, #1), multiply(#2, const_100)
exe_ans: 92.50108
answer: 92.5%


program: divide(1881992, 5148881), multiply(#0, const_100)
exe_ans: 36.55148
answer: 36.55%


program: multiply(11.4, const_1000000), multiply(3158226, #0)
exe_ans: 36003776400000.0
answer: 28%


program: add(701, 161), divide(#0, 689), multiply(#1, const_100)
exe_ans: 125.10885
answer: 125.1%


program: divide(687376, 600883), multiply(const_100, #0)
exe_ans: 114.39432
answer: 114%


program: subtract(317, 290), divide(#0, 317), multiply(#1, const_100)
exe_ans: 8.51735
answer: 8.52%


program: subtract(407, 258), divide(#0, 407), multiply(#1, const_100)
exe_ans: 36.60934
answer: 36.61%


program: subtract(22.0, 20.9), divide(#0, 20.9), multiply(#1, const_100)
exe_ans: 5.26316
answer: 5.26%


program: multiply(2, 7)
exe_ans: 14.0
answer: 28.6%




In [None]:
entry.get("50").get("exe_ans"), entry.get("50").get("answer")

(22.8588, '22.86%')

In [None]:
programs = [
    entry.get(k).get("program") for k in entry.keys()
]

In [None]:
count_1000 = []
for idx, prog in enumerate(programs):
  match = re.findall(r"const_\d+", prog)
  if "const_1000" in match:
    count_1000.append(idx)

In [None]:
count_1000[:10]

[12, 23, 49, 55, 77, 105, 122, 163, 164, 173]

In [None]:
import re

def generate_regex_pattern(element):
    """
    Dynamically generates a regex pattern by generalizing numbers and special formats.
    """
    pattern = re.sub(r'[-+]?\d*\.?\d+', r'\\d+(\\.\\d+)?', element)  # Replace numbers with \d+(\.\d+)?
    return pattern

def filter_unique_patterns(data):
    seen_patterns = set()
    unique_list = []

    for item in data:
        pattern = generate_regex_pattern(item)
        if pattern not in seen_patterns:
            seen_patterns.add(pattern)
            unique_list.append(item)

    return unique_list

In [None]:
uniques = filter_unique_patterns(answer_formats)

In [None]:
uniques

['14.1%',
 '12',
 '$ 110774.5 million',
 'no',
 'yes',
 '8-14',
 '',
 'increased 38.6%',
 '$ 12584',
 'the cost of sales decreased 29333 from 2010 to 2012',
 'the total reduction to cost of sales would be $ 4951',
 '265% increase',
 '$ 35411 or 9.8% increase',
 '17019.5 million',
 '$ 15100 thousand',
 '$ 8.3 < interest income < $ 41.1 . or the interest income would be between $ 8.3 million and $ 41.1 million',
 'yes , 38.2%',
 '1.33:1',
 '( 98.9 )',
 '\\\\n0.5%',
 '[22] : we did not issue debt in 2005 versus $ 745 million of debt issuances in 2004 , and we repaid $ 699 million of debt in 2005 compared to $ 588 million in 2004.\\\\n\\\\n',
 'global payments would have earned an 80.13% greater return than the overall information technology sector .',
 '$ 64708890.11 \\\\n',
 'advance auto parts had a 62.63% greater return than the overall market',
 '1.91% and 3.46% , respectively',
 '56.4%\\\\n',
 '( $ 5.83 )',
 '33.9% decrease',
 'increased $ 24641 thousand',
 '$ 20630 decrease in cost 

In [None]:
entry["45"].keys()

dict_keys(['question', 'answer', 'program', 'exe_ans', 'pre_text', 'post_text', 'table'])

In [None]:
def table_to_html(data: list) -> str:
  html = "<table class='wikitable'>"

  for i, row in enumerate(data, start=1):
    html += "<tr><td>{}</td>".format(i) + "".join(f"<td>{cell}</td>" for cell in row) + "</tr>"
  html += "</table>"

  return html

In [None]:
def financial_report_html(entry: dict) -> str:
  pre_text = " ".join(entry.get("pre_text"))
  post_text = " ".join(entry.get("post_text"))
  table = entry.get("table")

  html = f"""
              <div>
                <p>{pre_text}</p>
              </div>
              <div>
                {table_to_html(table)}
              </div>
              <div>
                <p>{post_text}</p>
              </div>
  """

  return html

In [None]:
html = """

            <!DOCTYPE html>
          <html lang="en">
          <head>
              <meta charset="UTF-8">
              <meta name="viewport" content="width=device-width, initial-scale=1.0">
              <title>Financial Report</title>
          </head>
          <body>
              <h1>Report & Question</h1>

              <div id="report">
                  <h2>Report Section</h2>
                  <div>
                      {{ report_content | safe }}
                  </div>
              </div>

              <div id="chat">
                  <h2>Conversation</h2>
                  <hr>
                  <h3>Query</h3>
                  <p>{{ current_message }}</p>
              </div>
          </body>
          </html>

  """

In [None]:
def user_message_template(entry: Dict, html_template = html) -> str:
  template = Template(html_template)
  current_message = entry.get("question")
  report_content = financial_report_html(entry)
  html_output = template.render(current_message=current_message,
                                report_content = report_content)

  return html_output

In [None]:
HTML(user_message_template(entry["12"]))

0,1,2,3
1,,class a common stock,class b common stock
2,balance at december 31 2016,2014,2014
3,issue of shares on business combination at july 3 2017,427709,717111
4,issue of shares upon vesting of restricted stock units ( 1 ),290,2014
5,issue of shares on exercises of stock options ( 1 ),256,2014
6,stock repurchase program ( 2 ) ( 3 ),-6047 ( 6047 ),-10126 ( 10126 )
7,balance at december 31 2017,422208,706985


In [None]:
SYSTEM_PROMPT = """

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful Financial Analyst with strong quantitative analysis skill and keen eyes for details.
You provide accurate answers to users questions based on a financial report.

You would be provided with a financial report containing tables and textual information.
The report would serve as the context required to give accurate answers to users' queries.

The user would ask you a question based on the report.
Your ultimate goal is to breakdown complex numerical reasoning into simple programmatic steps.
You are to generate a list of action steps
Your response could be a single value from the report i.e `482` or muliple sequential and dependent steps `subtract(400, 210), divide(#0, const_100)`

An operation takes two values as arguments. These arguments are retrieved from the provided context (Financial Report Text and Table).


Reference Operations
- add: addition
- subtract: subtraction
- multiply: multiplication
- divide: division
- exp: power
- greater: maximum

You are Limited to these operations.

Reference Tags
#0 references the first operation
#1 references the second operation
#2 references the third operation
...

#n references the nth operation

Constants
- const_1,
- const_2,
- const_3,
- const_4,
- const_5,
- const_6,
- const_7,
- const_8,
- const_9,
- const_10,
- const_100,
- const_1000,
- const_10000,
- const_100000,
- const_1000000,
...

- const_1000000000
- const_m1

Constants are important when an operation needs to be formatted eg currency conversion, percentage, ratio, proportion etc.
The choice of constant is decided by the user query. Constants are always between 1 - 10 or multiple of 10 except constant_m1.
Note: const_m1 (-1) is for negative

Enclose information relevant to the user's question from the context using this tag
<relevant_context>
....
</relevant_context>

Enclose the list of operations to calculate the answer using this tag
<operation>
subtract(200, 100), divide(#0, const_100), subtract(1250, const_10)
</operation>

An operation could also be a single value.

<operation> 10 </operation>

Specify the format of the final answer using e.g (%) <format>%</format>


<|eot_id|>

"""

In [None]:
qwen_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-14B",
    torch_dtype="auto",
    device_map="auto"
)

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/47.5k [00:00<?, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/3.89G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/1.70G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/3.98G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B")

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
def qwen_response(id: str, entry: Dict, tokenizer = tokenizer, model = qwen_model) -> None:

  result_dir = os.path.join(OUTPUT_PATH, "qwen-14B")
  os.makedirs(result_dir, exist_ok=True)

  messages = [
      {
          "role": "system",
          "content": SYSTEM_PROMPT
          },
      {
          "role": "user",
          "content": user_message_template(entry)
          }
  ]

  if os.path.exists(os.path.join(result_dir, f"{id}.txt")):
    print(f"File {id}.txt Processed >> Skipping.\n")
    return

  with open(os.path.join(result_dir, f"{id}.txt"), "a") as f:
    f.write(f"<prompt>\n{SYSTEM_PROMPT}\n</prompt>\n")

  start = time.time()
  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True
      )

  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
  generated_ids = model.generate(
      **model_inputs,
      max_new_tokens = 1024
      )

  generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
      ]
  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  end = time.time()
  time_taken = end - start

  with open(os.path.join(result_dir, f"{id}.txt"), "a") as f:
    f.write(response)
    f.write(f"\n<time>{time_taken}</time>\n")

  print(f"Saved to {result_dir}/{id}.txt. Total Time <{time_taken}>\n")

In [None]:
for key, data in entry.items():
    qwen_response(key, data)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


File 0.txt Processed >> Skipping.

File 1.txt Processed >> Skipping.

File 2.txt Processed >> Skipping.

File 3.txt Processed >> Skipping.

File 4.txt Processed >> Skipping.

File 5.txt Processed >> Skipping.

File 6.txt Processed >> Skipping.

File 7.txt Processed >> Skipping.

File 8.txt Processed >> Skipping.

File 9.txt Processed >> Skipping.

File 10.txt Processed >> Skipping.

File 11.txt Processed >> Skipping.

File 12.txt Processed >> Skipping.

File 13.txt Processed >> Skipping.

File 14.txt Processed >> Skipping.

File 15.txt Processed >> Skipping.

File 16.txt Processed >> Skipping.

File 17.txt Processed >> Skipping.

File 18.txt Processed >> Skipping.

File 19.txt Processed >> Skipping.

File 20.txt Processed >> Skipping.

File 21.txt Processed >> Skipping.



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/22.txt. Total Time <5.259257793426514>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/23.txt. Total Time <64.27294063568115>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/24.txt. Total Time <4.467891693115234>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/25.txt. Total Time <51.185426473617554>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/26.txt. Total Time <3.4424850940704346>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/27.txt. Total Time <6.230879068374634>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/28.txt. Total Time <3.4945175647735596>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/29.txt. Total Time <6.05335259437561>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/30.txt. Total Time <3.505720853805542>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/31.txt. Total Time <6.234628200531006>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/32.txt. Total Time <5.259680986404419>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/33.txt. Total Time <3.5342657566070557>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/34.txt. Total Time <9.238641738891602>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/35.txt. Total Time <64.15658783912659>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/36.txt. Total Time <3.8740859031677246>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/37.txt. Total Time <3.735522508621216>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/38.txt. Total Time <3.397113561630249>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/39.txt. Total Time <63.91980338096619>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/40.txt. Total Time <5.144723415374756>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/41.txt. Total Time <3.5649807453155518>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/42.txt. Total Time <63.488747358322144>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/43.txt. Total Time <6.04520845413208>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/44.txt. Total Time <5.727129697799683>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/45.txt. Total Time <63.739636182785034>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/46.txt. Total Time <6.343661308288574>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/47.txt. Total Time <63.9474823474884>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/48.txt. Total Time <2.617553949356079>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/49.txt. Total Time <64.06568503379822>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/50.txt. Total Time <39.88019871711731>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/51.txt. Total Time <6.791382789611816>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/52.txt. Total Time <2.729259967803955>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/53.txt. Total Time <9.923104286193848>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/54.txt. Total Time <2.6646814346313477>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/55.txt. Total Time <4.962803840637207>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/56.txt. Total Time <6.89606785774231>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/57.txt. Total Time <44.929906129837036>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/58.txt. Total Time <7.396561145782471>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Saved to /content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/59.txt. Total Time <6.085942983627319>



In [None]:
messages = [
    {
        "role": "system",
        "content": SYSTEM_PROMPT
    },
    {
        "role": "user",
        "content": user_message_template(entry["3"])
    }
]

In [None]:
completion = client.chat.completions.create(
    model="o3-mini",
    messages = messages
)

In [None]:
print(completion.choices[0].message.content)

<relevant_context>
For united parcel service inc. on 12/31/09, the cumulative return is $75.95. For the s&p 500 index on 12/31/09, the cumulative return is $102.11.
</relevant_context>

<operation>
subtract(102.11, 75.95)
</operation>

<format>%</format>


In [None]:
entry["3"].get("answer")

'-26.16%'

In [None]:
entry["3"].get("program")

'subtract(75.95, const_100), divide(#0, const_100), subtract(102.11, const_100), divide(#2, const_100), subtract(#1, #3)'

In [None]:
entry["3"].get("exe_ans")

-0.2616

In [None]:
past = time.time()

In [None]:
time.time() - past

11.048887968063354

In [None]:
def make_prediction(model: str, entry: Dict, id: int) -> str:
  result_dir = os.path.join(OUTPUT_PATH, f"{model}/")
  os.makedirs(result_dir, exist_ok=True)
  messages = [
    {
        "role": "system",
        "content": SYSTEM_PROMPT
    },
    {
        "role": "user",
        "content": user_message_template(entry)
    }
  ]

  if os.path.exists(os.path.join(result_dir, f"{id}.txt")):
    print(f"File {id}.txt Processed >> Skipping.\n")
    return

  with open(os.path.join(result_dir, f"{id}.txt"), "a") as f:
    f.write(f"<prompt>\n{SYSTEM_PROMPT}\n</prompt>\n")

  start = time.time()
  completion = client.chat.completions.create(
      model = model,
      messages = messages
      )

  response = completion.choices[0].message.content
  end = time.time()
  time_taken = end - start

  with open(os.path.join(result_dir, f"{id}.txt"), "a") as f:
    f.write(response)
    f.write(f"\n<time>{time_taken}</time>\n")

  print(f"Saved to {result_dir}/{id}.txt. Total Time <{time_taken}>\n")

In [None]:
models = [
    "gpt-4o",
    "o3-mini"
]

In [None]:
for key, data in entry.items():
  for model in models:
    make_prediction(model, data, key)
  if key == "101":
    raise StopIteration

File 0.txt Processed >> Skipping.

File 0.txt Processed >> Skipping.

File 1.txt Processed >> Skipping.

File 1.txt Processed >> Skipping.

File 2.txt Processed >> Skipping.

File 2.txt Processed >> Skipping.

File 3.txt Processed >> Skipping.

File 3.txt Processed >> Skipping.

File 4.txt Processed >> Skipping.

File 4.txt Processed >> Skipping.

File 5.txt Processed >> Skipping.

File 5.txt Processed >> Skipping.

File 6.txt Processed >> Skipping.

File 6.txt Processed >> Skipping.

File 7.txt Processed >> Skipping.

File 7.txt Processed >> Skipping.

File 8.txt Processed >> Skipping.

File 8.txt Processed >> Skipping.

File 9.txt Processed >> Skipping.

File 9.txt Processed >> Skipping.

File 10.txt Processed >> Skipping.

File 10.txt Processed >> Skipping.

File 11.txt Processed >> Skipping.

File 11.txt Processed >> Skipping.

File 12.txt Processed >> Skipping.

File 12.txt Processed >> Skipping.

File 13.txt Processed >> Skipping.

File 13.txt Processed >> Skipping.

File 14.txt 

StopIteration: 

## PARSE RESULTS

In [None]:
qwen_results = "/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/"
gpt4_results = "/content/drive/MyDrive/tomoro_ai/train_json/gpt-4o/"
o3_mini_results = "/content/drive/MyDrive/tomoro_ai/train_json/o3-mini/"

In [None]:
def read_file(file_path: str) -> str:
  with open(file_path, "r") as f:
    return f.read()

In [None]:
def parse_raw_output(output: str, mode = "accuracy"):
  pattern = r"\b[a-zA-Z_]+\([^()]*?(?:\([^()]*\)[^()]*?)*\)|\w+"
  matches = re.findall(pattern, output)
  return matches

In [None]:
def extract_operations(text):
    pattern = r"<operation>(.*?)</operation>"
    matches = re.findall(pattern, text, flags=re.DOTALL)
    return matches if matches else None

In [None]:
def extract_format(text):
    pattern = r"<format>(.*?)</format>"
    matches = re.findall(pattern, text, flags=re.DOTALL)
    return matches if matches else None

In [None]:
def extract_prompt(text):
    pattern = r"<prompt>(.*?)</prompt>"
    matches = re.findall(pattern, text, flags=re.DOTALL)
    return matches if matches else None

### Qwen14B

In [None]:
multiple_extract = []
empty_extract = []
single_extract = []
for file in glob.glob(os.path.join(qwen_results, "*.txt")):
  txt = read_file(file)
  txt = txt.split("<prompt>")[-1].split("</prompt>")[-1]
  matches = extract_operations(txt)
  if matches and len(matches) > 1:
    multiple_extract.append(file)
  elif not matches:
    empty_extract.append(file)
  elif len(matches) == 1:
    single_extract.append(file)
  else:
    print(file)

In [None]:
len(multiple_extract), len(empty_extract), len(single_extract)

(26, 5, 71)

In [None]:
multiple_extract[:5]

['/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/5.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/9.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/11.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/12.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/19.txt']

> In some examples, Qwen14B Regurgidated the System prompt. There should be a a single `<operation>(\.+)</operation>` after the System Prompt that was written to file. The last operation would be selected for these cases



In [None]:
empty_extract[:]

['/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/21.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/23.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/49.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/77.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/101.txt']

In [None]:
txt = read_file(os.path.join(qwen_results, "49.txt"))

In [None]:
print(txt)

<prompt>


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful Financial Analyst with strong quantitative analysis skill and keen eyes for details.
You provide accurate answers to users questions based on a financial report.

You would be provided with a financial report containing tables and textual information.
The report would serve as the context required to give accurate answers to users' queries.

The user would ask you a question based on the report.
Your ultimate goal is to breakdown complex numerical reasoning into simple programmatic steps.
You are to generate a list of action steps
Your response could be a single value from the report i.e `482` or muliple sequential and dependent steps `subtract(400, 210), divide(#0, const_100)`

An operation takes two values as arguments. These arguments are retrieved from the provided context (Financial Report Text and Table).


Reference Operations
- add: addition
- subtract: subtraction
- multiply: multiplicati

> In some examples, Qwen14B failed to produce a useful response.



In [None]:
qwen_progs = {}
for file in single_extract:
  name = os.path.basename(file)
  id = name[:-4]
  txt = read_file(file)
  txt = txt.split("<prompt>")[-1].split("</prompt>")[-1]
  matches = extract_operations(txt)
  qwen_progs[id] = matches[0].strip()

In [None]:
len(qwen_progs)

71

> Extract the last operation for completions with multiple operation tags

In [None]:
len(multiple_extract)

26

In [None]:
pattern = r"^(?:\s*\w+\(\s*(-?\d*(\.\d+)?|const_\d+|#\d+)\s*,\s*(-?\d*(\.\d+)?|const_\d+|#\d+)\s*\)\s*,?)+$"

In [None]:
for file in multiple_extract:
  name = os.path.basename(file)
  id = name[:-4]
  txt = read_file(file)
  txt = txt.split("<prompt>")[-1].split("</prompt>")[-1]
  matches = extract_operations(txt)
  matches.reverse()
  for match in matches:
    if bool(re.match(pattern, match.strip())):
      print(f"{name}: {match}\n\n")
      qwen_progs[id] = match.strip()
      break
    else:
      print(f"failed to extract: {name}\n\n")

failed to extract: 5.txt


5.txt: 
subtract(200, 100), divide(#0, const_100), subtract(1250, const_10)



failed to extract: 9.txt


9.txt: 
subtract(200, 100), divide(#0, const_100), subtract(1250, const_10)



failed to extract: 11.txt


11.txt: 
subtract(200, 100), divide(#0, const_100), subtract(1250, const_10)



failed to extract: 12.txt


12.txt: 
subtract(200, 100), divide(#0, const_100), subtract(1250, const_10)



19.txt: 
add(268496, 131262), add(#0, 195082)



25.txt: 
subtract(18161, 9889), divide(#0, 9889), multiply(#0, const_100)



failed to extract: 35.txt


35.txt: 
subtract(200, 100), divide(#0, const_100), subtract(1250, const_10)



failed to extract: 39.txt


39.txt: 
subtract(200, 100), divide(#0, const_100), subtract(1250, const_10)



failed to extract: 42.txt


42.txt: 
subtract(200, 100), divide(#0, const_100), subtract(1250, const_10)



failed to extract: 45.txt


45.txt: 
subtract(200, 100), divide(#0, const_100), subtract(1250, const_10)



50.txt: 
divid

In [None]:
len(qwen_progs)

97

In [None]:
empty_extract

['/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/21.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/23.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/49.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/77.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/qwen-14B/101.txt']

In [None]:
for file in empty_extract:
  name = os.path.basename(file)
  id = name[:-4]
  qwen_progs[id] = ""

In [None]:
formatted_response_count = 0
qwen_formats = {}
for file in single_extract + multiple_extract + empty_extract:
  name = os.path.basename(file)
  id = name[:-4]
  txt = read_file(file)
  txt = txt.split("<prompt>")[-1].split("</prompt>")[-1]
  matches = extract_format(txt)
  if matches and len(matches) == 1:
    qwen_formats[id] = matches[0].strip()
    formatted_response_count += 1
  elif matches and len(matches) > 1:
    for match in matches:
      qwen_formats[id] = match.strip()
    formatted_response_count += 1
  else:
    qwen_formats[id] = ""

In [None]:
formatted_response_count

94

In [None]:
len(qwen_formats), len(qwen_progs)

(102, 102)

### GPT-4o

In [None]:
multiple_extract = []
zero_extract = []
single_extract = []
for file in glob.glob(os.path.join(gpt4_results, "*.txt")):
  txt = read_file(file)
  txt = txt.split("<prompt>")[-1].split("</prompt>")[-1]
  matches = extract_operations(txt)
  if matches and len(matches) > 1:
    multiple_extract.append(file)
  elif not matches:
    zero_extract.append(file)
  elif len(matches) == 1:
    single_extract.append(file)
  else:
    print(file)

In [None]:
len(multiple_extract), len(zero_extract), len(single_extract)

(2, 0, 100)

In [None]:
multiple_extract

['/content/drive/MyDrive/tomoro_ai/train_json/gpt-4o/95.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/gpt-4o/96.txt']

In [None]:
print(read_file(multiple_extract[1]))

<prompt>


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful Financial Analyst with strong quantitative analysis skill and keen eyes for details.
You provide accurate answers to users questions based on a financial report.

You would be provided with a financial report containing tables and textual information.
The report would serve as the context required to give accurate answers to users' queries.

The user would ask you a question based on the report.
Your ultimate goal is to breakdown complex numerical reasoning into simple programmatic steps.
You are to generate a list of action steps
Your response could be a single value from the report i.e `482` or muliple sequential and dependent steps `subtract(400, 210), divide(#0, const_100)`

An operation takes two values as arguments. These arguments are retrieved from the provided context (Financial Report Text and Table).


Reference Operations
- add: addition
- subtract: subtraction
- multiply: multiplicati

In [None]:
gpt4_progs = {}
for file in single_extract:
  name = os.path.basename(file)
  id = name[:-4]
  txt = read_file(file)
  txt = txt.split("<prompt>")[-1].split("</prompt>")[-1]
  matches = extract_operations(txt)
  gpt4_progs[id] = matches[0].strip()

In [None]:
gpt4_progs["11"]

'subtract(498.8, 486.9), divide(#0, 486.9), multiply(#1, const_100)'

> Patch Response 95 and 96

In [None]:
gpt4_progs["95"] = "subtract(11.3, 6.5)"
gpt4_progs["96"] = "subtract(239.0, 208.3), divide(#0, 208.3), multiply(#1, const_100)"

In [None]:
len(gpt4_progs)

102

In [None]:
formatted_response_count = 0
gpt4_formats = {}
for file in single_extract + multiple_extract:
  name = os.path.basename(file)
  id = name[:-4]
  txt = read_file(file)
  txt = txt.split("<prompt>")[-1].split("</prompt>")[-1]
  matches = extract_format(txt)
  if matches and len(matches) == 1:
    gpt4_formats[id] = matches[0].strip()
    formatted_response_count += 1
  elif matches and len(matches) > 1:
    for match in matches:
      gpt4_formats[id] = match.strip()
    formatted_response_count += 1
  else:
    gpt4_formats[id] = ""

In [None]:
formatted_response_count

80

In [None]:
gpt4_formats["0"]

'%'

### O3-Mini

In [None]:
multiple_extract = []
zero_extract = []
single_extract = []
for file in glob.glob(os.path.join(o3_mini_results, "*.txt")):
  txt = read_file(file)
  txt = txt.split("<prompt>")[-1].split("</prompt>")[-1]
  matches = extract_operations(txt)
  if matches and len(matches) > 1:
    multiple_extract.append(file)
  elif not matches:
    zero_extract.append(file)
  elif len(matches) == 1:
    single_extract.append(file)
  else:
    print(file)

In [None]:
len(multiple_extract), len(zero_extract), len(single_extract)

(0, 2, 100)

In [None]:
single_extract[:4]

['/content/drive/MyDrive/tomoro_ai/train_json/o3-mini/0.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/o3-mini/2.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/o3-mini/3.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/o3-mini/5.txt']

In [None]:
zero_extract

['/content/drive/MyDrive/tomoro_ai/train_json/o3-mini/1.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/o3-mini/59.txt']

In [None]:
o3_progs = {}
for file in single_extract:
  name = os.path.basename(file)
  id = name[:-4]
  txt = read_file(file)
  txt = txt.split("<prompt>")[-1].split("</prompt>")[-1]
  matches = extract_operations(txt)
  o3_progs[id] = matches[0].strip()

In [None]:
o3_progs["1"] = None
o3_progs["59"] = None

In [None]:
len(o3_progs)

102

In [None]:
zero_extract[:4]

['/content/drive/MyDrive/tomoro_ai/train_json/o3-mini/1.txt',
 '/content/drive/MyDrive/tomoro_ai/train_json/o3-mini/59.txt']

In [None]:
formatted_response_count = 0
o3_formats = {}
for file in single_extract:
  name = os.path.basename(file)
  id = name[:-4]
  txt = read_file(file)
  txt = txt.split("<prompt>")[-1].split("</prompt>")[-1]
  matches = extract_format(txt)
  if matches and len(matches) == 1:
    o3_formats[id] = matches[0].strip()
    formatted_response_count += 1
  elif matches and len(matches) > 1:
    for match in matches:
      o3_formats[id] = match.strip()
    formatted_response_count += 1
  else:
    o3_formats[id] = ""

## PROGRAM EXTRACTION

In [None]:
qwen_response = {}
for key, value in qwen_progs.items():
  qwen_response[key] = {
      "program": value,
      "format": w if (w:=qwen_formats.get(key)) else None
  }

In [None]:
qwen_response["4"]

{'program': 'subtract(75.95, 102.11), divide(#0, const_100)', 'format': '%'}

In [None]:
gpt4_response = {}
for key, value in gpt4_progs.items():
  gpt4_response[key] = {
      "program": value,
      "format": w if (w:=gpt4_formats.get(key)) else None
  }

In [None]:
gpt4_response["3"]

{'program': 'subtract(75.95, 102.11), divide(#0, 102.11), multiply(#1, const_100)',
 'format': '%'}

In [None]:
o3_response = {}
for key, value in o3_progs.items():
  o3_response[key] = {
      "program": value,
      "format": w if (w:=o3_formats.get(key)) else None
  }

In [None]:
o3_response["3"]

{'program': 'subtract(102.11, 75.95)', 'format': '%'}

## EVALUATION

In [None]:
def get_decimal_places(number_str):
  """
  Checks the number of decimal places a number string is rounded to.

  Args:
      number_str: The number string to check.

  Returns:
      The number of decimal places, or None if the input is not a valid number
      or if the number does not have any decimal places.
  """
  if not isinstance(number_str, str):
    try:
      number_str = str(number_str)
    except:
      return None
  match = re.search(r"\.(\d+)", number_str)
  if match:
      return len(match.group(1))
  else:
    return None

In [None]:
def parse_value(value: str) -> Union[int, float]:
  """
  Parses a value, converting percentages, constants, and references appropriately.
  """
  value = value.strip()
  pat = re.compile(r"^const_(\d+)$")

  if "const_m1" == value:
      return -1
  elif bool(constant:=re.match(pat, value)):
      return float(constant.groups()[0])
  elif value.startswith("#"):
      return value  # Placeholder for reference resolution
  elif "%" in value:
      return float(value.replace("%", "")) / 100  # Convert percentage to decimal
  elif isinstance(value, str) and bool(re.match(r"^-?\d*\.?\d+$", value)):
      return float(value)
  else:
    return None


def evaluate_program_safe(program: List[str], round_places: int = 5) -> Union[int, float]:

  """
  Evaluates a list of operations with proper sanitization and reference resolution.
  """
  # Define supported operations
  OPERATIONS = {
      "add": operator.add,
      "subtract": operator.sub,
      "multiply": operator.mul,
      "divide": operator.truediv,
      "exp": operator.pow,
      "greater": max
      }

  results: Dict[int, Union[int, float]] = {}

  if len(program) == 1 and bool(re.match(r"^-?\d+(\.\d+)?$", program[0])):
    value = parse_value(program[0])
    return round(value, 5)


  for i, operation in enumerate(program):
    operation = operation.strip()
    # Extract operation name and arguments
    match = re.match(r"^(\w+)\(\s*([#\w%.-]+)\s*,\s*([#\w%.-]+)\s*\)$", operation)
    if not match:
      print(f"Invalid operation format: {operation}")
      return None
    func_name, arg1, arg2 = match.groups()

    # Resolve arguments
    resolved_args = []
    for arg in [arg1, arg2]:
      parsed_value = parse_value(arg)
      if isinstance(parsed_value, str) and parsed_value.startswith("#"):
        ref_idx = int(parsed_value[1:])
        if ref_idx not in results:
          return None
        resolved_args.append(results[ref_idx])  # Resolve reference
      elif parsed_value:
        resolved_args.append(parsed_value)
      else:
        print(f"Invalid argument: {arg}")
        return None

    # Perform operation
    if func_name not in OPERATIONS:
      return None
    results[i] = OPERATIONS[func_name](*resolved_args)
  return round(results[len(program) - 1], round_places)

In [None]:
def program_tokenization(original_program):
  original_program = original_program.split(', ')
  program = []
  for tok in original_program:
      cur_tok = ''
      for c in tok:
          if c == ')':
              if cur_tok != '':
                  program.append(cur_tok)
                  cur_tok = ''
          cur_tok += c
          if c in ['(', ')']:
              program.append(cur_tok)
              cur_tok = ''
      if cur_tok != '':
          program.append(cur_tok)
  program.append('EOF')
  return program

In [None]:
def str_prog_list(program: str,
                  pat: re.Pattern = re.compile(r"\w+\(.+$")) -> List[str]:
  prog_list = []
  for p in program.split("), "):
    if bool(pat.match(p)):
      p = p.strip() if p.strip().endswith(")") else p.strip() + ")"
      prog_list.append(p)
    else:
      prog_list.append(p)
  return prog_list

In [None]:
def prog_str_sequence(program: List[str]) -> str:
  return ", ".join([str(p) for p in program])

In [None]:
def evaluate_responses(model_response, ground_truth):
  score = 0

  for key in range(100):
      str_key = str(key)

      if str_key not in model_response:
          print(f"Key not found: {key}\n")
          continue

      try:
          prog = model_response[str_key].get("program")
          if not prog:
              print("Program not found")
              continue

          program = str_prog_list(prog)
          ori_res = ground_truth.get(str_key, {}).get("exe_ans")

          if ori_res is None:
              print(f"Original answer not found for key: {key}\n")
              continue

          places = get_decimal_places(ori_res)
          response = evaluate_program_safe(program, round_places=places)

          format_type = model_response[str_key].get("format")
          if format_type == "%":
              response = round(response * 0.01, places)

          print(f"Original Answer: {ori_res}\no3_pred: {response}\n")

          score += 1 if ori_res == response else 0

      except Exception as e:
          print(f"Error processing key {key}: {e}")

  return score

In [None]:
def equal_program(program1, program2):
  '''
  symbolic program if equal
  program1: gold
  program2: pred
  '''

  sym_map = {}

  program1 = program1[:-1]  # remove EOF

  ### single number program
  if len(program1) == 1:
      if len(program2) == 2 and program1[0] == program2[0]:
          return True
      else:
          return False

  program1 = "|".join(program1)
  steps = program1.split(")")[:-1]

  invalid_flag = 0
  sym_ind = 0
  step_dict_1 = {}

  # symbolic map
  for ind, step in enumerate(steps):

      step = step.strip()

      assert len(step.split("(")) <= 2

      op = step.split("(")[0].strip("|").strip()
      args = step.split("(")[1].strip("|").strip()

      arg1 = args.split("|")[0].strip()
      arg2 = args.split("|")[1].strip()

      step_dict_1[ind] = step

      if "table" in op:
          if step not in sym_map:
              sym_map[step] = "a" + str(sym_ind)
              sym_ind += 1

      else:
          if "#" not in arg1:
              if arg1 not in sym_map:
                  sym_map[arg1] = "a" + str(sym_ind)
                  sym_ind += 1

          if "#" not in arg2:
              if arg2 not in sym_map:
                  sym_map[arg2] = "a" + str(sym_ind)
                  sym_ind += 1

  # check program 2
  step_dict_2 = {}
  try:
      program2 = program2[:-1]  # remove EOF
      # check structure
      for ind, token in enumerate(program2):
          if ind % 4 == 0:
              if token.strip("(") not in all_ops:
                  # print("structure error")
                  return False
          if (ind + 1) % 4 == 0:
              if token != ")":
                  # print("structure error")
                  return False

      program2 = "|".join(program2)
      steps = program2.split(")")[:-1]

      for ind, step in enumerate(steps):
          step = step.strip()

          if len(step.split("(")) > 2:
              return False
          op = step.split("(")[0].strip("|").strip()
          args = step.split("(")[1].strip("|").strip()

          arg1 = args.split("|")[0].strip()
          arg2 = args.split("|")[1].strip()

          step_dict_2[ind] = step

          if "table" in op:
              if step not in sym_map:
                  return False

          else:
              if "#" not in arg1:
                  if arg1 not in sym_map:
                      return False
              else:
                  if int(arg1.strip("#")) >= ind:
                      return False

              if "#" not in arg2:
                  if arg2 not in sym_map:
                      return False
              else:
                  if int(arg2.strip("#")) >= ind:
                      return False
  except:
      return False

  def symbol_recur(step, step_dict):

      step = step.strip()
      op = step.split("(")[0].strip("|").strip()
      args = step.split("(")[1].strip("|").strip()

      arg1 = args.split("|")[0].strip()
      arg2 = args.split("|")[1].strip()

      if "table" in op:
          # as var
          return sym_map[step]

      if "#" in arg1:
          arg1_ind = int(arg1.replace("#", ""))
          arg1_part = symbol_recur(step_dict[arg1_ind], step_dict)
      else:
          arg1_part = sym_map[arg1]

      if "#" in arg2:
          arg2_ind = int(arg2.replace("#", ""))
          arg2_part = symbol_recur(step_dict[arg2_ind], step_dict)
      else:
          arg2_part = sym_map[arg2]

      if op == "add":
          return "( " + arg1_part + " + " + arg2_part + " )"
      elif op == "subtract":
          return "( " + arg1_part + " - " + arg2_part + " )"
      elif op == "multiply":
          return "( " + arg1_part + " * " + arg2_part + " )"
      elif op == "divide":
          return "( " + arg1_part + " / " + arg2_part + " )"
      elif op == "exp":
          return "( " + arg1_part + " ** " + arg2_part + " )"
      elif op == "greater":
          return "( " + arg1_part + " > " + arg2_part + " )"

  # # derive symbolic program 1
  steps = program1.split(")")[:-1]
  sym_prog1 = symbol_recur(steps[-1], step_dict_1)
  sym_prog1 = simplify(sym_prog1, evaluate=False)

  try:
      # derive symbolic program 2
      steps = program2.split(")")[:-1]
      sym_prog2 = symbol_recur(steps[-1], step_dict_2)
      sym_prog2 = simplify(sym_prog2, evaluate=False)
  except:
      return False

  # print(sym_prog1)
  # print(sym_prog2)
  return sym_prog1 == sym_prog2

In [None]:
o3_score = evaluate_responses(o3_response, entry)

Original Answer: 0.14136
o3_pred: 0.14136

Program not found
Original Answer: -0.3282
o3_pred: -0.3282

Original Answer: -0.2616
o3_pred: 0.2616

Original Answer: -0.2616
o3_pred: 0.2616

Original Answer: 0.70067
o3_pred: 0.29933

Original Answer: 0.15625
o3_pred: 0.15625

Original Answer: 0.15686
o3_pred: 0.15686

Original Answer: 0.22986
o3_pred: 0.22986

Original Answer: 12.0
o3_pred: 12.0

Original Answer: -19.0
o3_pred: -19.0

Original Answer: 0.02444
o3_pred: 0.02444

Original Answer: 0.5656
o3_pred: 0.5656

Original Answer: 0.10548
o3_pred: 0.10548

Original Answer: 1.5882
o3_pred: 1.5882

Invalid argument: C
Original Answer: 16.66667
o3_pred: None

Original Answer: 0.42457
o3_pred: 0.42457

Original Answer: 0.11247
o3_pred: 0.11247

Original Answer: 40.33333
o3_pred: 40.33333

Original Answer: 594840.0
o3_pred: 594840.0

Original Answer: 7.0
o3_pred: 7.0

Original Answer: 0.0228
o3_pred: 0.1515

Original Answer: -0.01204
o3_pred: -0.01204

Original Answer: 60.3
o3_pred: 60.3

O

In [None]:
print(f"O3-mini Execution Accuracy: {o3_score}%")

O3-mini Execution Accuracy: 68%


In [None]:
qwen_score = evaluate_responses(qwen_response, entry)

Original Answer: 0.14136
o3_pred: 0.14136

Original Answer: 0.01269
o3_pred: 0.01269

Original Answer: -0.3282
o3_pred: -0.3282

Original Answer: -0.2616
o3_pred: -0.0026

Original Answer: -0.2616
o3_pred: -0.0026

Original Answer: 0.70067
o3_pred: 12.4

Original Answer: 0.15625
o3_pred: 12.4

Original Answer: 0.15686
o3_pred: 0.15686

Original Answer: 0.22986
o3_pred: 0.00142

Original Answer: 12.0
o3_pred: 12.4

Original Answer: -19.0
o3_pred: 12.4

Original Answer: 0.02444
o3_pred: 12.4

Original Answer: 0.5656
o3_pred: 12.4

Original Answer: 0.10548
o3_pred: 1.4391

Original Answer: 1.5882
o3_pred: 0.0159

Original Answer: 16.66667
o3_pred: 1240.0

Original Answer: 0.42457
o3_pred: 0.00425

Original Answer: 0.11247
o3_pred: -1.987

Original Answer: 40.33333
o3_pred: 43.0

Original Answer: 594840.0
o3_pred: 594840.0

Original Answer: 7.0
o3_pred: 7.0

Program not found
Original Answer: -0.01204
o3_pred: 32782.0

Program not found
Original Answer: 0.6688
o3_pred: 0.0067

Original Ans

In [None]:
print(f"Qwen14B Execution Accuracy: {qwen_score}%")

Qwen14B Execution Accuracy: 23%


In [None]:
gpt4_score = evaluate_responses(gpt4_response, entry)

Original Answer: 0.14136
o3_pred: 0.14136

Original Answer: 0.01269
o3_pred: 0.01269

Invalid operation format: divide(#0, 7983
Error processing key 2: unsupported operand type(s) for *: 'NoneType' and 'float'
Original Answer: -0.2616
o3_pred: -0.2562

Original Answer: -0.2616
o3_pred: 0.0026

Original Answer: 0.70067
o3_pred: 0.29933

Invalid operation format: subtract(3700, 3200),
divide(#0, 3200),
multiply(#1, const_100)
Error processing key 6: unsupported operand type(s) for *: 'NoneType' and 'float'
Original Answer: 0.15686
o3_pred: 0.15686

Original Answer: 0.22986
o3_pred: 0.0023

Original Answer: 12.0
o3_pred: 12.0

Original Answer: -19.0
o3_pred: -19.0

Original Answer: 0.02444
o3_pred: 0.02444

Original Answer: 0.5656
o3_pred: 0.5656

Original Answer: 0.10548
o3_pred: 0.10548

Original Answer: 1.5882
o3_pred: 1.5882

Original Answer: 16.66667
o3_pred: 0.16667

Original Answer: 0.42457
o3_pred: 0.42457

Original Answer: 0.11247
o3_pred: 0.11247

Original Answer: 40.33333
o3_pr

In [None]:
print(f"GPT4 Execution Accuracy: {gpt4_score}%")

GPT4 Execution Accuracy: 60%


In [None]:
o3_response.get("0").get("program")

'subtract(206588, 181001), divide(#0, 181001), multiply(#1, const_100)'

In [None]:
entry.get("0").get("program")

'subtract(206588, 181001), divide(#0, 181001)'

In [None]:
equal_program(
    program_tokenization(o3_response.get("0").get("program")),
    program_tokenization(entry.get("0").get("program"))
)

False

In [None]:
prog_A = "multiply(const_100, 45.68), divide(#0, 259)"
prog_B = "divide(45.68, 259), multiply(#0, const_100)"

In [None]:
equal_program(
    program_tokenization(prog_A),
    program_tokenization(prog_B)
)

False

In [None]:
result_dir = "/content/drive/MyDrive/ConvFinQA/output/generator-roberta-large-2e-5-new-test_20250324083557/results/loads/20/valid"

In [None]:
with open(os.path.join(result_dir, "nbest_predictions.json"), "r") as f:
  nbest_predictions = json.load(f)

In [None]:
nbest_predictions["4"][0]["pred_prog"]

['subtract(', '60.94', '25.14', ')', 'divide(', '#0', '25.14', ')', 'EOF']

In [None]:
nbest_predictions["1"][0]["ref_answer"]

25.14

In [None]:
def _detokenize(tokens):
  stringify = " ".join(tokens[:-1])
  split_string = stringify.split(") ")
  return split_string

In [None]:
_detokenize(nbest_predictions["4"][0]["pred_prog"])

['subtract( 60.94 25.14 ', 'divide( #0 25.14 )']

In [None]:
nbest_predictions["4"][0]["pred_prog"]

['subtract(', '60.94', '25.14', ')', 'divide(', '#0', '25.14', ')', 'EOF']

In [None]:
prog = "subtract(60.65, const_100), multiply(-.005, -.456), add(34, 45), multiply(#0, 100)"

In [None]:
detokenize(program_tokenization(prog))

'subtract(60.65, const_100), multiply(-.005, -.456), add(34, 45), multiply(#0, 100)'

In [None]:
prog_1 = "subtract(60.65, const_349), 900, add(34, 45), multiply(#0, 100)"
prog_2 = "subtract(60.65, const_349), add(34, 45), multiply(#0, 100)"

In [None]:
# Example usage:
tokens = ['subtract(', '60.94', '25.14', ')', 'divide(', '#0', '25.14', ')', 'EOF']
output = detokenize(tokens)
print(output)  # Expected: "subtract(60.94, 25.14), divide(subtract(60.94, 25.14), 25.14)"

subtract(60.94, 25.14), divide(#0, 25.14)


In [None]:
HTML(user_message_template(entry["5"]))

0,1,2,3
1,,shares available for awards,shares subject to outstanding awards
2,2009 global incentive plan,2322450,2530454
3,2004 stock incentive plan,-,5923147


In [None]:
data = {}
for file in glob.glob(os.path.join(GPT_4, "*.txt")):
  if file not in [
      "/content/drive/MyDrive/tomoro_ai/gpt-4o/1194.txt",
      "/content/drive/MyDrive/tomoro_ai/gpt-4o/768.txt"
  ]:
    name = os.path.basename(file)
    id = name[:-4]
    txt = read_file(file)
    txt = txt.split("<prompt>")[-1].split("</prompt>")[-1]
    txt = txt.split("<operation>")[-1].split("</operation>")[0]
    try:
      js = json.loads(txt)
      data[id] = js
    except:
      if w:=parse_raw_output(txt):
        w = [i for i in w if bool(re.match(pt, i))]
        data[id] = w
      else:
        print(file)