# GSM8K

In [28]:
import json 
import unicodedata

def normalize(s: str):
    return unicodedata.normalize("NFC", s)

f = open("./data/GSM8K/train.jsonl", "r")
data = []
for line in f:
    line = json.loads(line)
    line["question"] = normalize(line["question"])
    answer = normalize(line["answer"]).split("####")[-1]
    line["answer"] = float(answer.replace(",", "").strip())
    data.append(line)

f.close()

f = open("./data/GSM8K/test.jsonl", "r")
for line in f:
    line = json.loads(line)
    line["question"] = normalize(line["question"])
    answer = normalize(line["answer"]).split("####")[-1]
    line["answer"] = float(answer.replace(",", "").strip())
    data.append(line)

f.close()

In [29]:
len(data)

8792

In [30]:
f = open("./data/GSM8K/gsm6k_code.jsonl", "r")
code_data = []
for line in f:
    line = json.loads(line)
    line["question"] = normalize(line["question"])
    code_data.append(line)

f.close()

f = open("./data/GSM8K/gsm3k_code.jsonl", "r")
for line in f:
    line = json.loads(line)
    line["question"] = normalize(line["question"])
    code_data.append(line)

f.close()

final_data = []
for s1 in code_data:
    new_sample = {
        "question": s1["question"],
        "instruction": s1["instruction"],
        "code": s1["code"],
        "answer": None
    }
    for s2 in data:
        if s1["question"] == s2["question"]:
            new_sample["answer"] = s2["answer"]
            break 
    final_data.append(new_sample)

for i, s in enumerate(final_data):
    if s["answer"] is None:
        print(i)


In [31]:
len(final_data)

8929

In [32]:
print((final_data[0]["code"])[9:-3])


# Define the number of clips sold in April
clips_april = 48

# Divide the number of clips sold in April by 2 to calculate the number of clips sold in May
clips_may = clips_april / 2

# Add the number of clips sold in April and the number of clips sold in May to calculate the total number of clips sold in April and May
clips_total = clips_april + clips_may

# Print the total number of clips sold in April and May
print(clips_total)



In [14]:
from src.utils.python_executor import PythonExecutor
import re

CODE_PATTERN = re.compile(r"```python([\s\S]*)```")

def extract_code(s: str):
    return CODE_PATTERN.findall(s)[0]

executor = PythonExecutor(get_answer_from_stdout=True)

In [34]:
from tqdm import  tqdm
final_data_1 = []
for s in tqdm(final_data):
    try:
        code = extract_code(s["code"])
        predictions = float(executor.apply(code)[0])
    except:
        continue
    if abs(predictions - s["answer"]) < 1e-5:
        final_data_1.append(s)

 91%|█████████ | 8126/8929 [15:28<01:31,  8.74it/s]

In [None]:
len(final_data_1)

6648

In [72]:
with open("./data/GSM8K/gsm8k_code.jsonl", "a") as f:
    for s in final_data_1:
        d = json.dumps(s, ensure_ascii=False)+"\n"
        f.write(d)

In [73]:
final_data_1[0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'instruction': 'STEP\nStep 1: The number of clips sold in April is 48.\nStep 2: Divide the number of clips sold in April by 2 to calculate the number of clips sold in May.\nStep 3: Add the number of clips sold in April and the number of clips sold in May to calculate the total number of clips sold in April and May.',
 'code': '```python\n# Define the number of clips sold in April\nclips_april = 48\n\n# Divide the number of clips sold in April by 2 to calculate the number of clips sold in May\nclips_may = clips_april / 2\n\n# Add the number of clips sold in April and the number of clips sold in May to calculate the total number of clips sold in April and May\nclips_total = clips_april + clips_may\n\n# Print the total number of clips sold in April and May\nprint(clips_total)\n```',
 'answer': 72.0}

In [76]:
sample

{'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',
 'instruction': 'STEP\nStep 1: The amount Weng earns per hour babysitting is $12.\nStep 2: Calculate how much Weng earns per minute babysitting by dividing the amount Weng earns per hour by 60. In this case, $12 divided by 60 equals $0.20 per minute.\nStep 3: Multiply the result in step 2 by 50 to find the amount of money Weng earns after 50 minutes of babysitting. In this case, $0.20 multiplied by 50 equals $10.',
 'code': '```python\n# Define the amount Weng earns per hour babysitting\nhourly_rate = 12\n\n# Calculate how much Weng earns per minute babysitting by dividing the hourly rate by 60\nminute_rate = hourly_rate / 60\n\n# Multiply the minute rate by 50 to find the amount of money Weng earns after 50 minutes of babysitting\nearnings = minute_rate * 50\n\n# Print the earnings\nprint(earnings)\n```',
 'answer': 10.0}

In [22]:
from tqdm import  tqdm
with open("./data/gsm8k_zalo/gsm8k_v1.jsonl", "a") as f:
    for sample in tqdm(final_data_1):
        code = f"```python\n{extract_code(sample['code'])}\n```"
        s = {
            "messages": [
                {
                    "role": "user",
                    "content": f"Solve the following problem: {sample['question']}"
                },
                {
                    "role": "assistant",
                    "content": (
                        "Let's break it down step by step first:\n"
                        f"{sample['instruction']}\n"
                        "Here's the Python code based on the plan above:\n"
                        f"{code}"    
                    )
                }
            ]
        }
        d = json.dumps(s, ensure_ascii=False)+"\n"
        f.write(d)


100%|██████████| 6648/6648 [00:00<00:00, 27024.75it/s]


# ZALO

In [27]:
import pandas as pd
import json
import unicodedata

def normalize(s: str):
    return unicodedata.normalize("NFC", s)

    
f  = open("./data/zalo/train/math_code_v2.jsonl")
code_data = []
for line in f:
    line = json.loads(line.strip())
    code_data.append(line)
f.close()

with open("./data/zalo/train/math_train.json", "r") as f:
    original_data = json.loads(f.read())

for sample in code_data:
    sample["answer"] = None
    for sample_1 in original_data["data"]:
        question = normalize(sample["question"].strip())
        question_1 = normalize(sample_1["question"].strip())
        if question_1[:-1] in question:
            sample["answer"] = normalize(sample_1["answer"])

In [28]:
len(code_data)

1208

In [29]:
from tqdm import  tqdm
import re
from src.utils.python_executor import PythonExecutor


CODE_PATTERN = re.compile(r"```python([\s\S]*)```")

def extract_code(s: str):
    return CODE_PATTERN.findall(s)[0]

executor = PythonExecutor(get_answer_from_stdout=True)

def is_float(string: str):
    try:
        float(string)
        return True
    except ValueError:
        return False


COMPARISON_PATTERN = re.compile("if ([\w]+==.*|[\w]+ ==.*):")


def fix_rounding_error(code: str):
    comparisons = COMPARISON_PATTERN.findall(code)
    if len(comparisons) == 0:
        return code
    answer_variable = comparisons[0].split("==")[0]
    answer_value = executor.apply(
        code + f"\nprint({answer_variable})"
    )[0]
    if is_float(answer_value):
        for comparison in comparisons:
            choice_variable = comparison.split("==")[-1]
            str_to_replace = f"abs({answer_variable}-{choice_variable}) < 1e-8"
            code = code.replace(comparison, str_to_replace)
    
    return code

def execute_python_code(code: str):
    output = executor.apply(code)
    if output[1] == 'Done':
        if output[0] == '':
            return code, "Missing print function!!!!"
        if output[0].strip() not in ["A", "B", "C", "D", "E"]:
            code = fix_rounding_error(code)
            output = executor.apply(code)        

    return code, output[0]

In [30]:
code_data_1 = []
drop_ids = []
for i, s in enumerate(tqdm(code_data)):
    try:
        code = extract_code(s["code"])
    except:
        drop_ids.append(i)
        continue
    tmp = execute_python_code(code)
    _, output = tmp
    if output.strip().lower() == s["answer"][0].lower():
        code_data_1.append(s)
    else:
        drop_ids.append(i)

100%|██████████| 1208/1208 [03:12<00:00,  6.27it/s]


In [31]:
len(code_data_1)

785

In [33]:
with open("./data/zalo/train/zalo_code_filtered.jsonl", "a") as f:
    for s in code_data_1:
        d = json.dumps(s, ensure_ascii=False)+"\n"
        f.write(d)

In [6]:
code_data_1[0]

{'question': 'Một người bán hàng bỏ ra 80,000 đồng tiền vốn và bị lỗ 6%. Để tính số tiền lỗ ta phải tính?\nA. 80,000 : 6\nB. 80,000 x 6\nC. 80,000 : (6 x 100)\nD. (80,000 x 6) : 100\n',
 'instruction': 'STEP\nStep 1: The capital invested is 80,000 VND, and the loss percentage is 6%.\nStep 2: To calculate the loss amount, multiply the initial capital amount (80,000) by the loss percentage (6%).\nStep 3: Compare the calculated results with each answer to choose the correct answer.',
 'code': 'I can try to write a Python code based on the instructions you gave me. Here is what I came up with:\n\n```python\ninitial_capital = 80000\nloss_percentage = 6 / 100\n\nloss = initial_capital * loss_percentage\n\nanswers = {\n    "A": 80000 / 6,\n    "B": 80000 * 6,\n    "C": 80000 / (6 * 100),\n    "D": (80000 * 6) / 100\n}\nif loss == answers["A"]:\n    answer = "A"\nelif loss == answers["B"]:\n    answer = "B"\nelif loss == answers["C"]:\n    answer = "C"\nelif loss == answers["D"]:\n    answer =

In [8]:
from tqdm import  tqdm
import json

from src.utils.utils import add_notes

with open("./data/gsm8k_zalo/zalo_v1.jsonl", "a") as f:
    for sample in tqdm(code_data_1):
        code = f"```python\n{extract_code(sample['code'])}\n```"
        question = add_notes(sample['question'])
        s = {
            "messages": [
                {
                    "role": "user",
                    "content": f"Solve the following multiple-choices problem: {question}"
                },
                {
                    "role": "assistant",
                    "content": (
                        "Let's break it down step by step first:\n"
                        f"{sample['instruction']}\n"
                        "Here's the Python code based on the plan above:\n"
                        f"{code}"    
                    )
                }
            ]
        }
        d = json.dumps(s, ensure_ascii=False)+"\n"
        f.write(d)

100%|██████████| 785/785 [00:00<00:00, 16837.79it/s]


In [75]:
print(code_data[5]["instruction"])

STEP
Step 1: Convert 10% to decimal form by dividing it by 100.
Step 2: Multiply the decimal form of 10% by 5 dm to find the answer.
Step 3: Compare the calculated result with each answer to choose the correct option.


In [71]:
drop_ids

[5,
 7,
 18,
 20,
 24,
 31,
 36,
 39,
 40,
 43,
 45,
 51,
 53,
 57,
 66,
 69,
 73,
 85,
 86,
 89,
 90,
 91,
 93,
 95,
 97,
 99,
 101,
 104,
 110,
 119,
 131,
 134,
 137,
 149,
 150,
 154,
 158,
 165,
 166,
 171,
 174,
 181,
 182,
 190,
 197,
 199,
 202,
 203,
 204,
 205,
 206,
 207,
 216,
 229,
 233,
 234,
 242,
 248,
 249,
 255,
 256,
 257,
 260,
 263,
 264,
 267,
 268,
 269,
 271,
 275,
 277,
 279,
 280,
 281,
 285,
 289,
 290,
 291,
 293,
 296,
 297,
 299,
 302,
 307,
 310,
 311,
 316,
 317,
 318,
 321,
 322,
 324,
 325,
 326,
 327,
 329,
 330,
 332,
 334,
 335,
 336,
 337,
 342,
 346,
 348,
 350,
 353,
 354,
 355,
 360,
 361,
 363,
 364,
 374,
 383,
 384,
 387,
 389,
 395,
 396,
 399,
 407,
 413,
 414,
 415,
 416,
 421,
 429,
 431,
 436,
 437,
 438,
 440,
 442,
 447,
 452,
 453,
 459,
 460,
 465,
 469,
 471,
 476,
 477,
 478,
 480,
 482,
 483,
 484,
 485,
 486,
 491,
 492,
 494,
 497,
 498,
 499,
 500,
 503,
 504,
 505,
 507,
 509,
 510,
 511,
 514,
 515,
 517,
 518,
 519,
 520,
 5

In [27]:
add_id = [5, 7, 36, 43, 53, 66, 69, 73, 85, 86, 89, 90, 93, 97, 99, 101, 104, 110, 119]
correct_id = [91]

In [78]:
ind = 1205
print(code_data[ind]["question"])
print(code_data[ind]["instruction"])
print(code_data[ind]["answer"])

Một ô tô đi từ Hà Nội lúc 11 giờ trưa và đến Vinh lúc 5 giờ 30 phút chiều. Dọc đường ô tô dừng ở Ninh Bình và Thanh Hóa mỗi nơi 15 phút. Hỏi không kể thời gian dừng dọc đường, ô tô đi hết quãng đường từ Hà Nội đến Vinh mất bao nhiêu thời gian ?
A. 4 giờ 30 phút
B. 6 giờ 30 phút
C. 6 giờ 15 phút
D. 6 giờ

STEP
Step 1: Calculate the total time spent on the road by subtracting the time of departure from the time of arrival. 
Step 2: Subtract the time spent at each stop (Ninh Binh and Thanh Hoa) from the total time calculated in step 1. 
Step 3: Compare the results with the answers to find the correct option.
D. 6 giờ


In [79]:
print(code_data[ind]["code"])

CODE
```python
departure_time = 11  # in hours
arrival_time = 5 + 30/60  # in hours

total_time_on_road = arrival_time - departure_time

# Time spent at each stop in hours
stop_time = 15 / 60

# Subtract stop time from the total time on the road
total_time_without_stops = total_time_on_road - (2 * stop_time)

answers = {
    "A": 4 + 30/60,
    "B": 6 + 30/60,
    "C": 6 + 15/60,
    "D": 6
}
if total_time_without_stops == answers["A"]:
    answer = "A"
elif total_time_without_stops == answers["B"]:
    answer = "B"
elif total_time_without_stops == answers["C"]:
    answer = "C"
elif total_time_without_stops == answers["D"]:
    answer = "D"
else:
    answer = "No answer"

print(answer)
```

This code calculates the total time spent on the road without considering the time spent at stops in Ninh Binh and Thanh Hoa. It then compares this result with the provided answer choices to determine the correct option.


In [74]:
x = (27 * 8) / 3

answers = {
    "A": 56,
    "B": 64,
    "C": 72,
    "D": 80
}
if x == answers["A"]:
    answer = "A"
elif x == answers["B"]:
    answer = "B"
elif x == answers["C"]:
    answer = "C"
elif x == answers["D"]:
    answer = "D"
else:
    answer = "No answer"

print(answer)

C


In [69]:
remainder

9

# VI_GSM8K

In [1]:
import json 
import unicodedata

def normalize(s: str):
    return unicodedata.normalize("NFC", s)

f = open("./data/GSM8K/gsm8k_vi.jsonl", "r")
vi_data = []
for line in f:
    line = json.loads(line)
    for sample in line:
        sample["question"] = normalize(sample["query"])
        answer = normalize(sample["response"]).split("####")[-1]
        try:
            sample["answer"] = float(answer.replace(",", "").strip())
            vi_data.append(sample)
        except:
            pass

f.close()

f = open("./data/GSM8K/train.jsonl", "r")
en_data = []
for line in f:
    line = json.loads(line)
    line["question"] = normalize(line["question"])
    line["answer"] = normalize(line["answer"])
    en_data.append(line)

f.close()

f = open("./data/GSM8K/test.jsonl", "r")
for line in f:
    line = json.loads(line)
    line["question"] = normalize(line["question"])
    line["answer"] = normalize(line["answer"])
    en_data.append(line)

f.close()


In [2]:
for s1 in vi_data:
    s1["en_question"] = None
    for s2 in en_data:
        if s1["response"] == s2["answer"]:
            s1["en_question"] = s2["question"]
            break

for i, s1 in enumerate(vi_data):
    if not s1["en_question"]:
        print(i)

In [4]:
f = open("./data/GSM8K/gsm8k_code.jsonl", "r")
code_data = []
for line in f:
    line = json.loads(line)
    line["question"] = normalize(line["question"])
    code_data.append(line)

f.close()

In [5]:
len(code_data)

6648

In [10]:
for s1 in vi_data:
    s1["code"] = None
    for s2 in code_data:
        if s1["en_question"] == s2["question"]:
            s1["code"] = s2["code"]
            s1["instruction"] = s2["instruction"]
            break

vi_data_1 = []
for i, s1 in enumerate(vi_data):
    if s1["code"]:
        vi_data_1.append(s1)

In [13]:
with open("./data/GSM8K/vi_gsm8k_code.jsonl", "a") as f:
    for s in vi_data_1:
        d = json.dumps(s, ensure_ascii=False)+"\n"
        f.write(d)

In [15]:
import re

CODE_PATTERN = re.compile(r"```python([\s\S]*)```")

def extract_code(s: str):
    return CODE_PATTERN.findall(s)[0]

In [12]:
vi_data_1[0]

{'query': 'Natalia đã bán clip cho 48 người bạn của mình vào tháng 4 và sau đó cô đã bán được một nửa số clip đó vào tháng 5. Natalia đã bán được tổng cộng bao nhiêu clip trong tháng 4 và tháng 5?',
 'response': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72',
 'question': 'Natalia đã bán clip cho 48 người bạn của mình vào tháng 4 và sau đó cô đã bán được một nửa số clip đó vào tháng 5. Natalia đã bán được tổng cộng bao nhiêu clip trong tháng 4 và tháng 5?',
 'answer': 72.0,
 'en_question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'code': '```python\n# Define the number of clips sold in April\nclips_april = 48\n\n# Divide the number of clips sold in April by 2 to calculate the number of clips sold in May\nclips_may = clips_april / 2\n\n# Add the number of clips sold in April and the numb

In [16]:
from tqdm import  tqdm

with open("./data/gsm8k_zalo/vi_gsm8k_v1.jsonl", "a") as f:
    for sample in tqdm(vi_data_1):
        code = f"```python\n{extract_code(sample['code'])}\n```"
        s = {
            "messages": [
                {
                    "role": "user",
                    "content": f"Solve the following problem: {sample['question']}"
                },
                {
                    "role": "assistant",
                    "content": (
                        "Let's break it down step by step first:\n"
                        f"{sample['instruction']}\n"
                        "Here's the Python code based on the plan above:\n"
                        f"{code}"    
                    )
                }
            ]
        }
        d = json.dumps(s, ensure_ascii=False)+"\n"
        f.write(d)


100%|██████████| 5756/5756 [00:00<00:00, 25200.88it/s]


# EN_ZALO

In [1]:
import json 
import unicodedata

def normalize(s: str):
    return unicodedata.normalize("NFC", s)

f = open("./data/zalo/train/math_train_en.jsonl", "r")
en_data = []
for line in f:
    line = json.loads(line)
    for sample in line:
        sample["question"] = normalize(sample["question"])
        en_data.append(sample)

f.close()


with open("./data/zalo/train/math_train.json", "r") as f:
    vi_data = json.loads(f.read())["data"]

for s in vi_data:
    s["question"] = normalize(s["question"])

code_data = []
with open("./data/zalo/train/zalo_code_filtered.jsonl", "r") as f:
    for line in f:
        code_data.append(json.loads(line))

In [2]:
for s1 in en_data:
    s1["vi_question"] = None
    for s2 in vi_data:
        if s1["id"] == s2["id"]:
            s1["vi_question"] = normalize(s2["question"])

In [11]:
ban_words = ["số bị trừ", "số trừ", "số hạng", "thừa số", "số bị chia", "số chia", "giao hoán", "tính kết hợp", "tính chất kết hợp"]
def contain_ban_word(s: str):
    for w in ban_words:
        if w in s:
            return True
    return False

for s1 in en_data:
    s1["instruction"] = None
    s1["code"] = None
    for s2 in code_data:
        if s1["vi_question"][:-4] in s2["question"] and not contain_ban_word(s1["vi_question"]): 
            s1["instruction"] = s2["instruction"]
            s1["code"] = s2["code"]

In [12]:
en_data_1 = []
for s in en_data:
    if s["instruction"]:
        en_data_1.append(s)

In [15]:
with open("./data/zalo/train/zalo_code_en.jsonl", "a") as f:
    for s in en_data_1:
        d = json.dumps(s, ensure_ascii=False)+"\n"
        f.write(d)

In [16]:
import re

CODE_PATTERN = re.compile(r"```python([\s\S]*)```")

def extract_code(s: str):
    return CODE_PATTERN.findall(s)[0]

In [17]:
from tqdm import  tqdm
import json

with open("./data/gsm8k_zalo/en_zalo_v1.jsonl", "a") as f:
    for sample in tqdm(en_data_1):
        code = f"```python\n{extract_code(sample['code'])}\n```"
        question = sample["question"]
        s = {
            "messages": [
                {
                    "role": "user",
                    "content": f"Solve the following multiple-choices problem: {question}"
                },
                {
                    "role": "assistant",
                    "content": (
                        "Let's break it down step by step first:\n"
                        f"{sample['instruction']}\n"
                        "Here's the Python code based on the plan above:\n"
                        f"{code}"    
                    )
                }
            ]
        }
        d = json.dumps(s, ensure_ascii=False)+"\n"
        f.write(d)

100%|██████████| 731/731 [00:00<00:00, 32864.59it/s]


# METAMATHQA

In [24]:
import json

with open("./data/MetaMathQA/MetaMathQA-395K.json", "r") as f:
    metamath_data = json.loads(f.read())

gsm8k_data = []
with open("./data/GSM8K/train.jsonl", "r") as f:
    for line in f:
        gsm8k_data.append(json.loads(line))

In [29]:
metamath_data[0]

{'query': 'Which letter corresponds to the graph of $y = |f(x)|$?',
 'response': 'The graph of $y = |f(x)|$ is the graph of $y = f(x)$ reflected across the x-axis for all values of $x$ where $f(x) < 0$.\nTherefore, the correct letter that corresponds to the graph of $y = |f(x)|$ is the letter $\\boxed{\\text{D}}$.The answer is: \\text{D}',
 'type': 'MATH_Rephrased'}

In [33]:
types = set()
for s in metamath_data:
    types.add(s["type"])

print(types)

{'GSM_Rephrased', 'MATH_FOBAR', 'GSM_FOBAR', 'MATH_AnsAug', 'MATH_SV', 'GSM_SV', 'MATH_Rephrased', 'GSM_AnsAug'}


In [36]:
from tqdm import tqdm

gsm8k_new = []
for s in tqdm(metamath_data):
    if "GSM" in s["type"]:
        gsm8k_new.append(s)

100%|██████████| 395000/395000 [00:00<00:00, 2160396.94it/s]


In [37]:
len(gsm8k_new)

240000

In [41]:
gsm8k_new[3]

{'query': 'Jerry has to shingle 3 roofs. Each roof consists of two slanted rectangular sides that measure 20 feet by 40 feet. If he requires 8 shingles to cover one square foot of roof, what is the total number of shingles he needs?',
 'response': 'The area of one side of the roof is 20 feet x 40 feet = 800 square feet\nSince there are two sides to each roof, the total area of one roof is 800 square feet x 2 = 1600 square feet\nJerry needs 8 shingles to cover 1 square foot, so to cover 1600 square feet he will need 1600 square feet x 8 shingles/square foot = 12800 shingles\nSince he has to shingle 3 roofs, the total number of shingles he needs is 12800 shingles x 3 roofs = 38400 shingles\n#### 38400\nThe answer is: 38400',
 'type': 'GSM_Rephrased'}

In [34]:
from tqdm import tqdm

for s1 in tqdm(metamath_data):
    if "GSM" not in s1["type"]:
        continue
    for s2 in gsm8k_data:
        if s1["response"]==s2["answer"] and s1["query"]!=s2["question"]:
            gsm8k_data.append({
                "question": s1["query"],
                "answer": s1["response"]
            })
            break

100%|██████████| 395000/395000 [03:29<00:00, 1888.70it/s]


In [42]:
def normalize(s: str):
    return unicodedata.normalize("NFC", s)

f = open("./data/GSM8K/train.jsonl", "r")
data = []
for line in f:
    line = json.loads(line)
    line["question"] = normalize(line["question"])
    answer = normalize(line["answer"]).split("####")[-1]
    line["answer"] = float(answer.replace(",", "").strip())
    data.append(line)

f.close()

In [43]:
len(data)

7473

In [45]:
answers = set()
for s in data:
    answers.add(s["answer"])

In [46]:
len(answers)

866

# Few shot prompting

# Retrieval-based few shot prompting

## zalo code

In [26]:
import json
import re
from tqdm import tqdm
from rank_bm25 import BM25Okapi

from src.utils.preprocess import  preprocess
from copy import  deepcopy

zalo_code = []
with open("data/zalo/train/zalo_code_filtered.jsonl", "r") as f:
    for line in f:
        zalo_code.append(json.loads(line))

question_corpus = []
for s in zalo_code:
    question = s["question"].split("\n")[0]
    question_corpus.append(question.lower())

#-----------------------------------------------

WORD_PATTERN = re.compile(r"[\w]+")
NUMBER_PATTERN = re.compile(r"[\d]+")

def remove_numbers(s: str):
    return NUMBER_PATTERN.sub("", s)

question_corpus = list(map(preprocess, tqdm(question_corpus)))
tokenized_question_corpus = list(map(
    lambda s: WORD_PATTERN.findall(remove_numbers(s)), 
    tqdm(question_corpus)
))

bm25 = BM25Okapi(tokenized_question_corpus)

#---------------------------------------------


new_zalo_code = []
for s in zalo_code:
    question = s["question"].split("\n")[0]
    question = preprocess(question, lowercase=True)
    tokenized_ques = WORD_PATTERN.findall(remove_numbers(question))
    new_s = deepcopy(s)
    new_s["examples"] = bm25.get_top_n(tokenized_ques, zalo_code, n=2)
    new_zalo_code.append(new_s)

with open("./data/zalo/train/zalo_code_filtered_1.jsonl", "a") as f:
    for s in new_zalo_code:
        d = json.dumps(s, ensure_ascii=False)+"\n"
        f.write(d)

In [38]:
question_corpus[0]

'natalia đã bán clip cho 48 người bạn của mình vào tháng 4 và sau đó cô đã bán được một nửa số clip đó vào tháng 5. natalia đã bán được tổng cộng bao nhiêu clip trong tháng 4 và tháng 5?'

In [2]:
import json
zalo_code = []
with open("data/zalo/train/zalo_code_filtered.jsonl", "r") as f:
    for line in f:
        zalo_code.append(json.loads(line))

question_corpus = []
for s in zalo_code:
    question = s["question"].split("\n")[0]
    question_corpus.append(question.lower())

## en zalo code

In [32]:
import json
import re
from tqdm import tqdm
from rank_bm25 import BM25Okapi

from src.utils.preprocess import  preprocess
from copy import  deepcopy

zalo_code = []
with open("data/zalo/train/zalo_code_en.jsonl", "r") as f:
    for line in f:
        zalo_code.append(json.loads(line))

question_corpus = []
for s in zalo_code:
    question = s["question"].split("\n")[0]
    question_corpus.append(question.lower())

#-----------------------------------------------

WORD_PATTERN = re.compile(r"[\w]+")
NUMBER_PATTERN = re.compile(r"[\d]+")

def remove_numbers(s: str):
    return NUMBER_PATTERN.sub("", s)

question_corpus = list(map(preprocess, tqdm(question_corpus)))
tokenized_question_corpus = list(map(
    lambda s: WORD_PATTERN.findall(remove_numbers(s)), 
    tqdm(question_corpus)
))

bm25 = BM25Okapi(tokenized_question_corpus)

#---------------------------------------------


new_zalo_code = []
for s in zalo_code:
    question = s["question"].split("\n")[0]
    question = preprocess(question, lowercase=True)
    tokenized_ques = WORD_PATTERN.findall(remove_numbers(question))
    new_s = deepcopy(s)
    new_s["examples"] = bm25.get_top_n(tokenized_ques, zalo_code, n=2)
    new_zalo_code.append(new_s)

with open("./data/zalo/train/zalo_code_en_1.jsonl", "a") as f:
    for s in new_zalo_code:
        d = json.dumps(s, ensure_ascii=False)+"\n"
        f.write(d)

100%|██████████| 731/731 [00:00<00:00, 143212.49it/s]
100%|██████████| 731/731 [00:00<00:00, 91959.94it/s]


## gsm8k

In [34]:
import json
import re
from tqdm import tqdm
from rank_bm25 import BM25Okapi

from src.utils.preprocess import  preprocess
from copy import  deepcopy

gsm8k_code = []
with open("data/GSM8K/gsm8k_code.jsonl", "r") as f:
    for line in f:
        gsm8k_code.append(json.loads(line))

question_corpus = []
for s in gsm8k_code:
    question = s["question"].split("\n")[0]
    question_corpus.append(question.lower())

#-----------------------------------------------

WORD_PATTERN = re.compile(r"[\w]+")
NUMBER_PATTERN = re.compile(r"[\d]+")

def remove_numbers(s: str):
    return NUMBER_PATTERN.sub("", s)

question_corpus = list(map(preprocess, tqdm(question_corpus)))
tokenized_question_corpus = list(map(
    lambda s: WORD_PATTERN.findall(remove_numbers(s)), 
    tqdm(question_corpus)
))

bm25 = BM25Okapi(tokenized_question_corpus)

#---------------------------------------------


new_gsm8k_code = []
for s in gsm8k_code:
    question = s["question"].split("\n")[0]
    question = preprocess(question, lowercase=True)
    tokenized_ques = WORD_PATTERN.findall(remove_numbers(question))
    new_s = deepcopy(s)
    new_s["examples"] = bm25.get_top_n(tokenized_ques, gsm8k_code, n=2)
    new_gsm8k_code.append(new_s)

with open("./data/GSM8K/gsm8k_code_1.jsonl", "a") as f:
    for s in new_gsm8k_code:
        d = json.dumps(s, ensure_ascii=False)+"\n"
        f.write(d)

100%|██████████| 6648/6648 [00:00<00:00, 197377.61it/s]
100%|██████████| 6648/6648 [00:00<00:00, 39603.69it/s]


## vi gsm8k code

In [35]:
import json
import re
from tqdm import tqdm
from rank_bm25 import BM25Okapi

from src.utils.preprocess import  preprocess
from copy import  deepcopy

gsm8k_code = []
with open("data/GSM8K/vi_gsm8k_code.jsonl", "r") as f:
    for line in f:
        gsm8k_code.append(json.loads(line))

question_corpus = []
for s in gsm8k_code:
    question = s["question"].split("\n")[0]
    question_corpus.append(question.lower())

#-----------------------------------------------

WORD_PATTERN = re.compile(r"[\w]+")
NUMBER_PATTERN = re.compile(r"[\d]+")

def remove_numbers(s: str):
    return NUMBER_PATTERN.sub("", s)

question_corpus = list(map(preprocess, tqdm(question_corpus)))
tokenized_question_corpus = list(map(
    lambda s: WORD_PATTERN.findall(remove_numbers(s)), 
    tqdm(question_corpus)
))

bm25 = BM25Okapi(tokenized_question_corpus)

#---------------------------------------------


new_gsm8k_code = []
for s in gsm8k_code:
    question = s["question"].split("\n")[0]
    question = preprocess(question, lowercase=True)
    tokenized_ques = WORD_PATTERN.findall(remove_numbers(question))
    new_s = deepcopy(s)
    new_s["examples"] = bm25.get_top_n(tokenized_ques, gsm8k_code, n=2)
    new_gsm8k_code.append(new_s)

with open("./data/GSM8K/vi_gsm8k_code_1.jsonl", "a") as f:
    for s in new_gsm8k_code:
        d = json.dumps(s, ensure_ascii=False)+"\n"
        f.write(d)

100%|██████████| 5756/5756 [00:00<00:00, 98169.03it/s]
100%|██████████| 5756/5756 [00:00<00:00, 46041.59it/s]


# Inference test

In [13]:
from tqdm import  tqdm
import re
from src.utils.python_executor import PythonExecutor


CODE_PATTERN = re.compile(r"```python([\s\S]*)```")

def extract_code(s: str):
    return CODE_PATTERN.findall(s)[0]

executor = PythonExecutor(get_answer_from_stdout=True)

def is_float(string: str):
    try:
        float(string)
        return True
    except ValueError:
        return False


COMPARISON_PATTERN = re.compile("if ([\w]+==.*|[\w]+ ==.*):")


def fix_rounding_error(code: str):
    comparisons = COMPARISON_PATTERN.findall(code)
    if len(comparisons) == 0:
        return code
    answer_variable = comparisons[0].split("==")[0]
    answer_value = executor.apply(
        code + f"\nprint({answer_variable})"
    )[0]
    if is_float(answer_value):
        for comparison in comparisons:
            choice_variable = comparison.split("==")[-1]
            str_to_replace = f"abs({answer_variable}-{choice_variable}) < 1e-8"
            code = code.replace(comparison, str_to_replace)
    
    return code

def execute_python_code(code: str):
    output = executor.apply(code)
    if output[1] == 'Done':
        if output[0] == '':
            return code, "Missing print function!!!!"
        if output[0].strip() not in ["A", "B", "C", "D", "E"]:
            code = fix_rounding_error(code)
            output = executor.apply(code)        

    return code, output[0]

In [39]:
code = """
initial_short_side = 8
initial_long_side = 12
increase_area = 25

# Step 2: Write the equation
# Step 2 substituted: (8 + x) * 12 = 25 + (8 * 12)
# Simplified equation: x * 12 = 25 + (8 * 12)
# x = (25 + (8 * 12)) / 12
x = (25 + (8 * 12)) / 12

# Step 3: Calculate the percentage increase
percentage_increase = ((x / 12) * 100)

# Step 4: Compare the results with the answers to find the correct option
answers = {
   "A": 125,
   "B": 20,
   "C": 25,
   "D": 50
}
if percentage_increase == answers["A"]:
   answer = "A"
elif percentage_increase == answers["B"]:
   answer = "B"
elif percentage_increase == answers["C"]:
   answer = "C"
elif percentage_increase == answers["D"]:
   answer = "D"
else:
   answer = "No answer"

print(answer)
"""

In [40]:
_, out = execute_python_code(code)

In [41]:
out

'No answer\n'