In [1]:
import gadgets

[2023-08-29 10:10:12,800] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

from gadgets.model import gadget_assisted_model
from gadgets.gadget import Calculator

model_name = "emnlp2023/calc-flan-xl"
GadgetAssistedT5 = gadget_assisted_model(T5ForConditionalGeneration)
model = GadgetAssistedT5.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

model.prepare_for_generate(tokenizer, 
                           enabled_gadgets=[Calculator()], 
                           default_max_tokens=512)
query = """
    The profit from a business transaction is shared among 2 business partners, 
    Mike and Johnson in the ratio 2:5 respectively. 
    If Johnson got $2500, how much will Mike have 
    after spending some of his share on a shirt that costs $200?
"""

inputs = tokenizer(query, return_tensors="pt")
output_ids = model.generate(**inputs)
tokenizer.decode(output_ids[0], spaces_between_special_tokens=False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


'According to the ratio, for every 5 parts that Johnson gets, Mike gets 2 parts Since Johnson got $2500, each part is therefore $2500/5 = $<gadget id="calculator">2500/5</gadget><output>500</output> 500 Mike will get 2*$500 = $<gadget id="calculator">2*500</gadget><output>1_000</output> 1000 After buying the shirt he will have $1000-$200 = $<gadget id="calculator">1000-200</gadget><output>800</output> 800 left. Final result is<result>800</result></s>'

In [3]:
import datasets
import pandas as pd
import numpy as np

ape_filtered = datasets.load_dataset("emnlp2023/calc-ape210k", split="test")
ape_orig = pd.read_json("../data/ape210k/my_tests.ape.jsonl", lines=True)

ids_orig = set(ape_orig["id"].astype(int).tolist())
ids_filtered = set(ape_filtered["id"])

In [4]:
assert ids_filtered.issubset(ids_orig)

print("Number of examples in original test set:", len(ids_orig))
print("Number of examples in filtered test set:", len(ids_filtered))
print("Number of examples dropped:", len(ids_orig - ids_filtered))

ids_dropped = ids_orig - ids_filtered

np.random.seed(0)
ids_for_manual_eval = np.random.choice(list(ids_dropped), 20, replace=False)

Number of examples in original test set: 5000
Number of examples in filtered test set: 4867
Number of examples dropped: 133


In [5]:
ids_for_manual_eval

array([ 198030,  649201, 1224805, 1112101,  536172,  133003,  294840,
        981913, 1054891,  422815, 1066241,  759191,  286888, 1069969,
        338514,  240587, 1099153,  309636,  114724,  324700])

In [6]:
ape_eval = ape_orig[ape_orig["id"].isin(ids_for_manual_eval)]
ape_eval.shape

(20, 5)

In [7]:
ape_eval

Unnamed: 0,id,question_chinese,question_english_mt,equation,result
46,294840,(4/7)x-2(5/11)=0.25-(1/28)x．,(4/7)x-2(5/11)=0.25-(1/28)x.,x=(2(5/11)+0.25)/(4/7+1/28),4(5/11)
234,309636,小强以平均每小时4千米的速度登山，下山时以平均每小时8千米的速度原路返回．那么，来回一次平均...,Xiaoqiang climbed the mountain at an average ...,x=(1+1)/((1/4)+(1/8)),5(1/3)
324,1054891,列式计算；甲数是17(1/2)，乙数是20，甲数比乙数少百分之几？,Column formula calculation; the number A is 1...,x=(20-17(1/2))/20,12.5%
377,981913,面包车的速度是每小时60千米，在面包车开出30分钟后，一辆小轿车以每小时84千米的速度从同一...,The speed of the van is 60 kilometers per hou...,x=60*0.5/(84-60),1.25
591,198030,有一项工程，由甲单独做，12小时完成；乙单独做，9小时完成．如果按甲先乙后，每人每次1小时轮...,There is a project that A alone can complete i...,x=5*2+(1-((1/12)+(1/9))*5)/(1/12),10(1/3)
676,649201,在一次献爱心捐款活动中，五1班37名男生共捐款250元，26名女生每人捐10元．全班平均每人...,"In a love donation activity, 37 boys from Cla...",x=8(22/63),8(22/63)
838,1224805,解方程：x-(2/7)x=(3/4)(2-(3/7)x)．,Solve the equation: x-(2/7)x=(3/4)(2-(3/7)x).,x=(3/4*2)/(1+3/4*3/7-2/7),1(13/29)
887,324700,一件工作，甲队独做12天完成，乙队15天只能完成这件工作的75%，甲、乙合做完成这件工作需要...,A task can be completed by team A alone in 12...,x=1/((1/12)+(1/(15/75%))),7(1/2)
1196,1066241,1.25与3.4的和乘5.6与4(4/5)的差，积是多少？,What is the product of the sum of 1.25 and 3....,x=(1.25+3.4)*(5.6-4(4/5)),3.72
1346,338514,(9.3*(5/6)-7.3)/2(1/4)．,(9.3*(5/6)-7.3)/2(1/4).,x=(9.3*(5/6)-7.3)/2(1/4),0.2


In [8]:
ape_eval.set_index("id").loc[1112101]["question_english_mt"]

' 2(1/2)/((5/7)+2(1/2)).'

In [9]:
manual_results = pd.DataFrame([
    {"id": 294840, "result": None, "comment": "ambiguous question (contains implicit multiplication vs mixed fraction)"},
    {"id": 309636, "result": "16/3", "comment": "this is a catch, it's not just the average of the two speeds"},
    {"id": 1054891, "result": None, "comment": "ambiguous question (contains implicit multiplication vs mixed fraction)"},
    {"id": 981913, "result": "1.25", "comment": "I'm not sure why this is dropped. But I checked the answer anyways and it's correct."},
    {"id": 198030, "result": "10/3", "comment": ""},
    {"id": 649201, "result": "170/21", "comment": "But this is not the same as in answer column in any interpretation"},
    {"id": 1224805, "result": "42/29", "comment": ""},
    {"id": 324700, "result": "15/2", "comment": ""},
    {"id": 1066241, "result": None, "comment": "ambiguous question (contains implicit multiplication vs mixed fraction)"},
    {"id": 338514, "result": None, "comment": "ambiguous question (contains implicit multiplication vs mixed fraction)"},
    {"id": 114724, "result": None, "comment": "ambiguous question (contains implicit multiplication vs mixed fraction)"},
    {"id": 286888, "result": None, "comment": "ambiguous question (contains implicit multiplication vs mixed fraction)"},
    {"id": 1069969, "result": "49/4", "comment": ""},
    {"id": 422815, "result": None, "comment": "ambiguous question (contains implicit multiplication vs mixed fraction)"},
    {"id": 240587, "result": "18/11", "comment": ""},
    {"id": 1099153, "result": "11/3", "comment": ""},
    {"id": 536172, "result": "29/4", "comment": ""},
    {"id": 759191, "result": None, "comment": "english question (machine-translated) does not make sense"},
    {"id": 133003, "result": "40/11", "comment": ""},
    {"id": 1112101, "result": None, "comment": "ambiguous question (contains implicit multiplication vs mixed fraction)"},
])

manual_results.shape
manual_results["result"].isna().sum()

9

In [10]:
df = manual_results.join(ape_eval.set_index("id"), on="id", how="left", lsuffix="_manual", rsuffix="_orig")
df

Unnamed: 0,id,result_manual,comment,question_chinese,question_english_mt,equation,result_orig
0,294840,,ambiguous question (contains implicit multipli...,(4/7)x-2(5/11)=0.25-(1/28)x．,(4/7)x-2(5/11)=0.25-(1/28)x.,x=(2(5/11)+0.25)/(4/7+1/28),4(5/11)
1,309636,16/3,"this is a catch, it's not just the average of ...",小强以平均每小时4千米的速度登山，下山时以平均每小时8千米的速度原路返回．那么，来回一次平均...,Xiaoqiang climbed the mountain at an average ...,x=(1+1)/((1/4)+(1/8)),5(1/3)
2,1054891,,ambiguous question (contains implicit multipli...,列式计算；甲数是17(1/2)，乙数是20，甲数比乙数少百分之几？,Column formula calculation; the number A is 1...,x=(20-17(1/2))/20,12.5%
3,981913,1.25,I'm not sure why this is dropped. But I checke...,面包车的速度是每小时60千米，在面包车开出30分钟后，一辆小轿车以每小时84千米的速度从同一...,The speed of the van is 60 kilometers per hou...,x=60*0.5/(84-60),1.25
4,198030,10/3,,有一项工程，由甲单独做，12小时完成；乙单独做，9小时完成．如果按甲先乙后，每人每次1小时轮...,There is a project that A alone can complete i...,x=5*2+(1-((1/12)+(1/9))*5)/(1/12),10(1/3)
5,649201,170/21,But this is not the same as in answer column i...,在一次献爱心捐款活动中，五1班37名男生共捐款250元，26名女生每人捐10元．全班平均每人...,"In a love donation activity, 37 boys from Cla...",x=8(22/63),8(22/63)
6,1224805,42/29,,解方程：x-(2/7)x=(3/4)(2-(3/7)x)．,Solve the equation: x-(2/7)x=(3/4)(2-(3/7)x).,x=(3/4*2)/(1+3/4*3/7-2/7),1(13/29)
7,324700,15/2,,一件工作，甲队独做12天完成，乙队15天只能完成这件工作的75%，甲、乙合做完成这件工作需要...,A task can be completed by team A alone in 12...,x=1/((1/12)+(1/(15/75%))),7(1/2)
8,1066241,,ambiguous question (contains implicit multipli...,1.25与3.4的和乘5.6与4(4/5)的差，积是多少？,What is the product of the sum of 1.25 and 3....,x=(1.25+3.4)*(5.6-4(4/5)),3.72
9,338514,,ambiguous question (contains implicit multipli...,(9.3*(5/6)-7.3)/2(1/4)．,(9.3*(5/6)-7.3)/2(1/4).,x=(9.3*(5/6)-7.3)/2(1/4),0.2


In [11]:
def generate_prediction(question):
    inputs = tokenizer(question, return_tensors="pt")
    output_ids = model.generate(**inputs)
    return tokenizer.decode(output_ids[0], spaces_between_special_tokens=False)


df["model_prediction"] = df["question_english_mt"].apply(generate_prediction)

Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


In [12]:
df["model_result"] = df["model_prediction"].apply(gadgets.markup.from_model_markup).apply(lambda x: x[-1]).fillna("").str.split("=").apply(lambda x: x[0].strip())

In [25]:
df["model_correct"] = df[~df["result_manual"].isna()].apply(lambda row: gadgets.metrics.are_numeric_results_same(row["model_result"], row["result_manual"]), axis=1)
df["model_correct"]

0       NaN
1      True
2       NaN
3      True
4     False
5      True
6     False
7      True
8       NaN
9       NaN
10      NaN
11      NaN
12     True
13      NaN
14    False
15     True
16     True
17      NaN
18     True
19      NaN
Name: model_correct, dtype: object

In [14]:
baseline_model_name = "emnlp2023/baseline-flan-xl"
baseline_model = T5ForConditionalGeneration.from_pretrained(baseline_model_name)
baseline_tokenizer = T5Tokenizer.from_pretrained(baseline_model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
def baseline_generate_prediction(question):
    inputs = tokenizer(question, return_tensors="pt")
    output_ids = baseline_model.generate(**inputs, max_length=512)
    return tokenizer.decode(output_ids[0], spaces_between_special_tokens=False, skip_special_tokens=True)


def baseline_extract_result(prediction):
    return prediction.split("The final result is")[-1].strip().split("=")[0].replace(" ", "").rstrip(".")

In [16]:
df["baseline_prediction"] = df["question_english_mt"].apply(baseline_generate_prediction)

In [20]:
df["baseline_result"] = df["baseline_prediction"].apply(gadgets.baseline_metrics.get_result_from_output)
df["baseline_result"]

0        3
1     32/5
2      1/5
3      5/2
4        3
5       12
6      3/7
7     20/3
8        6
9        1
10     210
11     1/5
12    21/4
13     3/2
14     140
15     5/3
16     7/4
17      12
18    40/9
19     7/4
Name: baseline_result, dtype: object

In [24]:
df["baseline_correct"] = df[~df["result_manual"].isna()].apply(lambda row: gadgets.metrics.are_numeric_results_same(row["baseline_result"], row["result_manual"]), axis=1)
df["baseline_correct"]


0       NaN
1     False
2       NaN
3     False
4     False
5     False
6     False
7     False
8       NaN
9       NaN
10      NaN
11      NaN
12    False
13      NaN
14    False
15    False
16    False
17      NaN
18    False
19      NaN
Name: baseline_correct, dtype: object

In [27]:
df[["id", "result_manual", "baseline_result", "baseline_correct", "model_result", "model_correct", "comment"]]

Unnamed: 0,id,result_manual,baseline_result,baseline_correct,model_result,model_correct,comment
0,294840,,3,,15/2,,ambiguous question (contains implicit multipli...
1,309636,16/3,32/5,False,16/3,True,"this is a catch, it's not just the average of ..."
2,1054891,,1/5,,-7/17,,ambiguous question (contains implicit multipli...
3,981913,1.25,5/2,False,5/4,True,I'm not sure why this is dropped. But I checke...
4,198030,10/3,3,False,22/3,False,
5,649201,170/21,12,False,170/21,True,But this is not the same as in answer column i...
6,1224805,42/29,3/7,False,12/25,False,
7,324700,15/2,20/3,False,15/2,True,
8,1066241,,6,,22.32,,ambiguous question (contains implicit multipli...
9,338514,,1,,1.8,,ambiguous question (contains implicit multipli...
