In [1]:
import datasets
from datasets import load_dataset
import re
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig

tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.1")
model = AutoModelForCausalLM.from_pretrained("lmsys/vicuna-7b-v1.1").set_train(False)
model.generation_config = GenerationConfig.from_pretrained("lmsys/vicuna-7b-v1.1")
model.generation_config.do_sample = False

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
Downloading shards:   0%|          | 0/2 [00:18<?, ?it/s]


KeyboardInterrupt: 

## 算术数据部分

In [None]:
file_path = 'gsm8k_test.parquet'
# 使用 load_dataset 函数加载 Parquet 文件
dataset = load_dataset('parquet', data_files=file_path)
# 获取测试集
test = dataset['train']  # 由于我们只加载一个文件，数据集会自动命名为 'train'
# 打印一些样本数据
print(test)

In [3]:
q_lens = [len(d['question']) for d in dataset['train']]

In [45]:
np.percentile(q_lens, [50, 80, 90, 95, 99, 100])

array([222.  , 308.4 , 366.  , 431.  , 538.82, 848.  ])

In [46]:
np.argmax(q_lens)

1077

In [4]:
input_text = dataset['train'][1077]['question']

In [5]:
dataset['train'][1077]['answer']

'The first train stays in the station for 10 minutes * 2 = <<10*2=20>>20 minutes.\nSo Paul waits for the first train for 20 + 10 = <<20+10=30>>30 minutes.\nThe second train arrives after half an hour, which is the same as 60 minutes / 2 = <<60/2=30>>30 minutes.\nIt then stays in the station for a quarter of the time the first train did which is 20 minutes / 4 = <<20/4=5>>5 minutes.\nSo Paul waits for the second train for 30 + 5 = <<30+5=35>>35 minutes.\nThe third train arrives an hour later, which is the same as 60 minutes * 1 hour = <<60*1=60>>60 minutes.\nAfter the final wait, Paul has waited a total of 30 minutes for the first train + 35 minutes for the second train + 60 minutes for the third train + 20 minutes final wait = <<30+35+60+20=145>>145 minutes.\n#### 145'

In [40]:
test = dataset['train'][:50]

# 准确率计算函数
def calculate_accuracy(predictions, references):
    correct = 0
    total = len(references)
    for pred, ref in zip(predictions, references):
        if pred == ref:
            correct += 1
    return correct / total

# 提取数据集中的问题和答案
questions = test['question']
print(test['answer'][0])
answers = [int(re.findall(r'#### (\d+)', ans)[0]) for ans in test['answer']]
print(questions[0],questions[1])
print(answers[0],answers[1])
predictions = []


Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?
18 3


In [8]:
def decode(outputs, tokenizer, raw_text_len):
    """
    Decodes the model outputs using the provided tokenizer and processes the decoded text.
    
    Args:
        outputs (torch.Tensor): The model outputs.
        tokenizer (Tokenizer): The tokenizer object used for decoding.
        raw_text_len (int): Length of the raw text.

    Returns:
        str: Processed decoded text.
    """
    # Decode the model outputs using the tokenizer
    decoded_text = tokenizer.decode(outputs[0][raw_text_len:])
    
    # Process the decoded text
    res = res.split("<|endoftext|>")[0]       # Get the first part before an empty string (likely a typo, consider fixing)
    res = res.split("\n\n\n")[0]              # Get the first part before multiple consecutive newlines
    res = res.split("Q:")[0]                  # Get the first part before "Q:" (likely indicating a new question)
    
    return res


def inference(model, tokenizer, question, max_new_tokens):
    """
    Generates text based on a given question using the provided model and tokenizer.
    
    Args:
        model (PreTrainedModel): The pre-trained model for inference.
        tokenizer (Tokenizer): The tokenizer object used for tokenization.
        question (str): The input question for inference.
        max_new_tokens (int): Maximum number of tokens to generate.

    Returns:
        str: Generated text based on the input question.
    """
    # Tokenize the input question
    input_ids = tokenizer.encode(question, return_tensors="ms")
    
    # Generate text based on the input question
    outputs = model.generate(input_ids, max_new_tokens=max_new_tokens)
    
    # Decode and process the generated text
    generated_text = decode(outputs, tokenizer, len(input_ids))
    
    return generated_text

In [None]:
prompt_1 = "Let's think step by step."
prompt_2 = "Therefore, the answer (arabic numerals) is "
# Loop through each question and its corresponding index
for i, question in enumerate(questions):
    print(f"\nSample {i + 1}:")  # Print the sample number

    # Formulate the first prompt by appending the question to the initial prompt
    input_1 = f"Q: {question}.{prompt_1}"
    
    # Generate text based on the first prompt with a maximum of 150 new tokens
    output_1 = inference(model, tokenizer, input_1, 150)
    print(output_1)  # Print the generated output from the first prompt

    # Update the second prompt by combining the first prompt, its generated output, and the existing second prompt
    prompt_2 = f"{input_1}{output_1}{prompt_2}"
    
    # Generate text based on the updated second prompt with a maximum of 20 new tokens
    output_2 = inference(model, tokenizer, prompt_2, 20)
    print(output_2)  # Print the generated output from the second prompt
    
    # Extract the last number found in the generated output (if any) as the predicted answer
    prediction = re.findall(r'\d+', output_2)[-1]
    
    # Check if a prediction is found
    if prediction:
        predictions.append(int(prediction[0]))  # Append the predicted answer to the predictions list
        print(f"Predicted Answer: {int(prediction[0])}")  # Print the predicted answer
    else:
        predictions.append(None)  # If no number is found, append None to the predictions list
        print("Predicted Answer: None")  # Print that no prediction is found



In [69]:
accuracy = calculate_accuracy(predictions, answers)
print(f"\nAccuracy: {accuracy * 100:.2f}%")


Accuracy: 50.00%


In [49]:
# 打印一些样本数据
print("\nSample Outputs:")
for q, a, p in zip(questions[:5], answers[:5], predictions[:5]):
    print(f"Question: {q}")
    print(f"True Answer: {a}")
    print(f"Predicted Answer: {p}")
    print()


Sample Outputs:
Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
True Answer: 18
Predicted Answer: 26

Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?
True Answer: 3
Predicted Answer: 3

Question: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?
True Answer: 70000
Predicted Answer: 105000

Question: James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?
True Answer: 540
Predicted Answer: 540

Question: Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, conta

## 常识问答部分


In [None]:
file_path = 'train-00000-of-00001.parquet'
dataset2 = load_dataset('parquet', data_files=file_path)
# 获取测试集
dataset2 = dataset2['train'][:50]  # 由于我们只加载一个文件，数据集会自动命名为 'train'
# 打印一些样本数据
print(dataset2)

In [57]:
# 准确率计算函数
def calculate_accuracy(predictions, references):
    correct = 0
    total = len(references)
    for pred, ref in zip(predictions, references):
        if pred == ref:
            correct += 1
    return correct / total

# 提取数据集中的问题和答案
# 从数据集中提取问题和答案
questions = dataset2['question']
choices = dataset2['choices']

# 将问题和选择合并成一个字符串列表
combined_questions = []
for i in range(len(questions)):
    combined_question = questions[i]
    for j in range(len(choices[i]['label'])):
        combined_question += f" {choices[i]['label'][j]}: {choices[i]['text'][j]}"
    combined_questions.append(combined_question)
questions = combined_questions
# 提取答案
answers = dataset2['answerKey']
print(questions[0],answers[0])
predictions = []
prompt_1 = "Let's think step by step."
prompt_2 = "Therefore, among A through E, the answer is "

The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? A: ignore B: enforce C: authoritarian D: yell at E: avoid A


In [None]:
for i, question in enumerate(questions):
    print(f"\nSample {i + 1}:")
    input_1 = f"Q: {question}.{prompt_1}"
    output_1 = inference(model,tokenizer,input_1,150)
    print(f"output1:{output_1}")
    prompt_2 = f"{input_1}{output_1}{prompt_2}"
    output_2 = inference(model,tokenizer,prompt_2,10)
    print(f"output2:{output_2}")

    # 提取生成文本中的第一个大写字母作为预测答案
    prediction = re.findall(r'\b[A-E]\b', output_2)[-1]
    if prediction:
        predictions.append(prediction[0])
        print(f"Predicted Answer: {prediction[0]}")
    else:
        predictions.append(None)  # 如果没有找到大写字母，则预测为空
        print("Predicted Answer: None")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Sample 1:
\

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? A: ignore B: enforce C: authoritarian D: yell at E: avoid.Let's think step by step. B: enforce. The sanctions against the school were a punishing blow, and they seemed to enforce the efforts the school had made to change.
|

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? A: ignore B: enforce C: authoritarian D: yell at E: avoid.Let's think step by step.: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? A: ignore B: enforce C: authoritarian D: yell at E: avoid.Let's think step by step. B: enforce. The sanctions against the school were a punishing blow, and they seemed to enforce the efforts the school had made to change.Therefore, among A through E, the answer is  B: enforce.
Predicted Answer: B

Sample 2:
\

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: Sammy wanted to go to where the people were.  Where might he go? A: race track B: populated areas C: the desert D: apartment E: roadblock.Let's think step by step. B: populated areas
/

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: Sammy wanted to go to where the people were.  Where might he go? A: race track B: populated areas C: the desert D: apartment E: roadblock.Let's think step by step.: Sammy wanted to go to where the people were.  Where might he go? A: race track B: populated areas C: the desert D: apartment E: roadblock.Let's think step by step. B: populated areas
Predicted Answer: B

Sample 3:
\

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: To locate a choker not located in a jewelry box or boutique where would you go? A: jewelry store B: neck C: jewlery box D: jewelry box E: boutique.Let's think step by step. A choker can be found in a jewelry store as they sell a variety of necklaces and jewelry items.
-

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: To locate a choker not located in a jewelry box or boutique where would you go? A: jewelry store B: neck C: jewlery box D: jewelry box E: boutique.Let's think step by step.: To locate a choker not located in a jewelry box or boutique where would you go? A: jewelry store B: neck C: jewlery box D: jewelry box E: boutique.Let's think step by step. A choker can be found in a jewelry store as they sell a variety of necklaces and jewelry items.
Predicted Answer: A

Sample 4:
\

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: Google Maps and other highway and street GPS services have replaced what? A: united states B: mexico C: countryside D: atlas E: oceans.Let's think step by step. Google Maps and other highway and street GPS services have replaced atlases. An atlas is a collection of maps, while GPS services provide real-time location and navigation using a smartphone or GPS device. So, the answer is A: atlas.
/

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: Google Maps and other highway and street GPS services have replaced what? A: united states B: mexico C: countryside D: atlas E: oceans.Let's think step by step.: Google Maps and other highway and street GPS services have replaced what? A: united states B: mexico C: countryside D: atlas E: oceans.Let's think step by step. Google Maps and other highway and street GPS services have replaced atlases. An atlas is a collection of maps, while GPS services provide real-time location and navigation using a smartphone or GPS device. So, the answer is A: atlas.
Predicted Answer: A

Sample 5:


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: The fox walked from the city into the forest, what was it looking for? A: pretty flowers. B: hen house C: natural habitat D: storybook E: dense forest.Let's think step by step. The fox is looking for pretty flowers in the forest. So, the answer is A.
|

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: The fox walked from the city into the forest, what was it looking for? A: pretty flowers. B: hen house C: natural habitat D: storybook E: dense forest.Let's think step by step.: The fox walked from the city into the forest, what was it looking for? A: pretty flowers. B: hen house C: natural habitat D: storybook E: dense forest.Let's think step by step. The fox is looking for pretty flowers in the forest. So, the answer is A.
Predicted Answer: A

Sample 6:
-

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: What home entertainment equipment requires cable? A: radio shack B: substation C: cabinet D: television E: desk.Let's think step by step. Television is a popular home entertainment which requires cable. The answer: D.
|

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: What home entertainment equipment requires cable? A: radio shack B: substation C: cabinet D: television E: desk.Let's think step by step.: What home entertainment equipment requires cable? A: radio shack B: substation C: cabinet D: television E: desk.Let's think step by step. Television is a popular home entertainment which requires cable. The answer: D.
Predicted Answer: D

Sample 7:
/

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: The only baggage the woman checked was a drawstring bag, where was she heading with it? A: garbage can B: military C: jewelry store D: safe E: airport.Let's think step by step. E: airport. The woman was heading to the airport with her drawstring bag.
\

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: The only baggage the woman checked was a drawstring bag, where was she heading with it? A: garbage can B: military C: jewelry store D: safe E: airport.Let's think step by step.: The only baggage the woman checked was a drawstring bag, where was she heading with it? A: garbage can B: military C: jewelry store D: safe E: airport.Let's think step by step. E: airport. The woman was heading to the airport with her drawstring bag.
Predicted Answer: E

Sample 8:
/

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what? A: carpet B: refrigerator C: breadbox D: fridge E: coach.Let's think step by step. B: refrigerator is the correct answer. The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his refrigerator. Mold is a common issue with food that has not been stored properly, and the refrigerator is the most common place for this to happen. Mold can grow on food and cause health problems if ingested, so it's important to store food properly and check it regularly for signs of spoilage.
\

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what? A: carpet B: refrigerator C: breadbox D: fridge E: coach.Let's think step by step.: The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what? A: carpet B: refrigerator C: breadbox D: fridge E: coach.Let's think step by step. B: refrigerator is the correct answer. The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his refrigerator. Mold is a common issue with food that has not been stored properly, and the refrigerator is the most common place for this to happen. Mold can grow on food and cause health problems if ingested, so it's important to store food properly and check it regularly for signs of spoilage.
Predicted Answer: B

Sample 9:
|

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: What do people use to absorb extra ink from a fountain pen? A: shirt pocket B: calligrapher's hand C: inkwell D: desk drawer E: blotter.Let's think step by step. The correct answer is E: blotter. A blotter is a piece of paper or cloth used to absorb excess ink from a fountain pen or other writing instrument. It is typically placed on top of the paper or writing surface to prevent smudging or smearing of the ink. The other options, such as a shirt pocket, calligrapher's hand, inkwell, or desk drawer, are not typically used to absorb extra ink from a fountain pen.
-

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: What do people use to absorb extra ink from a fountain pen? A: shirt pocket B: calligrapher's hand C: inkwell D: desk drawer E: blotter.Let's think step by step.: What do people use to absorb extra ink from a fountain pen? A: shirt pocket B: calligrapher's hand C: inkwell D: desk drawer E: blotter.Let's think step by step. The correct answer is E: blotter. A blotter is a piece of paper or cloth used to absorb excess ink from a fountain pen or other writing instrument. It is typically placed on top of the paper or writing surface to prevent smudging or smearing of the ink. The other options, such as a shirt pocket, calligrapher's hand, inkwell, or desk drawer, are not typically used to absorb extra ink from a fountain pen.
Predicted Answer: A

Sample 10:
\

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: Where is a business restaurant likely to be located? A: town B: at hotel C: mall D: business sector E: yellow pages.Let's think step by step. A business restaurant is likely to be located in a business sector. So, the answer is D.
/

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: Where is a business restaurant likely to be located? A: town B: at hotel C: mall D: business sector E: yellow pages.Let's think step by step.: Where is a business restaurant likely to be located? A: town B: at hotel C: mall D: business sector E: yellow pages.Let's think step by step. A business restaurant is likely to be located in a business sector. So, the answer is D.
Predicted Answer: D

Sample 11:
-

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: Where do you put your grapes just before checking out? A: mouth B: grocery cart C: super market D: fruit basket E: fruit market.Let's think step by step. A: Before checking out, you put your grapes in your grocery cart. So, the answer is B.
/

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: Where do you put your grapes just before checking out? A: mouth B: grocery cart C: super market D: fruit basket E: fruit market.Let's think step by step.: Where do you put your grapes just before checking out? A: mouth B: grocery cart C: super market D: fruit basket E: fruit market.Let's think step by step. A: Before checking out, you put your grapes in your grocery cart. So, the answer is B.
Predicted Answer: B

Sample 12:


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: Before getting a divorce, what did the wife feel who was doing all the work? A: harder B: anguish C: bitterness D: tears E: sadness.Let's think step by step. The answer is: E: sadness.
\

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: Before getting a divorce, what did the wife feel who was doing all the work? A: harder B: anguish C: bitterness D: tears E: sadness.Let's think step by step.: Before getting a divorce, what did the wife feel who was doing all the work? A: harder B: anguish C: bitterness D: tears E: sadness.Let's think step by step. The answer is: E: sadness.
Predicted Answer: E

Sample 13:


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: Johnny sat on a bench and relaxed after doing a lot of work on his hobby.  Where is he? A: state park B: bus depot C: garden D: gym E: rest area.Let's think step by step. A person does a hobby when he is free. The bench in the garden is used to relax in free time. So, the answer is C.
-

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: Johnny sat on a bench and relaxed after doing a lot of work on his hobby.  Where is he? A: state park B: bus depot C: garden D: gym E: rest area.Let's think step by step.: Johnny sat on a bench and relaxed after doing a lot of work on his hobby.  Where is he? A: state park B: bus depot C: garden D: gym E: rest area.Let's think step by step. A person does a hobby when he is free. The bench in the garden is used to relax in free time. So, the answer is C.
Predicted Answer: C

Sample 14:


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: James was cooling off two quickly.  He would die if he didn't find some way to stop what? A: loss of heat B: revenge C: expansion D: relaxation E: calm down.Let's think step by step. A: James was cooling off too quickly, and he would die if he didn't find some way to stop the loss of heat. The other options, such as revenge, expansion, relaxation, and calm down, are not relevant to the situation described.
/

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: James was cooling off two quickly.  He would die if he didn't find some way to stop what? A: loss of heat B: revenge C: expansion D: relaxation E: calm down.Let's think step by step.: James was cooling off two quickly.  He would die if he didn't find some way to stop what? A: loss of heat B: revenge C: expansion D: relaxation E: calm down.Let's think step by step. A: James was cooling off too quickly, and he would die if he didn't find some way to stop the loss of heat. The other options, such as revenge, expansion, relaxation, and calm down, are not relevant to the situation described.
Predicted Answer: A

Sample 15:


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:: Of all the rooms in a house it was his favorite, the aromas always drew him to the what? A: yard B: basement C: kitchen D: living room E: garden.Let's think step by step. The answer: C.
\

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:: Of all the rooms in a house it was his favorite, the aromas always drew him to the what? A: yard B: basement C: kitchen D: living room E: garden.Let's think step by step.: Of all the rooms in a house it was his favorite, the aromas always drew him to the what? A: yard B: basement C: kitchen D: living room E: garden.Let's think step by step. The answer: C.
Predicted Answer: C

Sample 16:


In [2]:
# 计算准确率
accuracy = calculate_accuracy(predictions, answers)
print(f"\nAccuracy: {accuracy * 100:.2f}%")


Accuracy: 68.00%


In [61]:
# 打印一些样本数据
print("\nSample Outputs:")
for q, a, p in zip(questions[:15], answers[:15], predictions[:15]):
    print(f"Question: {q}")
    print(f"True Answer: {a}")
    print(f"Predicted Answer: {p}")
    print()


Sample Outputs:
Question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? A: ignore B: enforce C: authoritarian D: yell at E: avoid
True Answer: A
Predicted Answer: B

Question: Sammy wanted to go to where the people were.  Where might he go? A: race track B: populated areas C: the desert D: apartment E: roadblock
True Answer: B
Predicted Answer: B

Question: To locate a choker not located in a jewelry box or boutique where would you go? A: jewelry store B: neck C: jewlery box D: jewelry box E: boutique
True Answer: A
Predicted Answer: A

Question: Google Maps and other highway and street GPS services have replaced what? A: united states B: mexico C: countryside D: atlas E: oceans
True Answer: D
Predicted Answer: A

Question: The fox walked from the city into the forest, what was it looking for? A: pretty flowers. B: hen house C: natural habitat D: storybook E: dense forest
True Answer: C
Predicted Answer: A

Q