In [5]:
import random
import numpy as np
import torch


def set_seeds(seed=42):
    # Set the Python seed for the random module
    random.seed(seed)
    
    # Set the seed for numpy
    np.random.seed(seed)
    
    # Set the seed for PyTorch on CPU and GPU
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # If using multiple GPUs

    # Ensure deterministic algorithms for reproducibility
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seeds(4291)


In [6]:
data_file = "/lustre/fsw/portfolios/llmservice/users/sghotra/data/grade-school-math/grade_school_math/data/train.jsonl"

import json
import random

def load_data(data_file):
    with open(data_file, "r") as f:
        data = [json.loads(line) for line in f]
    return data


data = load_data(data_file)
random.shuffle(data)

In [7]:
data[1]

{'question': 'Asha has to buy some gifts but has no money. She decides to borrow $20 from her brother, $40 from her father, $30 from her mother and is gifted $70 by her granny. She adds this to her savings of $100 and goes out to Target to get what she wanted. If she spent 3/4 of the money, how much money did she remain with?',
 'answer': 'With the amount borrowed from her family, she has a total of 20+40+30= <<20+40+30=90>>90 dollars.\nWith the amount gifted from her granny, she has 70+90 = <<70+90=160>>160 dollars.\nShe adds this to her savings, ending up with a total of 160+100 = <<160+100=260>>260 dollars.\nShe spends 3/4 of this money, which is 3/4*260 = <<3/4*260=195>>195 dollars.\nThe total she remains with is 260-195 = <<260-195=65>>65 dollars\n#### 65'}

GSM examples (train):

1. Reese has been practicing piano for four hours every week. How many hours will he practice after five months?',
'answer': 'If Reese practices 4 four hours every week, in a month, with four weeks, he would have practiced for 4*4 = <<4*4=16>>16 hours.\nIn five months, Reese would have practiced for 5*16 = <<5*16=80>>80 hours\n#### 80'}

2. {'question': 'Asha has to buy some gifts but has no money. She decides to borrow $20 from her brother, $40 from her father, $30 from her mother and is gifted $70 by her granny. She adds this to her savings of $100 and goes out to Target to get what she wanted. If she spent 3/4 of the money, how much money did she remain with?',
 'answer': 'With the amount borrowed from her family, she has a total of 20+40+30= <<20+40+30=90>>90 dollars.\nWith the amount gifted from her granny, she has 70+90 = <<70+90=160>>160 dollars.\nShe adds this to her savings, ending up with a total of 160+100 = <<160+100=260>>260 dollars.\nShe spends 3/4 of this money, which is 3/4*260 = <<3/4*260=195>>195 dollars.\nThe total she remains with is 260-195 = <<260-195=65>>65 dollars\n#### 65'}

In [7]:
import transformers
import torch

base_model = "/lustre/fsw/portfolios/llmservice/users/sghotra/models/pre_train/Meta-Llama-3.1-8B-Instruct"
pipeline = transformers.pipeline(
    "text-generation", 
    model=base_model, 
    model_kwargs={"torch_dtype": torch.bfloat16}, 
    device_map="auto"
)
tokenizer = transformers.AutoTokenizer.from_pretrained(base_model, padding_side="left", truncation_side="left")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.29s/it]


In [8]:
inst = f"""Given the following problem, reason (subtle flawed) and give a final answer (wrong) to the problem. Specifically, you should introduce a subtle flaws in the steps that challenges the users to understand the problem better by learning to find the flaw in the solution. DO NOT POINT OUT THE FLAW IN THE SOLUTION. Make sure it's as logically coherent as possible while still being subtly flawed. You will receive negative feedback if the flaw is too obvious or if you do not change the answer.

Task format:

Question:
<A question here>

Solution:
Step 1: <Step 1 here>
Step 2: <Step 2 here>
.
.
.
Step n: <Step n here>
The final answer is 6

Note: Your response should end with \"The final answer is [answer]\" where [answer] is the response to the problem.

Problem:
{{question}}

Solution:
Step 1:"""

In [9]:
question = "Reese has been practicing piano for four hours every week. How many hours will he practice after five months?"
full_q = inst.format(question=question)
messages = [
    {"role": "user", "content": full_q},
]
# template_question = tokenizer.apply_chat_template(messages, tokenize=False)


In [10]:
outputs = pipeline(
    messages,
    do_sample=False,
    # temperature=1.0,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1]['content'])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Since Reese practices piano for four hours every week, we need to calculate the total number of weeks in five months. There are approximately 4 weeks in a month, so there are 5 * 4 = 20 weeks in five months.

Step 2: Now that we know there are 20 weeks in five months, we can multiply the number of weeks by the number of hours Reese practices each week. This will give us the total number of hours Reese practices in five months. 20 weeks * 4 hours/week = 80 hours.

Step 3: However, we need to consider that Reese practices piano for four hours every week, but we don't know if he practices the same amount of hours every day. Let's assume he practices for 4 hours a day, which is a reasonable assumption. There are 7 days in a week, so Reese practices for 4 * 7 = 28 hours every week. But we already know that he practices for 4 hours every week, so we can just use that.

Step 4: Now that we have the correct number of hours Reese practices every week, we can multiply it by the number of weeks i

In [11]:
i = 1

In [12]:
i += 1
question = data[i]["question"]
full_q = inst.format(question=question)
messages = [
    {"role": "user", "content": full_q},
]
outputs = pipeline(
    messages,
    do_sample=False,
    # temperature=1.0,
    max_new_tokens=256,
)
print(data[i])
print()
print(outputs[0]["generated_text"][-1]['content'])


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'question': 'Bill needs to soak his clothes for 4 minutes to get rid of each grass stain and 7 additional minutes to get rid of each marinara stain. If his clothes have 3 grass stains and 1 marinara stain, how long does he need to soak them?', 'answer': 'First find the total soaking time to get rid of the grass stains: 4 minutes/stain * 3 stains = <<4*3=12>>12 minutes\nThen add the soak time for the marinara to find the total soak time: 12 minutes + 7 minutes = <<12+7=19>>19 minutes\n#### 19'}

Step 1: First, let's calculate the total time needed to get rid of the grass stains. Since each grass stain requires 4 minutes, and there are 3 grass stains, we can multiply the number of grass stains by the time needed to get rid of each one: 3 * 4 = 12 minutes.

Step 2: Next, we need to calculate the additional time needed to get rid of the marinara stain. Since each marinara stain requires 7 additional minutes, and there is only 1 marinara stain, we can simply add 7 minutes to the total time

In [13]:
i += 1
question = data[i]["question"]
full_q = inst.format(question=question)
messages = [
    {"role": "user", "content": full_q},
]
outputs = pipeline(
    messages,
    do_sample=False,
    # temperature=1.0,
    max_new_tokens=256,
)
print(data[i])
print()
print(outputs[0]["generated_text"][-1]['content'])


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'question': 'Toms car gets 50 miles to the gallon.  He drives 75 miles per day.  If gas is $3 per gallon how much does he spend on gas in 10 days?', 'answer': 'He drives 75*10=<<75*10=750>>750 miles\nSo he uses 750/50=<<750/50=15>>15 gallons\nThat means he spends 15*3=$<<15*3=45>>45 on gas\n#### 45'}

Step 1: First, let's calculate how many miles Tom drives in 10 days. Since he drives 75 miles per day, we multiply 75 miles by 10 days to get the total miles driven. 75 miles/day * 10 days = 750 miles.

Step 2: Next, we need to calculate how many gallons of gas Tom uses in 10 days. Since his car gets 50 miles to the gallon, we divide the total miles driven by 50 miles/gallon to get the total gallons used. 750 miles / 50 miles/gallon = 15 gallons.

Step 3: Now, we need to calculate the total cost of gas for 10 days. Since gas is $3 per gallon, we multiply the total gallons used by $3 per gallon to get the total cost. However, we need to consider that Tom's car gets 50 miles to the gallon,

In [14]:
i = 3
i += 1
question = data[i]["question"]
full_q = inst.format(question=question)
messages = [
    {"role": "user", "content": full_q},
]
outputs = pipeline(
    messages,
    do_sample=False,
    # temperature=1.0,
    max_new_tokens=256,
)
print(data[i])
print(data[i])
print()
print(outputs[0]["generated_text"][-1]['content'])


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'question': "Jake is from a conservative household so during lunch at school one day, he gave one-quarter of his feeding allowance to a hungry friend. If candies cost 20 cents apiece and Jake's feeding allowance is $4, how many candies can his friend purchase with the money Jake gave to him?", 'answer': 'One quarter of $4 is (1/4)*$4 = $<<(1/4)*4=1>>1\n$1 is equivalent to 100 cents\n20 cents will buy 1 candy so 100 cents will buy 100/20 = <<100/20=5>>5 candies\n#### 5'}
{'question': "Jake is from a conservative household so during lunch at school one day, he gave one-quarter of his feeding allowance to a hungry friend. If candies cost 20 cents apiece and Jake's feeding allowance is $4, how many candies can his friend purchase with the money Jake gave to him?", 'answer': 'One quarter of $4 is (1/4)*$4 = $<<(1/4)*4=1>>1\n$1 is equivalent to 100 cents\n20 cents will buy 1 candy so 100 cents will buy 100/20 = <<100/20=5>>5 candies\n#### 5'}

First, let's calculate the amount of money Jake

In [15]:
i += 1
question = data[i]["question"]
full_q = inst.format(question=question)
messages = [
    {"role": "user", "content": full_q},
]
outputs = pipeline(
    messages,
    do_sample=False,
    # temperature=1.0,
    max_new_tokens=256,
)
print(data[i])
print(data[i])
print()
print(outputs[0]["generated_text"][-1]['content'])


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'question': 'The number of math problems that Marvin practiced today is three times as many as the number of problems he solved yesterday. His friend, Arvin, has practiced twice as many math problems on each day. How many math problems have they practiced altogether if Marvin solved 40 math problems yesterday?', 'answer': 'If Marvin solved 40 math problems yesterday, he has solved 40*3 = <<40*3=120>>120 math problems today.\nThe total number of math problems that Marvin has practiced is 120+40 = <<120+40=160>>160\nArvin has practiced twice as many as Marvin, so has practiced 160 * 2 = <<160*2=320>>320 problems.\nTogether, Marvin and Arvin have solved 320+160 = <<320+160=480>>480 math problems.\n#### 480'}
{'question': 'The number of math problems that Marvin practiced today is three times as many as the number of problems he solved yesterday. His friend, Arvin, has practiced twice as many math problems on each day. How many math problems have they practiced altogether if Marvin solved

In [13]:
i += 1
question = data[i]["question"]
full_q = inst.format(question=question)
messages = [
    {"role": "user", "content": full_q},
]
outputs = pipeline(
    messages,
    do_sample=False,
    # temperature=1.0,
    max_new_tokens=256,
)
print(data[i])
print(data[i])
print()
print(outputs[0]["generated_text"][-1]['content'])


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'question': 'John bakes 12 coconut macaroons, each weighing 5 ounces. He then packs an equal number of the macaroons in 4 different brown bags, ready for delivery. When he briefly leaves the kitchen to pick the phone, his little brother Steve eats the entire contents of one of the brown bags.  What is the total weight, in ounces, of the remaining coconut macaroons?', 'answer': 'Each brown bag has 12/4 = <<12/4=3>>3 macaroons\nAfter Steve eats a bag, there are 12-3 = <<12-3=9>>9 macaroons left\nThe total weight is therefore 9 * 5 = <<9*5=45>>45 ounces.\n#### 45'}
{'question': 'John bakes 12 coconut macaroons, each weighing 5 ounces. He then packs an equal number of the macaroons in 4 different brown bags, ready for delivery. When he briefly leaves the kitchen to pick the phone, his little brother Steve eats the entire contents of one of the brown bags.  What is the total weight, in ounces, of the remaining coconut macaroons?', 'answer': 'Each brown bag has 12/4 = <<12/4=3>>3 macaroons\