In [1]:
from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()


hf_token = os.getenv("HF_TOKEN")

login(token=hf_token)

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
from transformers import AutoTokenizer


model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name,token=hf_token)

In [4]:
from datasets import load_dataset

# Load dataset from the hub
dataset = load_dataset("microsoft/orca-math-word-problems-200k", split="train")

dataset

Generating train split: 100%|██████████| 200035/200035 [00:00<00:00, 325387.44 examples/s]


Dataset({
    features: ['question', 'answer'],
    num_rows: 200035
})

In [5]:
from pprint import pprint

dataset[0]

{'question': 'Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook.',
 'answer': 'If Jungkook is in 5th place, then 4 people crossed the finish line faster than him.'}

In [6]:
# Create system prompt
system_message = """Solve the given high school math problem by providing a clear explanation of each step leading to the final solution.
 
Provide a detailed breakdown of your calculations, beginning with an explanation of the problem and describing how you derive each formula, value, or conclusion. Use logical steps that build upon one another, to arrive at the final answer in a systematic manner.
 
# Steps
 
1. **Understand the Problem**: Restate the given math problem and clearly identify the main question and any important given values.
2. **Set Up**: Identify the key formulas or concepts that could help solve the problem (e.g., algebraic manipulation, geometry formulas, trigonometric identities).
3. **Solve Step-by-Step**: Iteratively progress through each step of the math problem, justifying why each consecutive operation brings you closer to the solution.
4. **Double Check**: If applicable, double check the work for accuracy and sense, and mention potential alternative approaches if any.
5. **Final Answer**: Provide the numerical or algebraic solution clearly, accompanied by appropriate units if relevant.
 
# Notes
 
- Always clearly define any variable or term used.
- Wherever applicable, include unit conversions or context to explain why each formula or step has been chosen.
- Assume the level of mathematics is suitable for high school, and avoid overly advanced math techniques unless they are common at that level.
"""
 
# convert to messages 
def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }  
 

 
# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features, batched=False)
 
pprint(dataset[345]["messages"])
 
# save datasets to disk 
dataset.to_json("train_dataset.json", orient="records")

Map: 100%|██████████| 200035/200035 [00:12<00:00, 15422.65 examples/s]


[{'content': 'Solve the given high school math problem by providing a clear '
             'explanation of each step leading to the final solution.\n'
             ' \n'
             'Provide a detailed breakdown of your calculations, beginning '
             'with an explanation of the problem and describing how you derive '
             'each formula, value, or conclusion. Use logical steps that build '
             'upon one another, to arrive at the final answer in a systematic '
             'manner.\n'
             ' \n'
             '# Steps\n'
             ' \n'
             '1. **Understand the Problem**: Restate the given math problem '
             'and clearly identify the main question and any important given '
             'values.\n'
             '2. **Set Up**: Identify the key formulas or concepts that could '
             'help solve the problem (e.g., algebraic manipulation, geometry '
             'formulas, trigonometric identities).\n'
             '3. **Solve Ste

Creating json from Arrow format: 100%|██████████| 201/201 [00:04<00:00, 48.44ba/s]


541572034

In [6]:
dataset

Dataset({
    features: ['messages'],
    num_rows: 200035
})