In [1]:
!pip install transformers datasets peft bitsandbytes trl auto-gptq openai

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting trl
  Downloading trl-0.10.1-py3-none-any.whl.metadata (12 kB)
Collecting auto-gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting openai
  Downloading openai-1.45.0-py3-none-any.whl.metadata (22 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py31

In [2]:
import os
import torch
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from transformers import BitsAndBytesConfig
import logging

In [4]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
model_name = "AI-MO/NuminaMath-7B-TIR"

In [6]:
# Quantization handled within QLoRA automatically
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Ensures model is loaded in 4-bit precision
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [7]:
# Load the model with quantization settings
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/716 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

In [8]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

In [9]:
# Configure QLoRA settings
lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    target_modules=["q_proj","k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none"
)

In [10]:
# Prepare the model for QLoRA training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)


In [11]:
# Display trainable parameters
model.print_trainable_parameters()

trainable params: 3,932,160 || all params: 6,914,297,856 || trainable%: 0.0569


In [13]:
# Load the combined training and test datasets
combined_train_df = pd.read_csv('/content/combined_train_new.csv')
combined_test_df = pd.read_csv('/content/combined_test_new.csv')

# Convert the DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(combined_train_df)
test_dataset = Dataset.from_pandas(combined_test_df)

# Create a DatasetDict to match the structure of raw_datasets
raw_datasets = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [14]:
def preprocess_data(examples):
    prompts = [f"[INST] {problem} [/INST]" for problem in examples['problem']]
    tokenized_inputs = tokenizer(prompts, truncation=True, max_length=1024, padding='max_length')
    tokenized_inputs['labels'] = tokenizer(
        examples['solution'], truncation=True, max_length=1024, padding='max_length')['input_ids']
    return tokenized_inputs

In [15]:
# Apply the preprocessing function
tokenized_datasets = raw_datasets.map(preprocess_data, batched=True)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [16]:
print(tokenized_datasets["train"].column_names)


['problem', 'solution', 'Unnamed: 2', 'input_ids', 'attention_mask', 'labels']


In [17]:
def formatting_func(examples):
    # Example: Combine 'problem' and 'solution' if required
    return {'formatted_text': [f"Problem: {p} | Solution: {s}" for p, s in zip(examples['problem'], examples['solution'])]}


In [18]:
# Trainer configuration
sft_config = SFTConfig(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=2e-05,
    weight_decay=0.02,
    fp16=True,
    optim="paged_adamw_8bit",
    greater_is_better=False,
    output_dir="./model_output_1",  # Save the best model here
    metric_for_best_model="eval_loss",
    label_names=["labels"]  # Ensure labels are passed correctly for loss calculation
)



In [19]:
# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    formatting_func=formatting_func
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [20]:
# Ensure the trainer can return loss during evaluation
trainer.can_return_loss = True

In [None]:
# Train the model
print("*** Training the model ***")
trainer.train()

In [23]:
# Save the best model
trainer.save_model(sft_config.output_dir)
logger.info(f"Best model saved to {sft_config.output_dir}")


In [24]:
# Load the best saved model
best_model_path = "./model_output_1"  # Replace with your actual saved model path
model_test = AutoModelForCausalLM.from_pretrained(best_model_path)
tokenizer_test = AutoTokenizer.from_pretrained(best_model_path, use_fast=True)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [25]:
def generate_response(model, tokenizer, input_text, max_length=1024):
    # Encode the input text
    inputs_test = tokenizer_test.encode(input_text, return_tensors="pt")

    # Ensure the model and inputs are on the correct device (CPU or GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_test.to(device)
    inputs_test = inputs_test.to(device)

    # Generate response
    output_test = model_test.generate(
        inputs_test,
        max_length=1024,
        num_return_sequences=1,  # Number of responses to generate
        do_sample=False,          # Use sampling; set to False for deterministic output
        top_p=0.95,              # Nucleus sampling (top-p)
        temperature=0.7          # Control randomness; lower is more conservative
    )

    # Decode the output
    response = tokenizer_test.decode(output_test[0], skip_special_tokens=True)
    return response

In [37]:
# Provide a custom input prompt
custom_input = "A ball is thrown vertically upward with an initial velocity of 15 m/s from the ground. Question: How high does the ball go?"


In [38]:
# Generate and print the response
response = generate_response(model_test, tokenizer_test, custom_input)
print("Custom Input:", custom_input)
print("Model Response:", response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


Custom Input: A ball is thrown vertically upward with an initial velocity of 15 m/s from the ground. Question: How high does the ball go?
Model Response: A ball is thrown vertically upward with an initial velocity of 15 m/s from the ground. Question: How high does the ball go? How long does it take to reach its highest point? Ignore air resistance.
### Solution: To solve this problem, we need to use the equations of motion under constant acceleration due to gravity. Here are the steps to solve the problem:

1. **Determine the maximum height reached:**
   The height \( h \) reached by an object thrown upward with an initial velocity \( u \) under gravity \( g \) is given by the equation:
   \[
   h = ut - \frac{1}{2}gt^2
   \]
   At the highest point, the velocity becomes zero. So, we need to find the time \( t \) it takes to reach the highest point using:
   \[
   v = u - gt
   \]
   where \( v = 0 \) (at the highest point).

2. **Calculate the time to reach the highest point:**
   \[


## Evaluation & Comparison of models


In [70]:
solution = response

In [71]:
import re

In [72]:
def extract_python_code(solution):
    # Basic regex to extract code blocks
    code_blocks = re.findall(r'```python\n(.*?)\n```', solution, re.DOTALL)
    if code_blocks:
        return code_blocks[0]  # Return the first block if there are multiple
    return None


In [73]:
def execute_python_code(code):
    local_vars = {}
    try:
        exec(code, globals(), local_vars)
        return local_vars
    except Exception as e:
        return str(e)


In [74]:
# Extract and execute the Python code
python_code = extract_python_code(solution)
if python_code:
    print("Extracted Python Code:\n", python_code)
    execution_result = execute_python_code(python_code)
    print("Execution Result:\n", execution_result)
else:
    print("No Python code found in the solution.")


Extracted Python Code:
 import sympy as sp

# Define the variables
x, y = sp.symbols('x y')

# Define the temperature distribution function T(x, y)
T = 100 * sp.exp(-0.1 * x) * sp.cos(0.5 * y)

# Compute the partial derivative of T with respect to x
dT_dx = sp.diff(T, x)

# Evaluate the partial derivative at the point (2, pi)
rate_of_change_at_point = dT_dx.subs({x: 2, y: sp.pi})

# Print the result
print(rate_of_change_at_point)
0
Execution Result:
 {'sp': <module 'sympy' from '/usr/local/lib/python3.10/dist-packages/sympy/__init__.py'>, 'x': x, 'y': y, 'T': 100*exp(-0.1*x)*cos(0.5*y), 'dT_dx': -10.0*exp(-0.1*x)*cos(0.5*y), 'rate_of_change_at_point': 0}


In [28]:
!pip install openai




In [39]:
from openai import OpenAI

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key="insert-your-api-key-here",

)


def chat_gpt_with_client(prompt):
    """
    Function to interact with ChatGPT using the OpenAI client object.
    """
    try:
        # Generate the completion using OpenAI's chat completion endpoint
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        # Extract and return the response content
        return response.choices[0].message.content

    except Exception as e:
        print(f"An error occurred: {e}")
        return None


prompt = "A ball is thrown vertically upward with an initial velocity of 15 m/s from the ground. Question: How high does the ball go?"

chatgpt_response = chat_gpt_with_client(prompt)


if chatgpt_response:
    print(f"ChatGPT's Response:\n{chatgpt_response}")


ChatGPT's Response:
The height a ball reaches can be determined by using the equation of motion: h = v² / (2g), where h represents the height, v represents the initial velocity, and g represents the gravitational acceleration (approximated as 9.8 m/s²).

Substituting the given values into the equation, we get h = 15² / (2 * 9.8) = 11.47 meters

Therefore, the ball goes approximately 11.47 meters high.


In [49]:
import spacy


nlp = spacy.load("en_core_web_sm")

def extract_steps_nlp(response):
    """
    Extract steps from a model's response using NLP techniques with SpaCy.

    Parameters:
    - response: The complete response string from the model.

    Returns:
    - steps: A list of identified steps extracted from the response.
    """
    # List to hold extracted steps
    steps = []

    # Expanded step indicator keywords and patterns
    step_keywords = [
        'step', 'compute', 'to compute', 'to evaluate', 'to find', 'next', 'thus', 'evaluate', 'implement', 'calculate', 'find',
        'apply', 'determine', 'perform', 'execute', 'derive', 'show', 'use',
        'examine', 'obtain', 'simplify', 'solve'
    ]
    step_patterns = ['\d+\.', 'step \d+', 'step', 'procedure', 'solution', 'process']

    # Parse the response with SpaCy
    doc = nlp(response)

    # Split the response into sentences
    sentences = list(doc.sents)

    print("Debug: Extracting steps...\n")  # Debugging line

    # Iterate through each sentence and look for step indicators
    for sent in sentences:
        sent_text = sent.text.strip().lower()

        # Debug print to check sentence contents
        print(f"Debug: Checking sentence - {sent.text.strip()}")

        # Check if the sentence contains step keywords or matches step patterns
        if any(keyword in sent_text for keyword in step_keywords) or any(nlp(sent_text).text.startswith(pattern) for pattern in step_patterns):
            steps.append(sent.text.strip())
            print(f"Debug: Step detected - {sent.text.strip()}")  # Debugging line

        # Check for sentences starting with numbers or action verbs commonly used in steps
        elif sent.text.strip().startswith(('1.', '2.', '3.', 'step', 'Step')):
            steps.append(sent.text.strip())
            print(f"Debug: Step detected by numbering - {sent.text.strip()}")  # Debugging line

        # Detect and separate code blocks or inline code
        elif '```' in sent.text or sent_text.startswith('```python') or sent_text.startswith('print'):
            steps.append(sent.text.strip())
            print(f"Debug: Code block detected - {sent.text.strip()}")  # Debugging line

        elif sent_text.startswith(('import', 'def', '#')) and len(steps) > 0:
            # Check if the code step is part of the previous extracted step; concatenate it
            steps[-1] += f"\n{sent.text.strip()}"
            print(f"Debug: Code appended to previous step - {sent.text.strip()}")  # Debugging line

    # Merge steps if they seem logically part of the same step, for better readability
    steps = merge_related_steps(steps)

    # Check if steps were successfully extracted
    if not steps:
        print("Debug: No steps detected. Returning the original response.")
        return [response]  # Return the response as it is if no steps are detected

    return steps

def merge_related_steps(steps):
    """
    Merge related steps that are broken across multiple lines or sentences.

    Parameters:
    - steps: List of steps extracted from the response.

    Returns:
    - merged_steps: List of steps with related content merged.
    """
    merged_steps = []
    current_step = ""

    for step in steps:
        # Check if a step appears to be continuation or related to previous step
        if step.startswith(('1.', '2.', '3.', 'Step', '```', '#', 'Next', 'To find', 'To compute', 'To evaluate', 'import')) or step.startswith("Let's"):
            # Start a new step if current is not empty
            if current_step:
                merged_steps.append(current_step.strip())
            current_step = step
        else:
            # Concatenate lines that appear to be continuations of previous step
            current_step += f" {step}"

    # Append the last accumulated step
    if current_step:
        merged_steps.append(current_step.strip())

    return merged_steps


In [50]:
#model_steps = extract_steps_nlp(response)
#print(model_steps)

model_extracted_steps = extract_steps_nlp(response)


for i, step in enumerate(model_extracted_steps, start=1):
    print(f"Extracted Step {i}: {step}")


Debug: Extracting steps...

Debug: Checking sentence - Find the rate of change of temperature with respect to x at the point (2, π) for the temperature distribution T(x, y) = 100e^(-0.1x) cos(0.5y).
Debug: Step detected - Find the rate of change of temperature with respect to x at the point (2, π) for the temperature distribution T(x, y) = 100e^(-0.1x) cos(0.5y).
Debug: Checking sentence - ### Solution: To find the rate of change of temperature with respect to \( x \) at the point \((2, \pi)\) for the temperature distribution \( T(x, y) = 100e^{-0.1x} \cos(0.5y) \), we need to compute the partial derivative of \( T(x, y) \) with respect to \( x \) and then evaluate it at the point \((2, \pi)\).
Debug: Step detected - ### Solution: To find the rate of change of temperature with respect to \( x \) at the point \((2, \pi)\) for the temperature distribution \( T(x, y) = 100e^{-0.1x} \cos(0.5y) \), we need to compute the partial derivative of \( T(x, y) \) with respect to \( x \) and then e

In [51]:
chatgpt_extracted_steps = extract_steps_nlp(chatgpt_response)

for i, step in enumerate(chatgpt_extracted_steps, start=1):
    print(f"Extracted Step {i}: {step}")


Debug: Extracting steps...

Debug: Checking sentence - The partial derivative of T(x, y) with respect to x is given by:

∂T/∂x =
Debug: Checking sentence - -10
Debug: Checking sentence - *
Debug: Checking sentence - exp(-0.1 * x) *
Debug: Checking sentence - cos(0.5 * y)
Debug: Checking sentence - At the point (2, π), we have:
Debug: Checking sentence - ∂T/∂x = -10 * exp(-0.2) *
Debug: Checking sentence - cos(0.5 * π)

= -10 * exp(-0.2) * 0
Debug: Checking sentence - (since cos(π/2) = 0)

= 0
Debug: No steps detected. Returning the original response.
Extracted Step 1: The partial derivative of T(x, y) with respect to x is given by:

∂T/∂x = -10 * exp(-0.1 * x) * cos(0.5 * y)

At the point (2, π), we have:

∂T/∂x = -10 * exp(-0.2) * cos(0.5 * π)

= -10 * exp(-0.2) * 0 (since cos(π/2) = 0)

= 0


In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_cosine_similarity(step1, step2):
    """
    Compute the cosine similarity between two text steps.

    Parameters:
    - step1: First step as a string.
    - step2: Second step as a string.

    Returns:
    - similarity: Cosine similarity score.
    """
    vectorizer = TfidfVectorizer().fit_transform([step1, step2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

def find_best_matches(answer_key, model_steps, model_name='Model'):
    """
    Find the best matching model step for each answer key step based on cosine similarity.

    Parameters:
    - answer_key: List of steps from the answer key.
    - model_steps: List of steps from the model.

    Returns:
    - best_matches: List of best matching similarities for each answer key step.
    """
    best_matches = []
    for i, key_step in enumerate(answer_key):
        similarities = [compute_cosine_similarity(key_step, model_step) for model_step in model_steps]
        best_match_index = np.argmax(similarities)  # Find the index of the highest similarity
        best_match = model_steps[best_match_index]
        best_similarity = similarities[best_match_index]
        best_matches.append(best_similarity)  # Appe

        print(f"Answer Key Step {i + 1}: {key_step}")
        print(f"Best Match from {model_name}: Step {best_match_index + 1} - {best_match}")
        print(f"Cosine Similarity: {best_similarity:.4f}\n")


    return best_matches

def compute_weighted_similarity(similarities):
    """
    Compute the weighted average of cosine similarities.

    Parameters:
    - similarities: List of similarity scores.

    Returns:
    - weighted_similarity: Weighted average of the similarity scores.
    """
    weights = [1 / len(similarities)] * len(similarities)  # Equal weights for each similarity
    weighted_similarity = sum(w * sim for w, sim in zip(weights, similarities))
    return weighted_similarity



In [53]:
# Example input data
answer_key = ["Compute the partial derivative of T(x, y).", "Evaluate the partial derivative at (2, π)."]


In [54]:

# Step 1: Find the best matches for model steps against the answer key
model_best_matches = find_best_matches(answer_key, model_extracted_steps, model_name='Model')

# Step 2: Find the best matches for ChatGPT steps against the answer key
chatgpt_best_matches = find_best_matches(answer_key, chatgpt_extracted_steps, model_name='Chatgpt')

# Step 3: Compute weighted similarities
model_weighted_similarity = compute_weighted_similarity(model_best_matches)
chatgpt_weighted_similarity = compute_weighted_similarity(chatgpt_best_matches)

# Step 4: Compare the similarities
print(f"Model Weighted Similarity: {model_weighted_similarity:.4f}")
print(f"ChatGPT Weighted Similarity: {chatgpt_weighted_similarity:.4f}")

if model_weighted_similarity > chatgpt_weighted_similarity:
    print("Model's similarity to the answer key is greater than ChatGPT's.")
elif model_weighted_similarity < chatgpt_weighted_similarity:
    print("ChatGPT's similarity to the answer key is greater than the model's.")
else:
    print("Model and ChatGPT have equal similarity to the answer key.")


Answer Key Step 1: Compute the partial derivative of T(x, y).
Best Match from Model: Step 2 - ### Solution: To find the rate of change of temperature with respect to \( x \) at the point \((2, \pi)\) for the temperature distribution \( T(x, y) = 100e^{-0.1x} \cos(0.5y) \), we need to compute the partial derivative of \( T(x, y) \) with respect to \( x \) and then evaluate it at the point \((2, \pi)\). Here are the steps:

1. Compute the partial derivative of \( T(x, y) \) with respect to \( x \).
Cosine Similarity: 0.5001

Answer Key Step 2: Evaluate the partial derivative at (2, π).
Best Match from Model: Step 3 - 2. Evaluate the partial derivative at \( x = 2 \) and \( y = \pi \).
Cosine Similarity: 0.7474

Answer Key Step 1: Compute the partial derivative of T(x, y).
Best Match from Chatgpt: Step 1 - The partial derivative of T(x, y) with respect to x is given by:

∂T/∂x = -10 * exp(-0.1 * x) * cos(0.5 * y)

At the point (2, π), we have:

∂T/∂x = -10 * exp(-0.2) * cos(0.5 * π)

= -1