In [1]:
import numpy as np
import pandas as pd
import json
import champ_dataset
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import torch
from torch import nn

In [2]:
# =================================Load the dataset==================================
# dataset is a champ_dataset.Dataset instance; 'v0' is the dataset used in the paper
dataset = champ_dataset.load('v0')

In [3]:
# =====================Get a random problem and read its content=====================
# get a random problem; dataset.problems is a dictionary
problem_id = random.choice(list(dataset.problems.keys()))
# shortcut for dataset.problems[problem_id]
problem = dataset[problem_id]
# problem identifier (which is equal to problem_id) and problem statement
print(f'{problem.identifier}: {problem.text}')

P_Inequality_19: For positive a, b, c, d, at most how many of the three inequalities a+b<c+d, (a+b)(c+d)<ab+cd, (a+b)cd<ab(c+d) can be true at the same time?


In [4]:
# iterate over the list of relevant concepts and hints
for ch_id in problem.ch_list:
    ch = dataset[ch_id]  # ch is either a champ_dataset.Concept instance or champ_dataset.Hint instance
    if isinstance(ch, champ_dataset.Concept):  # displaying a concept
        print(f'We have a concept: {ch.identifier}')
        print(f'Text: {ch.text}')  # content of the concept
        print(f'Category: {ch.category}')  # category of the concept
        if ch.name is not None:  # some concept has a name
            print(f'Name: {ch.name}')
        if ch.parent is not None:  # some concept has a parent concept (i.e., a more general version)
            print(f'Parent concept ({ch.parent.identifier}): {ch.parent.text}')
        print('-------------End of this concept-------------')
    else:  # displaying a hint
        print(f'We have a hint: {ch.identifier}')
        print(f'Text: {ch.text}')  # content of the hint
        print('--------------End of this hint---------------')

We have a hint: H_Inequality_19_1
Text: Find an assignment of a, b, c that makes two inequalities true.
--------------End of this hint---------------
We have a hint: H_Inequality_19_2
Text: Study whether all inequalities can be true by multiplying the first two together, and then the last two together.
--------------End of this hint---------------
We have a concept: C_(a+b)sq_4ab
Text: For real numbers a, b, (a+b)^2≥4ab.
Category: Inequality
Parent concept (C_sumdiff_sq): (x±y)^2=x^2±2xy+y^2.
-------------End of this concept-------------


In [5]:
print(f'Answer: {problem.answer}')  # final answer
print('Step-wise soluion:')
# problem.solution.steps is a list of champ_dataset.Step object
for idx, step in enumerate(problem.solution.steps):
    print(f'Step {idx}: {step.text}')  # content of the step
    # step.ch_idxs is the list of concepts and hints associated with this step
    # (by their index in problem.ch_list)
    if len(step.ch_idxs) == 0:
        print('  This step does not use any concepts or hints.')
    else:
        print('  This step uses the following concepts and hints: ' + \
            ', '.join([problem.ch_list[idx] for idx in step.ch_idxs]))

Answer: 2 inequalities
Step-wise soluion:
Step 0: We can see that the first two inequalities can be satisified with a=b=1, c=d=10, as 2<20 and 40<102.
  This step uses the following concepts and hints: H_Inequality_19_1
Step 1: Assume that all three inequalities are true.
  This step uses the following concepts and hints: H_Inequality_19_2
Step 2: Multiplying the first two inequalities, we get (a+b)^2<ab+cd.
  This step uses the following concepts and hints: H_Inequality_19_2
Step 3: Since (a+b)^2≥4ab, we have 4ab<ab+cd, or cd>3ab.
  This step uses the following concepts and hints: H_Inequality_19_2, C_(a+b)sq_4ab
Step 4: Multiplying the last two inequalities, we get (a+b)^2*cd<ab(ab+cd).
  This step uses the following concepts and hints: H_Inequality_19_2
Step 5: Since (a+b)^2≥4ab, we have 4abcd<abab+abcd, or ab>3cd.
  This step uses the following concepts and hints: H_Inequality_19_2, C_(a+b)sq_4ab
Step 6: The inequalities ab>3cd and cd>3ab cannot be true at the same time for positiv

In [6]:
# ======================Use PromptGenerator to generate prompts======================
generator = champ_dataset.PromptGenerator(dataset)  # the prompt generator is defined on a dataset
print(f'The prompt generator supports the following prompts: {generator.get_all_prompt_names()}')
# randomly select a prompt to generate
name = random.choice(generator.get_all_prompt_names())
print(f'Generating prompt: {name}')
sys_prompt, user_inputs, imputed_outputs = generator.construct_prompt(name, problem)
print('-----------------------')
print(f'System prompt: \n{sys_prompt}')
print('-----------------------')
for idx, msg in enumerate(user_inputs):
    print(f'User input {idx+1}: \n{msg}')
    print('-----------------------')
for idx, msg in enumerate(imputed_outputs):
    print(f'Imputed output {idx+1}: \n{msg}')
    print('-----------------------')

The prompt generator supports the following prompts: ['0-Shot', '5-Shot', '1/3 Soln', '2/3 Soln', 'No C w/o H', 'No C w/ H', 'Direct C w/o H', 'Direct C w/ H', 'Name C w/o H', 'Name C w/ H', 'Example C w/o H', 'Example C w/ H', 'Root C w/o H', 'Root C w/ H', 'Problem C w/o H', 'Problem C w/ H', 'Misleading C w/o H', 'Misleading C w/ H']
Generating prompt: Name C w/o H
-----------------------
System prompt: 
You are an expert on mathematics.
-----------------------
User input 1: 
Solve the following problem. Make sure to show your work before giving the final answer.

For positive a, b, c, d, at most how many of the three inequalities a+b<c+d, (a+b)(c+d)<ab+cd, (a+b)cd<ab(c+d) can be true at the same time?

You may find the following information useful:

1. For real numbers a, b, (a+b)^2≥4ab.
-----------------------


In [7]:
# the following code is (currently) only supported on the 'v0' version of the dataset

# =======================Read the first wrong step annotation========================
# problem.fws_annotations is a dictionary; note the key format as '{model}|{prompt}'
print(f'We have the following FWS annotations: {list(problem.fws_annotations.keys())}')
# get a key-value pair from the dictionary
key, annotation = next(iter(problem.fws_annotations.items()))
print(f'Displaying FWS annotation: {key}')
print(f'The model response is generated by {annotation.author}')
# annotation.text is the full model-generated solution
print(f'Beginning of model response: {annotation.text[:200]}')
if annotation.start_idx is None:
    print('The model-generated solution is fully correct')
else:
    # start_idx inclusive, end_idx exclusive
    print(f'The error happens at characters {annotation.start_idx}-{annotation.end_idx}')
    # shortcut for annotation.text[annotation.start_idx : annotation.end_idx]
    print(f'The text span is: {annotation.wrong_step()}')

We have the following FWS annotations: ['GPT-4 Turbo|No C w/o H', 'GPT-4 Turbo|Direct C w/ H']
Displaying FWS annotation: GPT-4 Turbo|No C w/o H
The model response is generated by GPT-4 Turbo
Beginning of model response: Let's analyze each inequality one by one and see if we can find a relationship between them.

1. \( a + b < c + d \)

This inequality simply states that the sum of \( a \) and \( b \) is less than the
The error happens at characters 500-562
The text span is: Grouping terms, we get:

\( c(a + b - d) + a(d + b - c) < 0 \)


In [8]:
# ========================Read all model-generated solutions=========================
# problem.conversations is a dictionary; note the key format as '{model}|{prompt}'
print(f'We have the following conversations: {list(problem.conversations.keys())}')
key, conversation = next(iter(problem.conversations.items()))  # get a key-value pair from the dictionary
print(f'Displaying conversation: {key}')
for message in conversation.messages:  # conversation.messages is a list of champ_dataset.Message instances
    # one of 'System', 'User', 'Imputation' and 'Model.*' (e.g., 'Model.GPT-3.5')
    print(f'Role: {message.role}')
    # verbatim content of the message
    print(f'Text: {message.text}')
    # any possible error with the model-generation
    print(f'Error: {message.error}')
    print('-----------------------')

We have the following conversations: ['GPT-3.5|0-Shot', 'GPT-3.5|5-Shot', 'GPT-3.5|1/3 Soln', 'GPT-3.5|2/3 Soln', 'GPT-3.5|No C w/o H', 'GPT-3.5|No C w/ H', 'GPT-3.5|Direct C w/o H', 'GPT-3.5|Direct C w/ H', 'GPT-3.5|Root C w/o H', 'GPT-3.5|Root C w/ H', 'GPT-3.5|Name C w/o H', 'GPT-3.5|Name C w/ H', 'GPT-3.5|Example C w/o H', 'GPT-3.5|Example C w/ H', 'GPT-3.5|Problem C w/o H', 'GPT-3.5|Problem C w/ H', 'GPT-3.5|Misleading C w/o H', 'GPT-3.5|Misleading C w/ H', 'GPT-4|0-Shot', 'GPT-4|5-Shot', 'GPT-4|1/3 Soln', 'GPT-4|2/3 Soln', 'GPT-4|No C w/o H', 'GPT-4|No C w/ H', 'GPT-4|Direct C w/o H', 'GPT-4|Direct C w/ H', 'GPT-4|Root C w/o H', 'GPT-4|Root C w/ H', 'GPT-4|Name C w/o H', 'GPT-4|Name C w/ H', 'GPT-4|Example C w/o H', 'GPT-4|Example C w/ H', 'GPT-4|Problem C w/o H', 'GPT-4|Problem C w/ H', 'GPT-4|Misleading C w/o H', 'GPT-4|Misleading C w/ H', 'GPT-4 Turbo|0-Shot', 'GPT-4 Turbo|5-Shot', 'GPT-4 Turbo|1/3 Soln', 'GPT-4 Turbo|2/3 Soln', 'GPT-4 Turbo|No C w/o H', 'GPT-4 Turbo|No C w/ H

# Prepare data

In [9]:
list_math_type = ['Combinatorics','Inequality','Number-Theory','Polynomial','Sequence']

In [10]:
PROMT = "\nRespond in a single word. In this five catagories: 'Combinatorics','Inequality','Number-Theory','Polynomial','Sequence'.What is the catagory for this math problem?\n"


In [11]:
problem_type_list = []
problem_text_list = []

for i in dataset.problems.keys():
    problem = dataset[i]
    current_promt = problem.text.replace('?', '') + PROMT
    current_promt = problem.text
    problem_text_list.append(current_promt)
    for current_type in list_math_type:
        if current_type in problem.identifier:
            problem_type_list.append(current_type)
            break


print(len(problem_type_list), '  ', len(problem_text_list))

270    270


In [12]:
current_promt

'Let a sequence be the defined as a_1=a_2=1, and a_n=(a_(n-1)^2+2)/a_(n-2). How many values in a_1, a_2, ..., a_100 are integers?'

In [13]:
encoder = OneHotEncoder(sparse_output=False)
problem_type_list = encoder.fit_transform(np.array(problem_type_list).reshape(-1,1))
#problem_text_list = np.array(problem_text_list)

In [14]:
from sklearn.model_selection import train_test_split


# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(problem_text_list, problem_type_list, test_size=0.2, random_state=42)

print("Train set size:", len(X_train))
print("Test set size:", len(X_test))

Train set size: 216
Test set size: 54


In [15]:
#load model
from transformers import AutoTokenizer, AutoModelForCausalLM, FlaxLlamaModel

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_fast=True)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
inputs = tokenizer("Hello, how are you?", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Hello, how are you? I hope this email finds you well. I am writing to inquire


In [17]:
configuration = model.config
configuration

LlamaConfig {
  "_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 128256
}

In [18]:
# Freeze all layers of the model
for param in model.parameters():
    param.requires_grad = False

# Fine-tune only the last transformer block (layer)
# for param in model.transformer.h[-1].parameters():
#     param.requires_grad = True

In [19]:
# Define the classification head
class ClassificationHead(nn.Module):
    def __init__(self, input_size, num_classes):
        super(ClassificationHead, self).__init__()
        self.fc = nn.Linear(input_size, num_classes)

    def forward(self, hidden_states):
        # Use the hidden state of the first token (usually the [CLS] token)
        cls_output = hidden_states[:, 0, :]
        return self.fc(cls_output)


# Number of classes in the classification task
num_classes = 5

# Add classification head on top of the Llama model
classification_head = ClassificationHead(input_size=model.config.hidden_size, num_classes=num_classes)

In [20]:
# Combine Llama model and classification head
class LlamaForClassification(nn.Module):
    def __init__(self, llama_model, classification_head):
        super(LlamaForClassification, self).__init__()
        self.llama_model = llama_model
        self.classification_head = classification_head

    def forward(self, input_ids, attention_mask=None):
        # Get the hidden states from the Llama model
        outputs = self.llama_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        hidden_states = outputs.hidden_states[-1]  # Last hidden state
        
        # Pass the hidden states to the classification head
        logits = self.classification_head(hidden_states)
        return logits

# Create the combined model
model_for_classification = LlamaForClassification(model, classification_head)

In [21]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

# Convert y_train and y_test to tensors
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Define the optimizer and loss function
optimizer = AdamW(model_for_classification.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss()

In [22]:
tokenizer.pad_token = tokenizer.eos_token 

In [23]:
# Tokenize the train and test data
train_encodings = tokenizer(X_train, padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(X_test, padding=True, truncation=True, return_tensors="pt")

# Get input ids and attention masks
train_input_ids = train_encodings['input_ids']
train_attention_mask = train_encodings['attention_mask']

test_input_ids = test_encodings['input_ids']
test_attention_mask = test_encodings['attention_mask']

In [24]:
from torch.utils.data import DataLoader, TensorDataset

# Create TensorDatasets for training and test sets
train_dataset = TensorDataset(train_input_ids, train_attention_mask, y_train)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, y_test)

# Set batch size
batch_size = 32

# Create DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [25]:
# Training loop
model_for_classification.train()
epochs = 5

for epoch in range(epochs):
    total_loss = 0

    for batch in train_loader:
        print('batch')
        # Unpack batch
        batch_input_ids, batch_attention_mask, batch_labels = batch

        # Zero out the gradients
        optimizer.zero_grad()
        
        # Forward pass
        logits = model_for_classification(batch_input_ids, batch_attention_mask)
        
        # Compute loss
        loss = loss_fn(logits, batch_labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

batch
batch
batch
batch
batch
batch
batch
Epoch 1, Loss: 1.638376304081508
batch
batch
batch
batch
batch
batch
batch
Epoch 2, Loss: 1.6477112770080566
batch
batch
batch
batch
batch
batch
batch
Epoch 3, Loss: 1.6039048773901803
batch
batch
batch
batch
batch
batch
batch
Epoch 4, Loss: 1.6411420617784773
batch
batch
batch
batch
batch
batch
batch
Epoch 5, Loss: 1.5887463773999895


In [26]:
# Evaluate on the test set
model_for_classification.eval()
all_predictions = []

with torch.no_grad():
    for batch in test_loader:
        print('batch')
        batch_input_ids, batch_attention_mask, batch_labels = batch
        logits = model_for_classification(batch_input_ids, batch_attention_mask)
        predictions = torch.argmax(logits, dim=1)
        all_predictions.append(predictions)

# Convert predictions to a single tensor
all_predictions = torch.cat(all_predictions)

print(f"Test Predictions: {all_predictions}")

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


batch
batch
Test Predictions: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])


In [27]:
torch.argmax(y_test, dim=1)

tensor([0, 2, 1, 2, 3, 2, 3, 1, 2, 4, 3, 4, 4, 1, 2, 3, 4, 2, 2, 1, 4, 4, 1, 1,
        3, 0, 0, 3, 2, 0, 2, 1, 2, 4, 4, 0, 2, 1, 2, 2, 0, 2, 4, 0, 1, 3, 2, 3,
        2, 3, 0, 2, 4, 4])