In [22]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
from pprint import pprint

import pandas as pd
from datasets import load_dataset
import cohere
from dotenv import load_dotenv



import prompts


In [3]:
# Retrieve your API key from environment variables, and create a Cohere client
load_dotenv()
key = os.getenv("COHERE_API_KEY")
co = cohere.ClientV2(key)
model_name = "command-r-plus-08-2024"

# Test the API
print(co.chat(model=model_name, messages=[{"role": "user", "content": "Hello, world!"}]))



In [4]:
# Load the cn k12 subset and get the first 5 rows of problem/solution
file_path = "datasets/cn_k12_math_problems.csv"
data = pd.read_csv(file_path, nrows=5)
data = data[["problem", "solution"]]
data

Unnamed: 0,problem,solution
0,Given the functions $f(x) = \log_a(1+x)$ and $...,1. Since $f(x) = \log_a(1+x)$ and $g(x) = \log...
1,"In $\triangle ABC$, the lengths of the sides o...","Since $\cos \frac{C}{2} = \frac{\sqrt{5}}{3}$,..."
2,Given that $P$ is any point on the circle $C$:...,The distance $d$ between the center of the cir...
3,Factorize: $x^{3}-6x^{2}+9x=\_\_\_\_\_\_.$,To factorize the given expression $x^{3}-6x^{2...
4,Given that the sum of the first $n$ terms of a...,"Since $S\_n=2^{n}a\_n-1$,\nwe have $S_{n-1}=2^..."


In [5]:
step_response = co.chat(
    model=model_name,
    messages=[
        {
            "role": "user",
            "content": prompts.STEPIFY_PROMPT.format(solution=data.iloc[0]["solution"])
        }
    ]
)


In [25]:
steps = step_response.message.content[0].text
print(steps)

<step>
Given the functions $f(x) = \log_a(1+x)$ and $g(x) = \log_a(1-x)$, where $a>0$ and $a \neq 1$, we want to find the domain of the function $f(x) - g(x)$.
</step>

<step>
To ensure the function $f(x) - g(x)$ is defined, we need to consider the domain of the individual logarithmic functions. The domain of $\log_a(1+x)$ is $1+x > 0$, and the domain of $\log_a(1-x)$ is $1-x > 0$.
</step>

<step>
Solving the system of inequalities $1+x > 0$ and $1-x > 0$, we find that $-1 < x < 1$. Therefore, the domain of the function $f(x) - g(x)$ is $(-1, 1)$.
</step>

<step>
Now, let's determine if $f(x) - g(x)$ is an odd function. Since the domain $(-1, 1)$ is symmetric about the origin, we can define a new function $F(x) = f(x) - g(x)$. We then evaluate $F(-x) = f(-x) - g(-x) = \log_a(1-x) - \log_a(1+x)$.
</step>

<step>
Simplifying further, we get $F(-x) = -[\log_a(1+x) - \log_a(1-x)] = -F(x)$. This shows that $F(x)$ is an odd function, and consequently, $f(x) - g(x)$ is also an odd function.
<

In [46]:
import importlib
importlib.reload(prompts)

<module 'prompts' from '/home/sam/code/cohere-reasoning/prompts.py'>

In [47]:
perturbed_truncated_response = co.chat(
    model=model_name,
    messages=[
        {
            "role": "user",
            "content": prompts.PERTURB_PROMPT.format(steps=steps)
        }
    ]
)

In [49]:
print(steps)

<step>
Given the functions $f(x) = \log_a(1+x)$ and $g(x) = \log_a(1-x)$, where $a>0$ and $a \neq 1$, we want to find the domain of the function $f(x) - g(x)$.
</step>

<step>
To ensure the function $f(x) - g(x)$ is defined, we need to consider the domain of the individual logarithmic functions. The domain of $\log_a(1+x)$ is $1+x > 0$, and the domain of $\log_a(1-x)$ is $1-x > 0$.
</step>

<step>
Solving the system of inequalities $1+x > 0$ and $1-x > 0$, we find that $-1 < x < 1$. Therefore, the domain of the function $f(x) - g(x)$ is $(-1, 1)$.
</step>

<step>
Now, let's determine if $f(x) - g(x)$ is an odd function. Since the domain $(-1, 1)$ is symmetric about the origin, we can define a new function $F(x) = f(x) - g(x)$. We then evaluate $F(-x) = f(-x) - g(-x) = \log_a(1-x) - \log_a(1+x)$.
</step>

<step>
Simplifying further, we get $F(-x) = -[\log_a(1+x) - \log_a(1-x)] = -F(x)$. This shows that $F(x)$ is an odd function, and consequently, $f(x) - g(x)$ is also an odd function.
<

In [48]:
perturbed_truncated = perturbed_truncated_response.message.content[0].text
print(perturbed_truncated)

<perturbed_chain>
<step>
Given the functions $f(x) = \log_a(1+x)$ and $g(x) = \log_a(1-x)$, where $a>0$ and $a \neq 1$, we want to find the domain of the function $f(x) - g(x)$.
</step>

<step>
To ensure the function $f(x) - g(x)$ is defined, we need to consider the domain of the individual logarithmic functions. The domain of $\log_a(1+x)$ is $1+x > 0$, and the domain of $\log_a(1-x)$ is $1-x > 0$.
</step>

<step>
Solving the system of inequalities $1+x > 0$ and $1-x > 0$, we find that $x < 1$. Therefore, the domain of the function $f(x) - g(x)$ is $(-\infty, 1)$.
</step>
</perturbed_chain>

<perturbation_info>
Selected Step: 3
Perturbation Type: Arithmetic sign error
Description: I changed the inequality sign in the solution to $x < 1$ instead of $-1 < x < 1$.
</perturbation_info>


In [50]:
def stepify(solution: str) -> str:
    step_response = co.chat(
        model=model_name,
        messages=[
            {
                "role": "user",
                "content": prompts.STEPIFY_PROMPT.format(solution=solution)
            }
        ]
    )
    return step_response.message.content[0].text


In [51]:
def perturb(steps: str) -> str:
    perturbed_response = co.chat(
        model=model_name,
        messages=[
            {
                "role": "user",
                "content": prompts.PERTURB_SHOTS[0].format(steps=steps)
            }
        ]
    )
    return perturbed_response.message.content[0].text

In [52]:
def postprocess(output: str) -> dict:
    # Extract the steps from the perturbed chain
    steps_match = re.search(r'<perturbed_chain>(.*?)</perturbed_chain>', output, re.DOTALL)
    steps = steps_match.group(1).strip() if steps_match else ""
    steps = re.sub(r'</?step>', '', steps).replace('\n', ' ').strip()
    steps = re.sub(r'\s+', ' ', steps)  # Replace multiple spaces with a single space

    # Extract the perturbation info
    perturbation_info_match = re.search(r'<perturbation_info>(.*?)</perturbation_info>', output, re.DOTALL)
    perturbation_info = perturbation_info_match.group(1).strip() if perturbation_info_match else ""

    # Extract the selected step number
    step_match = re.search(r'Selected Step:\s*(\d+)', perturbation_info)
    perturbation_step = int(step_match.group(1)) if step_match else None

    # Extract the perturbation type
    type_match = re.search(r'Perturbation Type:\s*(.*)', perturbation_info)
    perturbation_type = type_match.group(1).strip() if type_match else ""

    # Extract the description
    description_match = re.search(r'Description:\s*(.*)', perturbation_info)
    perturbation_trace = description_match.group(1).strip() if description_match else ""

    return {
        "steps": steps,
        "perturbation_step": perturbation_step,
        "perturbation_type": perturbation_type,
        "perturbation_trace": perturbation_trace
}


NameError: name 'Any' is not defined

In [None]:
df_100 = pd.read_csv("datasets/cn_k12_math_problems.csv", nrows=100)
df_100 = df_100[["problem", "solution"]]

for index, row in df_100.iterrows():
    steps = stepify(row["solution"])
    perturbed = perturb(steps)
    print("~~~~~~~~~~~~~~")
    postprocessed = postprocess(perturbed)
    pprint(row["solution"], postprocessed)
    print("~~~~~~~~~~~~~~")



