In [1]:
import os
os.environ['HF_HOME'] = '/run/cache/'

# Preparing model

In [9]:
!pip install --upgrade deepspeed
!pip install --upgrade pydantic
!pip install --upgrade transformers

Collecting deepspeed
  Downloading deepspeed-0.14.4.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting nvidia-ml-py (from deepspeed)
  Obtaining dependency information for nvidia-ml-py from https://files.pythonhosted.org/packages/50/13/fcc6b604dea9782d69e675b1481af16e9a02c44d1d43819af209b0a1e6bb/nvidia_ml_py-12.555.43-py3-none-any.whl.metadata
  Downloading nvidia_ml_py-12.555.43-py3-none-any.whl.metadata (8.6 kB)
Downloading nvidia_ml_py-12.555.43-py3-none-any.whl (39 kB)
Building wheels for collected packages: deepspeed
  Building wheel for deepspeed (setup.py) ... [?25ldone
[?25h  Created wheel for deepspeed: filename=deepspeed-0.14.4-py3-none-any.whl size=1445516 sha256=9262491ddc464ea52b31080c05c99df36b82377c55912e569a8a8a791f2fe41e
  Stored in directory: /home/jovyan/.cache/pip/wheels/68/2c/db/7164d811bdcb8a2288343

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login, snapshot_download
from pathlib import Path


In [4]:
mistral_models_path = Path.home().joinpath('mistral_models', 'Mathstral-7b-v0.1')
mistral_models_path.mkdir(parents=True, exist_ok=True)


In [5]:
login(token=os.environ['HF_TOKEN'])
snapshot_download(repo_id="mistralai/Mathstral-7b-v0.1", allow_patterns=["params.json", "consolidated.safetensors", "tokenizer.model.v3"], local_dir=mistral_models_path)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /run/cache/token
Login successful


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

'/home/jovyan/mistral_models/Mathstral-7b-v0.1'

# Проверяем модель

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

checkpoint = "mistralai/Mathstral-7b-v0.1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16)


[2024-08-13 01:51:53,435] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/saturncloud/envs/saturn/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [8]:
prompt = [{"role": "user", "content": "What are the roots of unity?"}]
tokenized_prompt = tokenizer.apply_chat_template(prompt, add_generation_prompt=True, return_dict=True, return_tensors="pt").to(model.device)

out = model.generate(**tokenized_prompt, max_new_tokens=512)
tokenizer.decode(out[0])


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<s>[INST] What are the roots of unity?[/INST] The roots of unity are the complex numbers that satisfy the equation $z^n = 1$, where $n$ is a positive integer. These roots are evenly spaced around the unit circle in the complex plane, and they have a variety of interesting properties and applications in mathematics and science.</s>'

# Оформляем в функцию

In [9]:
def solve_math_problem(tokenizer, model, problem):
    prompt = [
        {"role": "system", "content": "You are a helpful math assistant. Solve the given problem step by step, showing your reasoning."},
        {"role": "user", "content": f"Solve this math problem: {problem}"}
    ]

    tokenized_prompt = tokenizer.apply_chat_template(prompt, add_generation_prompt=True, return_dict=True, return_tensors="pt").to(model.device)

    output = model.generate(**tokenized_prompt, max_new_tokens=512)
    full_response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Удаление промпта и исходного вопроса из ответа
    response_parts = full_response.split("Solve this math problem:")
    if len(response_parts) > 1:
        clean_response = response_parts[1].strip()
    else:
        clean_response = "Unable to extract the solution from the model's response."

    return clean_response

In [10]:
problem = "Find the value of the expression $\dfrac{2\strut^{-5} \cdot 2\strut^{9} }{2\strut^{-4} } $."

In [11]:
result = solve_math_problem(tokenizer, model, problem)
print("Solution:")
print(result)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Solution:
Find the value of the expression $\dfrac{2\strut^{-5} \cdot 2\strut^{9} }{2\strut^{-4} } $. Sure, let's solve this step by step:

1. First, let's simplify the numerator of the fraction. The expression $2^{-5} \cdot 2^{9}$ can be simplified using the rule of exponents that states $a^m \cdot a^n = a^{m+n}$. So, $2^{-5} \cdot 2^{9} = 2^{-5+9} = 2^{4}$.

2. Now, let's simplify the denominator of the fraction. The expression $2^{-4}$ is already simplified.

3. So, the original expression $\dfrac{2\strut^{-5} \cdot 2\strut^{9} }{2\strut^{-4} }$ simplifies to $\dfrac{2^{4}}{2^{-4}}$.

4. To simplify this further, we can use the rule of exponents that states $\dfrac{a^m}{a^n} = a^{m-n}$. So, $\dfrac{2^{4}}{2^{-4}} = 2^{4-(-4)} = 2^{4+4} = 2^{8}$.

5. Finally, we calculate $2^{8}$ to get the final answer.

\boxed{256}


# Решения из трейна

In [12]:
from tqdm import tqdm
import pandas as pd

In [13]:
train = pd.read_csv('https://github.com/DataTalksClub/llm-zoomcamp/raw/main/cohorts/2024/competition/data/train.csv')

In [14]:
train10 = train.iloc[0:10,]

In [15]:
train10

Unnamed: 0,problem_id,problem_text,answer
0,2374,Find the value of the expression $\dfrac{17}{5...,1.6
1,4723,"In a company of 30 people, 25 use the social n...",24
2,7135,The number of road traffic accidents (RTAs) in...,32
3,5814,Find the value of the expression $\dfrac{2\str...,256
4,9237,A traveler from Moscow wants to visit four cit...,53
5,10703,"At the end of the term, Vovochka wrote down al...",3
6,9621,Find a four-digit number that is 11 times smal...,27000; 64000
7,4318,Find the value of the expression $\dfrac{\sqrt...,2
8,7549,"The harmonic mean of three numbers $a$, $b$, a...",0.1
9,4405,Find \(\tan x\) if \(\sin x = \dfrac{6}{\sqrt{...,1.2


In [18]:
def process_dataframe(df):
    # Создаем новую колонку для ответов модели
    df['model_answer'] = ''

    # Используем tqdm для создания прогресс-бара
    for index in tqdm(df.index, desc="Processing questions"):
        question = df.loc[index, 'problem_text']
        answer = solve_math_problem(tokenizer, model, question)
        df.loc[index, 'model_answer'] = answer

    return df

In [19]:
process_dataframe(train10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['model_answer'] = ''
Processing questions:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing questions:  10%|█         | 1/10 [00:19<02:51, 19.11s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing questions:  20%|██        | 2/10 [00:54<03:49, 28.74s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing questions:  30%|███       | 3/10 [01:11<02:44, 23.44s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing questions:  40%|████      | 4/10 [01:30<02:10, 21.74s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing questions:  50%|█████     | 

Unnamed: 0,problem_id,problem_text,answer,model_answer
0,2374,Find the value of the expression $\dfrac{17}{5...,1.6,Find the value of the expression $\dfrac{17}{5...
1,4723,"In a company of 30 people, 25 use the social n...",24,"In a company of 30 people, 25 use the social n..."
2,7135,The number of road traffic accidents (RTAs) in...,32,The number of road traffic accidents (RTAs) in...
3,5814,Find the value of the expression $\dfrac{2\str...,256,Find the value of the expression $\dfrac{2\str...
4,9237,A traveler from Moscow wants to visit four cit...,53,A traveler from Moscow wants to visit four cit...
5,10703,"At the end of the term, Vovochka wrote down al...",3,"At the end of the term, Vovochka wrote down al..."
6,9621,Find a four-digit number that is 11 times smal...,27000; 64000,Find a four-digit number that is 11 times smal...
7,4318,Find the value of the expression $\dfrac{\sqrt...,2,Find the value of the expression $\dfrac{\sqrt...
8,7549,"The harmonic mean of three numbers $a$, $b$, a...",0.1,"The harmonic mean of three numbers $a$, $b$, a..."
9,4405,Find \(\tan x\) if \(\sin x = \dfrac{6}{\sqrt{...,1.2,Find \(\tan x\) if \(\sin x = \dfrac{6}{\sqrt{...


In [None]:
train10

# Достаем ответ с помощью HAIKU

In [28]:
!pip install anthropic

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting anthropic
  Obtaining dependency information for anthropic from https://files.pythonhosted.org/packages/8a/0c/2ba7c72b2f7176a5583f76b72df737cd7b2d842301afe63a63abcfb2f6f8/anthropic-0.33.1-py3-none-any.whl.metadata
  Downloading anthropic-0.33.1-py3-none-any.whl.metadata (18 kB)
Collecting distro<2,>=1.7.0 (from anthropic)
  Obtaining dependency information for distro<2,>=1.7.0 from https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl.metadata
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from anthropic)
  Obtaining dependency information for jiter<1,>=0.4.0 from https://files.pythonhosted.org/packages/c1/62/16ca89739acd12308a7d1f07af81edac2fecd2f813f65fc6d12c53afa65a/jiter-0.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading jiter-0.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Downlo

In [40]:
import anthropic
import re

In [51]:
def extract_math_answer(text):
    client = anthropic.Anthropic()
    system = """
    You are an AI assistant specialized in extracting numerical answers from mathematical solutions.
    Your task is to analyze the given mathematical solution and return only the final numerical answer.
    If the answer is in a LaTeX \boxed{} environment, extract the content inside the box.
    Do not include any explanations, just return the number. Only number, not text like: The final answer is ...
    Use only number and symbols . (dot), decimal answer (no fractions)
    """

    user_text = f"Extract the final numerical answer from this mathematical solution:\n\n{text}"

    message = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=100,
        temperature=0,
        system=system,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_text
                    }
                ]
            }
        ]
    )

    answer = message.content[0].text.strip()
    
    # If the answer is still in LaTeX format, extract the number
    latex_match = re.search(r'\\boxed{(\d+)}', answer)
    if latex_match:
        answer = latex_match.group(1)

    return answer

In [52]:
def process_dataframe_2(df):
    # Создаем новую колонку для ответов модели
    df['parsed_answer'] = ''

    # Используем tqdm для создания прогресс-бара
    for index in tqdm(df.index, desc="Processing questions"):
        model_answer = df.loc[index, 'model_answer']
        df.loc[index, 'parsed_answer'] = extract_math_answer(model_answer)

    return df

In [53]:
process_dataframe_2(train10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['parsed_answer'] = ''
Processing questions: 100%|██████████| 10/10 [00:06<00:00,  1.64it/s]


Unnamed: 0,problem_id,problem_text,answer,model_answer,parsed_answer
0,2374,Find the value of the expression $\dfrac{17}{5...,1.6,Find the value of the expression $\dfrac{17}{5...,1.6
1,4723,"In a company of 30 people, 25 use the social n...",24,"In a company of 30 people, 25 use the social n...",24.0
2,7135,The number of road traffic accidents (RTAs) in...,32,The number of road traffic accidents (RTAs) in...,32.0
3,5814,Find the value of the expression $\dfrac{2\str...,256,Find the value of the expression $\dfrac{2\str...,256.0
4,9237,A traveler from Moscow wants to visit four cit...,53,A traveler from Moscow wants to visit four cit...,8950.0
5,10703,"At the end of the term, Vovochka wrote down al...",3,"At the end of the term, Vovochka wrote down al...",4.0
6,9621,Find a four-digit number that is 11 times smal...,27000; 64000,Find a four-digit number that is 11 times smal...,33.0
7,4318,Find the value of the expression $\dfrac{\sqrt...,2,Find the value of the expression $\dfrac{\sqrt...,2.0
8,7549,"The harmonic mean of three numbers $a$, $b$, a...",0.1,"The harmonic mean of three numbers $a$, $b$, a...",0.1
9,4405,Find \(\tan x\) if \(\sin x = \dfrac{6}{\sqrt{...,1.2,Find \(\tan x\) if \(\sin x = \dfrac{6}{\sqrt{...,1.2


# Решаем TEST задачи

In [55]:
test = pd.read_csv('https://github.com/DataTalksClub/llm-zoomcamp/raw/main/cohorts/2024/competition/data/test.csv')

In [57]:
process_dataframe(test)
process_dataframe_2(test)

Processing questions:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing questions:   1%|          | 1/100 [00:06<10:12,  6.19s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing questions:   2%|▏         | 2/100 [00:22<19:28, 11.92s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing questions:   3%|▎         | 3/100 [00:31<17:11, 10.63s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing questions:   4%|▍         | 4/100 [00:51<23:05, 14.43s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing questions:   5%|▌         | 5/100 [01:02<21:02, 13.29s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing questions:   6%|▌         | 6/100 [01:16<20:59, 13.40s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing questions:   7%|▋         | 7/100 [01:

Unnamed: 0,problem_id,problem_text,model_answer,parsed_answer
0,11919,Find the value of the expression \(4.8 \cdot 2...,Find the value of the expression \(4.8 \cdot 2...,12
1,8513,The plane's navigation system informs the pass...,The plane's navigation system informs the pass...,11245
2,7887,The volume of a rectangular parallelepiped is ...,The volume of a rectangular parallelepiped is ...,4
3,5272,Find the root of the equation: $\left(\dfrac{1...,Find the root of the equation: $\left(\dfrac{1...,6
4,8295,"At the school, there are two-person camping te...","At the school, there are two-person camping te...",13
...,...,...,...,...
95,3519,A tourist is selecting tours. Information abou...,A tourist is selecting tours. Information abou...,650
96,7934,The area of a triangle can be calculated using...,The area of a triangle can be calculated using...,12
97,9390,A construction contractor plans to buy 20 tons...,A construction contractor plans to buy 20 tons...,256000 rubles
98,7137,The number of road traffic accidents (RTAs) in...,The number of road traffic accidents (RTAs) in...,22


In [61]:
test

Unnamed: 0_level_0,problem_text,model_answer,parsed_answer
problem_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11919,Find the value of the expression \(4.8 \cdot 2...,Find the value of the expression \(4.8 \cdot 2...,12
8513,The plane's navigation system informs the pass...,The plane's navigation system informs the pass...,11245
7887,The volume of a rectangular parallelepiped is ...,The volume of a rectangular parallelepiped is ...,4
5272,Find the root of the equation: $\left(\dfrac{1...,Find the root of the equation: $\left(\dfrac{1...,6
8295,"At the school, there are two-person camping te...","At the school, there are two-person camping te...",13
...,...,...,...
3519,A tourist is selecting tours. Information abou...,A tourist is selecting tours. Information abou...,650
7934,The area of a triangle can be calculated using...,The area of a triangle can be calculated using...,12
9390,A construction contractor plans to buy 20 tons...,A construction contractor plans to buy 20 tons...,256000 rubles
7137,The number of road traffic accidents (RTAs) in...,The number of road traffic accidents (RTAs) in...,22


# Make sumbit

In [58]:
def make_submit(answers: pd.Series, filename="data/my_submit.csv"):
    answer_df = answers.reset_index()
    answer_df.columns = ["problem_id", "answer"]
    answer_df.to_csv(filename, index=False)
    

In [59]:
test.set_index('problem_id', inplace=True)

In [62]:
make_submit(test.parsed_answer, filename="my_submit_7_mathstral.csv")

In [68]:
def clean_numeric_column(df, column_name):
    def clean_value(value):
        if pd.isna(value):
            return value
        
        # Convert to string if not already
        value = str(value)
        
        # Remove all non-digit characters except for comma and dot
        cleaned = re.sub(r'[^\d,.]', '', value)
        
        # Replace comma with dot for decimal separator
        cleaned = cleaned.replace(',', '.')
        
        # Try to convert to float
        try:
            return float(cleaned)
        except ValueError:
            return np.nan  # Return NaN if conversion fails

    # Apply the cleaning function to the specified column
    df[column_name] = df[column_name].apply(clean_value)
    
    return df

In [66]:
import numpy as np

In [70]:
clean_numeric_column(test, "parsed_answer")
test

Unnamed: 0_level_0,problem_text,model_answer,parsed_answer
problem_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11919,Find the value of the expression \(4.8 \cdot 2...,Find the value of the expression \(4.8 \cdot 2...,12.0
8513,The plane's navigation system informs the pass...,The plane's navigation system informs the pass...,11245.0
7887,The volume of a rectangular parallelepiped is ...,The volume of a rectangular parallelepiped is ...,4.0
5272,Find the root of the equation: $\left(\dfrac{1...,Find the root of the equation: $\left(\dfrac{1...,6.0
8295,"At the school, there are two-person camping te...","At the school, there are two-person camping te...",13.0
...,...,...,...
3519,A tourist is selecting tours. Information abou...,A tourist is selecting tours. Information abou...,650.0
7934,The area of a triangle can be calculated using...,The area of a triangle can be calculated using...,12.0
9390,A construction contractor plans to buy 20 tons...,A construction contractor plans to buy 20 tons...,256000.0
7137,The number of road traffic accidents (RTAs) in...,The number of road traffic accidents (RTAs) in...,22.0


In [74]:
test.parsed_answer.fillna(0, inplace=True)

In [75]:
make_submit(test.parsed_answer, filename="my_submit_7_1_mathstral.csv")