In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pwd

/Users/realmistic/Documents/llm-zoomcamp-competition-2024


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import io
import base64
from IPython.display import display, HTML


# 1) Test Local LLM (downloaded with Ollama, use OpenAI lib for calling)

In [4]:
# list of available local models
!ollama list

NAME                    	ID          	SIZE  	MODIFIED    
mixtral:8x22b           	e8479ee1cb51	79 GB 	10 days ago	
phi3:14b                	cf611a26b048	7.9 GB	10 days ago	
deepseek-coder-v2:latest	8577f96d693e	8.9 GB	10 days ago	
qwen2:72b               	14066dfa503f	41 GB 	10 days ago	
mistral-large:latest    	0ca7dfa0bf06	69 GB 	10 days ago	
gemma2:27b              	53261bc9c192	15 GB 	10 days ago	
llama3:70b              	786f3184aec0	39 GB 	10 days ago	
llama3.1:latest         	91ab477bec9d	4.7 GB	11 days ago	


In [5]:
# Ollama - OpenAI Compatibility: https://ollama.com/blog/openai-compatibility

from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [6]:
# small model by default for fast computation
def llm(prompt, model = 'llama3.1:latest'):
    response = client.chat.completions.create(
        model=model,
        temperature= 0.0,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response

In [7]:
prompt = 'tell me a joke'

r = llm(prompt=prompt)

In [8]:
r.choices[0].message.content

"Here's one:\n\nWhat do you call a fake noodle?\n\nAn impasta."

# 2) Wrapper for solving math problems

In [9]:
def get_answer(question, model = 'llama3.1:latest'):
    prompt = f"""Role:
    You are an advanced AI system with exceptional mathematical reasoning and problem-solving capabilities, specifically designed to solve tricky math problems (whose answer is a non-negative integer) written in LaTeX format from the AI Mathematical Olympiad (AIMO) competition. Your task is to accurately analyze and solve intricate mathematical problems, demonstrating a deep understanding of mathematical concepts and a strong ability to apply logical reasoning strategies.

    Instruction:
    1. Carefully read and comprehend the problem statement provided in the "Problem" section.
    2. In the "Solution" section, provide a solution of the problem with detailed explanation of your logical reasoning process. Keep in mind that answer must be a non-negative integer number.
    3. At the end, create a "Answer" section where you will state only the final numerical (convert fractions to approx. numbers, try not to do rounding) or algebraic answer, without any additional text or narrative.

    Problem:
    ...

    Solution:
    ...

    Answer:
    ...

    {question}

    Step-by-step solution and final answer:"""

    response = llm(prompt=prompt, model=model)
    return response

In [10]:
# extract numerical
import re

def extract_numerical_answer(text):
    # Look for patterns like "Final answer: X" or "The answer is X" at the end of the text
    match = re.search(r'(?:final answer|the answer is)[:\s]*([+-]?\d*\.?\d+)', text, re.IGNORECASE)
    if match:
        return float(match.group(1))
    else:
        # If no clear final answer, look for the last number in the text
        numbers = re.findall(r'[+-]?\d*\.?\d+', text)
        if numbers:
            number = float(numbers[-1])
            # Check if the number is an integer
            if number.is_integer():
                return int(number)
            else:
                return number
        return None
        
        # old: returned only float
        # return float(numbers[-1]) if numbers else None

In [11]:
# check input and output
def show_latex_text(latex_text:str):
    # Create a figure and axis
    fig, ax = plt.subplots(figsize=(6, 8), dpi=100)  # Adjust figure size and DPI

    # Hide axes
    ax.axis('off')

    # Add the text with padding
    plt.text(0.5, 
             0.5, 
             latex_text, 
             horizontalalignment='center', 
             verticalalignment='center', 
             fontsize=10, 
             wrap=True)

    # Adjust layout
    plt.tight_layout(pad=2.0)  # Add padding

    # Save the figure to a BytesIO object
    buf = io.BytesIO()
    fig.savefig(buf, format='png', bbox_inches='tight')
    buf.seek(0)

    # Convert to base64
    img_base64 = base64.b64encode(buf.read()).decode('utf-8')

    # Display the image in the notebook
    html = f'<img src="data:image/png;base64,{img_base64}" style="max-width:100%; height:auto;" />'
    display(HTML(html))

    # Clean up
    plt.close(fig)
    
    #     # Create a figure and axis with adjusted size
    # fig, ax = plt.subplots(figsize=(8, 2), dpi=100)  # Adjust figure size as needed

    # # Hide axes
    # ax.axis('off')

    # # Add LaTeX text with centering
    # ax.text(0.5, 0.5, f"${latex_text}$", horizontalalignment='center', verticalalignment='center', fontsize=14, usetex=True)

    # # Adjust layout to minimize white space
    # plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

    # # Save the figure to a BytesIO object
    # buf = io.BytesIO()
    # fig.savefig(buf, format='png', bbox_inches='tight', pad_inches=0.1)
    # buf.seek(0)

    # # Convert to base64
    # img_base64 = base64.b64encode(buf.read()).decode('utf-8')

    # # Display the image in the notebook
    # html = f'<img src="data:image/png;base64,{img_base64}" style="max-width:100%; height:auto;" />'
    # display(HTML(html))

    # # Clean up
    # plt.close(fig)

    return

In [12]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.display import Image
import io

# Configure Matplotlib to use LaTeX
mpl.rcParams['text.usetex'] = True
mpl.rcParams['text.latex.preamble'] = r'\usepackage{amsmath}'

def show_latex_as_image(latex_string):
    # Create a figure and an axis
    fig, ax = plt.subplots(figsize=(4, 1), dpi=200)
    
    # Hide the axes
    ax.axis('off')
    
    # Display the LaTeX string
    plt.text(0.5, 0.5, f"${latex_string}$", 
             horizontalalignment='center', 
             verticalalignment='center', 
             fontsize=12)
    
    # Adjust layout
    # plt.tight_layout(pad=1.0)
    
    # Save the figure to a BytesIO object
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0.1)
    buf.seek(0)
    
    # Close the plot to free memory
    plt.close(fig)
    
    # Display the image
    display(Image(data=buf.getvalue()))

In [13]:
# downloaded data from kaggle https://www.kaggle.com/competitions/llm-zoomcamp-2024-competition/data
df_train = pd.read_csv('input_data/train.csv')
df_train.head()


Unnamed: 0,problem_id,problem_text,answer
0,2374,Find the value of the expression $\dfrac{17}{5...,1.6
1,4723,"In a company of 30 people, 25 use the social n...",24.0
2,7135,The number of road traffic accidents (RTAs) in...,32.0
3,5814,Find the value of the expression $\dfrac{2\str...,256.0
4,9237,A traveler from Moscow wants to visit four cit...,53.0


In [14]:
rows = df_train.to_dict(orient='records')

In [15]:
# one row
rows[3]

{'problem_id': 5814,
 'problem_text': 'Find the value of the expression $\\dfrac{2\\strut^{-5} \\cdot 2\\strut^{9} }{2\\strut^{-4} } $.',
 'answer': '256'}

In [16]:
# check downloaded models from the library: https://ollama.com/library
!ollama list

NAME                    	ID          	SIZE  	MODIFIED    
mixtral:8x22b           	e8479ee1cb51	79 GB 	10 days ago	
phi3:14b                	cf611a26b048	7.9 GB	10 days ago	
deepseek-coder-v2:latest	8577f96d693e	8.9 GB	10 days ago	
qwen2:72b               	14066dfa503f	41 GB 	10 days ago	
mistral-large:latest    	0ca7dfa0bf06	69 GB 	10 days ago	
gemma2:27b              	53261bc9c192	15 GB 	10 days ago	
llama3:70b              	786f3184aec0	39 GB 	10 days ago	
llama3.1:latest         	91ab477bec9d	4.7 GB	11 days ago	


In [17]:
def solve_one_problem(one_row, model = "phi3:14b", verbose = False):
    if verbose: #show rendered problem
        show_latex_text(one_row['problem_text'])
        # show_latex_as_image(one_row['problem_text'])
        
    llm_response = get_answer(question=one_row['problem_text'], model = model)
    ans = llm_response.choices[0].message.content
    ans2 = ans.replace('\\strut', '').replace('$$', '$') 
    res = extract_numerical_answer(ans)

    # DEBUG: answer by LLM + right/wrong
    
    # SHOW INPUTS:
    # for k in one_row.keys():
    #     print(f'[{k}]: {one_row[k]}')

    # print(f'answer by llm {model}: {res}')
    
    if verbose:  #show  logics
        show_latex_text(ans2)
        # show_latex_as_image(ans2)


    return res, llm_response.choices[0].message.content

In [18]:
import matplotlib as mpl

# if not installed Latex (shows not everything correctly:)
# mpl.rcParams['text.usetex'] = False

# Ensure LaTeX support in Matplotlib
# installed previously Latex on Mac in terminal: brew install --cask mactex

mpl.rcParams['text.usetex'] = True
mpl.rcParams['text.latex.preamble'] = r'\usepackage{amsmath}'

# mpl.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}']

In [19]:
!ollama list

NAME                    	ID          	SIZE  	MODIFIED    
mixtral:8x22b           	e8479ee1cb51	79 GB 	10 days ago	
phi3:14b                	cf611a26b048	7.9 GB	10 days ago	
deepseek-coder-v2:latest	8577f96d693e	8.9 GB	10 days ago	
qwen2:72b               	14066dfa503f	41 GB 	10 days ago	
mistral-large:latest    	0ca7dfa0bf06	69 GB 	10 days ago	
gemma2:27b              	53261bc9c192	15 GB 	10 days ago	
llama3:70b              	786f3184aec0	39 GB 	10 days ago	
llama3.1:latest         	91ab477bec9d	4.7 GB	11 days ago	


In [20]:
# phi3:14b
# gemma2:27b
# qwen2:72b

# +(2m24) mixtral:8x22b :: CORRECT          
# +(1m37) qwen2:72b     :: CORRECT        	
# +(4m21) mistral-large:latest ::	almost Correct
# +(23s) phi3:14b ::	almost Correct

res, msg = solve_one_problem(rows[4], model = 'llama3.1:latest', verbose=False)
# print(res, msg)

In [21]:
from tqdm.auto import tqdm

from concurrent.futures import ThreadPoolExecutor

pool = ThreadPoolExecutor(max_workers=14)

def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
def process_row(row, model = 'llama3.1:latest'):
    problem_id = row['problem_id']
    problem_text = row['problem_text']
    problem_answer = None
    if 'answer' in row.keys():
        problem_answer = row['answer']
    # llm_reasoning = get_answer(problem_text)
    # numerical_answer = extract_numerical_answer(llm_reasoning)
    numerical_answer, llm_reasoning = solve_one_problem(row, model = model, verbose=False)
    
    correct = None
    if problem_answer is not None:
        if problem_answer == str(numerical_answer):
            correct = True
            print('CORRECT ANSWER')
        else:
            correct = False
            print('WRONG ANSWER')
            print(f' Let\'s compare answers: LLM_ANSWER: {numerical_answer}, TRUE_ANSWER: {problem_answer}')

    return {
        'problem_id': problem_id,
        'problem_text': problem_text,
        'problem_answer':problem_answer,
        'llm_reasoning': llm_reasoning,
        'llm_answer': str(numerical_answer),
        'is_correct': correct
    }

In [23]:
# check process one row
process_row(rows[0])

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 85, TRUE_ANSWER: 1.6


{'problem_id': 2374,
 'problem_text': 'Find the value of the expression $\\dfrac{17}{5} :\\dfrac{34}{3} +1.3$.',
 'problem_answer': '1.6',
 'llm_reasoning': "Problem:\n\n\nSolution:\n\n\nTo solve this problem, we first need to understand that the colon (:) is not a standard mathematical operator, but rather an indication of division. Therefore, we can rewrite the expression as follows:\n\n$\\dfrac{17}{5} \\div \\dfrac{34}{3} + 1.3$\n\nNow, let's simplify this expression step by step.\n\nFirst, we perform the division operation:\n\n$\\dfrac{17}{5} \\div \\dfrac{34}{3} = \\dfrac{17}{5} \\times \\dfrac{3}{34}$\n\nNext, we multiply the numerators and denominators:\n\n$\\dfrac{17 \\times 3}{5 \\times 34} = \\dfrac{51}{170}$\n\nNow, let's add 1.3 to this result:\n\n$\\dfrac{51}{170} + 1.3$\n\nTo add a fraction and a decimal, we need to convert the decimal to a fraction with the same denominator as the fraction. We can rewrite 1.3 as $\\dfrac{13}{10}$.\n\nNow, let's find a common denominator 

In [24]:
results = map_progress(pool, rows, process_row)
df_results = pd.DataFrame(results)
df_results

  0%|                                                                                                                                                                                                                                                                            | 0/100 [00:00<?, ?it/s]

  1%|██▌                                                                                                                                                                                                                                                                 | 1/100 [00:17<29:13, 17.72s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 16, TRUE_ANSWER: 256


  2%|█████▏                                                                                                                                                                                                                                                              | 2/100 [00:18<12:41,  7.77s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 124, TRUE_ANSWER: 24


  3%|███████▊                                                                                                                                                                                                                                                            | 3/100 [00:24<11:35,  7.17s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 85, TRUE_ANSWER: 1.6


  4%|██████████▍                                                                                                                                                                                                                                                         | 4/100 [00:34<13:09,  8.23s/it]

CORRECT ANSWER


  5%|█████████████                                                                                                                                                                                                                                                       | 5/100 [00:36<09:30,  6.01s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 1060, TRUE_ANSWER: 27000; 64000


  6%|███████████████▌                                                                                                                                                                                                                                                    | 6/100 [00:47<11:58,  7.64s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 425, TRUE_ANSWER: 53


  7%|██████████████████▏                                                                                                                                                                                                                                                 | 7/100 [00:48<08:27,  5.46s/it]

CORRECT ANSWER


  8%|████████████████████▊                                                                                                                                                                                                                                               | 8/100 [01:02<12:40,  8.27s/it]

CORRECT ANSWER


  9%|███████████████████████▍                                                                                                                                                                                                                                            | 9/100 [01:08<11:22,  7.50s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 5, TRUE_ANSWER: 1.2


 10%|█████████████████████████▉                                                                                                                                                                                                                                         | 10/100 [01:10<08:36,  5.74s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 10, TRUE_ANSWER: 0.1


 11%|████████████████████████████▍                                                                                                                                                                                                                                      | 11/100 [01:22<11:22,  7.67s/it]

CORRECT ANSWER


 12%|███████████████████████████████                                                                                                                                                                                                                                    | 12/100 [01:25<09:15,  6.32s/it]

CORRECT ANSWER


 13%|█████████████████████████████████▋                                                                                                                                                                                                                                 | 13/100 [01:40<12:50,  8.86s/it]

CORRECT ANSWER


 14%|████████████████████████████████████▎                                                                                                                                                                                                                              | 14/100 [01:43<10:00,  6.98s/it]

CORRECT ANSWER


 15%|██████████████████████████████████████▊                                                                                                                                                                                                                            | 15/100 [01:54<11:47,  8.33s/it]

CORRECT ANSWER


 16%|█████████████████████████████████████████▍                                                                                                                                                                                                                         | 16/100 [02:30<23:27, 16.76s/it]

CORRECT ANSWER


 17%|████████████████████████████████████████████                                                                                                                                                                                                                       | 17/100 [02:44<21:41, 15.68s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 1, TRUE_ANSWER: 3412


 18%|██████████████████████████████████████████████▌                                                                                                                                                                                                                    | 18/100 [03:05<23:34, 17.25s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 153, TRUE_ANSWER: 76


 19%|█████████████████████████████████████████████████▏                                                                                                                                                                                                                 | 19/100 [03:09<18:04, 13.39s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 4, TRUE_ANSWER: -4


 20%|███████████████████████████████████████████████████▊                                                                                                                                                                                                               | 20/100 [03:19<16:34, 12.43s/it]

CORRECT ANSWER


 21%|██████████████████████████████████████████████████████▍                                                                                                                                                                                                            | 21/100 [03:24<13:30, 10.26s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 3, TRUE_ANSWER: 4213


 22%|████████████████████████████████████████████████████████▉                                                                                                                                                                                                          | 22/100 [04:12<27:48, 21.39s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 12356, TRUE_ANSWER: 135, 346


 23%|███████████████████████████████████████████████████████████▌                                                                                                                                                                                                       | 23/100 [04:41<30:28, 23.75s/it]

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 4, TRUE_ANSWER: 1432


 24%|██████████████████████████████████████████████████████████████▏                                                                                                                                                                                                    | 24/100 [04:48<23:38, 18.66s/it]

CORRECT ANSWER


 25%|████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                  | 25/100 [05:21<28:47, 23.03s/it]

CORRECT ANSWER


 25%|████████████████████████████████████████████████████████████████▊                                                                                                                                                                                                  | 25/100 [10:13<30:41, 24.55s/it]


KeyboardInterrupt: 

WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 234, TRUE_ANSWER: 23
CORRECT ANSWER
WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 2, TRUE_ANSWER: -2.5
CORRECT ANSWER
CORRECT ANSWER
CORRECT ANSWER
CORRECT ANSWER
CORRECT ANSWER
WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 250, TRUE_ANSWER: 25
WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 27, TRUE_ANSWER: 6
WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 10, TRUE_ANSWER: 0.9
CORRECT ANSWER
WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 12.5, TRUE_ANSWER: 2.6
WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 5, TRUE_ANSWER: 235
WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 2, TRUE_ANSWER: 3412
WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 3, TRUE_ANSWER: 17490
WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 1650, TRUE_ANSWER: 2.18
CORRECT ANSWER
WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 19, TRUE_ANSWER: 12
WRONG ANSWER
 Let's compare answers: LLM_ANSWER: 515, TRUE_ANSWER: 12505
WRONG ANSWER
 Let's compare answers

In [None]:
df_results['is_correct_num']= df_results.is_correct.astype(int)

In [None]:
# Score ratio
sum(df_results['is_correct_num'])/len(df_results)

0.2

# 3) Test submission

In [None]:
df_test = pd.read_csv('input_data/test.csv')