In [12]:
from langchain_experimental.llm_symbolic_math.base import LLMSymbolicMathChain
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from tqdm import tqdm
import pandas as pd
import re
import datasets

def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", use_auth_token=True).to("cuda" if torch.cuda.is_available() else "cpu")
    return model, tokenizer

def create_llm_math_chain(model, tokenizer):
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=512,
        temperature=0,
        top_p=1,
        repetition_penalty=1.0
    )
    llm = HuggingFacePipeline(pipeline=pipe)
    llm_math = LLMSymbolicMathChain.from_llm(llm)
    return llm_math


In [13]:
def react_prompt(question, choices):
    prompt = f"""
You are a highly skilled mathematician tasked with solving complex math problems.

Question: {question}

Choices:
1. {choices[0]}
2. {choices[1]}
3. {choices[2]}
4. {choices[3]}

Let's think step by step:

1. First, identify the type of mathematical problem we are dealing with. 
   - [Here, explain what the problem is asking for in mathematical terms.]

2. Analyze the given choices.
   - [Go through each choice and consider if it could be a potential solution.]

3. Perform necessary calculations or manipulations to arrive at the correct solution.
   - [Show your mathematical work.]

4. Select the correct choice based on the calculations.
   - [Pick the final correct option.]

Final Answer: [State the correct answer clearly as either 1, 2, 3, or 4.]

Explanation: [Provide a brief explanation for why this choice is correct.]
"""
    return prompt


In [14]:
def extract_answer(generated_text):
    answer_pattern = r'Final Answer:\s*(\d)'
    match = re.search(answer_pattern, generated_text, re.IGNORECASE)
    if match:
        answer = int(match.group(1))
        if 1 <= answer <= 4:
            return answer - 1  
    
    last_number_pattern = r'(?:^|\D)([1234])(?:\D|$)'
    matches = list(re.finditer(last_number_pattern, generated_text))
    if matches:
        last_match = matches[-1]
        return int(last_match.group(1)) - 1 
    
    return None


In [15]:
import time

def run_gemma_and_llama_inference(question, choices):
    prompt = react_prompt(question, choices)

    results = {}
    
    for model_name, model_label in model_names.items():
        try:
            start_time = time.time()

            response = client.chat.completions.create(
                model=model_label,
                messages=[{"role": "system", "content": prompt}],
                max_tokens=512,
                temperature=0.7,
                top_p=0.7,
                top_k=50,
                repetition_penalty=1,
                stop=["<eos>", "<end_of_turn>"],
                stream=False
            )

           
            end_time = time.time()
            inference_time = end_time - start_time

            generated_text = response.choices[0].message.content
            extracted_answer = extract_answer(generated_text)

            results[model_name] = {
                "generated_text": generated_text,
                "extracted_answer": extracted_answer + 1 if extracted_answer is not None else None,
                "inference_time": inference_time
            }

        except Exception as e:
            print(f"Error in Together AI model inference for {model_name}: {str(e)}")
            continue

    return results


In [17]:
from datasets import load_dataset

dataset = load_dataset("cais/mmlu", "college_mathematics")
df = pd.DataFrame(dataset['test'])

# Define Together AI model names
model_names = {
    'gemma': 'google/gemma-2b-it',
    'llama': 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo'
}

import os
from together import Together
client = Together(api_key="5b9bcdd27be5e30c2f33fbc6b3acf0cbc805c74615418455f0a805f0df9b56a0")  # Replace with your Together API key

results_df = process_dataframe_react_together_gemma_llama(df, model_names)

results_df.to_csv("react_gemma_llama_results.csv", index=False)


  0%|          | 0/100 [00:00<?, ?it/s]

Model: gemma, Question 1:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 6.8778 seconds



  1%|          | 1/100 [00:09<15:46,  9.56s/it]

Model: llama, Question 1:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 2.6850 seconds

Model: gemma, Question 2:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 6.1000 seconds



  2%|▏         | 2/100 [00:18<15:15,  9.34s/it]

Model: llama, Question 2:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 3.0830 seconds

Model: gemma, Question 3:
Extracted answer: 2
Correct answer: 4
Is correct: False
Inference time: 7.8876 seconds



  3%|▎         | 3/100 [00:29<15:59,  9.89s/it]

Model: llama, Question 3:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 2.6480 seconds

Model: gemma, Question 4:
Extracted answer: 4
Correct answer: 1
Is correct: False
Inference time: 2.7162 seconds



  4%|▍         | 4/100 [00:35<13:34,  8.49s/it]

Model: llama, Question 4:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 3.6175 seconds

Model: gemma, Question 5:
Extracted answer: 2
Correct answer: 3
Is correct: False
Inference time: 3.3250 seconds



  5%|▌         | 5/100 [00:41<11:57,  7.55s/it]

Model: llama, Question 5:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 2.5753 seconds

Model: gemma, Question 6:
Extracted answer: 2
Correct answer: 4
Is correct: False
Inference time: 0.9768 seconds



  6%|▌         | 6/100 [00:44<09:18,  5.94s/it]

Model: llama, Question 6:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 1.8206 seconds

Model: gemma, Question 7:
Extracted answer: 2
Correct answer: 3
Is correct: False
Inference time: 4.2145 seconds



  7%|▋         | 7/100 [00:50<09:34,  6.17s/it]

Model: llama, Question 7:
Extracted answer: 2
Correct answer: 3
Is correct: False
Inference time: 2.4469 seconds

Model: gemma, Question 8:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 4.9329 seconds



  8%|▊         | 8/100 [00:58<10:11,  6.65s/it]

Model: llama, Question 8:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 2.7369 seconds

Model: gemma, Question 9:
Extracted answer: 2
Correct answer: 3
Is correct: False
Inference time: 1.5091 seconds



  9%|▉         | 9/100 [01:01<08:28,  5.59s/it]

Model: llama, Question 9:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 1.7306 seconds

Model: gemma, Question 10:
Extracted answer: None
Correct answer: 1
Is correct: False
Inference time: 0.5579 seconds



 10%|█         | 10/100 [01:04<06:48,  4.54s/it]

Model: llama, Question 10:
Extracted answer: 1
Correct answer: 1
Is correct: True
Inference time: 1.6285 seconds

Model: gemma, Question 11:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 3.6031 seconds



 11%|█         | 11/100 [01:09<06:55,  4.66s/it]

Model: llama, Question 11:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 1.3494 seconds

Model: gemma, Question 12:
Extracted answer: 4
Correct answer: 2
Is correct: False
Inference time: 3.9755 seconds



 12%|█▏        | 12/100 [01:15<07:46,  5.30s/it]

Model: llama, Question 12:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 2.7787 seconds

Model: gemma, Question 13:
Extracted answer: 4
Correct answer: 1
Is correct: False
Inference time: 1.5530 seconds



 13%|█▎        | 13/100 [01:19<07:01,  4.84s/it]

Model: llama, Question 13:
Extracted answer: 3
Correct answer: 1
Is correct: False
Inference time: 2.2339 seconds

Model: gemma, Question 14:
Extracted answer: 4
Correct answer: 2
Is correct: False
Inference time: 4.3192 seconds



 14%|█▍        | 14/100 [01:26<07:44,  5.40s/it]

Model: llama, Question 14:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 2.3568 seconds

Model: gemma, Question 15:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 4.6914 seconds



 15%|█▌        | 15/100 [01:33<08:29,  6.00s/it]

Model: llama, Question 15:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 2.7005 seconds

Model: gemma, Question 16:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 1.6926 seconds



 16%|█▌        | 16/100 [01:37<07:17,  5.20s/it]

Model: llama, Question 16:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 1.6605 seconds

Model: gemma, Question 17:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 0.8746 seconds



 17%|█▋        | 17/100 [01:39<06:16,  4.53s/it]

Model: llama, Question 17:
Extracted answer: 2
Correct answer: 2
Is correct: True
Inference time: 2.1035 seconds

Model: gemma, Question 18:
Extracted answer: 2
Correct answer: 3
Is correct: False
Inference time: 5.5949 seconds



 18%|█▊        | 18/100 [01:47<07:30,  5.49s/it]

Model: llama, Question 18:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 2.1102 seconds

Model: gemma, Question 19:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 1.3874 seconds



 19%|█▉        | 19/100 [01:51<06:50,  5.07s/it]

Model: llama, Question 19:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 2.7157 seconds

Model: gemma, Question 20:
Extracted answer: 4
Correct answer: 1
Is correct: False
Inference time: 4.6760 seconds



 20%|██        | 20/100 [01:58<07:26,  5.58s/it]

Model: llama, Question 20:
Extracted answer: 1
Correct answer: 1
Is correct: True
Inference time: 2.0930 seconds

Model: gemma, Question 21:
Extracted answer: 2
Correct answer: 4
Is correct: False
Inference time: 2.1617 seconds



 21%|██        | 21/100 [02:03<06:54,  5.25s/it]

Model: llama, Question 21:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 2.3048 seconds

Model: gemma, Question 22:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 3.1240 seconds



 22%|██▏       | 22/100 [02:08<06:49,  5.25s/it]

Model: llama, Question 22:
Extracted answer: 2
Correct answer: 4
Is correct: False
Inference time: 2.1154 seconds

Model: gemma, Question 23:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 0.8528 seconds



 23%|██▎       | 23/100 [02:11<05:55,  4.61s/it]

Model: llama, Question 23:
Extracted answer: 2
Correct answer: 2
Is correct: True
Inference time: 2.2850 seconds

Model: gemma, Question 24:
Extracted answer: 4
Correct answer: 1
Is correct: False
Inference time: 3.8574 seconds



 24%|██▍       | 24/100 [02:18<06:36,  5.21s/it]

Model: llama, Question 24:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 2.7471 seconds

Model: gemma, Question 25:
Extracted answer: 4
Correct answer: 2
Is correct: False
Inference time: 1.6221 seconds



 25%|██▌       | 25/100 [02:21<05:53,  4.71s/it]

Model: llama, Question 25:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 1.9276 seconds

Model: gemma, Question 26:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 1.2010 seconds



 26%|██▌       | 26/100 [02:25<05:28,  4.45s/it]

Model: llama, Question 26:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 2.6184 seconds

Model: gemma, Question 27:
Extracted answer: 2
Correct answer: 4
Is correct: False
Inference time: 3.8771 seconds



 27%|██▋       | 27/100 [02:31<06:03,  4.99s/it]

Model: llama, Question 27:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 2.3666 seconds

Model: gemma, Question 28:
Extracted answer: 4
Correct answer: 1
Is correct: False
Inference time: 5.7262 seconds



 28%|██▊       | 28/100 [02:40<07:13,  6.02s/it]

Model: llama, Question 28:
Extracted answer: 1
Correct answer: 1
Is correct: True
Inference time: 2.7062 seconds

Model: gemma, Question 29:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 2.2473 seconds



 29%|██▉       | 29/100 [02:44<06:28,  5.47s/it]

Model: llama, Question 29:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 1.9214 seconds

Model: gemma, Question 30:
Extracted answer: 3
Correct answer: 2
Is correct: False
Inference time: 3.9912 seconds



 30%|███       | 30/100 [02:50<06:48,  5.83s/it]

Model: llama, Question 30:
Extracted answer: 3
Correct answer: 2
Is correct: False
Inference time: 2.7009 seconds

Model: gemma, Question 31:
Extracted answer: 1
Correct answer: 3
Is correct: False
Inference time: 1.6012 seconds



 31%|███       | 31/100 [02:54<05:55,  5.16s/it]

Model: llama, Question 31:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 1.9737 seconds

Model: gemma, Question 32:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 7.2868 seconds



 32%|███▏      | 32/100 [03:04<07:25,  6.55s/it]

Model: llama, Question 32:
Extracted answer: 3
Correct answer: 2
Is correct: False
Inference time: 2.5239 seconds

Model: gemma, Question 33:
Extracted answer: 3
Correct answer: 2
Is correct: False
Inference time: 4.4405 seconds



 33%|███▎      | 33/100 [03:11<07:22,  6.61s/it]

Model: llama, Question 33:
Extracted answer: 3
Correct answer: 2
Is correct: False
Inference time: 2.2837 seconds

Model: gemma, Question 34:
Extracted answer: 1
Correct answer: 3
Is correct: False
Inference time: 5.3125 seconds



 34%|███▍      | 34/100 [03:18<07:34,  6.88s/it]

Model: llama, Question 34:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 2.2036 seconds

Model: gemma, Question 35:
Extracted answer: 2
Correct answer: 4
Is correct: False
Inference time: 1.7527 seconds



 35%|███▌      | 35/100 [03:23<06:39,  6.15s/it]

Model: llama, Question 35:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 2.7038 seconds

Model: gemma, Question 36:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 4.6200 seconds



 36%|███▌      | 36/100 [03:29<06:43,  6.31s/it]

Model: llama, Question 36:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 2.0552 seconds

Model: gemma, Question 37:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 5.9881 seconds



 37%|███▋      | 37/100 [03:38<07:20,  6.99s/it]

Model: llama, Question 37:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 2.5743 seconds

Model: gemma, Question 38:
Extracted answer: 3
Correct answer: 1
Is correct: False
Inference time: 5.8265 seconds



 38%|███▊      | 38/100 [03:46<07:39,  7.42s/it]

Model: llama, Question 38:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 2.6002 seconds

Model: gemma, Question 39:
Extracted answer: 4
Correct answer: 1
Is correct: False
Inference time: 1.5917 seconds



 39%|███▉      | 39/100 [03:50<06:21,  6.26s/it]

Model: llama, Question 39:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 1.9631 seconds

Model: gemma, Question 40:
Extracted answer: 1
Correct answer: 3
Is correct: False
Inference time: 3.1359 seconds



 40%|████      | 40/100 [03:56<06:12,  6.21s/it]

Model: llama, Question 40:
Extracted answer: 2
Correct answer: 3
Is correct: False
Inference time: 2.9573 seconds

Model: gemma, Question 41:
Extracted answer: 4
Correct answer: 2
Is correct: False
Inference time: 3.6334 seconds



 41%|████      | 41/100 [04:01<05:53,  6.00s/it]

Model: llama, Question 41:
Extracted answer: 2
Correct answer: 2
Is correct: True
Inference time: 1.8725 seconds

Model: gemma, Question 42:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 5.1668 seconds



 42%|████▏     | 42/100 [04:08<06:05,  6.30s/it]

Model: llama, Question 42:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 1.8462 seconds

Model: gemma, Question 43:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 4.4583 seconds



 43%|████▎     | 43/100 [04:15<06:12,  6.53s/it]

Model: llama, Question 43:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 2.5842 seconds

Model: gemma, Question 44:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 1.3817 seconds



 44%|████▍     | 44/100 [04:19<05:22,  5.76s/it]

Model: llama, Question 44:
Extracted answer: 3
Correct answer: 1
Is correct: False
Inference time: 2.5892 seconds

Model: gemma, Question 45:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 3.3642 seconds



 45%|████▌     | 45/100 [04:25<05:08,  5.61s/it]

Model: llama, Question 45:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 1.8864 seconds

Model: gemma, Question 46:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 1.4260 seconds



 46%|████▌     | 46/100 [04:29<04:36,  5.11s/it]

Model: llama, Question 46:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 2.5361 seconds

Model: gemma, Question 47:
Extracted answer: 3
Correct answer: 2
Is correct: False
Inference time: 3.9653 seconds



 47%|████▋     | 47/100 [04:35<04:53,  5.54s/it]

Model: llama, Question 47:
Extracted answer: 4
Correct answer: 2
Is correct: False
Inference time: 2.5527 seconds

Model: gemma, Question 48:
Extracted answer: 2
Correct answer: 2
Is correct: True
Inference time: 4.3108 seconds



 48%|████▊     | 48/100 [04:42<05:02,  5.82s/it]

Model: llama, Question 48:
Extracted answer: 2
Correct answer: 2
Is correct: True
Inference time: 2.1760 seconds

Model: gemma, Question 49:
Extracted answer: 1
Correct answer: 3
Is correct: False
Inference time: 2.3291 seconds



 49%|████▉     | 49/100 [04:47<04:46,  5.62s/it]

Model: llama, Question 49:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 2.8293 seconds

Model: gemma, Question 50:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 1.1556 seconds



 50%|█████     | 50/100 [04:50<04:05,  4.91s/it]

Model: llama, Question 50:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 2.0888 seconds

Model: gemma, Question 51:
Extracted answer: 1
Correct answer: 1
Is correct: True
Inference time: 3.8789 seconds



 51%|█████     | 51/100 [04:56<04:19,  5.30s/it]

Model: llama, Question 51:
Extracted answer: 1
Correct answer: 1
Is correct: True
Inference time: 2.3222 seconds

Model: gemma, Question 52:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 2.7587 seconds



 52%|█████▏    | 52/100 [05:02<04:16,  5.34s/it]

Model: llama, Question 52:
Extracted answer: 2
Correct answer: 2
Is correct: True
Inference time: 2.6820 seconds

Model: gemma, Question 53:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 2.5444 seconds



 53%|█████▎    | 53/100 [05:06<03:54,  4.99s/it]

Model: llama, Question 53:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 1.6377 seconds

Model: gemma, Question 54:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 4.9300 seconds



 54%|█████▍    | 54/100 [05:13<04:20,  5.66s/it]

Model: llama, Question 54:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 2.2731 seconds

Model: gemma, Question 55:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 3.8701 seconds



 55%|█████▌    | 55/100 [05:19<04:24,  5.88s/it]

Model: llama, Question 55:
Extracted answer: 4
Correct answer: 1
Is correct: False
Inference time: 2.5296 seconds

Model: gemma, Question 56:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 1.3093 seconds



 56%|█████▌    | 56/100 [05:23<03:43,  5.07s/it]

Model: llama, Question 56:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 1.8645 seconds

Model: gemma, Question 57:
Extracted answer: 3
Correct answer: 2
Is correct: False
Inference time: 5.6594 seconds



 57%|█████▋    | 57/100 [05:31<04:15,  5.94s/it]

Model: llama, Question 57:
Extracted answer: 4
Correct answer: 2
Is correct: False
Inference time: 2.3026 seconds

Model: gemma, Question 58:
Extracted answer: 2
Correct answer: 4
Is correct: False
Inference time: 3.0842 seconds



 58%|█████▊    | 58/100 [05:36<03:59,  5.70s/it]

Model: llama, Question 58:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 2.0643 seconds

Model: gemma, Question 59:
Extracted answer: 2
Correct answer: 3
Is correct: False
Inference time: 2.9223 seconds



 59%|█████▉    | 59/100 [05:41<03:43,  5.46s/it]

Model: llama, Question 59:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 1.9758 seconds

Model: gemma, Question 60:
Extracted answer: 3
Correct answer: 1
Is correct: False
Inference time: 3.9486 seconds



 60%|██████    | 60/100 [05:47<03:54,  5.86s/it]

Model: llama, Question 60:
Extracted answer: 1
Correct answer: 1
Is correct: True
Inference time: 2.8376 seconds

Model: gemma, Question 61:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 3.3363 seconds



 61%|██████    | 61/100 [05:53<03:43,  5.74s/it]

Model: llama, Question 61:
Extracted answer: 2
Correct answer: 2
Is correct: True
Inference time: 2.1169 seconds

Model: gemma, Question 62:
Extracted answer: 4
Correct answer: 1
Is correct: False
Inference time: 4.1072 seconds



 62%|██████▏   | 62/100 [06:00<03:49,  6.04s/it]

Model: llama, Question 62:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 2.6461 seconds

Model: gemma, Question 63:
Extracted answer: 1
Correct answer: 1
Is correct: True
Inference time: 1.9183 seconds



 63%|██████▎   | 63/100 [06:04<03:22,  5.47s/it]

Model: llama, Question 63:
Extracted answer: 4
Correct answer: 1
Is correct: False
Inference time: 2.1996 seconds

Model: gemma, Question 64:
Extracted answer: 2
Correct answer: 4
Is correct: False
Inference time: 3.4859 seconds



 64%|██████▍   | 64/100 [06:10<03:27,  5.75s/it]

Model: llama, Question 64:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 2.9250 seconds

Model: gemma, Question 65:
Extracted answer: 3
Correct answer: 2
Is correct: False
Inference time: 2.3863 seconds



 65%|██████▌   | 65/100 [06:15<03:11,  5.46s/it]

Model: llama, Question 65:
Extracted answer: 3
Correct answer: 2
Is correct: False
Inference time: 2.3978 seconds

Model: gemma, Question 66:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 1.5425 seconds



 66%|██████▌   | 66/100 [06:19<02:53,  5.12s/it]

Model: llama, Question 66:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 2.7713 seconds

Model: gemma, Question 67:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 5.5003 seconds



 67%|██████▋   | 67/100 [06:27<03:15,  5.92s/it]

Model: llama, Question 67:
Extracted answer: 2
Correct answer: 3
Is correct: False
Inference time: 2.3004 seconds

Model: gemma, Question 68:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 1.1779 seconds



 68%|██████▊   | 68/100 [06:30<02:44,  5.13s/it]

Model: llama, Question 68:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 2.1003 seconds

Model: gemma, Question 69:
Extracted answer: 3
Correct answer: 1
Is correct: False
Inference time: 3.8770 seconds



 69%|██████▉   | 69/100 [06:37<02:54,  5.63s/it]

Model: llama, Question 69:
Extracted answer: 3
Correct answer: 1
Is correct: False
Inference time: 2.9276 seconds

Model: gemma, Question 70:
Extracted answer: 3
Correct answer: 1
Is correct: False
Inference time: 5.3351 seconds



 70%|███████   | 70/100 [06:45<03:05,  6.17s/it]

Model: llama, Question 70:
Extracted answer: 1
Correct answer: 1
Is correct: True
Inference time: 2.1005 seconds

Model: gemma, Question 71:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 5.4081 seconds



 71%|███████   | 71/100 [06:53<03:14,  6.71s/it]

Model: llama, Question 71:
Extracted answer: 2
Correct answer: 4
Is correct: False
Inference time: 2.5536 seconds

Model: gemma, Question 72:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 1.1120 seconds



 72%|███████▏  | 72/100 [06:56<02:37,  5.64s/it]

Model: llama, Question 72:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 2.0106 seconds

Model: gemma, Question 73:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 5.3587 seconds



 73%|███████▎  | 73/100 [07:04<02:52,  6.40s/it]

Model: llama, Question 73:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 2.8214 seconds

Model: gemma, Question 74:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 1.3746 seconds



 74%|███████▍  | 74/100 [07:08<02:28,  5.71s/it]

Model: llama, Question 74:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 2.7323 seconds

Model: gemma, Question 75:
Extracted answer: 2
Correct answer: 3
Is correct: False
Inference time: 5.3272 seconds



 75%|███████▌  | 75/100 [07:16<02:39,  6.40s/it]

Model: llama, Question 75:
Extracted answer: 1
Correct answer: 3
Is correct: False
Inference time: 2.6625 seconds

Model: gemma, Question 76:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 3.6664 seconds



 76%|███████▌  | 76/100 [07:22<02:28,  6.20s/it]

Model: llama, Question 76:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 2.0865 seconds

Model: gemma, Question 77:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 1.0009 seconds



 77%|███████▋  | 77/100 [07:25<02:01,  5.29s/it]

Model: llama, Question 77:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 2.1599 seconds

Model: gemma, Question 78:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 4.3796 seconds



 78%|███████▊  | 78/100 [07:32<02:08,  5.82s/it]

Model: llama, Question 78:
Extracted answer: 2
Correct answer: 3
Is correct: False
Inference time: 2.6870 seconds

Model: gemma, Question 79:
Extracted answer: 2
Correct answer: 2
Is correct: True
Inference time: 2.9232 seconds



 79%|███████▉  | 79/100 [07:38<02:00,  5.74s/it]

Model: llama, Question 79:
Extracted answer: 2
Correct answer: 2
Is correct: True
Inference time: 2.6143 seconds

Model: gemma, Question 80:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 1.1260 seconds



 80%|████████  | 80/100 [07:41<01:40,  5.01s/it]

Model: llama, Question 80:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 2.1799 seconds

Model: gemma, Question 81:
Extracted answer: 1
Correct answer: 3
Is correct: False
Inference time: 0.6098 seconds



 81%|████████  | 81/100 [07:44<01:22,  4.35s/it]

Model: llama, Question 81:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 2.2109 seconds

Model: gemma, Question 82:
Extracted answer: 3
Correct answer: 2
Is correct: False
Inference time: 4.0779 seconds



 82%|████████▏ | 82/100 [07:50<01:28,  4.93s/it]

Model: llama, Question 82:
Extracted answer: 4
Correct answer: 2
Is correct: False
Inference time: 2.1947 seconds

Model: gemma, Question 83:
Extracted answer: 4
Correct answer: 2
Is correct: False
Inference time: 3.8854 seconds



 83%|████████▎ | 83/100 [07:56<01:31,  5.40s/it]

Model: llama, Question 83:
Extracted answer: 2
Correct answer: 2
Is correct: True
Inference time: 2.6122 seconds

Model: gemma, Question 84:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 4.2523 seconds



 84%|████████▍ | 84/100 [08:03<01:33,  5.86s/it]

Model: llama, Question 84:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 2.6684 seconds

Model: gemma, Question 85:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 4.7634 seconds



 85%|████████▌ | 85/100 [08:10<01:31,  6.10s/it]

Model: llama, Question 85:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 1.9134 seconds

Model: gemma, Question 86:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 1.2244 seconds



 86%|████████▌ | 86/100 [08:13<01:13,  5.27s/it]

Model: llama, Question 86:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 2.0905 seconds

Model: gemma, Question 87:
Extracted answer: 1
Correct answer: 1
Is correct: True
Inference time: 1.9753 seconds



 87%|████████▋ | 87/100 [08:18<01:07,  5.19s/it]

Model: llama, Question 87:
Extracted answer: 1
Correct answer: 1
Is correct: True
Inference time: 3.0319 seconds

Model: gemma, Question 88:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 3.4856 seconds



 88%|████████▊ | 88/100 [08:24<01:02,  5.23s/it]

Model: llama, Question 88:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 1.8345 seconds

Model: gemma, Question 89:
Extracted answer: 1
Correct answer: 3
Is correct: False
Inference time: 1.4680 seconds



 89%|████████▉ | 89/100 [08:27<00:52,  4.80s/it]

Model: llama, Question 89:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 2.3273 seconds

Model: gemma, Question 90:
Extracted answer: 2
Correct answer: 3
Is correct: False
Inference time: 3.9926 seconds



 90%|█████████ | 90/100 [08:34<00:53,  5.36s/it]

Model: llama, Question 90:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 2.6728 seconds

Model: gemma, Question 91:
Extracted answer: 4
Correct answer: 2
Is correct: False
Inference time: 3.6069 seconds



 91%|█████████ | 91/100 [08:40<00:48,  5.44s/it]

Model: llama, Question 91:
Extracted answer: 2
Correct answer: 2
Is correct: True
Inference time: 2.0039 seconds

Model: gemma, Question 92:
Extracted answer: 1
Correct answer: 3
Is correct: False
Inference time: 1.5124 seconds



 92%|█████████▏| 92/100 [08:43<00:38,  4.86s/it]

Model: llama, Question 92:
Extracted answer: 1
Correct answer: 3
Is correct: False
Inference time: 2.0041 seconds

Model: gemma, Question 93:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 4.9328 seconds



 93%|█████████▎| 93/100 [08:51<00:40,  5.72s/it]

Model: llama, Question 93:
Extracted answer: 2
Correct answer: 3
Is correct: False
Inference time: 2.7875 seconds

Model: gemma, Question 94:
Extracted answer: 3
Correct answer: 2
Is correct: False
Inference time: 5.7233 seconds



 94%|█████████▍| 94/100 [08:59<00:39,  6.55s/it]

Model: llama, Question 94:
Extracted answer: 1
Correct answer: 2
Is correct: False
Inference time: 2.7649 seconds

Model: gemma, Question 95:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 1.5959 seconds



 95%|█████████▌| 95/100 [09:04<00:29,  5.81s/it]

Model: llama, Question 95:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 2.5014 seconds

Model: gemma, Question 96:
Extracted answer: 1
Correct answer: 4
Is correct: False
Inference time: 4.5617 seconds



 96%|█████████▌| 96/100 [09:10<00:24,  6.01s/it]

Model: llama, Question 96:
Extracted answer: 3
Correct answer: 4
Is correct: False
Inference time: 1.8867 seconds

Model: gemma, Question 97:
Extracted answer: 3
Correct answer: 3
Is correct: True
Inference time: 3.0156 seconds



 97%|█████████▋| 97/100 [09:15<00:17,  5.71s/it]

Model: llama, Question 97:
Extracted answer: 1
Correct answer: 3
Is correct: False
Inference time: 1.9887 seconds

Model: gemma, Question 98:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 3.5973 seconds



 98%|█████████▊| 98/100 [09:21<00:11,  5.83s/it]

Model: llama, Question 98:
Extracted answer: 4
Correct answer: 3
Is correct: False
Inference time: 2.5242 seconds

Model: gemma, Question 99:
Extracted answer: 4
Correct answer: 4
Is correct: True
Inference time: 2.8831 seconds



 99%|█████████▉| 99/100 [09:26<00:05,  5.51s/it]

Model: llama, Question 99:
Extracted answer: 2
Correct answer: 4
Is correct: False
Inference time: 1.8745 seconds

Model: gemma, Question 100:
Extracted answer: 3
Correct answer: 1
Is correct: False
Inference time: 3.0585 seconds



100%|██████████| 100/100 [09:32<00:00,  5.72s/it]

Model: llama, Question 100:
Extracted answer: 2
Correct answer: 1
Is correct: False
Inference time: 2.6278 seconds


Accuracy by model:
model
gemma    0.15
llama    0.36
Name: is_correct, dtype: float64





In [18]:
question = "What is the integral of x^2?"
choices = ["x^3/3", "x^2/2", "2x", "1/x"]


results = run_gemma_and_llama_inference(question, choices)

for model_name, result in results.items():
    print(f"Model: {model_name.upper()}")
    print(f"Generated text:\n{result['generated_text']}")
    print(f"Extracted answer: {result['extracted_answer']}")
    print(f"Inference time: {result['inference_time']:.4f} seconds")



Model: GEMMA
Generated text:
**Answer: 1. x^3/3**

**Explanation:**

The integral of x^2 is (x^3)/3. This is a straightforward integral that can be solved by using the power rule of integration.
Extracted answer: 3
Inference time: 0.8958 seconds
Model: LLAMA
Generated text:
Let's break down the problem step by step.

1. First, identify the type of mathematical problem we are dealing with.
   - The problem is asking for the integral of x^2. In mathematical terms, we need to find the antiderivative or the indefinite integral of the function f(x) = x^2.

2. Analyze the given choices.
   - Let's go through each choice and consider if it could be a potential solution:
     1. x^3/3
     2. x^2/2
     3. 2x
     4. 1/x

3. Perform necessary calculations or manipulations to arrive at the correct solution.
   - To find the integral of x^2, we can use the power rule of integration, which states that ∫x^n dx = (x^(n+1))/(n+1) + C, where n is a constant and C is the constant of integration.
   - 