In [1]:
!pip install transformers accelerate datasets pandas tqdm bitsandbytes langchain_experimental

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting langchain_experimental
  Downloading langchain_experimental-0.3.0-py3-none-any.whl.metadata (1.7 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting langchain-community<0.4.0,>=0.3.0 (from langchain_experimental)
  Downloading langchain_community-0.3.0-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain-core<0.4.0,>=0.3.0 (from langchain_experi

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from accelerate import init_empty_weights, infer_auto_device_map
import bitsandbytes as bnb
from langchain_experimental.llm_symbolic_math.base import LLMSymbolicMathChain
from langchain.llms import HuggingFacePipeline
from datasets import load_dataset
import pandas as pd
import time
import re
from tqdm import tqdm


model_name = "microsoft/Phi-3.5-mini-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,            # Enable 8-bit quantization
    device_map="auto",            # Automatically distribute across available GPUs
    torch_dtype="auto",           # Set the appropriate dtype
    trust_remote_code=True        # Trust the remote code for this model
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

In [10]:
def create_llm_math_chain(model, tokenizer):
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        temperature=0,
        top_p=1,
        repetition_penalty=1.0
    )
    llm = HuggingFacePipeline(pipeline=pipe)
    llm_math = LLMSymbolicMathChain.from_llm(llm)
    return llm_math


In [4]:
def zero_shot_prompt(question, choices):
    prompt = f"Choose the correct answer from the following choices. Question: {question} Options: {choices}"
    return prompt

def cot_prompt(question, choices):
    prompt = f"Choose the correct answer and explain step-by-step. Question: {question} Options: {choices} Let's think step-by-step."
    return prompt

def react_prompt_with_math_chain(llm_math, question, choices):
    prompt = f"""
    You are a highly skilled mathematician tasked with solving the problem step-by-step.

    Question: {question}

    Choices:
    1. {choices[0]}
    2. {choices[1]}
    3. {choices[2]}
    4. {choices[3]}

    Let's think step by step:

    1. Identify the type of mathematical problem.
    2. Analyze the given choices.
    3. Perform necessary calculations using the math chain.
    4. Based on the calculations, select the correct choice.

    Final Answer: [Choose from 1, 2, 3, or 4]
    """

    reasoning_result = llm_math.run(prompt)
    return reasoning_result


In [5]:
def extract_answer(generated_text):
    answer_pattern = r'Final Answer:\s*(\d)'
    match = re.search(answer_pattern, generated_text, re.IGNORECASE)
    if match:
        return int(match.group(1)) - 1

    last_number_pattern = r'(?:^|\D)([1234])(?:\D|$)'
    matches = list(re.finditer(last_number_pattern, generated_text))
    if matches:
        last_match = matches[-1]
        return int(last_match.group(1)) - 1

    return None


In [11]:
def run_inference(model, tokenizer, df):
    llm_math = create_llm_math_chain(model, tokenizer)

    results = []

    for index, row in tqdm(df.iterrows(), total=len(df)):
        question = row['question']
        choices = row['choices']
        correct_answer = row['answer']

        zero_shot_text = zero_shot_prompt(question, choices)
        start_time = time.time()
        zero_shot_response = model.generate(
            **tokenizer(zero_shot_text, return_tensors="pt").to(model.device),
            max_new_tokens=150
        )
        zero_shot_response = tokenizer.decode(zero_shot_response[0], skip_special_tokens=True)
        end_time = time.time()
        zero_shot_inference_time = end_time - start_time
        zero_shot_answer = extract_answer(zero_shot_response)
        zero_shot_correct = zero_shot_answer == correct_answer

        cot_text = cot_prompt(question, choices)
        start_time = time.time()
        cot_response = model.generate(
            **tokenizer(cot_text, return_tensors="pt").to(model.device),
            max_new_tokens=150
        )
        cot_response = tokenizer.decode(cot_response[0], skip_special_tokens=True)
        end_time = time.time()
        cot_inference_time = end_time - start_time
        cot_answer = extract_answer(cot_response)
        cot_correct = cot_answer == correct_answer

        start_time = time.time()
        react_response = react_prompt_with_math_chain(llm_math, question, choices)
        end_time = time.time()
        react_inference_time = end_time - start_time
        react_answer = extract_answer(react_response)
        react_correct = react_answer == correct_answer

        results.append({
            'question': question,
            'correct_answer': correct_answer + 1,
            'zero_shot_answer': zero_shot_answer + 1 if zero_shot_answer is not None else None,
            'zero_shot_correct': zero_shot_correct,
            'zero_shot_inference_time': zero_shot_inference_time,
            'cot_answer': cot_answer + 1 if cot_answer is not None else None,
            'cot_correct': cot_correct,
            'cot_inference_time': cot_inference_time,
            'react_answer': react_answer + 1 if react_answer is not None else None,
            'react_correct': react_correct,
            'react_inference_time': react_inference_time
        })

    return pd.DataFrame(results)


In [14]:
dataset = load_dataset("cais/mmlu", "college_mathematics")
df = pd.DataFrame(dataset['test'])


In [13]:
results_df = run_inference(model, tokenizer, df)

results_df.to_csv("phi_3.5_inference_results.csv", index=False)

accuracy_zero_shot = results_df['zero_shot_correct'].mean()
accuracy_cot = results_df['cot_correct'].mean()
accuracy_react = results_df['react_correct'].mean()

print(f"Zero-shot accuracy: {accuracy_zero_shot * 100:.2f}%")
print(f"Chain-of-Thought accuracy: {accuracy_cot * 100:.2f}%")
print(f"ReAct accuracy: {accuracy_react * 100:.2f}%")