In [None]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/accelerate.git
!pip install transformers datasets huggingface_hub
!pip install -U bitsandbytes

In [None]:
from huggingface_hub import login
login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from datasets import load_dataset
import time
import torch
import warnings
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

warnings.filterwarnings('ignore')

gemma_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b-it",
    load_in_4bit=True,
    device_map='auto'
)

gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
gemma_tokenizer.pad_token = gemma_tokenizer.eos_token

gemma_model.config

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GemmaConfig {
  "_name_or_path": "google/gemma-2b-it",
  "architectures": [
    "GemmaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "eos_token_id": 1,
  "head_dim": 256,
  "hidden_act": "gelu",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 16384,
  "max_position_embeddings": 8192,
  "model_type": "gemma",
  "num_attention_heads": 8,
  "num_hidden_layers": 18,
  "num_key_value_heads": 1,
  "pad_token_id": 0,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float32",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "fp4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bits

In [None]:
def generate_response(model, tokenizer, prompt, type):
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to('cuda')

    if type == "zero_shot":
        max_tokens = 20
    elif type == "chain_of_thought":
        max_tokens = 200
    elif type == "react_prompting":
        max_tokens = 500

    generation_config = GenerationConfig(
      do_sample=False,
      num_beams=1,
      temperature=0.5,
      repetition_penalty=1.5,
      max_new_tokens=max_tokens
    )

    start_time = time.time()

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            attention_mask=torch.ones_like(input_ids),
            generation_config=generation_config
        )

    end_time = time.time()
    inference_time = end_time - start_time

    response = tokenizer.decode(generation_output[0], skip_special_tokens=True).strip()

    return response, inference_time

In [None]:
def zero_shot_inference(question, options):
    prompt = f"""Please select the correct answer by responding **only** with the corresponding number (1, 2, 3, or 4) without any explanation.


    Question: {question}

    Options:
    1) {options[0]}
    2) {options[1]}
    3) {options[2]}
    4) {options[3]}

    Answer ::"""

    return prompt



def chain_of_thought_inference(question, options):
    prompt = f"""Please solve the following math problem step by step and then select the correct answer by responding **only** with the corresponding number (1, 2, 3, or 4).

    Question: {question}

    Options:
    1) {options[0]}
    2) {options[1]}
    3) {options[2]}
    4) {options[3]}

    Let's break it down and solve the problem step by step. Answer :: """
    return prompt

In [None]:
def extract_option(text):
    pattern = r"::[\s\S]*?(\d)"

    match = re.search(pattern, text)

    if match:
        return match.group(1)
    else:
        return None

In [None]:
dataset = load_dataset("cais/mmlu", 'college_mathematics')['test']

In [None]:
correct_count_zero_shot = 0
correct_count_chain_of_thought = 0
total_count = 0

total_time_zero_shot = 0.0
total_time_chain_of_thought = 0.0

for example in dataset:
    question = example['question']
    options = example['choices']
    correct_answer = example['answer']
    prompt1 = zero_shot_inference(question, options)
    prompt2 = chain_of_thought_inference(question, options)

    response1, time1 = generate_response(gemma_model, gemma_tokenizer, prompt1, "zero_shot")
    total_time_zero_shot += time1

    response2, time2 = generate_response(gemma_model, gemma_tokenizer, prompt2, "chain_of_thought")
    total_time_chain_of_thought += time2

    answer1 = extract_option(response1)
    answer2 = extract_option(response2)

    if answer1 and int(answer1) == (correct_answer + 1):
        correct_count_zero_shot += 1

    if answer2 and int(answer2) == (correct_answer + 1):
        correct_count_chain_of_thought += 1

    total_count += 1

accuracy_zero_shot = (correct_count_zero_shot / total_count) * 100
accuracy_chain_of_thought = (correct_count_chain_of_thought / total_count) * 100

avg_time_zero_shot = total_time_zero_shot / total_count
avg_time_chain_of_thought = total_time_chain_of_thought / total_count

print(f"Zero-Shot Accuracy: {accuracy_zero_shot:.2f}%")
print(f"Chain of Thought Accuracy: {accuracy_chain_of_thought:.2f}%")

print(f"Average Zero-Shot Inference Time: {avg_time_zero_shot:.4f} seconds")
print(f"Average Chain-of-Thought Inference Time: {avg_time_chain_of_thought:.4f} seconds")

Zero-Shot Accuracy: 25.81%
Chain of Thought Accuracy: 31.18%
Average Zero-Shot Inference Time: 1.7947 seconds
Average Chain-of-Thought Inference Time: 17.2345 seconds


In [None]:
def react_inference(question, options):
    prompt = f"""

    Example 1:
    Question: Suppose that f(1 + x) = f(x) for all real x. If f is a polynomial and f(5) = 11, what is f(15/2)?

    Thought: The equation f(1 + x) = f(x) suggests that f is periodic with a period of 1, meaning the value of f(x) repeats every 1 unit.
    Action: Since f(5) = 11, we know f(x) = 11 for all values of x due to periodicity.
    Observation: The value 15/2 = 7.5. By periodicity, f(7.5) = f(5) = 11.
    Final Answer: Option 3.

    Example 2:
    Question: Let V be the set of all real polynomials p(x). Transformations T, S are defined on V by T(p(x)) -> x p(x) and S(p(x)) -> p'(x). What is true about ST(p(x))?

    Thought: To evaluate ST(p(x)), first apply T and then apply S to the result.
    Action: Apply T to p(x): T(p(x)) = x p(x). Then apply S: S(x p(x)) = p(x) + x p'(x).
    Observation: Thus, ST(p(x)) = p(x) + x p'(x) and TS(p(x)) = x p'(x). Therefore, ST - TS = p(x), meaning ST - TS is the identity map.
    Final Answer: Option 4


    Now, solve this problem step by step:

    Question: {question}

    Options:
    1) {options[0]}
    2) {options[1]}
    3) {options[2]}
    4) {options[3]}

    Please output the number corresponding to the correct option, Final Answer ::

    Explanation :

    """
    return prompt


In [2]:
correct_count_react = 0
total_count = 0
total_time_react = 0

for example in dataset:
    question = example['question']
    options = example['choices']
    correct_answer = example['answer']

    prompt3 = react_inference(question, options)
    response3, time3 = generate_response(gemma_model, gemma_tokenizer, prompt3, "react_prompting")
    total_time_react += time3
    answer3 = extract_option(response3)

    if answer3 and int(answer3) == (correct_answer + 1):
        correct_count_react += 1

    total_count += 1

accuracy_react = (correct_count_react / total_count) * 100
avg_time_react = total_time_react / total_count

print(f"ReAct Prompting Accuracy: {accuracy_react:.2f}%")
print(f"Average ReAct Inference Time: {avg_time_react:.4f} seconds")

ReAct Prompting Accuracy: 36.86%
Average ReAct Inference Time: 41.2945 seconds
