In [1]:
!pip install --upgrade transformers>=4.28

In [2]:
!pip install sentencepiece



In [3]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3


In [4]:
import os

# Store the Hugging Face token in environment
os.environ["HF_TOKEN"] = "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"

In [5]:
from huggingface_hub import login
login(token=os.getenv("HF_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [6]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM


In [7]:
model = LlamaForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")


Downloading shards: 100%|██████████| 4/4 [02:32<00:00, 38.07s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.12it/s]


In [7]:
from transformers import AutoTokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")

In [10]:
# Add a custom padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})


1

# Benchmark Functions

In [12]:
def calculate_mmlu_zero_shot(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=120)
    return outputs

def calculate_mmlu_pro_five_shot(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=120, num_return_sequences=5)
    return outputs

def calculate_human_eval_zero_shot(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=120)
    return outputs

def calculate_mbpp_eval_zero_shot(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=120)
    return outputs

def calculate_if_eval_zero_shot(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=120)
    return outputs

def calculate_gsm8k_eight_shot(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=120, num_return_sequences=8)
    return outputs

def calculate_math_zero_shot(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=120)
    return outputs

def calculate_arc_challenge_zero_shot(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=120)
    return outputs

def calculate_gpqa_zero_shot(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=120)
    return outputs

# attention_mask tells the model which tokens in the input sequence are actual data and which are just padding
# The padding argument ensures that all sequences in a batch are of the same length, which is necessary for efficient processing in models that handle batches of data.
# truncation argument ensures that input sequences do not exceed a maximum length that the model can handle.

# Prompt for each Benchmark

In [13]:
prompts = {
"MMLU (0-shot, CoT)": "Davis decided to kill Adams. He set out for Adams's house. Before he got there he saw Brooks, who resembled Adams. Thinking that Brooks was Adams, Davis shot at Brooks. The shot missed Brooks but wounded Case, who was some distance away. Davis had not seen Case. In a prosecution under a statute that proscribes any attempt to commit murder, the district attorney should indicate that the intended victim(s) was/were?",
"MMLU PRO (5-shot, CoT)": "The symmetric group $S_n$ has $ \factorial{n}$ elements, hence it is not true that $S_{10}$ has 10 elements. Find the characteristic of the ring 2Z.",
"HumanEval (0-shot)": "Write a Python function to calculate the factorial of a given number.",
"MBPP EvalPlus (Base)(0-shot)": "Write a function to find the longest chain which can be formed from the given set of pairs.",
"IFEval Code (0-shot)": "Write a JavaScript function to validate an email address.",
"GSM8K (8-shot)": "Generate eight different math word problems for 4th-grade level.",
"Math (0-shot) (CoT)": "Solve the equation 2x + 5 = 11 and explain the steps.",
"ARC Challenge (0-shot)": "Which interaction within an ecosystem is characterized by gradual change from one community of organisms to another?",#NLP type Qs
"GPQA (0-shot, CoT)": "trans-cinnamaldehyde was treated with methylmagnesium bromide, forming product 1. 1 was treated with pyridinium chlorochromate, forming product 2. 3 was treated with (dimethyl(oxo)-l6-sulfaneylidene)methane in DMSO at elevated temperature, forming product 3. how many carbon atoms are there in product 3?."
}

# Calculate benchmarks non-quantized one

In [17]:


original_benchmarks = {}
original_outputs = {}

for name, prompt in prompts.items():
    if "MMLU" in name:
        if "5-shot" in name:
            outputs = calculate_mmlu_pro_five_shot(prompt)
        else:
            outputs = calculate_mmlu_zero_shot(prompt)
    elif "HumanEval" in name:
        outputs = calculate_human_eval_zero_shot(prompt)
    elif "MBPP" in name:
        outputs = calculate_mbpp_eval_zero_shot(prompt)
    elif "IFEval" in name:
        outputs = calculate_if_eval_zero_shot(prompt)
    elif "GSM8K" in name:
        outputs = calculate_gsm8k_eight_shot(prompt)
    elif "Math" in name:
        outputs = calculate_math_zero_shot(prompt)
    elif "ARC" in name:
        outputs = calculate_arc_challenge_zero_shot(prompt)
    elif "GPQA" in name:
        outputs = calculate_gpqa_zero_shot(prompt)

    original_benchmarks[name] = outputs
    original_outputs[name] = tokenizer.decode(outputs[0], skip_special_tokens=True)  # Store the decoded outputs

# Store original benchmarks and outputs for comparison later
print("Original Benchmarks:\n", original_benchmarks)
print("\nOriginal Outputs:\n", original_outputs)


#import torch
# # Ensure the tensor is defined before using it
# try:
#     # Define the tensor (example tensor provided)
#     tensor = torch.tensor(outputs, dtype=torch.long)

#     # Convert tensor to float before calculating the mean
#     tensor_float = tensor.float()
#     mean_value = tensor_float.mean()

#     print("Mean value:", mean_value)

# except UnboundLocalError as e:
#     print(f"Error: {e}. Make sure the tensor is properly initialized.")


# # Function to extract and summarize benchmark values
# def summarize_benchmarks(benchmarks):
#     summarized = {}
#     for key, value in benchmarks.items():
#     # Assuming 'tensor' is your input tensor
#         tensor = torch.tensor(outputs, dtype=torch.float)
#         tensor_float = tensor.float()
#         mean_value = tensor_float.mean()

#         summarized[key] = torch.mean(value).item()  # Example: using mean to summarize
#     return summarized

# # Assuming 'original_benchmarks' is the dictionary containing all the tensor outputs
# summarized_benchmarks = summarize_benchmarks(original_benchmarks)

# # Print the summarized benchmarks
# print("Summarized Benchmarks:")
# for key, value in summarized_benchmarks.items():
#     print(f"{key}: {value}")
#     print("\nOriginal Outputs:\n", original_outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Original Benchmarks:
 {'MMLU (0-shot, CoT)': tensor([[128000,  97284,   6773,    311,   5622,  27329,     13,   1283,    743,
            704,    369,  27329,    596,   3838,     13,  13538,    568,   2751,
           1070,    568,   5602,  39119,     11,    889,  96858,  27329,     13,
          53389,    430,  39119,    574,  27329,     11,  17200,   6689,    520,
          39119,     13,    578,   6689,  13942,  39119,    719,  28593,  11799,
             11,    889,    574,   1063,   6138,   3201,     13,  17200,   1047,
            539,   3970,  11799,     13,    763,    264,  32699,   1234,    264,
          35786,    430,   8882,  56236,    904,   4879,    311,   5379,  10102,
             11,    279,   9474,  14065,   1288,  13519,    430,    279,  10825,
          11996,   1161,      8,    574,   6458,    486,     30,    720,     32,
              8,  39119,    198,     33,      8,  39119,    323,  11799,    198,
             34,      8,    423,      8,  11799,    198,     35,

# Quantization

In [9]:
# from transformers import BitsAndBytesConfig, LlamaForCausalLM, LlamaTokenizer

# # Quantization configuration (4-bit quantization)
# quant_config = BitsAndBytesConfig(load_in_4bit=True)

# Load the quantized model
# quantized_model = LlamaForCausalLM.from_pretrained(
#     "meta-llama/Meta-Llama-3.1-8B",
#     quantization_config=quant_config,
#     device_map="auto"
#)

model = LlamaForCausalLM.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")


config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

In [14]:
def calculate_benchmarks_quantized(model, tokenizer, prompts):
    quantized_benchmarks = {}
    outputs_dict = {}  # To store the generated outputs

    for name, prompt in prompts.items():
        if "MMLU" in name:
            if "5-shot" in name:
                outputs = calculate_mmlu_pro_five_shot(prompt)
            else:
                outputs = calculate_mmlu_zero_shot(prompt)
        elif "HumanEval" in name:
            outputs = calculate_human_eval_zero_shot(prompt)
        elif "MBPP" in name:
            outputs = calculate_mbpp_eval_zero_shot(prompt)
        elif "IFEval" in name:
            outputs = calculate_if_eval_zero_shot(prompt)
        elif "GSM8K" in name:
            outputs = calculate_gsm8k_eight_shot(prompt)
        elif "Math" in name:
            outputs = calculate_math_zero_shot(prompt)
        elif "ARC" in name:
            outputs = calculate_arc_challenge_zero_shot(prompt)
        elif "GPQA" in name:
            outputs = calculate_gpqa_zero_shot(prompt)

        quantized_benchmarks[name] = outputs
        outputs_dict[name] = tokenizer.decode(outputs[0], skip_special_tokens=True)  # Decode and store outputs

    return quantized_benchmarks, outputs_dict

# Run benchmarks on the quantized model
quantized_benchmarks, quantized_outputs = calculate_benchmarks_quantized(model, tokenizer, prompts)

# Print the benchmarks and outputs
print("Quantized Benchmarks:\n", quantized_benchmarks)
print("\nQuantized Outputs:\n", quantized_outputs)




Quantized Benchmarks:
 {'MMLU (0-shot, CoT)': tensor([[128000,  97284,   6773,    311,   5622,  27329,     13,   1283,    743,
            704,    369,  27329,    596,   3838,     13,  13538,    568,   2751,
           1070,    568,   5602,  39119,     11,    889,  96858,  27329,     13,
          53389,    430,  39119,    574,  27329,     11,  17200,   6689,    520,
          39119,     13,    578,   6689,  13942,  39119,    719,  28593,  11799,
             11,    889,    574,   1063,   6138,   3201,     13,  17200,   1047,
            539,   3970,  11799,     13,    763,    264,  32699,   1234,    264,
          35786,    430,   8882,  56236,    904,   4879,    311,   5379,  10102,
             11,    279,   9474,  14065,   1288,  13519,    430,    279,  10825,
          11996,   1161,      8,    574,   6458,    486,     30,    320,     32,
              8,   1193,  27329,     11,    320,     33,      8,   1193,  39119,
             11,    320,     34,      8,  27329,    323,  39119

# Comparing both quantized & non-quantized Benchmarks

In [None]:
# Compare benchmarks
for name in prompts.keys():
    print(f"Benchmark: {name}")
    print(f"Original: {original_benchmarks.get(name)}")
    print(f"Quantized: {quantized_benchmarks.get(name)}\n")

# Compare outputs
for name in prompts.keys():
    print(f"Output: {name}")
    print(f"Original: {original_outputs.get(name)}")
    print(f"Quantized: {quantized_outputs.get(name)}\n")
