In [2]:
# Install required packages
!pip install -U -q accelerate transformers einops datasets peft bitsandbytes evaluate

# Core libraries
import os
import re
import gc
import shutil
import time
from google.colab import files

# Data handling and evaluation
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
import evaluate
import torch


# Transformers and PEFT
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    pipeline,
    logging
)
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model

# Optimization
import bitsandbytes as bnb

# Progress bars
from tqdm.auto import tqdm


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [None]:
'''# Load the Base model
model_name = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-1_5",
    device_map={"":0},
    trust_remote_code=True,
    quantization_config=bnb_config
)

model.push_to_hub("phi-1_5-4b-quantized")'''

'#\xa0Load the Base model\nmodel_name = "microsoft/phi-1_5"\ntokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\ntokenizer.pad_token = tokenizer.eos_token\n\n# Quantization config\nbnb_config = BitsAndBytesConfig(\n    load_in_4bit=True,\n    bnb_4bit_use_double_quant=True,\n    bnb_4bit_quant_type="nf4",\n    bnb_4bit_compute_dtype=torch.float16\n)\n\nmodel = AutoModelForCausalLM.from_pretrained(\n    "microsoft/phi-1_5",\n    device_map={"":0},\n    trust_remote_code=True,\n    quantization_config=bnb_config\n)\n\nmodel.push_to_hub("phi-1_5-4b-quantized")'

In [3]:
#### Load finetuned model

def load_finetuned_model(model_name, base_model_name, max_length):
  cleanup_memory()
  if model_name == "rk68/phi-1_5-4b-quantized":
    model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True, torch_dtype=torch.float32)
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    peft_model=None
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length)
  else:
    model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True, torch_dtype=torch.float32)
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    peft_model = PeftModel.from_pretrained(model, model_name, from_transformers=True)
    model = peft_model.merge_and_unload()
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length)
  return model, tokenizer, pipe

# more efficient code
def format_options(options):
    formatted_options = ', '.join(options)
    return formatted_options

def build_prompt_aquarat(question, options):
    formatted_options = format_options(options)
    prompt = f"Instruction: {question} options: {formatted_options}\nOutput: "
    return prompt

def llm_response(question, options, pipe):
    prompt = build_prompt_aquarat(question, options)
    result = pipe(prompt)
    llm_answer = result[0]['generated_text']
    return llm_answer

def llm_response_few_shot(prompt, pipe):
    result = pipe(prompt)
    llm_answer = result[0]['generated_text']
    return llm_answer

def build_prompt_aquarat(question, options, previous_examples=""):
    formatted_options = format_options(options)
    prompt = f"{previous_examples}Instruction: {question} options: {formatted_options}\nOutput: "
    return prompt


#### Response Generation

# Zero-shot case

def generate_responses_aquarat(dataset, n_samples, csv_name, pipe, seed=42):
    np.random.seed(seed)
    random_indices = np.random.choice(len(dataset), size=n_samples, replace=False)
    data = []

    for index in tqdm(random_indices, desc="Generating LLM responses"):
        sample = dataset[int(index)]
        question, options, correct = sample['question'], sample['options'], sample['correct']
        llm_answer = llm_response(question, options, pipe)
        data.append({'question': question, 'options': options, 'correct': correct, 'llm_answer': llm_answer})

    df = pd.DataFrame(data)
    df.to_csv(csv_name, index=False)
    return df

# Few shot simplified (letter only)

def generate_responses_simplified_few_shot_aquarat(dataset, n_samples, n_shot, csv_name, pipe, seed=42):
    np.random.seed(seed)
    random_indices = np.random.choice(len(dataset), size=n_samples, replace=False)
    data = []

    for index in tqdm(random_indices, desc="Generating LLM responses (Simplified Few-Shot)"):
        sample = dataset[int(index)]
        question, options, correct = sample['question'], sample['options'], sample['correct']

        # Select few-shot examples
        few_shot_indices = np.random.choice([i for i in range(len(dataset)) if i != index], size=n_shot, replace=False)
        simplified_few_shot_examples = ""
        for fs_index in few_shot_indices:
            fs_sample = dataset[int(fs_index)]
            fs_formatted_prompt = build_prompt_aquarat(fs_sample['question'], fs_sample['options'])
            simplified_few_shot_examples += fs_formatted_prompt + "Answer: " + fs_sample['correct'] + "\n\n"

        full_prompt = build_prompt_aquarat(question, options, simplified_few_shot_examples)
        llm_answer = llm_response_few_shot(full_prompt, pipe)
        data.append({'question': question, 'options': options, 'correct': correct, 'llm_answer': llm_answer, 'few_shot_prompt': full_prompt})

    df = pd.DataFrame(data)
    df.to_csv(csv_name, index=False)
    return df

# Few shot CoT

def generate_responses_few_shot_aquarat(dataset, n_samples, n_shot, csv_name, pipe, seed=42):
    np.random.seed(seed)
    random_indices = np.random.choice(len(dataset), size=n_samples, replace=False)
    data = []

    for index in tqdm(random_indices, desc="Generating LLM responses (Few-Shot)"):
        sample = dataset[int(index)]
        question, options, correct = sample['question'], sample['options'], sample['correct']

        # Select few-shot examples
        few_shot_indices = np.random.choice([i for i in range(len(dataset)) if i != index], size=n_shot, replace=False)
        few_shot_examples = ""
        for fs_index in few_shot_indices:
            fs_sample = dataset[int(fs_index)]
            fs_formatted_prompt = build_prompt_aquarat(fs_sample['question'], fs_sample['options'])
            few_shot_examples += fs_formatted_prompt + fs_sample['rationale'] + " Answer: " + fs_sample['correct'] + "\n\n"

        full_prompt = build_prompt_aquarat(question, options, few_shot_examples)
        llm_answer = llm_response_few_shot(full_prompt, pipe)
        data.append({'question': question, 'options': options, 'correct': correct, 'llm_answer': llm_answer, 'few_shot_prompt': full_prompt})

    df = pd.DataFrame(data)
    df.to_csv(csv_name, index=False)
    return df


#### Evaluation code

def eval_letter(text):
    pattern = r"(?:answer\s*:?|answer\s+is\s*:?|correct\s+option\s+is\s*:?)\s*([A-E])"
    matches = re.findall(pattern, text, re.IGNORECASE)
    if matches:
        return matches[-1].upper()
    return None

def evaluate_aquarat(df):
    df['ex_llm_answer'] = df['llm_answer'].apply(eval_letter)
    df['em_score'] = df.apply(lambda row: 1 if row['ex_llm_answer'] == row['correct'] else 0, axis=1)
    em_score = df['em_score'].mean()
    return em_score


### Generate and evaluate for each

def generate_and_evaluate_aquarat(dataset, n_samples, csv_name, pipe):
    responses_df = generate_responses_aquarat(dataset, n_samples=n_samples, csv_name=csv_name, pipe=pipe)
    em_score = evaluate_aquarat(responses_df)
    del responses_df
    gc.collect()
    return em_score

def generate_and_evaluate_few_shot_aquarat(dataset, n_samples, n_shot, csv_name, pipe):
    responses_df = generate_responses_few_shot_aquarat(dataset=dataset, n_samples=n_samples, n_shot=n_shot, csv_name=csv_name, pipe=pipe)
    em_score = evaluate_aquarat(responses_df)
    del responses_df
    gc.collect()
    return em_score

def generate_and_evaluate_simplified_few_shot_aquarat(dataset, n_samples, n_shot, csv_name, pipe):
    responses_df = generate_responses_simplified_few_shot_aquarat(dataset=dataset, n_samples=n_samples,n_shot=n_shot, csv_name=csv_name, pipe=pipe)
    em_score = evaluate_aquarat(responses_df)
    del responses_df
    gc.collect()
    return em_score

### Clean up memory

def cleanup_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()



In [None]:
def evaluate_models_on_dataset(dataset, models_to_test, n_test=100, responses_folder='llm_responses', eval_mode="zero_shot", n_shot=1, max_length=225, base_model="microsoft/phi-1_5"):
  cleanup_memory()
  em_scores_dict = {}
  responses_folder = 'llm_responses'
  os.makedirs(responses_folder, exist_ok=True)

  # Main evaluation loop
  for model_name in models_to_test:
      simplified_model_name = model_name.split('/')[-1]
      print(f"Testing: {simplified_model_name}")

      model, tokenizer, pipe = load_finetuned_model(model_name, base_model_name=base_model, max_length=max_length)

      file_name = f"{model_name.split('/')[-1]}_test.csv"
      file_path = os.path.join(responses_folder, file_name)

      if eval_mode=="zero_shot":
        em_score = generate_and_evaluate_aquarat(aqua_rat_data, n_test, file_path, pipe=pipe)
      elif eval_mode=="few_shot":
        em_score = generate_and_evaluate_simplified_few_shot_aquarat(aqua_rat_data, n_test, n_shot, file_path, pipe=pipe)
      elif eval_mode=="few_shot_cot":
        em_score = generate_and_evaluate_few_shot_aquarat(aqua_rat_data, n_test, n_shot, file_path, pipe=pipe)

      print(f"Accuracy: {em_score}")

      em_scores_dict[simplified_model_name.replace('_test.csv', '')] = em_score

      del model, tokenizer, pipe
      cleanup_memory()

  em_scores_csv_path = os.path.join(responses_folder, "em_scores.csv")
  em_scores_df = pd.DataFrame(list(em_scores_dict.items()), columns=['Model Name', 'EM Score'])
  em_scores_df.to_csv(em_scores_csv_path, index=False)
  shutil.make_archive(responses_folder, 'zip', responses_folder)
  files.download(responses_folder + '.zip')

In [None]:
cleanup_memory()
aqua_rat_data = load_dataset("aqua_rat", split="test")

# Define your models to test
models_to_test = [#"rk68/phi-1_5-finetuned-aqua-rat-1k",
                  "rk68/phi-1_5-finetuned-aqua-rat-2k",
                  "rk68/phi-1_5-finetuned-aqua-rat-5k"]
                  #"rk68/phi-1_5-finetuned-aqua-rat-10k",
                  #"rk68/phi-1_5-finetuned-aqua-rat-teacher-1k",
                  #"rk68/phi-1_5-finetuned-aqua-rat-teacher-2k",
                  #"rk68/phi-1_5-4b-quantized"]

#evaluate_models_on_dataset(aqua_rat_data, models_to_test, n_test=100, responses_folder='llm_responses', eval_mode="zero_shot", n_shot=1, max_length=225, base_model="microsoft/phi-1_5")
evaluate_models_on_dataset(aqua_rat_data, models_to_test, n_test=100, responses_folder='llm_responses', eval_mode="few_shot", n_shot=1, max_length=300, base_model="microsoft/phi-1_5")
#evaluate_models_on_dataset(aqua_rat_data, models_to_test, n_test=3, responses_folder='llm_responses', eval_mode="few_shot_cot", n_shot=1, max_length=450, base_model="microsoft/phi-1_5")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/5.89k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/76.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/97467 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/254 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/254 [00:00<?, ? examples/s]

Testing: phi-1_5-finetuned-aqua-rat-2k


config.json:   0%|          | 0.00/864 [00:00<?, ?B/s]

configuration_phi.py:   0%|          | 0.00/9.26k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi.py:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/44.1M [00:00<?, ?B/s]

Generating LLM responses (Simplified Few-Shot):   0%|          | 0/100 [00:00<?, ?it/s]

Accuracy: 0.2
Testing: phi-1_5-finetuned-aqua-rat-5k


adapter_config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/44.1M [00:00<?, ?B/s]

Generating LLM responses (Simplified Few-Shot):   0%|          | 0/100 [00:00<?, ?it/s]

Accuracy: 0.18


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
def eval_letter(text):
    pattern = r"(?:answer\s*:?|answer\s+is\s*:?|correct\s+option\s+is\s*:?)\s*([A-E])"
    matches = re.findall(pattern, text, re.IGNORECASE)
    if matches:
        return matches[-1].upper()
    return None

def evaluate_aquarat(df):
    df['ex_llm_answer'] = df['llm_answer'].apply(eval_letter)
    df['em_score'] = df.apply(lambda row: 1 if row['ex_llm_answer'] == row['correct'] else 0, axis=1)
    em_score = df['em_score'].mean()
    return em_score

In [17]:
def process_csv_files(directory_path):
    for file in os.listdir(directory_path):
        if file.endswith('.csv'):
            file_path = os.path.join(directory_path, file)
            df = pd.read_csv(file_path)
            em_score = evaluate_aquarat(df)
            print(f"{os.path.splitext(file)[0]}: {em_score}")

In [19]:
process_csv_files('/content/zero-shot')

phi-1_5-finetuned-aqua-rat-2k_test: 0.21
phi-1_5-finetuned-aqua-rat-teacher-1k_test: 0.1
phi-1_5-finetuned-aqua-rat-teacher-2k_test: 0.1
phi-1_5-finetuned-aqua-rat-10k_test: 0.11
phi-1_5-finetuned-aqua-rat-5k_test: 0.12
phi-1_5-finetuned-aqua-rat-1k_test: 0.07
phi-1_5-4b-quantized_test: 0.01


In [28]:
def eval_letter_n_shot(text, n_shot):
    pattern = r"(?:answer\s*:?|answer\s+is\s*:?|correct\s+option\s+is\s*:?)\s*([A-E])"
    matches = re.findall(pattern, text, re.IGNORECASE)
    # ignore the answers passed as n-shot examples
    if len(matches) > n_shot:
        return matches[-1].upper()
    return None

def evaluate_aquarat_n_shot(df, n_shot):
    df['ex_llm_answer'] = df['llm_answer'].apply(lambda text: eval_letter_n_shot(text, n_shot))
    df['em_score'] = df.apply(lambda row: 1 if row['ex_llm_answer'] == row['correct'] else 0, axis=1)
    em_score = df['em_score'].mean()
    return em_score

def process_csv_files_n_shot(directory_path, n_shot):
    for file in os.listdir(directory_path):
        if file.endswith('.csv'):
            file_path = os.path.join(directory_path, file)
            df = pd.read_csv(file_path)
            em_score = evaluate_aquarat_n_shot(df, n_shot=n_shot)
            print(f"{os.path.splitext(file)[0]}: {em_score}")


In [29]:
process_csv_files_n_shot('/content/one-shot', n_shot=1)

phi-1_5-finetuned-aqua-rat-2k_test-one-shot: 0.16
phi-1_5-finetuned-aqua-rat-5k_test-one-shot: 0.08
phi-1_5-4b-quantized_test-one-shot: 0.0
phi-1_5-finetuned-aqua-rat-teacher-2k_test-one-shot: 0.12
phi-1_5-finetuned-aqua-rat-1k-one-shotest: 0.09
phi-1_5-finetuned-aqua-rat-teacher-1k_test-one-shot: 0.12
