In [None]:
# Install required packages
!pip install -U -q accelerate transformers einops datasets peft bitsandbytes evaluate

# Core libraries
import os
import re
import gc
import shutil
import time


# Data handling and evaluation
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
import evaluate
import torch


# Transformers and PEFT
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    pipeline,
    logging
)
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model

# Optimization
import bitsandbytes as bnb

# Progress bars
from tqdm.auto import tqdm


In [None]:
'''# Load the Base model
model_name = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-1_5",
    device_map={"":0},
    trust_remote_code=True,
    quantization_config=bnb_config
)

model.push_to_hub("phi-1_5-4b-quantized")'''

In [None]:
#### Load finetuned model

def load_finetuned_model(model_name, base_model_name, max_length, max_new_tokens):
  cleanup_memory()
  if model_name == "rk68/phi-1_5-4b-quantized":
    model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True, torch_dtype=torch.float32)
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    peft_model=None
    if max_new_tokens:
      pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=max_new_tokens)
    else:
      pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length)
  else:
    model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True, torch_dtype=torch.float32)
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    peft_model = PeftModel.from_pretrained(model, model_name, from_transformers=True)
    model = peft_model.merge_and_unload()
    if max_new_tokens:
      pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=max_new_tokens)
    else:
      pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length)
  return model, tokenizer, pipe

# more efficient code
def format_options(options):
    formatted_options = ', '.join(options)
    return formatted_options

def build_prompt_aquarat(question, options):
    formatted_options = format_options(options)
    prompt = f"Instruction: {question} options: {formatted_options}\nOutput: "
    return prompt

def llm_response(question, options, pipe):
    prompt = build_prompt_aquarat(question, options)
    result = pipe(prompt)
    llm_answer = result[0]['generated_text']
    return llm_answer

def llm_response_few_shot(prompt, pipe):
    result = pipe(prompt)
    llm_answer = result[0]['generated_text']
    return llm_answer

def build_prompt_aquarat(question, options, previous_examples=""):
    formatted_options = format_options(options)
    prompt = f"{previous_examples}Instruction: {question} options: {formatted_options}\nOutput: "
    return prompt


#### Response Generation

# Zero-shot case

def generate_responses_aquarat(dataset, n_samples, csv_name, pipe, seed=42):
    np.random.seed(seed)
    random_indices = np.random.choice(len(dataset), size=n_samples, replace=False)
    data = []

    for index in tqdm(random_indices, desc="Generating LLM responses"):
        sample = dataset[int(index)]
        question, options, correct = sample['question'], sample['options'], sample['correct']
        llm_answer = llm_response(question, options, pipe)
        data.append({'question': question, 'options': options, 'correct': correct, 'llm_answer': llm_answer})

    df = pd.DataFrame(data)
    df.to_csv(csv_name, index=False)
    return df

# Few shot simplified (letter only)

def generate_responses_simplified_few_shot_aquarat(dataset, n_samples, n_shot, csv_name, pipe, seed=42):
    np.random.seed(seed)
    random_indices = np.random.choice(len(dataset), size=n_samples, replace=False)
    data = []

    for index in tqdm(random_indices, desc="Generating LLM responses (Simplified Few-Shot)"):
        sample = dataset[int(index)]
        question, options, correct = sample['question'], sample['options'], sample['correct']

        # Select few-shot examples
        few_shot_indices = np.random.choice([i for i in range(len(dataset)) if i != index], size=n_shot, replace=False)
        simplified_few_shot_examples = ""
        for fs_index in few_shot_indices:
            fs_sample = dataset[int(fs_index)]
            fs_formatted_prompt = build_prompt_aquarat(fs_sample['question'], fs_sample['options'])
            simplified_few_shot_examples += fs_formatted_prompt + "Answer: " + fs_sample['correct'] + "\n\n"

        full_prompt = build_prompt_aquarat(question, options, simplified_few_shot_examples)
        llm_answer = llm_response_few_shot(full_prompt, pipe)
        data.append({'question': question, 'options': options, 'correct': correct, 'llm_answer': llm_answer, 'few_shot_prompt': full_prompt})

    df = pd.DataFrame(data)
    df.to_csv(csv_name, index=False)
    return df

# Few shot CoT

def generate_responses_few_shot_aquarat(dataset, n_samples, n_shot, csv_name, pipe, seed=42):
    np.random.seed(seed)
    random_indices = np.random.choice(len(dataset), size=n_samples, replace=False)
    data = []

    for index in tqdm(random_indices, desc="Generating LLM responses (Few-Shot)"):
        sample = dataset[int(index)]
        question, options, correct = sample['question'], sample['options'], sample['correct']

        # Select few-shot examples
        few_shot_indices = np.random.choice([i for i in range(len(dataset)) if i != index], size=n_shot, replace=False)
        few_shot_examples = ""
        for fs_index in few_shot_indices:
            fs_sample = dataset[int(fs_index)]
            fs_formatted_prompt = build_prompt_aquarat(fs_sample['question'], fs_sample['options'])
            few_shot_examples += fs_formatted_prompt + fs_sample['rationale'] + " Answer: " + fs_sample['correct'] + "\n\n"

        full_prompt = build_prompt_aquarat(question, options, few_shot_examples)
        llm_answer = llm_response_few_shot(full_prompt, pipe)
        data.append({'question': question, 'options': options, 'correct': correct, 'llm_answer': llm_answer, 'few_shot_prompt': full_prompt})

    df = pd.DataFrame(data)
    df.to_csv(csv_name, index=False)
    return df


#### Evaluation code

def eval_letter(text):
    pattern = r"(?:answer\s*:?|answer\s+is\s*:?|correct\s+option\s+is\s*:?)\s*([A-E])"
    matches = re.findall(pattern, text, re.IGNORECASE)
    if matches:
        return matches[-1].upper()
    return None

def evaluate_aquarat(df):
    df['ex_llm_answer'] = df['llm_answer'].apply(eval_letter)
    df['em_score'] = df.apply(lambda row: 1 if row['ex_llm_answer'] == row['correct'] else 0, axis=1)
    em_score = df['em_score'].mean()
    return em_score


### Generate and evaluate for each

def generate_and_evaluate_aquarat(dataset, n_samples, csv_name, pipe):
    responses_df = generate_responses_aquarat(dataset, n_samples=n_samples, csv_name=csv_name, pipe=pipe)
    em_score = evaluate_aquarat(responses_df)
    del responses_df
    gc.collect()
    return em_score

def generate_and_evaluate_few_shot_aquarat(dataset, n_samples, n_shot, csv_name, pipe):
    responses_df = generate_responses_few_shot_aquarat(dataset=dataset, n_samples=n_samples, n_shot=n_shot, csv_name=csv_name, pipe=pipe)
    em_score = evaluate_aquarat(responses_df)
    del responses_df
    gc.collect()
    return em_score

def generate_and_evaluate_simplified_few_shot_aquarat(dataset, n_samples, n_shot, csv_name, pipe):
    responses_df = generate_responses_simplified_few_shot_aquarat(dataset=dataset, n_samples=n_samples,n_shot=n_shot, csv_name=csv_name, pipe=pipe)
    em_score = evaluate_aquarat(responses_df)
    del responses_df
    gc.collect()
    return em_score

### Clean up memory

def cleanup_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()



In [None]:
def evaluate_models_on_dataset(dataset, models_to_test, n_test=100, responses_folder='llm_responses', eval_mode="zero_shot", n_shot=1, max_length=225, max_new_tokens=250, base_model="microsoft/phi-1_5"):
    cleanup_memory()
    em_scores_dict = {}
    responses_folder = 'llm_responses'
    os.makedirs(responses_folder, exist_ok=True)

    # Main evaluation loop
    for model_name in models_to_test:
        simplified_model_name = model_name.split('/')[-1]
        print(f"Testing: {simplified_model_name}")

        model, tokenizer, pipe = load_finetuned_model(model_name, base_model_name=base_model, max_length=max_length, max_new_tokens=max_new_tokens)

        file_name = f"{simplified_model_name}_test.csv"
        file_path = os.path.join(responses_folder, file_name)

        if eval_mode == "zero_shot":
            em_score = generate_and_evaluate_aquarat(dataset, n_test, file_path, pipe=pipe)
        elif eval_mode == "few_shot":
            em_score = generate_and_evaluate_simplified_few_shot_aquarat(dataset, n_test, n_shot, file_path, pipe=pipe)
        elif eval_mode == "few_shot_cot":
            em_score = generate_and_evaluate_few_shot_aquarat(dataset, n_test, n_shot, file_path, pipe=pipe)

        print(f"Accuracy: {em_score}")

        em_scores_dict[simplified_model_name] = em_score

        del model, tokenizer, pipe
        cleanup_memory()

    # Save the evaluation metrics to a CSV file
    em_scores_csv_path = os.path.join(responses_folder, "em_scores.csv")
    em_scores_df = pd.DataFrame(list(em_scores_dict.items()), columns=['Model Name', 'EM Score'])
    em_scores_df.to_csv(em_scores_csv_path, index=False)

    # Create a zip archive of the responses folder
    shutil.make_archive(responses_folder, 'zip', responses_folder)

    # The zip file is now saved locally, no need for a download command.
    print(f"Archive created at {responses_folder}.zip")

In [None]:
cleanup_memory()
aqua_rat_data = load_dataset("aqua_rat", split="test")

# Define your models to test
models_to_test = ['rk68/phi-1_5-finetuned-aqua-rat-AM-2k',
                 'rk68/phi-1_5-finetuned-aqua-rat-AM-1k']
                 #'rk68/phi-1_5-finetuned-aqua-rat-AM-0.5k']
    
    
    
    #'rk68/phi-1_5-finetuned-aqua-rat-AM-1k',
                  #'rk68/phi-1_5-finetuned-aqua-rat-AM-2k',
                  #'rk68/phi-1_5-finetuned-aqua-rat-AM-3k',
                  #'rk68/phi-1_5-finetuned-aqua-rat-AM-4k']
                  #'rk68/phi-1_5-finetuned-aqua-rat-AM-5k',
                  #'rk68/phi-1_5-finetuned-aqua-rat-teacher-AM-1k',
                  #'rk68/phi-1_5-finetuned-aqua-rat-teacher-AM-2k']
    
    #'rk68/phi-1_5-finetuned-aqua-rat-3k', 'rk68/phi-1_5-finetuned-aqua-rat-4k', 'rk68/phi-1_5-finetuned-aqua-rat-5k']#, 'rk68/phi-1_5-finetuned-aqua-rat-2k','rk68/phi-1_5-finetuned-aqua-rat-3k', 'rk68/phi-1_5-finetuned-aqua-rat-4k']#'rk68/phi-1-finetuned-aqua-rat-2k',
                  #'rk68/phi-1-finetuned-aqua-rat-teacher-2k']
                  #"rk68/phi-1_5-finetuned-aqua-rat-2k"]
                  #"rk68/phi-1_5-finetuned-aqua-rat-5k",
                  #"rk68/phi-1_5-finetuned-aqua-rat-10k"]
                  #"rk68/phi-1_5-finetuned-aqua-rat-teacher-1k",
                  #"rk68/phi-1_5-finetuned-aqua-rat-teacher-2k"]
                  #"rk68/phi-1_5-4b-quantized"]

#evaluate_models_on_dataset(aqua_rat_data, models_to_test, n_test=254, responses_folder='llm_responses1', eval_mode="zero_shot", n_shot=1, max_length=150, base_model="microsoft/phi-1_5")
evaluate_models_on_dataset(aqua_rat_data, models_to_test, n_test=254, responses_folder='llm_responses2', eval_mode="few_shot", n_shot=1, max_length=250, base_model="microsoft/phi-1_5")
evaluate_models_on_dataset(aqua_rat_data, models_to_test, n_test=254, responses_folder='llm_responses3', eval_mode="few_shot_cot", n_shot=1, max_new_tokens=300, base_model="microsoft/phi-1_5")

In [None]:
def eval_letter(text):
    pattern = r"(?:answer\s*:?|answer\s+is\s*:?|correct\s+option\s+is\s*:?)\s*([A-E])"
    matches = re.findall(pattern, text, re.IGNORECASE)
    if matches:
        return matches[-1].upper()
    return None

def evaluate_aquarat(df):
    df['ex_llm_answer'] = df['llm_answer'].apply(eval_letter)
    df['em_score'] = df.apply(lambda row: 1 if row['ex_llm_answer'] == row['correct'] else 0, axis=1)
    em_score = df['em_score'].mean()
    return em_score

In [None]:
import os
import pandas as pd
from pandas.errors import EmptyDataError

def process_csv_files(directory_path):
    for file in os.listdir(directory_path):
        if file.endswith('.csv'):
            file_path = os.path.join(directory_path, file)
            try:
                df = pd.read_csv(file_path)
                if df.empty:
                    raise ValueError(f"DataFrame is empty for file: {file}")
                em_score = evaluate_aquarat(df)  
                print(f"{os.path.splitext(file)[0]}: {em_score}")
            except EmptyDataError:
                print(f"EmptyDataError: No columns to parse from file {file}")
            except ValueError as e:
                print(e)
            except Exception as e:
                print(f"Error processing file {file}: {e}")



In [None]:
def eval_letter_n_shot(text, n_shot):
    pattern = r"(?:answer\s*:?|answer\s+is\s*:?|correct\s+option\s+is\s*:?)\s*([A-E])"
    matches = re.findall(pattern, text, re.IGNORECASE)
    # ignore the answers passed as n-shot examples
    if len(matches) > n_shot:
        return matches[-1].upper()
    return None

def evaluate_aquarat_n_shot(df, n_shot):
    df['ex_llm_answer'] = df['llm_answer'].apply(lambda text: eval_letter_n_shot(text, n_shot))
    df['em_score'] = df.apply(lambda row: 1 if row['ex_llm_answer'] == row['correct'] else 0, axis=1)
    em_score = df['em_score'].mean()
    return em_score

def process_csv_files_n_shot(directory_path, n_shot):
    for file in os.listdir(directory_path):
        if file.endswith('.csv'):
            file_path = os.path.join(directory_path, file)
            try:
                df = pd.read_csv(file_path)
                if df.empty:
                    raise ValueError("DataFrame is empty")
                em_score = evaluate_aquarat_n_shot(df, n_shot=n_shot)
                print(f"{os.path.splitext(file)[0]}: {em_score}")
            except Exception as e:
                print(f"Error processing file {file}: {e}")


In [None]:
print('Gemma teacher')
print('zero-shot:')
process_csv_files('AQUA-FT-RESULT-QLORA-AM/gemma-teacher/zero-shot')
print('one-shot:')
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/gemma-teacher/one-shot/', n_shot=1) 
print('one-shot-cot')
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/gemma-teacher/one-shot-cot/', n_shot=1)  
print()      
print('Aqua rationale')
print('zero-shot')
process_csv_files('AQUA-FT-RESULT-QLORA-AM/aqua-rationale/zero-shot')
print('one-shot')
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/aqua-rationale/one-shot/', n_shot=1) 
print('one-shot-cot')
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/aqua-rationale/one-shot-cot/', n_shot=1)    

In [None]:
print('Aqua rationale, vary R:')
print('zero-shot:')
process_csv_files('AQUA-FT-RESULT-QLORA-AM/aqua-rationale/vary-r/zero-shot')
print('one-shot:')
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/aqua-rationale/vary-r/one-shot', n_shot=1) 
print('one-shot-cot:')
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/aqua-rationale/vary-r/one-shot-cot', n_shot=2) 

In [None]:
print('AQUA-RAT, vary alpha:')
print('zero-shot:')
process_csv_files('AQUA-FT-RESULT-QLORA-AM/aqua-rationale/vary-alpha/zero-shot')
print('one-shot:')
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/aqua-rationale/vary-alpha/one-shot', n_shot=1) 
print('one-shot-cot:')
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/aqua-rationale/vary-alpha/one-shot-cot', n_shot=1) 

In [None]:
print('Gemma, vary R:')
print('zero-shot:')
process_csv_files('AQUA-FT-RESULT-QLORA-AM/gemma-teacher/vary-r/zero-shot')
print('one-shot:')
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/gemma-teacher/vary-r/one-shot', n_shot=1) 
print('one-shot-cot:')
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/gemma-teacher/vary-r/one-shot-cot', n_shot=1) 

In [None]:
print('Gemma, vary alpha:')
print('zero-shot:')
process_csv_files('AQUA-FT-RESULT-QLORA-AM/gemma-teacher/vary-alpha/zero-shot')
print('one-shot:')
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/gemma-teacher/vary-alpha/one-shot', n_shot=1) 
print('one-shot-cot:')
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/gemma-teacher/vary-alpha/one-shot-cot', n_shot=1) 

Main results on 254 samples:


 # QLoRA + KD
 
All layers:
- teacher 2k 0 shot: 0.20472
- teacher 2k 1 shot: 0.21653
- teacher 2k 1 shot CoT: 0.240157

- teacher 1k 0 shot: 0.1929133
- teacher 1k 1 shot: 0.2204724
- teacher 1k 1 shot CoT: 

- teacher 0.5k 0 shot: 0.19291
- teacher 0.5k 1 shot: 0.228346
- teacher 0.5k 1 shot CoT: 0.2362204


Attention layers:
- teacher 2k 0 shot: 0.149606
- teacher 2k 1 shot: 0.196850
- teacher 2k 1 shot CoT:


- teacher 1k 0 shot: 0.161417
- teacher 1k 1 shot: 0.200787
- teacher 1k 1 shot CoT: 0.2480314


# QLoRA + AQuA-RAT

All layers: 

In [None]:
process_csv_files('AQUA-FT-RESULT-QLORA-AM/aqua-rationale/254 samples/teacher')

In [None]:
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/aqua-rationale/254 samples/teacher', n_shot=1)

In [None]:
process_csv_files_n_shot('AQUA-FT-RESULT-QLORA-AM/aqua-rationale/254 samples/teacher', n_shot=1)