In [None]:
!pip install transformers
!pip install peft

In [None]:
!pip install -U bitsandbytes
!pip install torch

In [None]:
def generate_response(model, tokenizer, input_text, isTTM = False):
    max_length = 512
    input_ids = tokenizer(input_text, return_tensors="pt")
    if not isTTM:
      outputs = model.generate(input_ids['input_ids'], max_new_tokens=2048,temperature = 0.6, do_sample = True, top_k = 50,top_p = 0.95)
    else:
      outputs = model.generate(input_ids['input_ids'], max_new_tokens=max_length, num_beams = 2, early_stopping = True)
    res = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return res

In [None]:
def generate_output(text,max_cot = 1):
  buffer = []
  buffer.append(text)
  for i in range(max_cot):
    input_text = f"""
    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
    Role: You are an Instructions Providing AI that generates tailored steps to solve the given question.

    Instructions:
    - Carefully analyze the provided question before generating steps.
    - Only generate **specific steps** that are relevant to solving this particular question.
    - Avoid using generic or repetitive steps such as "Identify key information" or "Verify the solution".
    - Focus on logical reasoning, calculations, or operations that are **directly necessary** to solve the question.
    - Give Exactly 4 steps.
    - Do not provide descriptions or explanations for the steps.
    - Only output the **step titles** relevant to the question at hand.
    - Follow the provided format:
      Step 1: [Tailored Step Title According to Question]
      Step 2: [Tailored Step Title According to Question]
      Step 3: [Tailored Step Title According to Question]
      Step 4: [Tailored Step Title According to Question]
    - Do not generate an answer to the question or hint at the solution.
    - Do not exceed 4 Steps
    Generate the steps based solely on the question below.
    <|eot_id|><|start_header_id|>user<|end_header_id|>"{text}"<|eot_id|>"""
    internal_thought = generate_response(TTM_model, tokenizer, input_text,isTTM = True)
    internal_thought = extract_final_answer(internal_thought)
    internal_thought = f"Internal_thought{i+1}:"+"\n"+internal_thought
    buffer.append(internal_thought)
    prompt = '\n'.join(buffer)
    final_input = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    Role: You are a highly intelligent AI assistant specializing in mathematics and coding. Your expertise includes solving complex math problems, writing and debugging code, explaining mathematical concepts, and providing optimized solutions for coding challenges. When presented with a question or problem, you will: 1. Analyze the problem carefully. 2. Provide clear and concise explanations for your reasoning. 3. Offer step-by-step solutions for math and coding problems. 4. Generate clean, efficient, and well-commented code for programming tasks. You are expected to be accurate, logical, and detailed in your responses.
    Instruction:
    - Use the internal_thought to guide yourself to a correct answer and verify that it is correct before responding to the user.
    - Final output needs to be an answer for the question.
    - The last sentence needs to be the correct option for the question.
    - Provide the index of the correct option
    - Always provide the correct option number at the end
    - Follow the strictly the Structure of output:
        Explanation : Elaborate on steps in internal_thought provided
        Answer : Correct Answer
        Option : Correct Option number for the correct answer in the choices
    - Do not deviate from the format mentioned above
    - Option can only be any one value in 0,1,2,3 and should only be the option number
    - Do not hallucinate
    - Do not deviate from the instructions
    Example:
      Question : What is 1 + 2 ?
      Choices:
      0) 3
      1) 1
      2) 2
      3) 4
      Explanation : 1 + 2 adds to 3
      Answer : the answer is 3
      Option : 0
    <|end_header_id|>
    <|start_header_id|>user<|end_header_id|>"{prompt}"<|eot_id|>"""
    final_output = generate_response(model_finetuned, tokenizer, final_input)
    final_output = f"{i+1}th Output:\n" + final_output
    buffer.append(final_output)
  return final_output

In [None]:
!pip install -U peft

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_id = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"

torch_device = "cuda" if torch.cuda.is_available() else "cpu"
model_finetuned = AutoModelForCausalLM.from_pretrained(model_id,load_in_4bit=True,torch_dtype = torch.bfloat16).to(torch_device)
model_finetuned.load_adapter('/kaggle/input/main-adapter')
TTM_id = 'unsloth/Llama-3.2-1B-Instruct-bnb-4bit'
TTM_model = AutoModelForCausalLM.from_pretrained(TTM_id,load_in_4bit=True,torch_dtype = torch.bfloat16).to(torch_device)
TTM_model.load_adapter('/kaggle/input/adapter')
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
import pandas as pd

columns = [
    "Model Name",         # Name of the model
    "Dataset",            # Dataset used (e.g., MMLU, GSM8K)
    "Task",               # Specific task or subset of the dataset
    "Accuracy",           # Accuracy metric
    "BLEU Score",         # BLEU score metric
    "ROUGE-L Score",      # ROUGE-L score metric
]

# Create an empty DataFrame with the specified columns
evaluation_results = pd.DataFrame(columns=columns)

In [None]:
models = ["unsloth/llama-3.1-8b-Instruct-bnb-4bit","mistralai/Mistral-7B","gemma2-9b", "qwen/Qwen-2-7B"]
#temp_model = AutoModelForCausalLM.from_pretrained(models[0],load_in_4bit=True,torch_dtype = torch.bfloat16).to(torch_device)

In [None]:
!pip install datasets
!pip install rouge-score
!pip install nltk

In [None]:
TTM_model.eval()
model_finetuned.eval()

In [None]:
def extract_final_answer(output: str) -> str:
    # Assuming "Assistant:" precedes the answer
    if "assistant" in output:
        temp = output.split("assistant")[-1].strip()
        res = temp.replace('assistant','')
        return res
    return output.strip()

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import re
from datasets import load_dataset
from tqdm import tqdm
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu


def evaluate_mmlu(name):
    print("Evaluating on MMLU...")

    # Load MMLU dataset (example subset: 'high_school_mathematics')
    dataset = load_dataset("cais/mmlu", name, split="test")

    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    bleu_scores = []
    rouge_scores = []
    correct = 0
    total = 0

    for sample in tqdm(dataset, desc="Processing MMLU", total=len(dataset)):
        question = sample['question']
        choices = sample['choices']  # List of options (A, B, C, D)
        answer = sample['answer']  # Correct choice index
        ground_truth = choices[answer]

        # Create the input to the LLM (formatted question)
        temp = '\n'
        for n,c in enumerate(choices):
          temp += f'{n}) {c}\n'
        input_text = f"Question: {question}\nChoices: {temp}"

        # Generate response
        output = generate_output(input_text).strip()
        number = extract_answer_number(output)
        rouge_score = rouge.score(ground_truth, output)['rougeL'].fmeasure
        bleu_score = sentence_bleu([ground_truth.split()], output.split())
        for i in range(len(output),len(ground_truth)):
          if i + len(ground_truth) <= len(output):
            rouge_score = max(rouge_score,rouge.score(ground_truth, output[i:i + len(ground_truth)])['rougeL'].fmeasure)
            bleu_score = max(rouge_score,sentence_bleu([ground_truth.split()], output[i:i + len(ground_truth)].split()))
        rouge_scores.append(rouge_score)
        bleu_scores.append(bleu_score)

        # Match output with options
        print(f'answer:{answer} generated:{number}')
        if answer == number:
          correct += 1

        total += 1

    accuracy = correct / total
    average_bleu = sum(bleu_scores) / total
    average_rouge = sum(rouge_scores) / total
    print(f"MMLU Accuracy: {accuracy:.4f} ({correct}/{total})")
    print(f"MMLU Average BLEU Score: {average_bleu:.4f}")
    print(f"MMLU Average ROUGE-L Score: {average_rouge:.4f}")
    return accuracy, average_bleu, average_rouge

def evaluate_gsm8k():
    print("Evaluating on GSM8K...")

    # Load GSM8K dataset
    dataset = load_dataset("gsm8k", "main", split="test")

    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    bleu_scores = []
    rouge_scores = []
    correct = 0
    total = 0

    for sample in tqdm(dataset, desc="Processing GSM8K", total=len(dataset)):
        question = sample['question']
        ground_truth = sample['answer']  # Ground truth answer as string

        # Generate response
        input_text = f"Question: {question}\nAnswer:"
        output = generate_output(input_text).strip()

        # Calculate ROUGE and BLEU
        rouge_score = rouge.score(ground_truth, output)['rougeL'].fmeasure
        bleu_score = sentence_bleu([ground_truth.split()], output.split())
        rouge_scores.append(rouge_score)
        bleu_scores.append(bleu_score)

    average_bleu = sum(bleu_scores) / total
    average_rouge = sum(rouge_scores) / total
    print(f"GSM8K Average BLEU Score: {average_bleu:.4f}")
    print(f"GSM8K Average ROUGE-L Score: {average_rouge:.4f}")
    return accuracy, average_bleu, average_rouge


def extract_answer_number(text):
    """Extract the last number from the model's text output."""
    matches = re.findall(r"[-+]?\d*\.?\d+", text)  # Find all integers or decimals
    if matches:
        print(int(round(float(matches[-1]))))
        return int(round(float(matches[-1])))# Return the last number as an integer
    return None

if __name__ == "__main__":
    # Run evaluations
    mmlu_m_acc,mmlu_m_bleu,mmlu_m_rouge = evaluate_mmlu("college_mathematics")
    mmlu_cs_acc,mmlu_cs_bleu,mmlu_cs_rouge = evaluate_mmlu("college_computer_science")
    # gsm_bleu,gsm_rouge = evaluate_gsm8k()


In [None]:
# Example of adding a row to the DataFrame
evaluation_results.loc[len(evaluation_results)] = ["Llama-3.1-8B-Finetuned-COT","MMLU","College Mathematics",mmlu_m_acc,mmlu_m_bleu,mmlu_m_rouge]

evaluation_results.loc[len(evaluation_results)] = ["Llama-3.1-8B-Finetuned-COT","MMLU","College Computer Science", mmlu_cs_acc, mmlu_cs_bleu,mmlu_cs_rouge]

In [None]:
evaluation_results.head()

dataset = load_dataset("cais/mmlu", 'college_mathematics', split="test")
dataset[4]

question = dataset[4]['question']
choices = dataset[4]['choices']
temp = '\n'
for n,c in enumerate(choices):
    temp += f'{n}) {c}\n'
text = f"Question: {question}\nChoices: {temp}"

res = generate_output(text)
print(res)

In [None]:
def base_model_output(model, text):
  final_input = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    Role: You are a highly intelligent AI assistant specializing in mathematics and coding. Your expertise includes solving complex math problems, writing and debugging code, explaining mathematical concepts, and providing optimized solutions for coding challenges. When presented with a question or problem, you will: 1. Analyze the problem carefully. 2. Provide clear and concise explanations for your reasoning. 3. Offer step-by-step solutions for math and coding problems. 4. Generate clean, efficient, and well-commented code for programming tasks. You are expected to be accurate, logical, and detailed in your responses.
    Instruction:
    - Final output needs to be an answer for the question.
    - The last sentence needs to be the correct option for the question.
    - Provide the index of the correct option
    - Always provide the correct option number at the end
    - Follow the strictly the Structure of output:
        Explanation : Elaborate on steps
        Answer : Correct Answer
        Option : Correct Option number for the correct answer in the choices
    - Do not deviate from the format mentioned above
    - Option can only be any one value in 0,1,2,3 and should only be the option number
    - Do not hallucinate
    - Do not deviate from the instructions
    Example:
      Question : What is 1 + 2 ?
      Choices:
      0) 3
      1) 1
      2) 2
      3) 4
      Explanation : 1 + 2 adds to 3
      Answer : the answer is 3
      Option : 0
    <|end_header_id|>
    <|start_header_id|>user<|end_header_id|>"{text}"<|eot_id|>"""

  return extract_final_answer(generate_response(model, tokenizer, final_input))

In [None]:
import re
from datasets import load_dataset
from tqdm import tqdm
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu


def evaluate_mmlu(name):
    print("Evaluating on MMLU...")

    # Load MMLU dataset (example subset: 'high_school_mathematics')
    dataset = load_dataset("cais/mmlu", name, split="test")

    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    bleu_scores = []
    rouge_scores = []
    correct = 0
    total = 0

    for sample in tqdm(dataset, desc="Processing MMLU", total=len(dataset)):
        question = sample['question']
        choices = sample['choices']  # List of options (A, B, C, D)
        answer = sample['answer']  # Correct choice index
        ground_truth = choices[answer]

        # Create the input to the LLM (formatted question)
        temp = '\n'
        for n,c in enumerate(choices):
          temp += f'{n}) {c}\n'
        input_text = f"Question: {question}\nChoices: {temp}"

        # Generate response
        output = base_model_output(model_finetuned,input_text).strip()
        number = extract_answer_number(output)
        rouge_score = rouge.score(ground_truth, output)['rougeL'].fmeasure
        bleu_score = sentence_bleu([ground_truth.split()], output.split())
        for i in range(len(output),len(ground_truth)):
          if i + len(ground_truth) <= len(output):
            rouge_score = max(rouge_score,rouge.score(ground_truth, output[i:i + len(ground_truth)])['rougeL'].fmeasure)
            bleu_score = max(rouge_score,sentence_bleu([ground_truth.split()], output[i:i + len(ground_truth)].split()))
        rouge_scores.append(rouge_score)
        bleu_scores.append(bleu_score)

        # Match output with options
        print(f'answer:{answer} generated:{number}')
        if answer == number:
          correct += 1

        total += 1

    accuracy = correct / total
    average_bleu = sum(bleu_scores) / total
    average_rouge = sum(rouge_scores) / total
    print(f"MMLU Accuracy: {accuracy:.4f} ({correct}/{total})")
    print(f"MMLU Average BLEU Score: {average_bleu:.4f}")
    print(f"MMLU Average ROUGE-L Score: {average_rouge:.4f}")
    return accuracy, average_bleu, average_rouge

def evaluate_gsm8k():
    print("Evaluating on GSM8K...")

    # Load GSM8K dataset
    dataset = load_dataset("gsm8k", "main", split="test")

    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    bleu_scores = []
    rouge_scores = []
    correct = 0
    total = 0

    for sample in tqdm(dataset, desc="Processing GSM8K", total=len(dataset)):
        question = sample['question']
        ground_truth = sample['answer']  # Ground truth answer as string

        # Generate response
        input_text = f"Question: {question}\nAnswer:"
        output = base_model_output(model_finetuned,input_text).strip()

        # Calculate ROUGE and BLEU
        rouge_score = rouge.score(ground_truth, output)['rougeL'].fmeasure
        bleu_score = sentence_bleu([ground_truth.split()], output.split())
        rouge_scores.append(rouge_score)
        bleu_scores.append(bleu_score)

    average_bleu = sum(bleu_scores) / total
    average_rouge = sum(rouge_scores) / total
    print(f"GSM8K Average BLEU Score: {average_bleu:.4f}")
    print(f"GSM8K Average ROUGE-L Score: {average_rouge:.4f}")
    return accuracy, average_bleu, average_rouge

def extract_answer_number(text):
    """Extract the last number from the model's text output."""
    matches = re.findall(r"[-+]?\d*\.?\d+", text)  # Find all integers or decimals
    if matches:
        return int(round(float(matches[-1])))  # Return the last number as an integer
    return None

if __name__ == "__main__":
    # Run evaluations
    mmlu_m_acc2,mmlu_m_bleu2,mmlu_m_rouge2 = evaluate_mmlu("college_mathematics")
    mmlu_cs_acc2,mmlu_cs_bleu2,mmlu_cs_rouge2 = evaluate_mmlu("college_computer_science")
    # gsm_bleu2,gsm_rouge2 = evaluate_gsm8k()

In [None]:
# Example of adding a row to the DataFrame
evaluation_results.loc[len(evaluation_results)] = ["Llama-3.1-8B-Finetuned","MMLU","College Mathematics",mmlu_m_acc2,mmlu_m_bleu2,mmlu_m_rouge2]

evaluation_results.loc[len(evaluation_results)] = ["Llama-3.1-8B-Finetuned","MMLU","College Computer Science", mmlu_cs_acc2, mmlu_cs_bleu2,mmlu_cs_rouge2]

In [None]:
evaluation_results.to_csv('evaluations.csv')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data from evaluation (base and fine-tuned)
metrics = ['Accuracy', 'BLEU', 'ROUGE']
college_math_base = [0.26, 0.1298, 0.0313]
college_math_finetuned = [0.37, 0.115865, 0.013732]

# Create bar chart
x = np.arange(len(metrics))  # Positions for the metrics
width = 0.35  # Bar width

fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# College Mathematics
ax[0].bar(x - width/2, college_math_base, width, label='Base Model', color='skyblue')
ax[0].bar(x + width/2, college_math_finetuned, width, label='Fine-tuned Model', color='orange')
ax[0].set_title('College Mathematics')
ax[0].set_xticks(x)
ax[0].set_xticklabels(metrics)
ax[0].set_ylabel('Scores')
ax[0].legend()

# College Computer Science
ax[1].bar(x - width/2, college_cs_base, width, label='Base Model', color='skyblue')
ax[1].bar(x + width/2, college_cs_finetuned, width, label='Fine-tuned Model', color='orange')
ax[1].set_title('College Computer Science')
ax[1].set_xticks(x)
ax[1].set_xticklabels(metrics)
ax[1].set_ylabel('Scores')
ax[1].legend()

# Adjust layout and show plot
plt.tight_layout()
plt.show()

In [None]:
fig.savefig('comparision1')