In [None]:

!pip install transformers
from huggingface_hub import notebook_login
notebook_login()




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## create a evalaution dataset and save as .jsonl

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import json

# Load Vortex model (evaluated model)
vortex_model_name = "saishshinde15/TBH.AI_Valhala"
vortex_tokenizer = AutoTokenizer.from_pretrained(vortex_model_name)
vortex_model = AutoModelForCausalLM.from_pretrained(vortex_model_name, torch_dtype=torch.float16, device_map="auto")
vortex_pipeline = pipeline("text-generation", model=vortex_model, tokenizer=vortex_tokenizer)

# Load Llama 1B Instruct model (Judge)
judge_model_name = "meta-llama/Llama-3.2-1B-Instruct"
judge_tokenizer = AutoTokenizer.from_pretrained(judge_model_name)
judge_model = AutoModelForCausalLM.from_pretrained(judge_model_name, torch_dtype=torch.float16, device_map="auto")
judge_pipeline = pipeline("text-generation", model=judge_model, tokenizer=judge_tokenizer)

# Load dataset
dataset_path = "/content/evaluation_dataset.jsonl"
eval_data = []
with open(dataset_path, "r") as f:
    for line in f:
        eval_data.append(json.loads(line))

# Evaluation loop
results = []
for entry in eval_data:
    question = entry["question"]

    # Get answer from Vortex model
    vortex_answer = vortex_pipeline(question, max_new_tokens=200, truncation=True)[0]["generated_text"]

    # Format prompt for LLM Judge
    judge_prompt = (f"Question: {question}\nAnswer: {vortex_answer}\nJudge: Evaluate the correctness, coherence, and completeness of the answer."
                    " Provide a score between 0 and 10 along with a short justification.")

    # Get judgment from Llama 1B
    judge_response = judge_pipeline(judge_prompt, max_new_tokens=200, truncation=True)[0]["generated_text"]

    # Store result
    results.append({"question": question, "answer": vortex_answer, "judgment": judge_response})

# Save results
results_path = "evaluation_results.jsonl"
with open(results_path, "w") as f:
    for result in results:
        f.write(json.dumps(result) + "\n")

print(f"Evaluation completed! Results saved to {results_path}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


## With Rouge and Blue score

In [None]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datasets import load_metric
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

# Load models
VORTEX_MODEL = "saishshinde15/vortex"  # Model being evaluated
JUDGE_MODEL = "meta-llama/Llama-1b-instruct"  # Judge model

# Load models & tokenizers
def load_llm(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
    return pipeline("text-generation", model=model, tokenizer=tokenizer)

vortex_pipeline = load_llm(VORTEX_MODEL)
judge_pipeline = load_llm(JUDGE_MODEL)

# Load evaluation dataset (structured format)
DATASET_PATH = "evaluation_dataset.jsonl"
with open(DATASET_PATH, "r") as f:
    eval_data = [json.loads(line) for line in f]

# Initialize metrics
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
bleu_scores, rouge1_scores, rouge2_scores, rougeL_scores = [], [], [], []

# Evaluation loop
results = []
for entry in eval_data:
    question = entry["question"]

    # Get model answer
    vortex_answer = vortex_pipeline(question, max_length=200, truncation=True)[0]["generated_text"]

    # Format judge prompt
    judge_prompt = f"""Question: {question}\nAnswer: {vortex_answer}\nJudge: Provide a correctness score (0-10) and a brief explanation."""

    # Get judgment
    judge_response = judge_pipeline(judge_prompt, max_length=200, truncation=True)[0]["generated_text"]

    # Extract score (Regex-based extraction)
    score = int(next(iter(filter(str.isdigit, judge_response.split())), 0))

    # Compute BLEU & ROUGE
    ref_tokens = nltk.word_tokenize(judge_response)
    hyp_tokens = nltk.word_tokenize(vortex_answer)

    bleu_scores.append(sentence_bleu([ref_tokens], hyp_tokens))
    rouge_scores = scorer.score(judge_response, vortex_answer)
    rouge1_scores.append(rouge_scores["rouge1"].fmeasure)
    rouge2_scores.append(rouge_scores["rouge2"].fmeasure)
    rougeL_scores.append(rouge_scores["rougeL"].fmeasure)

    # Store result
    results.append({"question": question, "answer": vortex_answer, "judgment": judge_response, "score": score})

# Save evaluation results
RESULTS_PATH = "industry_evaluation_results.jsonl"
with open(RESULTS_PATH, "w") as f:
    for result in results:
        f.write(json.dumps(result) + "\n")

# Print final scores
print(f"Average BLEU Score: {sum(bleu_scores) / len(bleu_scores):.4f}")
print(f"Average ROUGE-1 Score: {sum(rouge1_scores) / len(rouge1_scores):.4f}")
print(f"Average ROUGE-2 Score: {sum(rouge2_scores) / len(rouge2_scores):.4f}")
print(f"Average ROUGE-L Score: {sum(rougeL_scores) / len(rougeL_scores):.4f}")


## With huggingface dataset as a eval dataset

In [None]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datasets import load_dataset
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

# Load the evaluation dataset
dataset = load_dataset("harpreetsahota/Instruction-Following-Evaluation-for-Large-Language-Models", split="train")

# Load your model (e.g., 'saishshinde15/vortex')
model_name = "saishshinde15/vortex"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Initialize BLEU and ROUGE scorers
nltk.download("punkt")
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluation loop
results = []
for example in dataset:
    instruction = example["prompt"]
    reference_output = example["reference_output"]

    # Generate model output
    generated_output = text_generator(instruction, max_length=200, truncation=True)[0]["generated_text"]

    # Compute BLEU score
    reference_tokens = nltk.word_tokenize(reference_output)
    generated_tokens = nltk.word_tokenize(generated_output)
    bleu_score = sentence_bleu([reference_tokens], generated_tokens)

    # Compute ROUGE scores
    rouge_scores = rouge_scorer.score(reference_output, generated_output)

    # Store results
    results.append({
        "instruction": instruction,
        "reference_output": reference_output,
        "generated_output": generated_output,
        "bleu_score": bleu_score,
        "rouge1": rouge_scores["rouge1"].fmeasure,
        "rouge2": rouge_scores["rouge2"].fmeasure,
        "rougeL": rouge_scores["rougeL"].fmeasure
    })

# Save results to a JSONL file
with open("evaluation_results.jsonl", "w") as f:
    for result in results:
        f.write(json.dumps(result) + "\n")

print("Evaluation completed. Results saved to 'evaluation_results.jsonl'.")
