# Evaluating the fine tuned model

### Needed packages and imports

In [None]:
# !pip install -r requirements.txt
!pip install einops~=0.8.0 
!pip install langchain~=0.3.5 
!pip install langchain-community~=0.3.3 
!pip install langchain-openai~=0.2.4
!pip install langchain-milvus~=0.1.6
!pip install pypdf~=5.1.0
!pip install pymilvus~=2.4.9
!pip install sentence-transformers~=3.2.1

### Model inference parameters

The parameters to the fine tuned model.

In [None]:
import os

INFERENCE_SERVER_URL = os.getenv("INFERENCE_SERVER_URL")
LLM_API_KEY = os.getenv("LLM_API_KEY")
MODEL_NAME = os.getenv("MODEL_NAME")
MAX_TOKENS=2048
TEMPERATURE=0.00

### Milvus connection info

Defaults to local db

In [None]:
MILVUS_URI = os.getenv("MILVUS_URI", "./milvus_local.db")
MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", "")
MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", "")
MILVUS_COLLECTION = os.getenv("MILVUS_COLLECTION", "my_org_documents")

In [None]:
import requests
import os

from langchain.document_loaders import PyPDFDirectoryLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

from langchain.callbacks.base import BaseCallbackHandler
from langchain.chains import LLMChain, RetrievalQA
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import VLLMOpenAI
from langchain.prompts import PromptTemplate

from langchain_milvus import Milvus

## Sanity check model

In [None]:
llm = VLLMOpenAI(
    openai_api_key=LLM_API_KEY,
    openai_api_base=INFERENCE_SERVER_URL,
    model_name=MODEL_NAME,
    max_tokens=MAX_TOKENS,
    temperature=TEMPERATURE,
    streaming=True,
    verbose=False,
    callbacks=[StreamingStdOutCallbackHandler()]
)

template_str="""<|system|>
You are a Red Hat Instruct Model based on Granite 7B,
an AI language model developed by Red Hat and IBM Research,
based on the Granite-7b-base language model.
Your primary function is to be a chat assistant.
<|user|>
Answer the following question.
Question: {question}
Answer:
<|assistant|>
"""
prompt = PromptTemplate.from_template(template_str)


In [None]:
conversation = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=False
)


In [None]:
# question = "Hello.  Who are you?"
question = "Which hardware accelerators are supported by RHEL AI?"

answer = conversation.predict(question=question)
answer

## Creating an Milvus DB with documents

In [None]:
import requests
import os

from langchain.document_loaders import PyPDFDirectoryLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

from langchain.callbacks.base import BaseCallbackHandler
from langchain.chains import RetrievalQA
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import VLLMOpenAI
from langchain.prompts import PromptTemplate

from langchain_milvus import Milvus

## Initial index creation and document ingestion

#### Load pdfs

In [None]:
pdf_folder_path = "../data_preparation/document_collection"

pdf_loader = PyPDFDirectoryLoader(pdf_folder_path, recursive=True)
pdf_docs = pdf_loader.load()

#### Split documents into chunks with some overlap

In [None]:
docs = pdf_docs
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=40)
all_splits = text_splitter.split_documents(docs)
all_splits[0]

#### Create the index and ingest the documents

In [None]:
import torch

device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

model_kwargs = {"trust_remote_code": True, "device": device}
embeddings = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1.5",
    model_kwargs=model_kwargs,
    show_progress=True
)

db = Milvus(
    embedding_function=embeddings,
    connection_args={
        "uri": MILVUS_URI,
        "user": MILVUS_USERNAME, 
        "password": MILVUS_PASSWORD
    },
    collection_name=MILVUS_COLLECTION,
    metadata_field="metadata",
    text_field="page_content",
    auto_id=True,
    drop_old=True
)


In [None]:
loaded = db.add_documents(all_splits)
print(f"{len(loaded)} documents loaded.")

#### Test vector DB search

In [None]:
query = "Which hardware accelerators are supported by RHEL AI?"
docs_with_score = db.similarity_search_with_score(query)

In [None]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

#### Test out RAG request

In [None]:
rag_template="""<|system|>
You are a Red Hat Instruct Model based on Granite 7B,
an AI language model developed by Red Hat and IBM Research,
based on the Granite-7b-base language model.
Your primary function is to be a chat assistant.
<|user|>
Context:
{context}
Answer the following question from context and internal memory.
Question: {question}
Answer:
<|assistant|>
"""

# RAG_CHAIN_PROMPT = PromptTemplate(input_variables=["input"], template=rag_template)
RAG_CHAIN_PROMPT = PromptTemplate.from_template(rag_template)

qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=db.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 4}
            ),
        chain_type_kwargs={"prompt": RAG_CHAIN_PROMPT},
        return_source_documents=True
        )

os.environ["TOKENIZERS_PARALLELISM"] = "false"

#### RAG query example

In [None]:
question = "Which hardware accelerators are supported by RHEL AI?"
result = qa_chain.invoke({"query": question})

## Process answers from ground truth QnA

### Load test config and qna.yaml

In [None]:
import yaml

with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)
# llm_config

In [None]:
import yaml
from pathlib import Path
import pandas as pd

directory = "../data_preparation/document_collection"


qna_list = []

for file_path in Path(directory).rglob('qna.yaml'):
    print(file_path)
    if not file_path.name == 'qna.yaml':
        continue
    with open(file_path) as file:
        qna = yaml.load(file, Loader=yaml.FullLoader)
        for seed_example in qna["seed_examples"]:
            for questions_and_answers in seed_example["questions_and_answers"]:
                qna_list.append(
                    {
                        "question": questions_and_answers["question"].strip(),
                        "ground_truth": questions_and_answers["answer"].strip()                     
                    }
                )
                
# print(qna_list)

qna_df = pd.DataFrame(qna_list)
# df.to_csv('qna.csv', index=False)
qna_df.to_json('qna.jsonl', orient='records', lines=True)


In [None]:
import re
import time
from langchain.chains import LLMChain


def replace_special_char(original_str):
    return re.sub(r"[^\w]", "_", original_str)


def qna_request(template_str, question):
    # print("QnA")
    num_retries = 1
    for attempt in range(num_retries):
        try:
            qna_template = PromptTemplate.from_template(template_str)
            conversation = LLMChain(llm=llm,
                                    prompt=qna_template,
                                    verbose=False
                                   )
            question = row["question"]
            answer = conversation.predict(question=question)
            return answer.strip()
        except Exception as e:
            print(f"Request failed: {e}")
            if attempt + 1 < num_retries:
                print(f"Retrying in 5 seconds...")
                time.sleep(5)
            else:
                return ""


def rag_request(template_str, question):
    # print("RAG")
    num_retries = 1
    for attempt in range(num_retries):
        try:
            rag_template = PromptTemplate.from_template(template_str)
            rag_chain = RetrievalQA.from_chain_type(
                llm,
                retriever=db.as_retriever(
                    search_type="similarity",
                    search_kwargs={"k": 4}
                    ),
                chain_type_kwargs={"prompt": rag_template},
                return_source_documents=True
                )
            question = row["question"]
            response = rag_chain.invoke({"query": question})
            return response["result"].strip()
        except Exception as e:
            print(f"Request failed: {e}")
            if attempt < num_retries:
                print(f"Retrying in 5 seconds...")
                time.sleep(5)
            else:
                return ""           



In [None]:
import yaml

with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)
    
qna_df = pd.read_json("qna.jsonl", orient="records", lines=True)

for testing_config in llm_config["testing_config"]:
    answers = qna_df.copy()
    answers["answer"] = ""
    answers["rag_answer"] = ""
    llm = VLLMOpenAI(
        openai_api_key=re.sub(r"\s+", "", testing_config["api_key"]),
        openai_api_base=testing_config["endpoint_url"], #https://model...com/v1
        model_name=testing_config["model_name"],
        temperature=0.00,
        max_tokens=2048,
        streaming=False
    )
    for index, row in answers.iterrows():
        question = row["question"]
        print(index, question)
        if testing_config["qna_template"]:
            answer = qna_request(testing_config["qna_template"], question)
            # print(answer)
            answers.at[index, "answer"] = answer
        if testing_config["rag_template"]:
            answer = rag_request(testing_config["rag_template"], question)
            # print(answer)
            answers.at[index, "rag_answer"] = answer
    base_filename = replace_special_char(testing_config["name" or "model_name"])
    answers.to_json(f"{base_filename}_answers.jsonl", orient="records", lines=True)
    # answers.to_csv(f"{base_filename}_answers.csv")

## Grade responses using Judge Model

### Load \*_answers.yaml

In [None]:
import yaml

with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)

In [None]:
from langchain.prompts import PromptTemplate

SCORING_PROMPT = PromptTemplate(
    template="""You are an evaluation system tasked with assessing the answer quality of a AI generated response in relation to the posed question and reference answer. Assess if the response is correct, accurate, and factual based on the reference answer. Evaluate the answer_quality as:
    - Score 1: The response is completely incorrect, inaccurate, and/or not factual.
    - Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
    - Score 3: The response is somewhat correct, accurate, and/or factual.
    - Score 4: The response is mostly correct, accurate, and factual.
    - Score 5: The response is completely correct, accurate, and factual.
    Here is the question: \n ------- \n {question} \n -------
    Here is model answer: \n ------- \n {answer} \n -------
    Here is the reference answer(may be very short and lack details or indirect, long and extractive):  \n ------- \n {reference_answer} \n ------- \n
    Assess the quality of model answer with respect to the Reference Answer, but do not penalize the model answer for adding details or give a direct answer to user question. Provide the quality level as a JSON object with two keys: 'reasoning' and 'answer_quality'.
    """,
    input_variables=["question", "answer", "reference_answer"],
)


In [None]:
import json
from openai import OpenAI

judge_client = OpenAI(api_key=llm_config["judge"]["api_key"])
judge_model_name = llm_config["judge"]["model_name"]


def score_request(question, answer, reference_answer):
    messages = [
        {
            "role": "user",
            "content": SCORING_PROMPT.format(
                question=question,
                answer=answer,
                reference_answer=reference_answer
            )
        }
    ]

    completion = judge_client.chat.completions.create(
        model=judge_model_name,
        messages=messages,
        n=1,
        temperature=0.0,
        max_tokens=1024,
    )

    response_content = completion.choices[0].message.content
    result = json.loads(response_content)
    score = result["answer_quality"]
    reasoning = result["reasoning"]
    return score, reasoning


In [None]:
for testing_config in llm_config["testing_config"]:
    base_filename = replace_special_char(testing_config["name" or "model_name"])
    answers_filename = f"{base_filename}_answers.jsonl"
    scores = pd.read_json(answers_filename, orient="records", lines=True)
    position = scores.columns.get_loc("answer")
    scores.insert(position + 1, "answer_score", "")
    scores.insert(position + 2, "answer_score_reasoning", "")
    position = scores.columns.get_loc("rag_answer")
    scores.insert(position + 1, "rag_answer_score", "")
    scores.insert(position + 2, "rag_answer_score_reasoning", "")

    for index, row in scores.iterrows():
        question = row["question"]
        answer = row["answer"]
        reference_answer = row["ground_truth"]
        print(index, question)
        if answer:
            score, reasoning = score_request(question, answer, reference_answer)
            scores.at[index, "answer_score"] = score
            scores.at[index, "answer_score_reasoning"] = reasoning
            print(answer[:40], score, reasoning[:40])
        rag_answer = row["rag_answer"]
        if rag_answer:
            score, reasoning = score_request(question, rag_answer, reference_answer)
            scores.at[index, "rag_answer_score"] = score
            scores.at[index, "rag_answer_score_reasoning"] = reasoning
            print(rag_answer[:40], score, reasoning[:40])

    judge_filename = replace_special_char(judge_model_name)
    scores_filename = f"{base_filename}_{judge_filename}_scores.jsonl"
    scores.to_json(scores_filename, orient="records", lines=True)
    scores.to_csv(f"{base_filename}_{judge_filename}_scores.csv", index=False)


## Create resulting score report CSV

In [None]:
import yaml

with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)

judge_client = OpenAI(api_key=llm_config["judge"]["api_key"])
judge_model_name = llm_config["judge"]["model_name"]
judge_filename = replace_special_char(judge_model_name)

summary_output_df = pd.DataFrame()

for testing_config in llm_config["testing_config"]:
    base_filename = replace_special_char(testing_config["name" or "model_name"])
    scores_filename = f"{base_filename}_{judge_filename}_scores.jsonl"
    scores = pd.read_json(scores_filename, orient="records", lines=True)
    if testing_config["qna_template"]:
        summary_output_df[f"{base_filename}_answer_score"] = scores["answer_score"]
    if testing_config["rag_template"]:
        summary_output_df[f"{base_filename}_rag_answer_score"] = scores["rag_answer_score"]


summary_output_df.to_json(f"summary_{judge_filename}_scores.jsonl", orient="records", lines=True)
summary_output_df.to_csv(f"summary_{judge_filename}_scores.csv", index=False)