# Evaluating the fine tuned model

### Needed packages and imports

In [None]:
!pip install -r requirements.txt

### Model inference parameters

The parameters to the fine tuned model.

In [None]:
import requests
import os
import yaml
import json
import re
import time
import pandas as pd
import torch

from typing import Iterator
from pathlib import Path
from openai import OpenAI

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument

from langchain_openai import ChatOpenAI
from langchain_community.llms import VLLMOpenAI
from langchain_milvus import Milvus
from langchain_text_splitters import RecursiveCharacterTextSplitter

from docling.document_converter import DocumentConverter

def replace_special_char(original_str):
    return re.sub(r"[^\w]", "_", original_str)

def get_config():
    with open("llm_config.yaml", "r") as f:
        llm_config = yaml.safe_load(f)
    return llm_config

def get_output_dir():
    llm_config = get_config()

    output_directory = "ragas_" + replace_special_char(llm_config.get("name", "output"))
    os.makedirs(output_directory, exist_ok=True)
    return output_directory


In [None]:
import os

from dotenv import load_dotenv

load_dotenv()

In [None]:
MAX_TOKENS=2048
TEMPERATURE=0.00

### Milvus connection info

Defaults to local db

In [None]:
MILVUS_URI = os.getenv("MILVUS_URI", "./milvus_ragas_eval.db")
MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", "")
MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", "")
MILVUS_COLLECTION = os.getenv("MILVUS_COLLECTION", "my_org_documents")

## Sanity check model

In [None]:
def create_llm(testing_config):
    if testing_config.get("model_type") == "openai":
        print("Creating OpenAI model")
        return ChatOpenAI(
            openai_api_key=re.sub(r"\s+", "", testing_config["api_key"]),
            model=testing_config["model_name"],
            streaming=False
        )
    print("Creating VLLM model")
    return VLLMOpenAI(
        openai_api_key=re.sub(r"\s+", "", testing_config["api_key"]),
        openai_api_base=testing_config["endpoint_url"], #https://model...com/v1
        model_name=testing_config["model_name"],
        temperature=0.00,
        max_tokens=2048,
        streaming=False
    )

def qna_request(llm, template_str, question):
    num_retries = 1
    for attempt in range(num_retries):
        try:
            prompt = PromptTemplate.from_template(template_str)
            chain = prompt | llm | StrOutputParser()
            answer = chain.invoke({"question": question})
            print(answer)
            return answer.strip()
        except Exception as e:
            print(f"Request failed: {e}")
            if attempt + 1 < num_retries:
                print(f"Retrying in 5 seconds...")
                time.sleep(5)
            else:
                return ""


In [None]:
llm_config = get_config()
llm = create_llm(llm_config["testing_config"][0])

question = "When will the ITS Telecommuting program end?"
llm.invoke(question)

In [None]:
template_str = llm_config["testing_config"][0]["qna_template"]
qna_request(llm, template_str, question)

## Creating an Milvus DB with documents

## Initial index creation and document ingestion

#### Load pdfs

In [None]:
class DoclingPDFLoader(BaseLoader):

    def __init__(self, file_path: str | list[str]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document
            text = dl_doc.export_to_markdown()
            yield LCDocument(page_content=text)

In [None]:
pdf_folder_path = "../data_preparation/document_collection"
file_paths = [str(path) for path in Path(pdf_folder_path).rglob('*.pdf')]
file_paths

In [None]:
loader = DoclingPDFLoader(file_path=file_paths)

#### Split documents into chunks with some overlap

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=100,
)

docs = loader.load()
all_splits = text_splitter.split_documents(docs)
all_splits[0]

#### Create the index and ingest the documents

In [None]:
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

model_kwargs = {"trust_remote_code": True, "device": device}
embeddings = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1.5",
    model_kwargs=model_kwargs,
    show_progress=True
)

db = Milvus(
    embedding_function=embeddings,
    connection_args={
        "uri": MILVUS_URI,
        "user": MILVUS_USERNAME, 
        "password": MILVUS_PASSWORD
    },
    collection_name=MILVUS_COLLECTION,
    auto_id=True,
    drop_old=False
)


In [None]:
loaded = db.add_documents(all_splits)
print(f"{len(loaded)} documents loaded.")

#### Test vector DB search

In [None]:
query = "What percentage of existing State-related debt is projected to be retired in 15 years?"
docs_with_score = db.similarity_search_with_score(query)

In [None]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

#### Test out RAG request

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def rag_request(llm, template_str, question):
    num_retries = 1
    for attempt in range(num_retries):
        try:
            search_results = db.similarity_search(question)
            contexts = [result.page_content for result in search_results]
            context_str = "\n\n".join(contexts)
            prompt = PromptTemplate.from_template(template_str)
            chain = prompt | llm | StrOutputParser()
            response = chain.invoke({"question": question, "context": context_str})
            return response.strip(), contexts
        except Exception as e:
            print(f"Request failed: {e}")
            if attempt < num_retries:
                print(f"Retrying in 5 seconds...")
                time.sleep(5)
            else:
                return "", ""


In [None]:
llm_config = get_config()
llm = create_llm(llm_config["testing_config"][0])
template_str = llm_config["testing_config"][0]["rag_template"]

question = "What percentage of existing State-related debt is projected to be retired in 15 years?"
answer, contexts = rag_request(llm, template_str, question)
print(answer)
for context in contexts: print("-------\n" + context[:100])

## Generate Answers

### Use qna.yaml to create some questions and ground truth answers

In [None]:
pdf_folder_path = "../data_preparation/document_collection"
output_directory = get_output_dir()

qna_list = []

for file_path in Path(directory).rglob('qna.yaml'):
    print(file_path)
    if not file_path.name == 'qna.yaml':
        continue
    with open(file_path) as file:
        qna = yaml.load(file, Loader=yaml.FullLoader)
        for seed_example in qna["seed_examples"]:
            for questions_and_answers in seed_example["questions_and_answers"]:
                qna_list.append(
                    {
                        "question": questions_and_answers["question"].strip(),
                        "ground_truth": questions_and_answers["answer"].strip()                     
                    }
                )
                
# print(qna_list)

qna_df = pd.DataFrame(qna_list)
# df.to_csv('qna.csv', index=False)
qna_df.to_json(f"{output_directory}/qna.jsonl", orient="records", lines=True)


## Get responses from each of the available models with RAG

In [None]:
llm_config = get_config()
output_directory = get_output_dir()
qna_df = pd.read_json(f"{output_directory}/qna.jsonl", orient="records", lines=True)

for testing_config in llm_config["testing_config"]:
    answers = qna_df.copy()
    answers["contexts"] = None
    answers["answer"] = None
    llm = create_llm(testing_config)
    for index, row in answers.iterrows():
        question = row["question"]
        print(index, question)
        if testing_config.get("rag_template"):
            answer, contexts = rag_request(llm, testing_config.get("rag_template"), question)
            print("RAG Answer: " + answer[:40])
            answers.at[index, "answer"] = answer
            answers.at[index, "contexts"] = contexts
    testing_config_name = replace_special_char(testing_config["name" or "model_name"])
    answers.to_json(f"{output_directory}/{testing_config_name}_answers.jsonl", orient="records", lines=True)
    # answers.to_csv(f"{output_directory}/{base_filename}_answers.csv")

## Grade responses using Judge Model

## Evaluation with Ragas


In [None]:
import re
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

llm_config = get_config()
output_directory = get_output_dir()

JUDGE_API_KEY = llm_config.get("judge").get("api_key")
JUDGE_MODEL_NAME = llm_config.get("judge").get("model_name")

# TODO: set the api key in the llm and embeddings directly
os.environ["OPENAI_API_KEY"] = JUDGE_API_KEY

judge_llm = ChatOpenAI(model=JUDGE_MODEL_NAME)

evaluator_llm = LangchainLLMWrapper(judge_llm)

evaluator_embeddings = LangchainEmbeddingsWrapper(
    OpenAIEmbeddings()
)

In [None]:
from datasets import Dataset
from ragas.metrics import (
    FactualCorrectness,
    SemanticSimilarity,
    RougeScore
)
from ragas import evaluate

metrics = [
    FactualCorrectness(llm=evaluator_llm),
    SemanticSimilarity(embeddings=evaluator_embeddings),
    RougeScore(),
]

for testing_config in llm_config["testing_config"]:
    testing_config_name = replace_special_char(testing_config["name" or "model_name"])
    answers_filename = f"{output_directory}/{testing_config_name}_answers.jsonl"
    answers_df = pd.read_json(answers_filename, orient="records", lines=True)
    answers_dataset = Dataset.from_pandas(answers_df)
    scores_dataset = evaluate(dataset=answers_dataset, metrics=metrics)
    scores = scores_dataset.to_pandas()
    scores_filename = f"{output_directory}/{testing_config_name}_scores"
    scores.to_json(f"{scores_filename}.jsonl", orient="records", lines=True)
    scores.to_csv(f"{scores_filename}.csv", index=False)


## Create resulting score report CSV

In [None]:
llm_config = get_config()
output_directory = get_output_dir()

judge_client = OpenAI(api_key=llm_config["judge"]["api_key"])
judge_model_name = llm_config["judge"]["model_name"]
judge_name = replace_special_char(judge_model_name)

summary_output_df = pd.DataFrame()

for testing_config in llm_config["testing_config"]:
    if not testing_config.get("rag_template"):
        continue
    testing_config_name = replace_special_char(testing_config["name" or "model_name"])
    scores_filename = f"{output_directory}/{testing_config_name}_scores.jsonl"
    scores = pd.read_json(scores_filename, orient="records", lines=True)
    summary_output_df[f"{testing_config_name}_factual_correctness"] = scores.get("factual_correctness")
    # summary_output_df[f"{testing_config_name}_semantic_similarity"] = scores.get("semantic_similarity")
    # summary_output_df[f"{testing_config_name}_rouge_score"] = scores.get("rouge_score")

average_row = summary_output_df.mean(axis=0, numeric_only=True)
print(average_row)
summary_output_df.loc[len(summary_output_df)] = average_row
question_indices = [f"Q{i + 1}" for i in range(len(summary_output_df) - 1)]
question_indices.append("Average")
summary_output_df.insert(0, 'question index', question_indices)

summary_filepath = f"{output_directory}/summary_{judge_name}_scores"
summary_output_df.to_json(f"{summary_filepath}.jsonl", orient="records", lines=True)
summary_output_df.to_csv(f"{summary_filepath}.csv", index=False)

In [None]:
with pd.ExcelWriter(f"{output_directory}/{judge_name}_scores.xlsx") as writer:
    summary_output_df = pd.read_csv(f"{summary_filepath}.csv")
    summary_output_df.to_excel(writer, sheet_name="Summary", index=False)

    for testing_config in llm_config["testing_config"]:
        testing_config_name = replace_special_char(testing_config["name" or "model_name"])
        scores_filename = f"{output_directory}/{testing_config_name}_scores.jsonl"
        scores = pd.read_json(scores_filename, orient="records", lines=True)
        scores.to_excel(writer, sheet_name=f"{testing_config_name}_scores")