In [None]:
import os
import re
import json
import datetime
from pathlib import Path


import pandas as pd
from tqdm.auto import tqdm

from dotenv import load_dotenv
from datasets import load_dataset
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain.docstore.document import Document

from extractor import Store, Answerer

tqdm.pandas()  # load tqdm's pandas support
pd.set_option("display.max_colwidth", None)

load_dotenv()

In [None]:
JUDGE_MODEL = "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"
JUDGE_MODEL_CONFIG = {
    "max_new_tokens": 128,
    "top_k": 10,
    "top_p": 0.95,
    "typical_p": 0.95,
    "temperature": 0.01,
    "repetition_penalty": 1.03,
}

DEFENDANT_MODEL_PROVIDER = "hfsi" # hfsi, groq
DEFENDANT_MODEL = "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"
DEFENDANT_MODEL_CONFIG = {
    "temperature": 0.05,
    "top_p": 0.7,
    "max_tokens": 2048,
}
BASE_DIR = Path("./results")

now = datetime.now()
date_time_str = now.strftime("%d.%m.%Y_%H.%M")

In [None]:
vec_store = Store("main")
vec_store.setup()

if DEFENDANT_MODEL_PROVIDER == "hfsi":
    DEFENDANT_MODEL = DEFENDANT_MODEL

defendant = Answerer(vec_store, model=DEFENDANT_MODEL, **DEFENDANT_MODEL_CONFIG)

### Answer generation and Source Attribution Isolation

In [4]:
with open("./eval_data_generated_15.01.25_12.00.json") as f:
    data = json.load(f)
context_question_pairs = pd.DataFrame(data, columns=["context","questions"])

In [5]:
def process_context_questions_pair(x):
    questions, context = x['questions'], [Document(page_content=x['context']),]
    
    answers = []
    answer_citations = []
    for q in questions:
        history, _, citations = defendant.answer_with_search(q, ctx_docs=context, show_cits=False)
        answer_text = history[-1]['content']
        answers.append(answer_text)
        answer_citations.append(citations)

    return {"answers": answers, "citations": answer_citations}

In [None]:
llm_answers = context_question_pairs.progress_apply(process_context_questions_pair, axis=1).to_list()
llm_answers = pd.DataFrame(llm_answers)

In [16]:
context_question_pairs_llm_answers = pd.concat([context_question_pairs, llm_answers], axis=1, ignore_index=False)
context_question_pairs_llm_answers.to_json(BASE_DIR / f"eval_data_questions_context_answers_{date_time_str}.json")

In [51]:
backup_file_datestr = ""
# context_question_pairs_llm_answers = pd.read_json(BASE_DIR / f"eval_data_questions_context_answers_{backup_file_datestr}.json")

In [23]:
IMPROVED_JUDGE_PROMPT = """
You will be given a user_question, system_answer and system_context set.
Your task is to provide a 'total rating' scoring how well the system_answer answers the user concerns expressed in the user_question depending on the system_context.
Give your answer on a scale of 1 to 4, where 1 means that the system_answer is not helpful at all, and 4 means that the system_answer completely and helpfully addresses the user_question.

Here is the scale you should use to build your answer:
1: The system_answer is terrible: completely irrelevant to the question asked, very partial, or is contradictory to the system_context
2: The system_answer is mostly not helpful: misses some key aspects of the question
3: The system_answer is mostly helpful: provides support, but still could be improved
4: The system_answer is excellent: relevant, direct, detailed, and addresses all the concerns raised in the question

Provide your feedback as follows:

Feedback:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 4)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the context, question, and answer.

Context: {context}
Question: {question}
Answer: {answer}

Feedback:::
Evaluation: """
prompt = PromptTemplate.from_template(IMPROVED_JUDGE_PROMPT)

In [24]:
llm = HuggingFaceEndpoint(
    repo_id=JUDGE_MODEL,
    huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
    **JUDGE_MODEL_CONFIG
)
llm_chain = prompt | llm

In [55]:
def extract_judge_score(judge_responses, split_str: str = "Total rating:") -> int:
    
    judge_responses_scores = []
    for res in judge_responses:
        answer = res
        try:
            if split_str in answer:
                rating = answer.split(split_str)[1]
            else:
                rating = answer
            digit_groups = [el.strip() for el in re.findall(r"\d+(?:\.\d+)?", rating)]
            judge_responses_scores.append(float(digit_groups[0]))
        except Exception as e:
            print(e)
            continue
    
    return {"llm_judge_scores": judge_responses_scores}

In [56]:
def judge_context_questions_answers(x):
    context, question_answer_pairs = x["context"], zip(x["questions"], x["answers"])
    cqa_processed = {
        "llm_judge":[],
        "llm_judge_fixed":[]
    }
    
    for qa in question_answer_pairs:
        cqa = {
            "context":context,
            "question":qa[0],
            "answer":qa[1],
        }
        result = llm_chain.invoke(cqa)
        
        cqa_processed["llm_judge"].append(result)
        cqa_processed["llm_judge_fixed"].append(result.split("Question")[0])
    
    return cqa_processed

In [None]:
cqa_pairs_judged = pd.DataFrame(context_question_pairs_llm_answers.progress_apply(judge_context_questions_answers, axis=1).to_list())

In [None]:
cqa_pairs_judged_scores = pd.DataFrame(cqa_pairs_judged["llm_judge"].apply(extract_judge_score).to_list())

context_question_pairs_llm_answers_judged = pd.concat([context_question_pairs_llm_answers, cqa_pairs_judged, cqa_pairs_judged_scores], axis=1, ignore_index=False)

In [67]:
context_question_pairs_llm_answers_judged.to_json(BASE_DIR / f"eval_result_data_{DEFENDANT_MODEL.replace('/','=')}_{date_time_str}.json")

In [3]:
import pandas as pd
pd.read_json("eval_result_data_NousResearch=Nous-Hermes-2-Mixtral-8x7B-DPO.16.01.25_12.18.json").to_csv("eval_result_data_NousResearch=Nous-Hermes-2-Mixtral-8x7B-DPO.16.01.25_12.18.csv")

In [1]:
context_question_pairs_llm_answers_judged

NameError: name 'context_question_pairs_llm_answers_judged' is not defined