# rag 평가질문


In [None]:
!pip install openai PyMuPDF tqdm pandas datasets

Collecting openai
  Downloading openai-1.40.3-py3-none-any.whl.metadata (22 kB)
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting PyMuPDFb==1.24.9 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux20

In [None]:
import fitz  # PyMuPDF
from openai import OpenAI
import random
from tqdm import tqdm
from google.colab import drive

# Google Drive를 마운트
drive.mount('/content/drive')

# OpenAI API 키 설정
client = OpenAI(api_key = '')




# PDF 문서에서 텍스트를 문단 단위로 추출하는 함수
def extract_paragraphs_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    paragraphs = []
    for page in doc:
        text = page.get_text("text")
        paragraphs.extend(text.split('\n\n'))  # 문단 단위로 분할
    return paragraphs

# OpenAI의 GPT-3.5 Turbo를 호출하는 함수 정의
def call_openai_gpt(prompt: str):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        max_tokens=1000,
        n=1,
        stop=None,
        temperature=0.7,
    )
    return response.choices[0].message.content

# QA 생성 프롬프트 정의
QA_generation_prompt = """
Your task is to write a factual question and answer based on the given context. Your factual question should be specific in context and answerable with concise factual information. Your factual question should be phrased in a way that a user would ask a search engine. This means that your factual question should not include phrases like "according to the article" or "according to the context".

Provide answers like this:

Output:::
Factoid question: (Your factual question)
Answer: (Answering factual questions)

Now here's some context.

Context: {context}\n
Output:::"""

# PDF 문서 경로
pdf_path = "/content/drive/MyDrive/R078r3e_revisions.pdf"
paragraphs = extract_paragraphs_from_pdf(pdf_path)

# 문서를 처리된 문서 리스트에 추가
docs_processed = [{"page_content": paragraph, "metadata": {"source": pdf_path}} for paragraph in paragraphs]

N_GENERATIONS = 10  # QA 쌍을 10개만 생성하는 것은 비용 및 시간 고려

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):  # 임의로 N_GENERATIONS만큼 문서 선택
    # QA 쌍 생성
    output_QA_couple = call_openai_gpt(QA_generation_prompt.format(context=sampled_context["page_content"]))
    try:
        # 생성된 텍스트에서 질문과 답변을 추출
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0].strip()
        answer = output_QA_couple.split("Answer: ")[-1].strip()
        assert len(answer) < 300, "Answer is too long"  # 답변 길이 제한 검사
        groundedness_score = 5  # 예시 점수
        relevance_score = 5     # 예시 점수
        standalone_score = 5    # 예시 점수
        outputs.append(
            {
                "context": sampled_context["page_content"],  # 문맥
                "question": question,  # 질문
                "answer": answer,  # 답변
                "source_doc": sampled_context["metadata"]["source"],  # 문서 출처 정보
                "groundedness_score": groundedness_score,  # 구체성 점수
                "relevance_score": relevance_score,  # 관련성 점수
                "standalone_score": standalone_score  # 독립성 점수
            }
        )
    except Exception as e:
        print(f"An error occurred: {e}")
        continue
if not outputs:
    print("Warning: outputs 리스트가 비어 있습니다.")
else:
    print(f"outputs 리스트에는 {len(outputs)}개의 항목이 있습니다.")
    # 첫 번째 항목의 내용을 출력하여 데이터 구조 확인
    print("첫 번째 항목의 내용:")
    print(outputs[0])
# 생성된 QA 쌍 출력
for output in outputs:
    print("Context:", output["context"])
    print("Question:", output["question"])
    print("Answer:", output["answer"])
    print("Source:", output["source_doc"])
    print("\n" + "-"*80 + "\n")

Mounted at /content/drive
Generating 10 QA couples...


 40%|████      | 4/10 [00:05<00:07,  1.31s/it]

An error occurred: Answer is too long


 90%|█████████ | 9/10 [00:12<00:01,  1.29s/it]

An error occurred: Answer is too long


100%|██████████| 10/10 [00:12<00:00,  1.29s/it]

outputs 리스트에는 8개의 항목이 있습니다.
첫 번째 항목의 내용:
{'context': 'E/ECE/324/Rev.1/Add.77/Rev.3 \nE/ECE/TRANS/505/Rev.1/Add.77/Rev.3 \n \n \nApproval Authority which granted the approval. Upon receiving the relevant \ncommunication, that Type Approval Authority shall inform thereof the other \nParties to the Agreement applying this Regulation by means of a copy of the \napproval form bearing at the end, in large letters, the signed and dated \nannotation "PRODUCTION DISCONTINUED". \n \n 12.  \nNames and addresses of Technical Services  \n \n \nresponsible for conducting approval tests and of  \n \n \nType Approval Authorities \n \nThe Contracting Parties to the Agreement applying this Regulation shall \ncommunicate to the United Nations Secretariat the names and addresses of the \nTechnical Services responsible for conducting approval tests and of Type \nApproval Authorities which grant approval and to which forms certifying \napproval or extension or refusal or withdrawal of approval, issued in ot




In [None]:
question_groundedness_critique_prompt = """
You will be given a context and a question. Your task is to provide a 'total assessment' score of whether you can clearly answer the question with the context provided. You will rate it on a scale of 1 to 5, where 1 means you cannot answer at all with the context, and 5 means you can answer the question clearly and unambiguously with the context.

Please respond as follows

Answer:::: Evaluation: (Explain why you gave the score you did) Total Rating: (Give your score as a number between 1 and 5)

Make sure to include a value for 'Rating:' and 'Total Rating:'.

Now you have a question and context.

question: {question}\n
context: {context}\n
answer::: """

question_relevance_critique_prompt = """
You are presented with a question. Your task is to provide an 'overall assessment' score that represents how useful this question is for developers determining legal risk in the vehicle development ecosystem. Please rate it on a scale of 1 to 5, where 1 means not at all useful and 5 means extremely useful.

Please respond as follows

Answer::: Rating: (a statement about why you gave the score) Total Rating: (a number between 1 and 5)

Be sure to include values for 'Rating:' and 'Total Rating:'.

Now we have a question.

question: {question}\n
answer::: """

question_standalone_critique_prompt = """
Your task is to provide a 'total assessment' score that indicates how context-independent the question is. A score of 1 means that more information is needed, and a score of 5 means that the question is self-explanatory. For example, if the question refers to a specific setting, such as 'in context' or 'in the documentation', the score should be 1. It is okay if the question contains technical terms or abbreviations such as brake or system, but it must be clear to an operator who can understand the question from the documentation.

For example, "How to disconnect the engine is in the appendix" should receive a score of 1 because it implies a specific context.

Please respond as follows

Answer:::: Evaluation: (a statement about why you gave the score) Total Evaluation: (a number between 1 and 5)

Be sure to include values for "Rating:" and "Total Rating:".

Now we have a question.

question: {question}\n
answer::: """

In [None]:
print("Generating critique for each QA couple...")
for output in tqdm(outputs):
    evaluations = {
        "groundedness": call_openai_gpt(
            question_groundedness_critique_prompt.format(context=output["context"], question=output["question"]),
        ),
        "relevance": call_openai_gpt(
            question_relevance_critique_prompt.format(question=output["question"]),
        ),
        "standalone": call_openai_gpt(
            question_standalone_critique_prompt.format(question=output["question"]),
        ),
    }
    try:
        for criterion, evaluation in evaluations.items():
            # Handle potential errors in splitting the evaluation string
            split_rating = evaluation.split("Total rating: ")
            split_eval = evaluation.split("Evaluation: ")
            if len(split_rating) > 1 and len(split_eval) > 1:
                score = int(split_rating[-1].strip())
                eval = split_eval[1].split("Total rating: ")[0].strip() # Extract evaluation before "Total rating"
                output.update(
                    {
                        f"{criterion}_score": score,
                        f"{criterion}_eval": eval,
                    }
                )
            else:
                print(f"Warning: Could not parse evaluation for {criterion}: {evaluation}")
    except Exception as e:
        print(f"Error processing evaluation: {e}") # Print the error for debugging
        continue

Generating critique for each QA couple...


 12%|█▎        | 1/8 [00:03<00:23,  3.35s/it]


Total Rating: 5
Total Rating: 5


 25%|██▌       | 2/8 [00:07<00:22,  3.76s/it]


Total Rating: 3

Rating: The context gives detailed information about the approval process and the specific details required for the approval of a vehicle type. However, it does not directly state the purpose of Annex 1 in the context of vehicle approval processes. Some inference can be made based on the information provided, but it is not explicitly clear. 

Total Rating: 3

Total Evaluation: 1

Rating: The question is context-dependent and requires additional information related to Annex 1 and vehicle approval processes to provide a meaningful answer.

Total Rating: 1


 38%|███▊      | 3/8 [00:09<00:15,  3.14s/it]


Rating: 5
Total Rating: 5


 50%|█████     | 4/8 [00:13<00:12,  3.23s/it]


Rating: 5


Evaluation: The question is concise and specific, providing clear details about the calculation of the Peak Braking Coefficient for vehicles with a limitation in achieving a test speed of 50 km/h. It does not rely on additional context for understanding. Total Evaluation: 5


 62%|██████▎   | 5/8 [00:15<00:09,  3.07s/it]


Rating: 5/5
Total Rating: 5/5


 75%|███████▌  | 6/8 [00:19<00:06,  3.16s/it]

Total Rating: 5

Total Evaluation: 5

Rating: 5
Total Rating: 5


 88%|████████▊ | 7/8 [00:22<00:03,  3.02s/it]


Total Rating: 5

Total Evaluation: 5

Rating: The question is clear and specific. 
Total Rating: 5


100%|██████████| 8/8 [00:25<00:00,  3.18s/it]


Evaluation: The context clearly provides the specific information about UN Regulation No. 78, indicating its purpose related to vehicle braking systems for certain categories.

Total Rating: 5

Evaluation: The question is clear and does not require additional context to provide an answer. Total Evaluation: 5

Rating: 5 Total Rating: 5





In [None]:
import pandas as pd
from datasets import Dataset

pd.set_option("display.max_colwidth", None)

# Assume `outputs` is populated with QA pairs and their evaluations
# Ensure `outputs` contains scores for each QA pair
for entry in outputs:
    if not all(score in entry for score in ["groundedness_score", "relevance_score", "standalone_score"]):
        # Log missing scores
        print(f"Missing scores in entry: {entry}")

generated_questions = pd.DataFrame.from_dict(outputs)

# Check if the columns exist before trying to display them
required_columns = ["groundedness_score", "relevance_score", "standalone_score"]

if all(col in generated_questions.columns for col in required_columns):
    print("Evaluation dataset before filtering:")
    display(generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ])

    # Filter dataset based on scores
    filtered_questions = generated_questions.loc[
        (generated_questions["groundedness_score"] >= 4) &
        (generated_questions["relevance_score"] >= 4) &
        (generated_questions["standalone_score"] >= 4)
    ]

    print(f"Number of rows after filtering: {len(filtered_questions)}")
    if len(filtered_questions) == 0:
        print("Filtered dataset is empty. Adjust the filtering criteria.")
    else:
        print("============================================")
        print("Final evaluation dataset:")
        display(filtered_questions[
            [
                "question",
                "answer",
                "groundedness_score",
                "relevance_score",
                "standalone_score",
            ]
        ])

        # Create the eval_dataset
        eval_dataset = Dataset.from_pandas(filtered_questions, preserve_index=False)
        print("Eval dataset created.")
else:
    missing_cols = [col for col in required_columns if col not in generated_questions.columns]
    print(f"Warning: Missing columns {missing_cols} in DataFrame. Check previous code cell for errors.")
    display(generated_questions)  # Display the DataFrame to inspect its contents

# Check if eval_dataset is created
if 'eval_dataset' in locals():
    print(f"eval_dataset contains {len(eval_dataset)} examples.")
    print(eval_dataset[0])
else:
    print("eval_dataset is empty or not defined.")


Evaluation dataset before filtering:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,What is the process for Contracting Parties to communicate information about Type Approval Authorities and Technical Services to the United Nations Secretariat?,The Contracting Parties to the Agreement applying the Regulation must communicate the names and addresses of Technical Services responsible for conducting approval tests and of Type Approval Authorities which grant approval to the United Nations Secretariat.,5,5,5
1,What is the purpose of the information provided in Annex 1 in the context of vehicle approval processes?,"The purpose of the information in Annex 1 is to document the approval, extension, refusal, withdrawal, or production discontinuation of a specific type of vehicle in relation to braking, following UN Regulation No. 78.",5,5,5
2,What actions are required when a modification is made to a vehicle type or its braking system according to the given context?,The modification must be communicated to the Type Approval Authority. The Authority can either consider the modifications to be compliant or request a further test report. Confirmation or refusal of approval must be communicated to the relevant parties.,5,5,5
3,How is the Peak Braking Coefficient (PBC) calculated for vehicles unable to achieve a test speed of 50 km/h?,"The Peak Braking Coefficient (PBC) for vehicles unable to achieve a test speed of 50 km/h is calculated using the formula tPBC = tPBC = where t represents the time taken, in seconds, for the speed of the vehicle to reduce from 0.8 Vmax to (0.8 Vmax - 20), where Vmax is measured in km/h.",5,5,5
4,What are the test conditions for conducting stops on a high friction surface in ABS tests?,"Initial brake temperature between 55°C and 100°C, test speed of 60 km/h or 0.9 Vmax, simultaneous actuation of both brake controls, and brake actuation force to ensure full cycling down to 10 km/h.",5,5,5
5,What are the performance requirements for the stopping distance in the partial failure test for split service brake systems?,The stopping distance (S) in the partial failure test for split service brake systems must be ≤ 0.1 V + 0.0117 V^2 (where V is the specified test speed in km/h) or the MFDD (Mean Fully Developed Deceleration) must be ≥ 3.3 m/s^2.,5,5,5
6,What is the position of the spray nozzles on the brake testing equipment according to the given context?,The spray nozzles are positioned two thirds of the distance from the outer circumference of the rotating drum to the wheel hub centre.,5,5,5
7,What is the purpose of UN Regulation No. 78?,"UN Regulation No. 78 establishes uniform provisions concerning the approval of vehicles of categories L1, L2, L3, L4, and L5 with regard to braking.",5,5,5


Number of rows after filtering: 8
Final evaluation dataset:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,What is the process for Contracting Parties to communicate information about Type Approval Authorities and Technical Services to the United Nations Secretariat?,The Contracting Parties to the Agreement applying the Regulation must communicate the names and addresses of Technical Services responsible for conducting approval tests and of Type Approval Authorities which grant approval to the United Nations Secretariat.,5,5,5
1,What is the purpose of the information provided in Annex 1 in the context of vehicle approval processes?,"The purpose of the information in Annex 1 is to document the approval, extension, refusal, withdrawal, or production discontinuation of a specific type of vehicle in relation to braking, following UN Regulation No. 78.",5,5,5
2,What actions are required when a modification is made to a vehicle type or its braking system according to the given context?,The modification must be communicated to the Type Approval Authority. The Authority can either consider the modifications to be compliant or request a further test report. Confirmation or refusal of approval must be communicated to the relevant parties.,5,5,5
3,How is the Peak Braking Coefficient (PBC) calculated for vehicles unable to achieve a test speed of 50 km/h?,"The Peak Braking Coefficient (PBC) for vehicles unable to achieve a test speed of 50 km/h is calculated using the formula tPBC = tPBC = where t represents the time taken, in seconds, for the speed of the vehicle to reduce from 0.8 Vmax to (0.8 Vmax - 20), where Vmax is measured in km/h.",5,5,5
4,What are the test conditions for conducting stops on a high friction surface in ABS tests?,"Initial brake temperature between 55°C and 100°C, test speed of 60 km/h or 0.9 Vmax, simultaneous actuation of both brake controls, and brake actuation force to ensure full cycling down to 10 km/h.",5,5,5
5,What are the performance requirements for the stopping distance in the partial failure test for split service brake systems?,The stopping distance (S) in the partial failure test for split service brake systems must be ≤ 0.1 V + 0.0117 V^2 (where V is the specified test speed in km/h) or the MFDD (Mean Fully Developed Deceleration) must be ≥ 3.3 m/s^2.,5,5,5
6,What is the position of the spray nozzles on the brake testing equipment according to the given context?,The spray nozzles are positioned two thirds of the distance from the outer circumference of the rotating drum to the wheel hub centre.,5,5,5
7,What is the purpose of UN Regulation No. 78?,"UN Regulation No. 78 establishes uniform provisions concerning the approval of vehicles of categories L1, L2, L3, L4, and L5 with regard to braking.",5,5,5


Eval dataset created.
eval_dataset contains 8 examples.
{'context': 'E/ECE/324/Rev.1/Add.77/Rev.3 \nE/ECE/TRANS/505/Rev.1/Add.77/Rev.3 \n \n \nApproval Authority which granted the approval. Upon receiving the relevant \ncommunication, that Type Approval Authority shall inform thereof the other \nParties to the Agreement applying this Regulation by means of a copy of the \napproval form bearing at the end, in large letters, the signed and dated \nannotation "PRODUCTION DISCONTINUED". \n \n 12.  \nNames and addresses of Technical Services  \n \n \nresponsible for conducting approval tests and of  \n \n \nType Approval Authorities \n \nThe Contracting Parties to the Agreement applying this Regulation shall \ncommunicate to the United Nations Secretariat the names and addresses of the \nTechnical Services responsible for conducting approval tests and of Type \nApproval Authorities which grant approval and to which forms certifying \napproval or extension or refusal or withdrawal of approva

In [None]:
# outputs 리스트에 점수가 들어가 있는지 확인하는 코드
for idx, entry in enumerate(outputs):
    # 각 entry가 dictionary라고 가정
    missing_keys = []
    if 'groundedness_score' not in entry:
        missing_keys.append('groundedness_score')
    if 'relevance_score' not in entry:
        missing_keys.append('relevance_score')
    if 'standalone_score' not in entry:
        missing_keys.append('standalone_score')

    # missing_keys에 값이 있다면, 해당 항목에서 누락된 키가 있는 것
    if missing_keys:
        print(f"Entry {idx} is missing scores: {missing_keys}")
    else:
        print(f"Entry {idx} scores - Groundedness: {entry['groundedness_score']}, Relevance: {entry['relevance_score']}, Standalone: {entry['standalone_score']}")
# 첫 번째 항목 출력
print(outputs[0])


Entry 0 scores - Groundedness: 5, Relevance: 5, Standalone: 5
Entry 1 scores - Groundedness: 5, Relevance: 5, Standalone: 5
Entry 2 scores - Groundedness: 5, Relevance: 5, Standalone: 5
Entry 3 scores - Groundedness: 5, Relevance: 5, Standalone: 5
Entry 4 scores - Groundedness: 5, Relevance: 5, Standalone: 5
Entry 5 scores - Groundedness: 5, Relevance: 5, Standalone: 5
Entry 6 scores - Groundedness: 5, Relevance: 5, Standalone: 5
Entry 7 scores - Groundedness: 5, Relevance: 5, Standalone: 5
{'context': 'E/ECE/324/Rev.1/Add.77/Rev.3 \nE/ECE/TRANS/505/Rev.1/Add.77/Rev.3 \n \n \nApproval Authority which granted the approval. Upon receiving the relevant \ncommunication, that Type Approval Authority shall inform thereof the other \nParties to the Agreement applying this Regulation by means of a copy of the \napproval form bearing at the end, in large letters, the signed and dated \nannotation "PRODUCTION DISCONTINUED". \n \n 12.  \nNames and addresses of Technical Services  \n \n \nresponsi

# rag프로세스 jsonl

In [None]:
!pip install chromadb tiktoken transformers sentence_transformers jq
!pip install openai langchain
!pip install -U langchain-openai langchain-community

Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting jq
  Downloading jq-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.112.0-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.30.5-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Down

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding("cl100k_base")

def tiktoken_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

In [None]:
import json
import pprint as ppr

from langchain_community.document_loaders import JSONLoader

In [None]:
import os
import openai
os.environ["OPENAI_API_KEY"] = ''

In [None]:
## 파일 경로 지정
file_path = './R078r3e_merged.jsonl'

In [None]:
## 파일 내용 확인
lines_inJSONL = []
with open(file_path, 'r') as infile:
    for line in infile:
        lines_inJSONL.append(json.loads(line))

ppr.pprint(lines_inJSONL)

[{'1.1.': {'1.1.1.': {'Description': ['High friction surface:'],
                      'Item': ['(a) Applicable to all dynamic brake tests '
                               'excluding the ABS tests where an alow-friction '
                               'surface is specified;',
                               '(b) The test area is a clean and level '
                               'surface, with a gradient ≤ 1 percent;',
                               '(c) The surface has a nominal peak braking '
                               'coefficient (PBC) of 0.9 unless otherwise '
                               'specified.']},
           '1.1.2.': {'Description': ['Low friction surface:'],
                      'Item': ['(a) Applicable to all dynamic brake tests '
                               'where a low-friction surface is specified;',
                               '(b) The test area is a clean and level '
                               'surface, with a gradient ≤ 1 percent;',
               

In [None]:
## JSONL 파일에서 개별 JSON을 추출하여 파이썬 객체로 로딩
loader = JSONLoader(
    file_path=file_path,
    jq_schema='.',
    text_content=False,
    json_lines=True)

data_toEmbedded = loader.load()

In [None]:
## 로딩된 데이터 확인
ppr.pprint(data_toEmbedded)

[Document(metadata={'source': '/content/R078r3e_merged.jsonl', 'seq_num': 1}, page_content='{"Chapter": "1", "Title": "General", "1.1.": {"Description": ["Test surfaces"], "1.1.1.": {"Description": ["High friction surface:"], "Item": ["(a) Applicable to all dynamic brake tests excluding the ABS tests where an alow-friction surface is specified;", "(b) The test area is a clean and level surface, with a gradient \\u2264 1 percent;", "(c) The surface has a nominal peak braking coefficient (PBC) of 0.9 unless otherwise specified."]}, "1.1.2.": {"Description": ["Low friction surface:"], "Item": ["(a) Applicable to all dynamic brake tests where a low-friction surface is specified;", "(b) The test area is a clean and level surface, with a gradient \\u2264 1 percent;", "(c) The surface has a PBC of \\u2264 0.45."]}, "1.1.3.": {"Description": ["Measurement of PBC: The PBC is measured as determined by the approval Type Approval A using either:"], "Item": ["(a) An ASTM International (ASTM) E1136 

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document

In [None]:
# 허깅 페이스 모델을 이용한 임베딩
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Chroma를 사용하여 문서 인덱싱
docsearch = Chroma.from_documents(data_toEmbedded, hf)

  warn_deprecated(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
from langchain.callbacks import StreamingStdOutCallbackHandler
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [None]:


openai = ChatOpenAI(model_name="gpt-3.5-turbo-0125",
                    streaming=True, callbacks=[StreamingStdOutCallbackHandler()],
                    temperature = 0)

qa = RetrievalQA.from_chain_type(llm = openai,
                                 chain_type = "stuff",
                                 retriever = docsearch.as_retriever(
                                    search_type="mmr",
                                    search_kwargs={'k':3, 'fetch_k': 10}),
                                 return_source_documents = True)

query = "What are the specific requirements and procedures outlined in the braking test conditions for a vehicle seeking approval, including the initial brake temperature range, test speed criteria for different vehicle categories, brake actuation force limits, and the necessary steps to be taken before and during the test to ensure compliance with the approval regulations?"
result = qa(query)

  warn_deprecated(


The specific requirements and procedures outlined in the braking test conditions for a vehicle seeking approval include the following:

1. Initial brake temperature range: The initial brake temperature should be greater than or equal to 55°C and less than or equal to 100°C.
2. Test speed criteria: The test speed should result in 50 km/h or 0.5 Vmax, whichever is lower, at the point where the vehicle passes from one friction surface to another.
3. Brake actuation force limits: The force applied should be enough to ensure that the ABS will be fully cycling throughout each stop, down to 10 km/h. The specific limits vary based on the type of vehicle category.
4. Necessary steps before and during the test: Accelerate the vehicle to the test speed, actuate the brake control before reaching the transition from one friction surface to another, and ensure that the ABS is fully cycling throughout each stop. For vehicles with ABS fitted to both brake systems, simultaneous actuation of both brake 

In [None]:
RAG_PROMPT_TEMPLATE = """
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.

Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
"""
from langchain.docstore.document import Document as LangchainDocument
from langchain_core.language_models.llms import LLM
from langchain_core.vectorstores import VectorStore
from typing import List, Tuple

def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[LangchainDocument]]:
    """RAG를 사용하여 질문에 답변합니다."""
    # 문서 검색
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # 텍스트만 유지

    # 필요한 문서 수 만큼 자르기
    relevant_docs = relevant_docs[:num_docs_final]

    # 최종 프롬프트 작성
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # 답변 생성
    from langchain.schema import HumanMessage
    messages = [HumanMessage(content=final_prompt)]
    generated_answer = llm(messages)

    return generated_answer, relevant_docs

# RAG를 사용하여 질문에 답변
generated_answer, docs = answer_with_rag(query, openai, docsearch)

print("generated_answer:", generated_answer)
print("Retrieved Docs:", docs)


  warn_deprecated(


The specific requirements and procedures outlined in the braking test conditions for a vehicle seeking approval include:
- Initial brake temperature range: ≥ 55°C and ≤ 100°C
- Test speed criteria: 0.8 Vmax for vehicles with Vmax > 125 km/h and 200 km/h; 160 km/h for vehicles with Vmax ≥ 200 km/h
- Brake actuation force limits: Hand control ≤ 200 N; Foot control ≤ 350 N for vehicle categories L 3 and L 4, ≤ 500 N for vehicle categories L 5 and L 7
- Necessary steps before and during the test: Accelerate the vehicle to the test speed, actuate the brake control(s) under specified conditions, and repeat the process for each stop until the vehicle meets the performance requirements (Document 1).generated_answer: content='The specific requirements and procedures outlined in the braking test conditions for a vehicle seeking approval include:\n- Initial brake temperature range: ≥ 55°C and ≤ 100°C\n- Test speed criteria: 0.8 Vmax for vehicles with Vmax > 125 km/h and 200 km/h; 160 km/h for veh

# rag 프로세스


In [None]:
!pip install chromadb tiktoken transformers sentence_transformers openai langchain pypdf

Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting langchain
  Downloading langchain-0.2.12-py3-none-any.whl.metadata (7.1 kB)
Collecting pypdf
  Downloading pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.112.0-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.30.5-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2.py3-none-any.whl.metadata (2.0 kB)
Co

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import openai
os.environ["OPENAI_API_KEY"] = ''

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding("cl100k_base")

def tiktoken_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.2.11-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.21.3-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.2.11-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader

# 새 섹션

In [None]:
with open("out-markdown.md","r") as file:
    markdown_document = file.read()

In [None]:
from langchain.text_splitter import MarkdownTextSplitter
splitter = MarkdownTextSplitter()
sections = splitter.split_text(markdown_document)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
all_chunks = []
for section in sections:
    chunks = text_splitter.split_text(section)
    all_chunks.extend(chunks)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Document 객체 생성
documents = [Document(page_content=chunk) for chunk in all_chunks]

# Chroma를 사용하여 문서 인덱싱
docsearch = Chroma.from_documents(documents, hf)

  warn_deprecated(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
pip install -U langchain-openai

Collecting langchain-openai
  Downloading langchain_openai-0.1.20-py3-none-any.whl.metadata (2.6 kB)
Downloading langchain_openai-0.1.20-py3-none-any.whl (48 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-openai
Successfully installed langchain-openai-0.1.20


# 새 섹션

In [None]:
loader = PyPDFLoader("/content/drive/MyDrive/R078r3e_revisions.pdf")
pages = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50, length_function = tiktoken_len)
texts = text_splitter.split_documents(pages)

from langchain.embeddings import HuggingFaceEmbeddings

model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

docsearch = Chroma.from_documents(texts, hf)

  warn_deprecated(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

openai = ChatOpenAI(model_name="gpt-4o-mini-2024-07-18",
                    streaming=True, callbacks=[StreamingStdOutCallbackHandler()],
                    temperature = 0)

qa = RetrievalQA.from_chain_type(llm = openai,
                                 chain_type = "map_reduce",
                                 retriever = docsearch.as_retriever(
                                    search_type="mmr",
                                    search_kwargs={'k':3, 'fetch_k': 10}),
                                 return_source_documents = True)

query = "What are the specific requirements and procedures outlined in the braking test conditions for a vehicle seeking approval, including the initial brake temperature range, test speed criteria for different vehicle categories, brake actuation force limits, and the necessary steps to be taken before and during the test to ensure compliance with the approval regulations?"
result = qa(query)

The specific requirements and procedures outlined in the braking test conditions for a vehicle seeking approval are as follows:

1. **Initial Brake Temperature**: The initial brake temperature must be between ≥ 55 °C and ≤ 100 °C.

2. **Test Speed**:
   - For vehicles with Vmax > 125 km/h and < 200 km/h: Test speed is 0.8 Vmax.
   - For vehicles with Vmax ≥ 200 km/h: Test speed is 160 km/h.

3. **Brake Application**:
   - For vehicles with two service brake systems: Simultaneous actuation of both brake controls.
   - For vehicles with one service brake system: Actuation of the single brake control.

4. **Brake Actuation Force Limits**:
   - Hand control: ≤ 200 N.
   - Foot control: 
     - ≤ 350 N for vehicle categories L3 and L4.
     - ≤ 500 N for vehicle categories L5 and L7.

5. **Number of Stops**: The vehicle must be stopped until it meets the performance requirements, with a maximum of 6 stops.

6. **Procedure for Each Stop**:
   - Accelerate the vehicle to the test speed.
   - 

In [None]:
RAG_PROMPT_TEMPLATE = """
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.

Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
"""
from langchain.docstore.document import Document as LangchainDocument
from langchain_core.language_models.llms import LLM
from langchain_core.vectorstores import VectorStore
from typing import List, Tuple

def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[LangchainDocument]]:
    """RAG를 사용하여 질문에 답변합니다."""
    # 문서 검색
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # 텍스트만 유지

    # 필요한 문서 수 만큼 자르기
    relevant_docs = relevant_docs[:num_docs_final]

    # 최종 프롬프트 작성
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # 답변 생성
    from langchain.schema import HumanMessage
    messages = [HumanMessage(content=final_prompt)]
    generated_answer = llm(messages)

    return generated_answer, relevant_docs

# RAG를 사용하여 질문에 답변
generated_answer, docs = answer_with_rag(query, openai, docsearch)

print("generated_answer:", generated_answer)
print("Retrieved Docs:", docs)


The specific requirements and procedures outlined in the braking test conditions for a vehicle seeking approval are as follows:

1. **Initial Brake Temperature**: The initial brake temperature must be between ≥ 55 °C and ≤ 100 °C.

2. **Test Speed Criteria**:
   - For vehicles with Vmax > 125 km/h and < 200 km/h: Test speed is 0.8 Vmax.
   - For vehicles with Vmax ≥ 200 km/h: Test speed is 160 km/h.
   - For vehicles with Vmax ≤ 125 km/h: The test is not required (Document 1).

3. **Brake Actuation Force Limits**:
   - Hand control: ≤ 200 N.
   - Foot control: 
     - ≤ 350 N for vehicle categories L3 and L4.
     - ≤ 500 N for vehicle categories L5 and L7 (Document 0).

4. **Test Procedure Steps**:
   - Accelerate the vehicle to the specified test speed.
   - Simultaneously actuate both brake controls for vehicles with two service brake systems, or actuate the single brake control for vehicles with one service brake system.
   - Conduct a maximum of 6 stops until the vehicle meets the

# rag 평가

In [None]:
# answer_with_rag 함수 내에서 답변이 올바르게 생성되는지 확인
for example in tqdm(eval_dataset):
    question = example["question"]
    if question in [output["question"] for output in outputs]:
        print(f"Skipping already processed question: {question}")
        continue

    try:
        answer, relevant_docs = answer_with_rag(question, llm, knowledge_index)
        print(f"Generated Answer: {answer}")  # 생성된 답변 출력
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')

        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        print(f"Result: {result}")  # result 딕셔너리 출력

        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)
    except Exception as e:
        print(f"Error processing question {question}: {e}")

print(f"Number of items in outputs: {len(outputs)}")
if len(outputs) > 0:
    print(f"First item in outputs: {outputs[0]}")

# 파일 저장 확인
with open(output_file, "w") as f:
    json.dump(outputs, f)
print(f"Data saved to {output_file}")

# 데이터 로드 확인
with open(output_file, "r") as f:
    data = json.load(f)
    print(f"Data loaded from {output_file}: {len(data)} items")
    if len(data) > 0:
        print(f"First item in data: {data[0]}")


100%|██████████| 8/8 [00:00<00:00, 2238.60it/s]

Skipping already processed question: What is the process for Contracting Parties to communicate information about Type Approval Authorities and Technical Services to the United Nations Secretariat?
Skipping already processed question: What is the purpose of the information provided in Annex 1 in the context of vehicle approval processes?
Skipping already processed question: What actions are required when a modification is made to a vehicle type or its braking system according to the given context?
Skipping already processed question: How is the Peak Braking Coefficient (PBC) calculated for vehicles unable to achieve a test speed of 50 km/h?
Skipping already processed question: What are the test conditions for conducting stops on a high friction surface in ABS tests?
Skipping already processed question: What are the performance requirements for the stopping distance in the partial failure test for split service brake systems?
Skipping already processed question: What is the position of 




In [None]:
from langchain_core.language_models import BaseChatModel
from datasets import Dataset
from typing import Optional
import os
import json
from tqdm import tqdm

# 디렉토리 경로 설정
output_dir = "output"
output_file = os.path.join(output_dir, "rag_evaluation_results.json")

if os.path.isfile(output_file):
    print(f"File {output_file} exists.")

    # 파일 내용 읽기
    with open(output_file, "r") as f:
        data = json.load(f)

    # 데이터 내용 출력
    print(f"Number of records in {output_file}: {len(data)}")
    if len(data) > 0:
        # 첫 번째 항목 출력
        print(f"First record in {output_file}: {data[0]}")
else:
    print(f"File {output_file} does not exist.")

# 디렉토리가 존재하지 않으면 생성
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if eval_dataset:
    print(f"eval_dataset contains {len(eval_dataset)} examples.")
    # 첫 번째 예제 출력
    print(eval_dataset[0])
else:
    print("eval_dataset is empty.")

def run_rag_tests(
    eval_dataset: Dataset,
    llm,
    knowledge_index: Chroma,
    output_file: str,
    reranker = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    if eval_dataset:
        print(f"eval_dataset contains {len(eval_dataset)} examples.")
        # 첫 번째 예제 출력 (필요한 경우 추가로 몇 가지 예제를 출력할 수 있음)
        print(eval_dataset[0])
    else:
        print("eval_dataset is empty.")

    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []
    print(f"Loaded {len(outputs)} existing outputs.")

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            print(f"Skipping already processed question: {question}")
            continue

        try:
            generated_answer, relevant_docs = answer_with_rag(
                question, llm, knowledge_index, reranker=reranker
            )
            print(f"Generated Answer: {generated_answer}")  # 생성된 답변 출력

            if verbose:
                print("=======================================================")
                print(f"Question: {question}")
                print(f"Generated Answer: {generated_answer}")
                print(f'True Answer: {example["answer"]}')

            result = {
                "question": question,
                "true_answer": example["answer"],
                "source_doc": example["source_doc"],
                "generated_answer": generated_answer.content,  # 올바르게 저장
                "retrieved_docs": [doc for doc in relevant_docs],
            }
            print(f"Appending result: {result}")  # result 딕셔너리 출력

            if test_settings:
                result["test_settings"] = test_settings
            outputs.append(result)
        except Exception as e:
            print(f"Error processing question {question}: {e}")

    print(f"Number of items in outputs: {len(outputs)}")
    if len(outputs) > 0:
        print(f"First item in outputs: {outputs[0]}")

    with open(output_file, "w") as f:
        json.dump(outputs, f)
    print(f"Data saved to {output_file}")

    # 데이터 로드 확인 부분 추가
    with open(output_file, "r") as f:
        data = json.load(f)
        print(f"Data loaded from {output_file}: {len(data)} items")
        if len(data) > 0:
            print(f"First item in data: {data[0]}")

# answer_with_rag 함수 내에서 답변이 올바르게 생성되는지 확인
print("Generated answer:", generated_answer)

# outputs 리스트에 추가되기 직전 결과를 출력
print("Appending result:", result)

# 데이터 로드 후 확인
with open(output_file, "r") as f:
    data = json.load(f)
    print(f"Data loaded from {output_file}: {len(data)} items")
    if len(data) > 0:
        print(f"First item in data: {data[0]}")


File output/rag_evaluation_results.json exists.
Number of records in output/rag_evaluation_results.json: 8
First record in output/rag_evaluation_results.json: {'context': 'E/ECE/324/Rev.1/Add.77/Rev.3 \nE/ECE/TRANS/505/Rev.1/Add.77/Rev.3 \n \n \nApproval Authority which granted the approval. Upon receiving the relevant \ncommunication, that Type Approval Authority shall inform thereof the other \nParties to the Agreement applying this Regulation by means of a copy of the \napproval form bearing at the end, in large letters, the signed and dated \nannotation "PRODUCTION DISCONTINUED". \n \n 12.  \nNames and addresses of Technical Services  \n \n \nresponsible for conducting approval tests and of  \n \n \nType Approval Authorities \n \nThe Contracting Parties to the Agreement applying this Regulation shall \ncommunicate to the United Nations Secretariat the names and addresses of the \nTechnical Services responsible for conducting approval tests and of Type \nApproval Authorities which g

In [None]:
# outputs 리스트에 추가된 첫 번째 항목 확인
if outputs:
    print("First item in outputs:", outputs[0])
else:
    print("No data in outputs")




First item in outputs: {'context': 'E/ECE/324/Rev.1/Add.77/Rev.3 \nE/ECE/TRANS/505/Rev.1/Add.77/Rev.3 \n \n \nApproval Authority which granted the approval. Upon receiving the relevant \ncommunication, that Type Approval Authority shall inform thereof the other \nParties to the Agreement applying this Regulation by means of a copy of the \napproval form bearing at the end, in large letters, the signed and dated \nannotation "PRODUCTION DISCONTINUED". \n \n 12.  \nNames and addresses of Technical Services  \n \n \nresponsible for conducting approval tests and of  \n \n \nType Approval Authorities \n \nThe Contracting Parties to the Agreement applying this Regulation shall \ncommunicate to the United Nations Secretariat the names and addresses of the \nTechnical Services responsible for conducting approval tests and of Type \nApproval Authorities which grant approval and to which forms certifying \napproval or extension or refusal or withdrawal of approval, issued in other \ncountries, a

In [None]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]

Score 1:

The response is completely incorrect, inaccurate, and/or not factual.
The answer contains no correct information or relevant details from the reference.
It may include fabricated or misleading information.
Score 2:

The response is mostly incorrect, inaccurate, and/or not factual.
Some elements may vaguely relate to the reference, but the overall answer is misleading or incorrect.
It fails to address the core of the question appropriately.
Score 3:

The response is somewhat correct, accurate, and/or factual.
Key information may be missing or slightly incorrect, but the general idea is present.
The answer demonstrates a basic understanding but lacks detail or precision.
It may include minor errors or lack full clarity.
Score 4-:

The response is mostly correct, accurate, and factual, but with minor errors or ambiguities.
The answer covers most aspects correctly but may slightly misinterpret the context or miss finer details.
There may be a small degree of speculation or vague wording that detracts from complete accuracy.
Score 4+:

The response is almost entirely correct, accurate, and factual with minimal errors.
The answer is clear and aligns well with the reference, though it may lack some specificity or completeness.
There is very little room for improvement, but it might slightly benefit from a more precise or detailed expression.
Score 5:

The response is completely correct, accurate, and factual.
The answer is thorough, clear, and precisely reflects the reference.
It addresses the question fully with no errors or ambiguities.

###Feedback:"""

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage


evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

In [None]:
from langchain.chat_models import ChatOpenAI
import json
import os
from tqdm import tqdm

OPENAI_API_KEY = ""

# 평가에 사용할 OpenAI Chat 모델 설정
eval_chat_model = ChatOpenAI(model="gpt-4-1106-preview", temperature=0, openai_api_key=OPENAI_API_KEY)

# 평가자 이름 설정
evaluator_name = "GPT4"

# 평가 결과를 저장할 파일 경로
answer_path = "./output/rag_evaluation_results1.json"

def evaluate_answers(
    answer_path: str,
    eval_chat_model,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """Evaluates generated answers and saves the results to the specified file."""
    answers = []
    if os.path.isfile(answer_path):
        with open(answer_path, "r") as f:
            answers = json.load(f)
        print(f"Loaded {len(answers)} existing answers from {answer_path}")
    else:
        print(f"No existing file at {answer_path}, starting fresh.")

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            print(f"Already evaluated: {experiment['question']}")
            continue
        if "true_answer" not in experiment:
            print(f"Skipping evaluation for question: {experiment['question']} due to missing 'true_answer'")
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment.get("generated_answer", experiment.get("generated_answer.content")), # Use get method to try both keys
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)
        feedback, score = [
            item.strip() for item in eval_result.content.split("[RESULT]")
        ]
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback
        print(f"Evaluated: {experiment['question']} with score {score}")

    print(f"Saving {len(answers)} evaluations to {answer_path}")
    with open(answer_path, "w") as f:
        json.dump(answers, f)



# 평가 프롬프트 템플릿 설정 (이전 코드에서 정의된 템플릿 사용)
evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

# 평가 수행
evaluate_answers(
    answer_path=answer_path,
    eval_chat_model=eval_chat_model,
    evaluator_name=evaluator_name,
    evaluation_prompt_template=evaluation_prompt_template
)


  warn_deprecated(


No existing file at ./output/rag_evaluation_results1.json, starting fresh.


0it [00:00, ?it/s]

Saving 0 evaluations to ./output/rag_evaluation_results1.json





In [None]:
if not os.path.exists("./output"):
    os.mkdir("./output")

for chunk_size in [200]:  # Add other chunk sizes (in tokens) as needed
    for embeddings in ["thenlper/gte-small"]:  # Add other embeddings as needed
        for rerank in [True, False]:
            settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}"
            output_file_name = f"./output/rag_{settings_name}.json"

            print(f"Running evaluation for {settings_name}:")

            print("Loading knowledge base embeddings...")
            knowledge_index = load_embeddings(
                RAW_KNOWLEDGE_BASE,
                chunk_size=chunk_size,
                embedding_model_name=embeddings,
            )

            print("Running RAG...")
            reranker = (
                RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
                if rerank
                else None
            )
            run_rag_tests(
                eval_dataset=eval_dataset,
                llm=READER_LLM,
                knowledge_index=knowledge_index,
                output_file=output_file_name,
                reranker=reranker,
                verbose=False,
                test_settings=settings_name,
            )

            print("Running evaluation...")
            evaluate_answers(
                output_file_name,
                eval_chat_model,
                evaluator_name,
                evaluation_prompt_template,
            )


NameError: name 'READER_MODEL_NAME' is not defined

In [None]:
import glob

outputs = []
for file in glob.glob("./output/*.json"):
    output = pd.DataFrame(json.load(open(file, "r")))
    output["settings"] = file
    outputs.append(output)
result = pd.concat(outputs)


In [None]:
result["eval_score_GPT4"] = result["eval_score_GPT4"].apply(
    lambda x: int(x) if isinstance(x, str) else 1
)
result["eval_score_GPT4"] = (result["eval_score_GPT4"] - 1) / 4

KeyError: 'eval_score_GPT4'

In [None]:
average_scores = result.groupby("settings")["eval_score_GPT4"].mean()
average_scores.sort_values()