# Word to Json Parsing

In [46]:
import json
from docx import Document
import re
import uuid
import os

# --- 설정 변수 ---
WORD_FILE_PATH = "C:/RAG_DATA/evaluation/RAG_Quiz_Eng_v2.docx"
OUTPUT_JSON_PATH = "./evaluation_data_final_v3.json" # 최종 파일을 이 경로에 저장
# -----------------

def parse_docx_to_json_final_v3(word_file_path):
    """
    Word 파일에서 질문-답변 쌍을 추출하여 JSON 구조로 변환합니다.
    - 객관식(1-20)과 서술형(21-30)의 type을 정확하게 분류하고, 서술형 정답을 완전하게 추출
    """
    
    try:
        doc = Document(word_file_path)
    except FileNotFoundError:
        print(f"오류: 파일을 찾을 수 없습니다. 경로를 확인해주세요: {word_file_path}")
        return None

    all_data = []
    current_question_data = {}
    is_mcq_section = True
    question_number = 1
    
    # 정규 표현식 패턴
    option_pattern = re.compile(r"^\((A|B|C|D|E)\)")
    # 질문 시작: '숫자.'로 시작하는 텍스트만 인식
    question_start_pattern = re.compile(r"^\d+\.\s+", re.IGNORECASE)
    # 서술형 질문 제목/주제 부분 제거를 위한 패턴: '1. Transaction Concept: ...' -> 'Transaction Concept: ...'
    descriptive_topic_pattern = re.compile(r"^\d+\.\s+", re.IGNORECASE)
    answer_tag_pattern = re.compile(r"^Answer:\s*(\(?[A-E]\)?|\S.*)", re.IGNORECASE)

    # --- 파싱 로직 시작 ---
    for paragraph in doc.paragraphs:
        text = paragraph.text.strip()
        
        if not text:
            continue
            
        # 1. Part II 섹션 진입 감지 (서술형 시작)
        if text.startswith("Part II: Descriptive Questions"):
            # 객관식 20번 저장 (20번이 마지막에 처리되지 않았을 경우)
            if current_question_data and current_question_data.get('type') == 'mcq':
                 if not current_question_data.get('ground_truth_answer'):
                    current_question_data['ground_truth_answer'] = "Answer not found in parse"
                 all_data.append(current_question_data)
                 
            current_question_data = {} # 데이터 초기화
            is_mcq_section = False
            question_number = 21 # 서술형은 21번부터 시작
            continue
            
        # 2. 새로운 질문 시작 감지
        
        if question_start_pattern.match(text):
            # 이전 질문 저장 (새로운 질문 번호가 발견되면 저장)
            if current_question_data:
                if not current_question_data.get('ground_truth_answer'):
                    current_question_data['ground_truth_answer'] = "Answer not found in parse"
                # 객관식 질문이 서술형 섹션에 잘못 포함된 경우 answer_options 제거
                if current_question_data.get('type') == 'descriptive' and 'answer_options' in current_question_data:
                    del current_question_data['answer_options']
                all_data.append(current_question_data)

            # 새 질문 초기화
            q_type = "mcq" if is_mcq_section else "descriptive"
            
            # 서술형 질문의 경우, "1. "과 같은 번호를 제거하여 질문만 남깁니다.
            q_text = descriptive_topic_pattern.sub("", text, 1) if not is_mcq_section else text

            current_question_data = {
                "number": question_number,
                "type": q_type,
                "question": q_text,
                "correct_node_id": str(uuid.uuid4()),
                "ground_truth_context": "",
                "ground_truth_answer": ""
            }
            # 객관식일 경우에만 answer_options 리스트를 추가합니다.
            if q_type == "mcq":
                 current_question_data["answer_options"] = []
                 
            question_number += 1
            continue

        # 3. 답변 옵션 파싱 (객관식에서만)
        if current_question_data.get('type') == 'mcq' and option_pattern.match(text):
            current_question_data['answer_options'].append(text)
            continue
            
        # 4. 정답 파싱 및 연속 정답 처리
        
        answer_match = answer_tag_pattern.match(text)
        
        if answer_match and current_question_data and not current_question_data.get('ground_truth_answer'):
            answer_content = answer_match.group(1).strip()
            
            if current_question_data['type'] == 'mcq':
                # 객관식 정답 처리 (예: Answer: (C))
                current_question_data['ground_truth_answer'] = f"({answer_content.upper()})" if len(answer_content) == 1 else answer_content
                continue
            
            if current_question_data['type'] == 'descriptive':
                # 서술형 정답 처리 (시작)
                current_question_data['ground_truth_answer'] = answer_content
                continue
                
        # 서술형 정답 연속 파싱
        if current_question_data and current_question_data.get('type') == 'descriptive' and current_question_data.get('ground_truth_answer'):
            # 다음 질문이나 새로운 'Answer:' 태그가 아닌 경우에만 텍스트를 정답에 추가
            if not question_start_pattern.match(text) and not answer_tag_pattern.match(text):
                current_question_data['ground_truth_answer'] += " " + text
                continue
                
    # 5. 마지막 질문 저장
    if current_question_data:
        if not current_question_data.get('ground_truth_answer'):
            current_question_data['ground_truth_answer'] = "Answer not found in parse"
            
        if current_question_data['type'] == 'descriptive' and 'answer_options' in current_question_data:
            del current_question_data['answer_options']
            
        all_data.append(current_question_data)

    # 최종 JSON 객체 생성 및 파일 저장
    final_json = {"mcq": [data for data in all_data if data['type'] == 'mcq'], 
                  "descriptive": [data for data in all_data if data['type'] == 'descriptive']}
    
    with open(OUTPUT_JSON_PATH, 'w', encoding='utf-8') as f:
        json.dump(final_json, f, ensure_ascii=False, indent=4)
        
    print(f"\n✅ JSON 변환 완료. 총 {len(final_json['mcq'])}개의 객관식(1-20)과 {len(final_json['descriptive'])}개의 서술형(21-30) 질문이 {OUTPUT_JSON_PATH}에 저장되었습니다.")
    
    return final_json

evaluation_data_final = parse_docx_to_json_final_v2(WORD_FILE_PATH)


✅ JSON 변환 완료. 총 30개의 객관식(1-20)과 0개의 서술형(21-30) 질문이 ./evaluation_data_final_v3.json에 저장되었습니다.


# Evaluation

In [2]:
import os
import json
import glob
import re
import difflib
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings

# 자연어 처리 평가 지표 라이브러리
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score as bert_score_func
from collections import Counter

# Word 문서 생성을 위한 라이브러리
try:
    from docx import Document
    from docx.shared import Pt, RGBColor
except ImportError:
    print("Error: 'python-docx' library is missing. Please install it using 'pip install python-docx'")
    Document = None

# LangChain 관련 임포트
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage

# 경고 무시
warnings.filterwarnings("ignore")

# ==========================================
# 1. 설정 및 API 키
# ==========================================

DATA_DIR = "C:/RAG_DATA/data/processed_v2"
EVAL_FILE_PATH = "C:/RAG_DATA/evaluation/RAG_Quiz_Eng_with_IDs.json"
RESULT_CSV_PATH = "C:/RAG_DATA/evaluation/raptor_final_metrics.csv"
RESULT_DOCX_PATH = "C:/RAG_DATA/evaluation/raptor_qa_logs.docx" # Word 파일 저장 경로

# OpenRouter / OpenAI 설정
OPENROUTER_API_KEY = "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"
OPENROUTER_API_BASE = "https://openrouter.ai/api/v1"

os.environ["OPENROUTER_API_KEY"] = OPENROUTER_API_KEY

LLM_MODEL_NAME = "google/gemini-2.5-flash-preview-09-2025"
EMBEDDING_MODEL_NAME = "openai/text-embedding-3-small"

TOP_K = 5
TEXT_SIMILARITY_THRESHOLD = 0.3

# ==========================================
# 2. NLTK 리소스 다운로드 및 설정
# ==========================================
def download_nltk_resources():
    resources = ['punkt', 'wordnet', 'omw-1.4', 'punkt_tab']
    for res in resources:
        try:
            nltk.data.find(f'tokenizers/{res}')
        except LookupError:
            print(f"Downloading NLTK resource: {res}")
            nltk.download(res, quiet=True)
        except ValueError:
             pass 

download_nltk_resources()
rouge_evaluator = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

# ==========================================
# 3. [해결책 B] 검색 평가 함수
# ==========================================

def check_text_similarity(doc_text, gt_context, threshold=TEXT_SIMILARITY_THRESHOLD):
    if not doc_text or not gt_context:
        return False
    
    doc_text = doc_text.lower().strip()
    gt_context = gt_context.lower().strip()
    
    if doc_text in gt_context or gt_context in doc_text:
        return True

    try:
        doc_tokens = set(doc_text.split())
        gt_tokens = set(gt_context.split())
        if not gt_tokens or not doc_tokens:
            jaccard = 0.0
        else:
            intersection = doc_tokens.intersection(gt_tokens)
            union = doc_tokens.union(gt_tokens)
            jaccard = len(intersection) / len(union)
        
        if jaccard >= threshold:
            return True
    except:
        pass
        
    try:
        ratio = difflib.SequenceMatcher(None, doc_text[:2000], gt_context[:2000]).ratio()
        if ratio >= threshold:
            return True
    except:
        pass

    return False

def calculate_retrieval_metrics(retrieved_docs, correct_node_id, ground_truth_context, k=TOP_K):
    if not retrieved_docs:
        return 0.0, 0.0, 0.0

    retrieved_ids = [doc.metadata.get("node_id") for doc in retrieved_docs]
    
    try:
        rank_idx = retrieved_ids.index(correct_node_id)
        rank = rank_idx + 1
        return 1.0, 1.0/rank, 1.0/np.log2(rank + 1)
    except ValueError:
        pass

    if ground_truth_context:
        for i, doc in enumerate(retrieved_docs):
            if check_text_similarity(doc.page_content, ground_truth_context, TEXT_SIMILARITY_THRESHOLD):
                rank = i + 1
                return 1.0, 1.0/rank, 1.0/np.log2(rank + 1)

    return 0.0, 0.0, 0.0

# ==========================================
# 4. 답변 품질 평가 함수들
# ==========================================

def clean_text(text):
    return text.strip().lower()

def calculate_mcq_accuracy(prediction, ground_truth):
    if not prediction or not ground_truth:
        return 0.0
    
    pred_match = re.search(r'\(([A-E])\)', prediction)
    gt_match = re.search(r'\(([A-E])\)', ground_truth)
    
    pred_char = pred_match.group(1) if pred_match else ""
    gt_char = gt_match.group(1) if gt_match else ""
    
    if not pred_char:
        for char in ['A', 'B', 'C', 'D', 'E']:
            if f"({char})" in prediction or f"{char}." in prediction:
                pred_char = char
                break
                
    if not gt_char:
        gt_char = ground_truth.strip().replace("(", "").replace(")", "")

    return 1.0 if pred_char == gt_char else 0.0

def calculate_token_f1(prediction, ground_truth):
    pred_tokens = clean_text(prediction).split()
    gt_tokens = clean_text(ground_truth).split()
    
    common = Counter(pred_tokens) & Counter(gt_tokens)
    num_same = sum(common.values())
    
    if num_same == 0:
        return 0.0
    
    precision = 1.0 * num_same / len(pred_tokens)
    recall = 1.0 * num_same / len(gt_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def calculate_descriptive_metrics(prediction, ground_truth):
    metrics = {}
    
    try:
        P, R, F1 = bert_score_func([prediction], [ground_truth], lang="en", verbose=False)
        metrics['BERTScore'] = F1.mean().item()
    except Exception:
        metrics['BERTScore'] = 0.0
        
    try:
        scores = rouge_evaluator.score(ground_truth, prediction)
        metrics['Rouge-L'] = scores['rougeL'].fmeasure
    except Exception:
        metrics['Rouge-L'] = 0.0
        
    pred_tokens = nltk.word_tokenize(prediction)
    gt_tokens = nltk.word_tokenize(ground_truth)
    
    try:
        chencherry = SmoothingFunction()
        metrics['BLEU-1'] = sentence_bleu([gt_tokens], pred_tokens, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1)
    except Exception:
        metrics['BLEU-1'] = 0.0
        
    try:
        metrics['METEOR'] = meteor_score([gt_tokens], pred_tokens)
    except Exception:
        metrics['METEOR'] = 0.0
        
    metrics['Token_F1'] = calculate_token_f1(prediction, ground_truth)
    return metrics

# ==========================================
# 5. 모델 로드 및 생성 함수
# ==========================================

def load_raptor_model(folder_path, index_name):
    embeddings = OpenAIEmbeddings(
        model=EMBEDDING_MODEL_NAME,
        openai_api_key=OPENROUTER_API_KEY,
        openai_api_base=OPENROUTER_API_BASE,
        check_embedding_ctx_length=False
    )
    try:
        vectorstore = FAISS.load_local(
            folder_path=folder_path, 
            embeddings=embeddings, 
            index_name=index_name,
            allow_dangerous_deserialization=True 
        )
        return vectorstore
    except Exception as e:
        print(f"Error loading model {index_name}: {e}")
        return None

def generate_mcq_answer(llm, context, question, options):
    options_str = "\n".join(options)
    prompt = f"""You are a helpful assistant. 
Read the following context and answer the multiple-choice question.
Output ONLY the option letter (e.g., (A), (B), (C), (D), or (E)) that corresponds to the correct answer. Do not write any other text.

Context:
{context}

Question: {question}
Options:
{options_str}

Correct Option:"""
    messages = [HumanMessage(content=prompt)]
    try:
        response = llm.invoke(messages)
        return response.content.strip()
    except:
        return ""

def generate_descriptive_answer(llm, context, question):
    prompt = f"""You are a helpful assistant. Answer the question based ONLY on the following context.
Answer concisely and accurately.

Context:
{context}

Question: {question}

Answer:"""
    messages = [HumanMessage(content=prompt)]
    try:
        response = llm.invoke(messages)
        return response.content.strip()
    except:
        return ""

# ==========================================
# 6. Word 저장 함수 (추가됨)
# ==========================================
def save_logs_to_docx(qa_logs, filepath):
    """
    수집된 Q&A 로그를 Word 파일로 저장합니다.
    """
    if Document is None:
        print("Skipping Word export: python-docx not installed.")
        return

    doc = Document()
    doc.add_heading('RAPTOR Model Evaluation Logs', 0)

    # 모델별로 그룹화
    current_model = None
    
    for log in qa_logs:
        # 모델 헤더 추가
        if log['Model'] != current_model:
            current_model = log['Model']
            doc.add_page_break()
            heading = doc.add_heading(f"Model: {current_model}", level=1)
            heading.style.font.color.rgb = RGBColor(0, 50, 150) # 파란색 헤더

        # 질문 섹션
        p = doc.add_paragraph()
        p.add_run(f"[{log['Type']}] ").bold = True
        p.add_run(log['Question'])

        # 정답 및 예측
        doc.add_paragraph(f"Ground Truth: {log['Ground_Truth']}", style='List Bullet')
        
        # 모델 답변 (강조)
        p_ans = doc.add_paragraph(style='List Bullet')
        run_ans = p_ans.add_run(f"Generated: {log['Generated']}")
        run_ans.bold = True
        run_ans.font.color.rgb = RGBColor(0, 100, 0) # 초록색
        
        # 점수 정보
        if log['Score']:
            doc.add_paragraph(f"Score Info: {log['Score']}", style='List Bullet 2')
        
        doc.add_paragraph("-" * 50) # 구분선

    try:
        doc.save(filepath)
        print(f"\nSuccessfully saved QA Logs to: {filepath}")
    except Exception as e:
        print(f"Error saving Word file: {e}")

# ==========================================
# 7. 메인 실행 루프
# ==========================================

def main():
    if not os.path.exists(EVAL_FILE_PATH):
        print(f"File not found: {EVAL_FILE_PATH}")
        return
    
    print(f"Loading quiz data from {EVAL_FILE_PATH}...")
    with open(EVAL_FILE_PATH, 'r', encoding='utf-8') as f:
        quiz_data = json.load(f)

    keys = list(quiz_data.keys())
    print(f"DEBUG: Keys found in JSON file: {keys}")

    search_pattern = os.path.join(DATA_DIR, "**/*.faiss")
    faiss_files = glob.glob(search_pattern, recursive=True)
    faiss_files = [os.path.abspath(p) for p in faiss_files]
    
    if not faiss_files:
        print("No models found.")
        return

    print(f"Found {len(faiss_files)} models. Starting comprehensive evaluation...")
    
    llm = ChatOpenAI(
        model=LLM_MODEL_NAME,
        openai_api_key=OPENROUTER_API_KEY,
        openai_api_base=OPENROUTER_API_BASE,
        temperature=0
    )
    
    all_results = []
    qa_logs = []  # Q&A 로그 수집용 리스트

    for faiss_file in faiss_files:
        folder_path = os.path.dirname(faiss_file)
        index_name = os.path.splitext(os.path.basename(faiss_file))[0]
        display_name = index_name
        
        print(f"\nEvaluating: {display_name}...")
        vectorstore = load_raptor_model(folder_path, index_name)
        if not vectorstore:
            continue
            
        retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
        
        mcq_scores = []
        desc_metrics_sum = {
            'BERTScore': 0, 'Rouge-L': 0, 'BLEU-1': 0, 'METEOR': 0, 'Token_F1': 0
        }
        retrieval_sum = {'Recall': 0, 'MRR': 0, 'NDCG': 0}
        
        desc_count = 0
        total_retrieval_count = 0
        
        # --- 1. MCQ 평가 ---
        mcq_key = None
        for k in keys:
            if "mcq" in k.lower():
                mcq_key = k
                break
        
        if mcq_key and isinstance(quiz_data[mcq_key], list):
            for item in tqdm(quiz_data[mcq_key], desc=f"  MCQ ({display_name})"):
                question = item.get("question")
                options = item.get("answer_options", [])
                ground_truth = item.get("ground_truth_answer")
                correct_node_id = item.get("correct_node_id")
                gt_context = item.get("ground_truth_context", "")
                
                # 검색
                try:
                    docs = retriever.invoke(question)
                    context = "\n\n".join([d.page_content for d in docs])
                except:
                    docs = []
                    context = ""
                
                # Retrieval 평가
                rec, mrr, ndcg = calculate_retrieval_metrics(docs, correct_node_id, gt_context, TOP_K)
                retrieval_sum['Recall'] += rec
                retrieval_sum['MRR'] += mrr
                retrieval_sum['NDCG'] += ndcg
                total_retrieval_count += 1
                
                # Generation 평가
                pred = generate_mcq_answer(llm, context, question, options)
                acc = calculate_mcq_accuracy(pred, ground_truth)
                mcq_scores.append(acc)

                # 로그 저장 (Word용)
                qa_logs.append({
                    "Model": display_name,
                    "Type": "MCQ",
                    "Question": question,
                    "Ground_Truth": ground_truth,
                    "Generated": pred,
                    "Score": f"Correct: {acc == 1.0}"
                })

        else:
            print("  [Warning] No MCQ questions found.")

        # --- 2. Descriptive 평가 ---
        desc_key = None
        possible_keys = ["essay", "descriptive", "desc", "short_answer", "open_ended", "subjective"]
        for pk in possible_keys:
            if pk in quiz_data:
                desc_key = pk
                break
        if not desc_key:
            for k in keys:
                if any(pk in k.lower() for pk in possible_keys) and isinstance(quiz_data[k], list):
                    desc_key = k
                    break
        
        if desc_key:
            for item in tqdm(quiz_data[desc_key], desc=f"  Desc ({display_name})"):
                question = item.get("question")
                ground_truth = item.get("ground_truth_answer") or item.get("answer", "")
                correct_node_id = item.get("correct_node_id")
                gt_context = item.get("ground_truth_context", "")
                
                try:
                    docs = retriever.invoke(question)
                    context = "\n\n".join([d.page_content for d in docs])
                except:
                    docs = []
                    context = ""

                # Retrieval 평가
                rec, mrr, ndcg = calculate_retrieval_metrics(docs, correct_node_id, gt_context, TOP_K)
                retrieval_sum['Recall'] += rec
                retrieval_sum['MRR'] += mrr
                retrieval_sum['NDCG'] += ndcg
                total_retrieval_count += 1
                
                # Generation 평가
                pred = generate_descriptive_answer(llm, context, question)
                metrics = calculate_descriptive_metrics(pred, ground_truth)
                
                for k, v in metrics.items():
                    desc_metrics_sum[k] += v
                desc_count += 1

                # 로그 저장 (Word용)
                qa_logs.append({
                    "Model": display_name,
                    "Type": "Descriptive",
                    "Question": question,
                    "Ground_Truth": ground_truth,
                    "Generated": pred,
                    "Score": f"BERT: {metrics['BERTScore']:.2f} | Rouge: {metrics['Rouge-L']:.2f}"
                })

        else:
            print(f"  [Warning] No descriptive questions found.")
        
        # --- 결과 집계 ---
        avg_mcq_acc = sum(mcq_scores)/len(mcq_scores) if mcq_scores else 0.0
        
        res_entry = {
            "Model": display_name,
            "Path": os.path.basename(folder_path),
            "MCQ_Accuracy": avg_mcq_acc,
            "Desc_Count": desc_count
        }

        # Retrieval Metrics 추가
        if total_retrieval_count > 0:
            res_entry["Retrieval_Recall@5"] = retrieval_sum['Recall'] / total_retrieval_count
            res_entry["Retrieval_MRR"] = retrieval_sum['MRR'] / total_retrieval_count
            res_entry["Retrieval_NDCG"] = retrieval_sum['NDCG'] / total_retrieval_count
        else:
             res_entry["Retrieval_Recall@5"] = 0.0
             res_entry["Retrieval_MRR"] = 0.0
             res_entry["Retrieval_NDCG"] = 0.0

        # Descriptive Metrics 추가
        if desc_count > 0:
            for k, v in desc_metrics_sum.items():
                res_entry[f"Desc_{k}"] = v / desc_count
        else:
            for k in desc_metrics_sum.keys():
                res_entry[f"Desc_{k}"] = 0.0
                
        all_results.append(res_entry)
        
        # 중간 결과 출력
        print(f"  > [Retrieval] Recall@5: {res_entry['Retrieval_Recall@5']:.4f}")
        print(f"  > [Generation] MCQ Acc: {avg_mcq_acc:.4f}")
        
        if desc_count > 0:
            print(f"  > [Generation] Desc BERTScore: {res_entry['Desc_BERTScore']:.4f}")

    # --- 최종 저장 (CSV) ---
    if all_results:
        df = pd.DataFrame(all_results)
        cols = ["Model", "Path", "Retrieval_Recall@5", "Retrieval_MRR", "Retrieval_NDCG", 
                "MCQ_Accuracy", "Desc_BERTScore", "Desc_Rouge-L", "Desc_BLEU-1", "Desc_METEOR", "Desc_Token_F1"]
        df = df[[c for c in cols if c in df.columns]]
        
        print("\n=== Final Evaluation Metrics ===")
        print(df)
        df.to_csv(RESULT_CSV_PATH, index=False)
        print(f"\nMetrics saved to: {RESULT_CSV_PATH}")

    # --- 최종 저장 (Word) ---
    if qa_logs:
        save_logs_to_docx(qa_logs, RESULT_DOCX_PATH)

if __name__ == "__main__":
    main()

Downloading NLTK resource: wordnet
Downloading NLTK resource: omw-1.4
Loading quiz data from C:/RAG_DATA/evaluation/RAG_Quiz_Eng_with_IDs.json...
DEBUG: Keys found in JSON file: ['mcq', 'essay']
Found 12 models. Starting comprehensive evaluation...

Evaluating: baseline...


  MCQ (baseline): 100%|██████████| 20/20 [00:40<00:00,  2.03s/it]
  Desc (baseline):   0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (baseline):  10%|█         | 1/10 [00:09<01:21,  9.06s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (baseline):  20%|██        | 2/10 [00:13<00:49,  6.24s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this

  > [Retrieval] Recall@5: 0.0000
  > [Generation] MCQ Acc: 0.8500
  > [Generation] Desc BERTScore: 0.8771

Evaluating: chunkoverlap_400...


  MCQ (chunkoverlap_400): 100%|██████████| 20/20 [00:41<00:00,  2.08s/it]
  Desc (chunkoverlap_400):   0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (chunkoverlap_400):  10%|█         | 1/10 [00:04<00:39,  4.40s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (chunkoverlap_400):  20%|██        | 2/10 [00:12<00:52,  6.60s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight'

  > [Retrieval] Recall@5: 0.1333
  > [Generation] MCQ Acc: 0.8500
  > [Generation] Desc BERTScore: 0.8956

Evaluating: chunkoverlap_50...


  MCQ (chunkoverlap_50): 100%|██████████| 20/20 [00:38<00:00,  1.91s/it]
  Desc (chunkoverlap_50):   0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (chunkoverlap_50):  10%|█         | 1/10 [00:03<00:33,  3.69s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (chunkoverlap_50):  20%|██        | 2/10 [00:07<00:29,  3.70s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
Yo

  > [Retrieval] Recall@5: 0.2333
  > [Generation] MCQ Acc: 0.8500
  > [Generation] Desc BERTScore: 0.9030

Evaluating: chunksize_2000...


  MCQ (chunksize_2000): 100%|██████████| 20/20 [00:29<00:00,  1.49s/it]
  Desc (chunksize_2000):   0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (chunksize_2000):  10%|█         | 1/10 [00:02<00:26,  2.95s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (chunksize_2000):  20%|██        | 2/10 [00:06<00:26,  3.32s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sh

  > [Retrieval] Recall@5: 0.1333
  > [Generation] MCQ Acc: 0.8500
  > [Generation] Desc BERTScore: 0.8999

Evaluating: chunksize_500...


  MCQ (chunksize_500): 100%|██████████| 20/20 [00:30<00:00,  1.53s/it]
  Desc (chunksize_500):   0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (chunksize_500):  10%|█         | 1/10 [00:03<00:29,  3.33s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (chunksize_500):  20%|██        | 2/10 [00:06<00:26,  3.33s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should

  > [Retrieval] Recall@5: 0.1667
  > [Generation] MCQ Acc: 0.8500
  > [Generation] Desc BERTScore: 0.8996

Evaluating: default...


  MCQ (default): 100%|██████████| 20/20 [00:30<00:00,  1.54s/it]
  Desc (default):   0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (default):  10%|█         | 1/10 [00:05<00:46,  5.12s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (default):  20%|██        | 2/10 [00:09<00:36,  4.54s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this mod

  > [Retrieval] Recall@5: 0.1667
  > [Generation] MCQ Acc: 0.8500
  > [Generation] Desc BERTScore: 0.8856

Evaluating: gmmthreshold_0.05...


  MCQ (gmmthreshold_0.05): 100%|██████████| 20/20 [00:32<00:00,  1.60s/it]
  Desc (gmmthreshold_0.05):   0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (gmmthreshold_0.05):  10%|█         | 1/10 [00:03<00:27,  3.06s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (gmmthreshold_0.05):  20%|██        | 2/10 [00:06<00:27,  3.40s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.wei

  > [Retrieval] Recall@5: 0.1667
  > [Generation] MCQ Acc: 0.8500
  > [Generation] Desc BERTScore: 0.8981

Evaluating: gmmthreshold_0.3...


  MCQ (gmmthreshold_0.3): 100%|██████████| 20/20 [00:31<00:00,  1.56s/it]
  Desc (gmmthreshold_0.3):   0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (gmmthreshold_0.3):  10%|█         | 1/10 [00:03<00:30,  3.42s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (gmmthreshold_0.3):  20%|██        | 2/10 [00:06<00:25,  3.13s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight'

  > [Retrieval] Recall@5: 0.1333
  > [Generation] MCQ Acc: 0.8500
  > [Generation] Desc BERTScore: 0.8952

Evaluating: reductiondimension_20...


  MCQ (reductiondimension_20): 100%|██████████| 20/20 [00:32<00:00,  1.63s/it]
  Desc (reductiondimension_20):   0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (reductiondimension_20):  10%|█         | 1/10 [00:03<00:35,  3.96s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (reductiondimension_20):  20%|██        | 2/10 [00:06<00:27,  3.39s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', '

  > [Retrieval] Recall@5: 0.1333
  > [Generation] MCQ Acc: 0.8500
  > [Generation] Desc BERTScore: 0.9037

Evaluating: reductiondimension_5...


  MCQ (reductiondimension_5): 100%|██████████| 20/20 [00:30<00:00,  1.53s/it]
  Desc (reductiondimension_5):   0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (reductiondimension_5):  10%|█         | 1/10 [00:04<00:36,  4.04s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (reductiondimension_5):  20%|██        | 2/10 [00:07<00:29,  3.67s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pool

  > [Retrieval] Recall@5: 0.1333
  > [Generation] MCQ Acc: 0.8500
  > [Generation] Desc BERTScore: 0.8945

Evaluating: treeheight_10...


  MCQ (treeheight_10): 100%|██████████| 20/20 [00:31<00:00,  1.58s/it]
  Desc (treeheight_10):   0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (treeheight_10):  10%|█         | 1/10 [00:03<00:30,  3.36s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (treeheight_10):  20%|██        | 2/10 [00:07<00:31,  3.88s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should

  > [Retrieval] Recall@5: 0.1333
  > [Generation] MCQ Acc: 0.8500
  > [Generation] Desc BERTScore: 0.8928

Evaluating: treeheight_2...


  MCQ (treeheight_2): 100%|██████████| 20/20 [00:37<00:00,  1.86s/it]
  Desc (treeheight_2):   0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (treeheight_2):  10%|█         | 1/10 [00:04<00:39,  4.35s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Desc (treeheight_2):  20%|██        | 2/10 [00:07<00:30,  3.77s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should pro

  > [Retrieval] Recall@5: 0.1333
  > [Generation] MCQ Acc: 0.8500
  > [Generation] Desc BERTScore: 0.9015

=== Final Evaluation Metrics ===
                    Model                   Path  Retrieval_Recall@5  \
0                baseline               baseline            0.000000   
1        chunkoverlap_400       chunkoverlap_400            0.133333   
2         chunkoverlap_50        chunkoverlap_50            0.233333   
3          chunksize_2000         chunksize_2000            0.133333   
4           chunksize_500          chunksize_500            0.166667   
5                 default                default            0.166667   
6       gmmthreshold_0.05      gmmthreshold_0.05            0.166667   
7        gmmthreshold_0.3       gmmthreshold_0.3            0.133333   
8   reductiondimension_20  reductiondimension_20            0.133333   
9    reductiondimension_5   reductiondimension_5            0.133333   
10          treeheight_10          treeheight_10            0.133333