In [None]:
import os
import re
import sys
import datasets
import multiprocessing
import logging
import time
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
from functools import partial
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from accelerate import Accelerator, DataLoaderConfiguration
from concurrent.futures import ProcessPoolExecutor
from vllm import LLM, SamplingParams

In [None]:
import re
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

In [None]:
RESPONSE_DIR = '../00.data/01.model_response/00.gemini_response/'
RESPONSE_NAME = ['00.original_response_500.parquet',
                 '01.subject_shuffled_response_500.parquet',
                 '02.object_shuffled_response_500.parquet',
                 '03.property_scoped_subject_shuffled_response_500.parquet',
                 '04.property_scoped_object_shuffled_response_500.parquet']

LANGUAGE_LIST = ['en', 'fr', 'de', 'es', 'it', 'pt', 'ko', 'ja']

In [None]:
TEMPLATES = [pq.read_table(f"{RESPONSE_DIR}{name}").to_pandas() for name in RESPONSE_NAME]
TEMPLATES_COPY = [template.copy() for template in TEMPLATES]
BENCHMARKS = []

In [None]:
EXTRACT_ANSWER = re.compile(r"<answer>(.*?)</answer>", re.DOTALL | re.IGNORECASE)
def extract_answer(response):
    sentence = EXTRACT_ANSWER.search(response)
    return sentence.group(1).strip()

def make_benchmark_prompt(sentence):
    prompt = (
        f"Is the following statement True or False? "
        f"statement: {sentence} "
        f"Answer must be one of the following options: "
        f"True, False, Unsure. "
        f"Answer must be encapsulated with <answer></answer> "#, for example <answer>True</answer> "
        f"Output: <answer>"
    )
        
    
    return prompt

In [None]:
TEMPLATES_COPY[1]

In [None]:
for df in TEMPLATES_COPY:
    df.insert(0, 'row_id', df.index)
    drop_cols = [col for col in df.columns if col.startswith("TF_")]
    df = df.drop(columns = drop_cols, errors = 'ignore')
                 
    for lang in LANGUAGE_LIST:
        # gemini 생성 문장에서 <answer> 태그 기준 추출한 문장
        response_col = f"response_{lang}"
        sentence = df[response_col].apply(extract_answer)

        # 위의 프롬프트 사용해서 TF 벤치마크 질문지 생성
        wikibench_col = f"wikibench_TF_{lang}"
        df[wikibench_col] = sentence.apply(make_benchmark_prompt)

        # 각 모델이 TF 질문에 대한 응답한 내용 저장하는 열
        response_TF_col = f"response_TF_{lang}"
        df[response_TF_col] = ""

        # 각 모델 응답에서 TRUE / FALSE 추출하는 열
        extract_col = f"extract_TF_{lang}"
        df[extract_col] = ""
        
        
    BENCHMARKS.append(df)

In [None]:
col_order = ['row_id', 'subject', 'property', 'object', 'kind', 
             'prompt_en', 'response_en', 'wikibench_TF_en', 'response_TF_en', 'extract_TF_en', 'correct_en', 
             'prompt_fr', 'response_fr', 'wikibench_TF_fr', 'response_TF_fr', 'extract_TF_fr', 'correct_fr', 
             'prompt_de', 'response_de', 'wikibench_TF_de', 'response_TF_de', 'extract_TF_de', 'correct_de', 
             'prompt_es', 'response_es', 'wikibench_TF_es', 'response_TF_es', 'extract_TF_es', 'correct_es',
             'prompt_it', 'response_it', 'wikibench_TF_it', 'response_TF_it', 'extract_TF_it', 'correct_it', 
             'prompt_pt', 'response_pt', 'wikibench_TF_pt', 'response_TF_pt', 'extract_TF_pt', 'correct_pt', 
             'prompt_ko', 'response_ko', 'wikibench_TF_ko', 'response_TF_ko', 'extract_TF_ko', 'correct_ko', 
             'prompt_ja', 'response_ja', 'wikibench_TF_ja', 'response_TF_ja', 'extract_TF_ja', 'correct_ja', ]
for i in range(len(BENCHMARKS)):
    BENCHMARKS[i] = BENCHMARKS[i][col_order]

In [None]:
BENCHMARK_DIR = '../00.data/02.WikiBench/'
BENCHMARK_NAME = ['00.original_benchmark_TF_500.parquet',
                  '01.subject_shuffled_benchmark_TF_500.parquet',
                  '02.object_shuffled_benchmark_TF_500.parquet',
                  '03.property_scoped_subject_shuffled_benchmark_TF_500.parquet',
                  '04.property_scoped_object_shuffled_benchmark_TF_500.parquet']
for i in range(len(BENCHMARKS)):
    BENCHMARKS[i].to_parquet(f"{BENCHMARK_DIR}{BENCHMARK_NAME[i]}")

In [None]:
BENCHMARKS[0].iloc[0]['wikibench_TF_ko']