In [None]:
import os
import glob
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import numpy as np
from tqdm import tqdm 

In [None]:
MERGED_DIR = '../00.data/00.wikidata/01.wikidata_merged/'
SHUFFLED_DIR  = '../00.data/00.wikidata/02.wikidata_shuffled/'
TEMPLATE_DIR  = '../00.data/00.wikidata/03.wikidata_template/'


subjects = pq.read_table(f"{MERGED_DIR}subject_filtered.parquet").to_pandas()
properties = pq.read_table(f"{MERGED_DIR}property_filtered.parquet").to_pandas()

In [None]:
original = pq.read_table(f"{SHUFFLED_DIR}00.original_500.parquet").to_pandas()
shuffled_0 = pq.read_table(f"{SHUFFLED_DIR}01.subject_shuffled_500.parquet").to_pandas()
shuffled_1 = pq.read_table(f"{SHUFFLED_DIR}02.object_shuffled_500.parquet").to_pandas()
shuffled_2 = pq.read_table(f"{SHUFFLED_DIR}03.property_scoped_subject_shuffled_500.parquet").to_pandas()
shuffled_3 = pq.read_table(f"{SHUFFLED_DIR}04.property_scoped_object_shuffled_500.parquet").to_pandas()

In [None]:
LANGUAGE_LIST = ['en', 'fr', 'de', 'es', 'it', 'pt', 'ko', 'ja']
LANGUAGE_NAME = {
    "en": "English",
    "fr": "French",
    "de": "German",
    "es": "Spanish",
    "it": "Italian",
    "pt": "Portuguese",
    "ko": "Korean",
    "ja": "Japanese",
}

In [None]:
def attach_labels(triples_df, subjects, properties):
    """(subject, property, object)에 각 언어별 라벨을 붙여주는 헬퍼"""
    df = triples_df.copy()

    sbj_labels = subjects.rename(columns = {lang: f"sbj_{lang}" for lang in LANGUAGE_LIST}).set_index('subject')
    obj_labels = subjects.rename(columns = {lang: f"obj_{lang}" for lang in LANGUAGE_LIST}).set_index('subject')
    prop_labels = properties.rename(columns = {lang: f"prop_{lang}" for lang in LANGUAGE_LIST}).set_index("property")

    
                                            

    # subject 라벨
    df = df.merge(sbj_labels, left_on="subject", right_index=True, how="left")
    # object 라벨
    df = df.merge(obj_labels, left_on="object", right_index=True, how="left")
    # property 라벨
    df = df.merge(prop_labels, left_on="property", right_index=True, how="left")

    return df


def build_templates(triples_df, kind):
    """
    triples_df: (subject, property, object) 100개짜리 DF
    kind: "original", "subject_shuffled", "object_shuffled", "prop_scoped_subject", "prop_scoped_object" 등
    """
    df = attach_labels(triples_df, subjects, properties).reset_index(drop=True)

    rows = []
    for i, row in df.iterrows():
        base = {
            "subject": row["subject"],
            "property": row["property"],
            "object": row["object"],
            "kind": kind,
        }

        for lang in LANGUAGE_LIST:
            lang_name = LANGUAGE_NAME[lang]
            s_label = row[f"sbj_{lang}"]
            p_label = row[f"prop_{lang}"]
            o_label = row[f"obj_{lang}"]

            # 1209 프롬프트 수정                
            prompt = (
                f"You are a {lang_name} language writing expert. "
                f"Write one concise and grammatically correct sentence in {lang_name} that expresses the given relation without adding or modifying any facts. "
                f"Respond only in {lang_name} and enclose your answer within <answer> and </answer> tags. "
                f"Do not include explanations, examples, or any text outside the tags. "
                f"For example: <answer>Washington, D.C. is the capital of the United States.</answer> "
                f"Now generate one sentence for this input triple: ['{s_label}', '{p_label}', '{o_label}']"            
            )
            

            base[f"prompt_{lang}"] = prompt
            base[f"response_{lang}"] = ""
            base[f"TF_{lang}"] = ""
            base[f"correct_{lang}"] = ""

        rows.append(base)

    return pd.DataFrame(rows)


In [None]:
# *_100.parquet 에서 읽어온 DF들:
# original, shuffled_0, shuffled_1, shuffled_2, shuffled_3

template_original      = build_templates(original  , kind = "original")
template_sbj_shuffle   = build_templates(shuffled_0, kind = "subject_shuffled")
template_obj_shuffle   = build_templates(shuffled_1, kind = "object_shuffled")
template_ps_subj       = build_templates(shuffled_2, kind = "prop_scoped_subject_shuffled")
template_ps_obj        = build_templates(shuffled_3, kind = "prop_scoped_object_shuffled")

In [None]:
template_original.to_parquet(f"{TEMPLATE_DIR}00.original_template_500.parquet", compression = 'snappy', index = True)
template_sbj_shuffle.to_parquet(f"{TEMPLATE_DIR}01.subject_shuffled_template_500.parquet", compression = 'snappy', index = True)
template_obj_shuffle.to_parquet(f"{TEMPLATE_DIR}02.object_shuffled_template_500.parquet", compression = 'snappy', index = True)
template_ps_subj.to_parquet(f"{TEMPLATE_DIR}03.property_scoped_subject_shuffled_template_500.parquet", compression = 'snappy', index = True)
template_ps_obj.to_parquet(f"{TEMPLATE_DIR}04.property_scoped_object_shuffled_template_500.parquet", compression = 'snappy', index = True)

In [None]:
template_original.iloc[0]['prompt_ko']