In [None]:
import os
import glob
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
from tqdm import tqdm
import pandas as pd


In [None]:
import pandas as pd
pd.set_option('display.max_rows', 10)

In [None]:
def merge_and_sort(input_dir, prefix, output_path, sort_by):
    pattern = os.path.join(input_dir, f"{prefix}*.parquet")
    files = sorted(glob.glob(pattern))
    if not files:
        print(f"[WARN] No files found for '{prefix}'")
        return
    dataset = ds.dataset(files, format="parquet")
    # 메모리 최적: table로 바로 변환
    table = dataset.to_table()
    # pandas 변환
    df = table.to_pandas()
    df = df.sort_values(sort_by).reset_index(drop=True)
    df.to_parquet(output_path, compression='snappy')
    print(f"{prefix} merged & sorted → {output_path}")

In [None]:
EXTRACT_DIR = '../00.data/00.wikidata/00.wikidata_extract/'              # 최종본 Parquet 저장 위치
MERGED_DIR = '../00.data/00.wikidata/01.wikidata_merged/'
os.makedirs(MERGED_DIR, exist_ok = True)

merge_and_sort(EXTRACT_DIR, "subject", f"{MERGED_DIR}subject.parquet", "subject")
merge_and_sort(EXTRACT_DIR, "property", f"{MERGED_DIR}property.parquet", "property")
merge_and_sort(EXTRACT_DIR, "triples", f"{MERGED_DIR}triples.parquet", ["subject", "property", "object"])

In [None]:
subject_df = pq.read_table(MERGED_DIR+'/subject.parquet').to_pandas()
property_df = pq.read_table(MERGED_DIR+'/property.parquet').to_pandas()
triples_df = pq.read_table(MERGED_DIR+'/triples.parquet').to_pandas()

In [None]:
LANGUAGE_LIST = ['en', 'fr', 'de', 'es', 'it', 'pt', 'ko', 'ja']


In [None]:
mask_valid = ~(subject_df[LANGUAGE_LIST] == "I Don't Know!!").any(axis = 1)
subject_df = subject_df[mask_valid].copy()

In [None]:
mask_valid = ~(property_df[LANGUAGE_LIST] == "I Don't Know!!").any(axis=1)
property_df = property_df[mask_valid].copy()

In [None]:
valid_subjects = set(subject_df['subject'])
valid_properties = set(property_df['property'])

mask = (
    triples_df['subject'].isin(valid_subjects) &
    triples_df['property'].isin(valid_properties) &
    triples_df['object'].isin(valid_subjects)
)

triples_df = triples_df[mask].reset_index(drop=True)

In [None]:
triples_df = triples_df.sort_values(["subject", "property", "object"]).reset_index(drop=True)
triples_df = triples_df.drop_duplicates(subset = ['subject', 'property', 'object']).reset_index(drop = True)

In [None]:
valid_subject = set(triples_df["subject"]).union(set(triples_df["object"]))
valid_property = set(triples_df["property"])

In [None]:
subject_df = subject_df[subject_df["subject"].isin(valid_subject)].reset_index(drop=True)
property_df = property_df[property_df["property"].isin(valid_property)].reset_index(drop=True)
triples_df = triples_df[
    triples_df["subject"].isin(subject_df["subject"]) &
    triples_df["object"].isin(subject_df["subject"]) &
    triples_df["property"].isin(property_df["property"])
].reset_index(drop=True)


In [None]:
subject_df.to_parquet(f"{MERGED_DIR}subject_tmp.parquet", compression = "snappy", index = True)
property_df.to_parquet(f"{MERGED_DIR}property_tmp.parquet", compression = "snappy", index = True)
triples_df.to_parquet(f"{MERGED_DIR}triples_tmp.parquet", compression = "snappy", index = True)

# 필터링 추가

In [None]:
subject_df = pd.rea

In [None]:
import os
import glob
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
from tqdm import tqdm
import pandas as pd

subject_df = pq.read_table('/home/ps2575/proj/wikibench_v.1.1_data_backup/00.data/00.wikidata/01.wikidata_merged/01.wikidata_merged/subject.parquet').to_pandas()
property_df = pq.read_table('/home/ps2575/proj/wikibench_v.1.1_data_backup/00.data/00.wikidata/01.wikidata_merged/01.wikidata_merged/property.parquet').to_pandas()
triple_df = pq.read_table('{/home/ps2575/proj/wikibench_v.1.1_data_backup/00.data/00.wikidata/01.wikidata_merged/01.wikidata_merged/triples.parquet').to_pandas()


In [None]:
print(len(subject_df))
print(len(property_df))
print(len(triple_df))

In [None]:
import regex
import unicodedata
import numpy as np

In [None]:
QUALIFIER_KEYWORDS = [
    "time", "start", "end", "point", "rank", "ordinal",
    "criterion", "used", "location", "coordinates",
    "applies", "statement", "series"
]

In [None]:
def clean_labels(df):
    df = df.copy()
    s = df[LANGUAGE_LIST].astype(str)

    # 1) 빈 값 / 너무 짧거나 긴 값
    mask_empty = s.apply(lambda c: c.str.strip().eq(""), axis=0).any(axis=1)
    mask_len   = s.apply(lambda c: (c.str.len() < 2) | (c.str.len() > 200), axis=0).any(axis=1)

    # 2) 유해 패턴
    PAT = {
        "URL":  r"https?://\S+|ftp://\S+",
        "HTML": r"&\w+?;",
        "WIKI": r"\[\[.*?\]\]|''.+?''|<[^>]+>",
        "TPL":  r"^(?:Category|Template|File|Help):",
        "NUM":  r"^\d+$",
        "PUNC": r"[^\w\s]{2,}",
        "UNK":  r"I Don't Know!!"
    }

    mask_url  = s.apply(lambda c: c.str.contains(PAT["URL"],  regex=True), axis=0).any(axis=1)
    mask_html = s.apply(lambda c: c.str.contains(PAT["HTML"], regex=True), axis=0).any(axis=1)
    mask_wiki = s.apply(lambda c: c.str.contains(PAT["WIKI"], regex=True), axis=0).any(axis=1)
    mask_tpl  = s.apply(lambda c: c.str.contains(PAT["TPL"],  regex=True), axis=0).any(axis=1)
    mask_num  = s.apply(lambda c: c.str.contains(PAT["NUM"],  regex=True), axis=0).any(axis=1)
    mask_punc = s.apply(lambda c: c.str.contains(PAT["PUNC"], regex=True), axis=0).any(axis=1)
    mask_unk  = s.apply(lambda c: c.str.contains(PAT["UNK"],  regex=False), axis=0).any(axis=1)

    # 3) 이모지 / 제어 문자
    EMOJI_RE = regex.compile(r"\p{Emoji}")
    mask_emoji = s.apply(lambda col: col.apply(lambda x: bool(EMOJI_RE.search(x))), axis=0).any(axis=1)

    def has_ctrl(x):
        return any(unicodedata.category(c) in {"Cf","Cc"} for c in x)

    mask_ctrl = s.apply(lambda col: col.apply(has_ctrl), axis=0).any(axis=1)

    # 4) 언어별 라벨이 모두 동일한 경우 제거 (신규)
    mask_all_equal = s.nunique(axis=1) == 1

    # 5) 모든 언어가 NaN → 제거
    mask_all_nan = df[LANGUAGE_LIST].isna().all(axis=1)

    # 최종 배드 마스크
    bad = (
        mask_empty | mask_len | mask_url | mask_html | mask_wiki |
        mask_tpl | mask_num | mask_punc | mask_emoji | mask_ctrl |
        mask_unk | mask_all_equal | mask_all_nan
    )

    return df[~bad], df[bad]


In [None]:
def remove_qualifier_properties(property_df):
    df = property_df.copy()

    mask_qual = df['en'].str.lower().apply(
        lambda x: any(k in x for k in QUALIFIER_KEYWORDS)
    )

    return df[~mask_qual], df[mask_qual]

In [None]:
def filter_triples(triples_df, subjects_clean, props_clean):
    df = triples_df.copy()

    valid_s = set(subjects_clean['subject'])
    valid_p = set(props_clean['property'])

    # 1) 유효 subject/property/object 여부
    mask_valid = (
        df['subject'].isin(valid_s) &
        df['object'].isin(valid_s) &
        df['property'].isin(valid_p)
    )

    df = df[mask_valid].copy()

    # 2) self-loop 제거
    df = df[df['subject'] != df['object']]

    # 3) 중복 제거
    df = df.drop_duplicates(subset=['subject','property','object']).reset_index(drop=True)

    return df

In [None]:
subject_clean, subject_bad = clean_labels(subject_df)
property_clean, property_bad = remove_qualifier_properties(property_df)
property_clean, props_qual_removed = remove_qualifier_properties(property_clean)
triples_clean = filter_triples(triples_df, subject_clean, property_clean)


In [None]:
pd.set_option('display.max_rows', 10)

In [None]:
# subject_clean[subject_clean['en'].str.contains("Portal")]
# property_clean[property_clean['en'].str.contains('Wiki')][['property','ko','en']]
property_clean[property_clean['property'] == 'P607']

In [None]:
triples_clean[triples_clean['property'] == 'P607']

In [None]:

pd.set_option('display.max_rows', None)
property_clean[['property', 'ko', 'en']]

In [None]:
pd.set_option('display.max_rows', 10)

In [None]:
triples_clean[triples_clean['property'] == 'P735']

In [None]:
property_clean[property_clean['en'] == 'given name']

In [None]:
set(triples_clean[triples_clean['object'] == 'Q4165246']['property'])

In [None]:
property_clean

In [None]:
subject_clean.to_parquet(f"{MERGED_DIR}subject_final.parquet", compression = "snappy", index = True)
property_clean.to_parquet(f"{MERGED_DIR}property_final.parquet", compression = "snappy", index = True)
triples_clean.to_parquet(f"{MERGED_DIR}triples_final.parquet", compression = "snappy", index = True)

In [None]:
property_clean[property_clean['property'] == 'P13']

In [None]:
subject_clean[subject_clean['subject'] == 'Q13']

In [None]:
pid_problem = ["P607", "P2614", "P1753", "P1423", "P1424",
               "P17", "P1754", "P2614", "P2959", "P2737",
               "P3150", "P360", "P3876", "P407", "P4195",
               "P4224", "P4329", "P5008", "P6104", "P629",
               "P6365", "P642", "P69", "P7763", "P8225",
               "P852", "P853", "P8423", "P8646", "P8744", 
               "P910", "P914", "P9241", "P46", "P55",
               "P521", "P590"]

In [None]:
property_clean[property_clean["property"].isin(pid_problem)][['property', 'ko', 'en']]

In [None]:
pd.set_option('display.max_rows', 10)
triples_clean[triples_clean['property'] == "P607"]

In [None]:
delete_pid = ["P1423", "P1424", "P1753", "P1754", "P2614", 
              "P2737", "P2959", "P360", "P3876", "P407", 
              "P4195", "P4224", "P4329", "P5008", "P6104", 
              "P629", "P642",  "P8225", "P8423", "P852", 
              "P853", "P8646", "P8744", "P910", "P914", 
              "P9241", "P607" ] 
# P607은 conflict -> participated in conflict 로 바꾸지 않고 삭제함
# Why? : 기존 conflict 가 모두 저렇게 수정되었을 것이라고 확언할 수 없어서.

In [None]:
triples_final_final = triples_clean[~triples_clean['property'].isin(delete_pid)]

In [None]:
triples_final_final.to_parquet('../00.data/00.wikidata/01.wikidata_merged/triples_final_final.parquet')

In [None]:
triples_final_final[triples_final_final['property'] == 'P735']

In [None]:
triples_final_final

In [None]:
len(set(triples_final_final['property']))

In [None]:
final = pq.read_table('/home/ps2575/proj/wikibench_v.1.1_data_backup/00.data/00.wikidata/01.wikidata_merged/01.wikidata_merged/triples_final_final.parquet').to_pandas()

In [None]:
final.head()

In [None]:
p_list = final['property'].unique()

In [None]:
len(p_list)

In [None]:
triples = pq.read_table('/home/ps2575/proj/wikibench_v.1.1_data_backup/00.data/00.wikidata/01.wikidata_merged/01.wikidata_merged/triples.parquet').to_pandas()

In [None]:
p_list=triples['property'].unique()

In [None]:
len(p_list)

In [None]:
template0 = pq.read_table('/home/ps2575/proj/wikibench_v.1.1/00.data/00.wikidata/03.wikidata_template/00.original_template_500.parquet').to_pandas()
template0.head()

In [None]:
template1 = pq.read_table('/home/ps2575/proj/wikibench_v.1.1/00.data/00.wikidata/03.wikidata_template/01.subject_shuffled_template_500.parquet').to_pandas()
template1.head()

In [None]:
template3 = pq.read_table('/home/ps2575/proj/wikibench_v.1.1/00.data/00.wikidata/03.wikidata_template/03.property_scoped_subject_shuffled_template_500.parquet').to_pandas()
template3.head()