실험 1. 데이터 유효성(EDA)
- 	목적: 스키마/결측/중복/분포/간단 품질지표 산출(“answer 엔티티 없음” 명시).
-	핵심 산출물:
-	01_schema_summary.csv (컬럼별 타입/결측/중복 예)
-	02_column_role_guess.csv (역할 추정: exam/question/time/source 등)
-	20_fk_integrity_report.csv (참조무결성 결과: 비정규화이므로 빈/약함일 수 있음)
-	50_quality_report.json (요약 지표)
-	60_processing_flow.md (Mermaid)

In [None]:
# SPO 재구축(정정판): Answer 제거, Question 텍스트 3종을 속성으로 매핑
from pathlib import Path
import pandas as pd, json, re, hashlib
NB_DIR = Path(".").resolve()
DATA   = NB_DIR.parent / "data" / "gwashi_utf8.csv"            # 번역 전 원본이면 그대로, 번역본이면 파일명만 바꿔 써도 OK
OUT    = NB_DIR / "eda_outputs" / "1번실험" / "links_fix"
OUT.mkdir(parents=True, exist_ok=True)
TRIPLES = OUT / "triples_no_answer.jsonl"

def clean(x): 
    if pd.isna(x): return ""
    return re.sub(r"\s+"," ", str(x).strip())
def hid(pfx,*vals):
    s = "||".join(clean(v) for v in vals)
    return pfx + hashlib.sha1(s.encode("utf-8")).hexdigest()[:12]

df = pd.read_csv(DATA)

def time_id(r):
    y,m,d = clean(r.get("year","")), clean(r.get("month","")), clean(r.get("day",""))
    if y: return hid("T", y,m,d, r.get("ganji_kr_year",""), r.get("ganji_kr_month",""), r.get("ganji_kr_day",""))
    return hid("T", m,d)
def exam_id(r):
    return hid("E", r.get("year",""), r.get("sortC",""), r.get("sortD",""), r.get("sortE",""), r.get("name_exam",""))
def question_id(r):
    return hid("Q", r.get("year",""), r.get("name_exam",""), r.get("name_question",""), r.get("category",""), r.get("category2",""))

with open(TRIPLES, "w", encoding="utf-8") as f:
    for i, r in enumerate(df.to_dict(orient="records")):
        e = exam_id(r); t = time_id(r); q = question_id(r)
        triples = [
            {"s": e, "p":"isHeldOn", "o": t, "o_type":"id"},
            {"s": q, "p":"isPartOf", "o": e, "o_type":"id"},
        ]
        # Question 텍스트 3종
        ab = clean(r.get("abstract",""))
        co = clean(r.get("contents",""))
        de = clean(r.get("description",""))
        if ab: triples.append({"s": q, "p":"hasAbstract", "o": ab, "o_type":"lit"})
        if co: triples.append({"s": q, "p":"hasContent",  "o": co, "o_type":"lit"})
        if de: triples.append({"s": q, "p":"hasDescription","o": de, "o_type":"lit"})
        # 범주/서브범주
        c1, c2 = clean(r.get("category","")), clean(r.get("category2",""))
        if c1: triples.append({"s": q, "p":"hasCategory", "o": c1, "o_type":"lit"})
        if c2: triples.append({"s": q, "p":"hasSubcategory", "o": c2, "o_type":"lit"})
        # Exam 분류/출처
        for p,v in [("hasTypeA","sortA"),("hasTypeB","sortB"),("hasCategory","sortC"),
                    ("hasStage","sortD"),("hasRound","sortE")]:
            vv = clean(r.get(v,""))
            if vv: triples.append({"s": e, "p":p, "o": vv, "o_type":"lit"})
        # Time 리터럴
        for p,v in [("year","year"),("month","month"),("day","day")]:
            vv = clean(r.get(v,""))
            if vv: triples.append({"s": t, "p":p, "o": vv, "o_type":"lit"})
        kr = "-".join([clean(r.get("ganji_kr_year","")), clean(r.get("ganji_kr_month","")), clean(r.get("ganji_kr_day",""))]).strip("-")
        cn = "-".join([clean(r.get("ganji_cn_year","")), clean(r.get("ganji_cn_month","")), clean(r.get("ganji_cn_day",""))]).strip("-")
        if kr: triples.append({"s": t, "p":"sexagenaryKR", "o": kr, "o_type":"lit"})
        if cn: triples.append({"s": t, "p":"sexagenaryCN", "o": cn, "o_type":"lit"})
        # 출처
        src, url = clean(r.get("source","")), clean(r.get("URL",""))
        if src: triples.append({"s": e, "p":"isRecordedIn", "o": src, "o_type":"lit"})
        if url: triples.append({"s": e, "p":"hasRecordURL", "o": url, "o_type":"lit"})
        if src: triples.append({"s": q, "p":"hasSource", "o": src, "o_type":"lit"})
        if url: triples.append({"s": q, "p":"hasSourceURL", "o": url, "o_type":"lit"})
        rec = {"row_index": i, "exam":{"id":e,"name":clean(r.get("name_exam",""))},
               "time":{"id":t}, "question":{"id":q,"name":clean(r.get("name_question",""))},
               "triples": triples}
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"[완료] Answer 제거 버전 triples 저장: {TRIPLES}")