실험 1. 데이터 유효성(EDA)
- 	목적: 스키마/결측/중복/분포/간단 품질지표 산출(“answer 엔티티 없음” 명시).
-	핵심 산출물:
-	01_schema_summary.csv (컬럼별 타입/결측/중복 예)
-	02_column_role_guess.csv (역할 추정: exam/question/time/source 등)
-	20_fk_integrity_report.csv (참조무결성 결과: 비정규화이므로 빈/약함일 수 있음)
-	50_quality_report.json (요약 지표)
-	60_processing_flow.md (Mermaid)

In [7]:
ls ../../

README.md
[1m[36manswer_parsing[m[m/
[1m[36mdata[m[m/
[1m[36mexamples[m[m/
[1m[36mfoundationmodel[m[m/
[1m[36mnetwork[m[m/
[1m[36mnotebook[m[m/
requirements.txt
[1m[36mtongu[m[m/
[1m[36mutil[m[m/
과거시험_데이터_정제_기록.md


In [8]:
import pandas as pd

# 1️⃣ 원본 파일 경로 (cp949로 인코딩된 파일)
input_file = "../../data/gwashi.csv"

# 2️⃣ 출력 파일 경로 (utf-8로 저장)
output_file = "gwashi_utf8.csv"

# 3️⃣ 파일 읽기 (cp949 → 내부 UTF-8 변환)
df = pd.read_csv(input_file, encoding='cp949')

# 4️⃣ UTF-8로 다시 저장 (BOM 없이)
df.to_csv(output_file, encoding='utf-8', index=False)

print("✅ 인코딩 변환 완료:", output_file)

✅ 인코딩 변환 완료: gwashi_utf8.csv


In [9]:
# SPO 재구축(정정판): Answer 제거, Question 텍스트 3종을 속성으로 매핑
from pathlib import Path
import pandas as pd, json, re, hashlib
NB_DIR = Path("..").resolve()
DATA   = NB_DIR.parent / "data" / "gwashi_utf8.csv"            # 번역 전 원본이면 그대로, 번역본이면 파일명만 바꿔 써도 OK
OUT    = NB_DIR / "eda_outputs" / "1번실험" / "links_fix"
OUT.mkdir(parents=True, exist_ok=True)
TRIPLES = OUT / "triples_no_answer.jsonl"

def clean(x): 
    if pd.isna(x): return ""
    return re.sub(r"\s+"," ", str(x).strip())
def hid(pfx,*vals):
    s = "||".join(clean(v) for v in vals)
    return pfx + hashlib.sha1(s.encode("utf-8")).hexdigest()[:12]

df = pd.read_csv(DATA)

def time_id(r):
    y,m,d = clean(r.get("year","")), clean(r.get("month","")), clean(r.get("day",""))
    if y: return hid("T", y,m,d, r.get("ganji_kr_year",""), r.get("ganji_kr_month",""), r.get("ganji_kr_day",""))
    return hid("T", m,d)
def exam_id(r):
    return hid("E", r.get("year",""), r.get("sortC",""), r.get("sortD",""), r.get("sortE",""), r.get("name_exam",""))
def question_id(r):
    return hid("Q", r.get("year",""), r.get("name_exam",""), r.get("name_question",""), r.get("category",""), r.get("category2",""))

with open(TRIPLES, "w", encoding="utf-8") as f:
    for i, r in enumerate(df.to_dict(orient="records")):
        e = exam_id(r); t = time_id(r); q = question_id(r)
        triples = [
            {"s": e, "p":"isHeldOn", "o": t, "o_type":"id"},
            {"s": q, "p":"isPartOf", "o": e, "o_type":"id"},
        ]
        # Question 텍스트 3종
        ab = clean(r.get("abstract",""))
        co = clean(r.get("contents",""))
        de = clean(r.get("description",""))
        if ab: triples.append({"s": q, "p":"hasAbstract", "o": ab, "o_type":"lit"})
        if co: triples.append({"s": q, "p":"hasContent",  "o": co, "o_type":"lit"})
        if de: triples.append({"s": q, "p":"hasDescription","o": de, "o_type":"lit"})
        # 범주/서브범주
        c1, c2 = clean(r.get("category","")), clean(r.get("category2",""))
        if c1: triples.append({"s": q, "p":"hasCategory", "o": c1, "o_type":"lit"})
        if c2: triples.append({"s": q, "p":"hasSubcategory", "o": c2, "o_type":"lit"})
        # Exam 분류/출처
        for p,v in [("hasTypeA","sortA"),("hasTypeB","sortB"),("hasCategory","sortC"),
                    ("hasStage","sortD"),("hasRound","sortE")]:
            vv = clean(r.get(v,""))
            if vv: triples.append({"s": e, "p":p, "o": vv, "o_type":"lit"})
        # Time 리터럴
        for p,v in [("year","year"),("month","month"),("day","day")]:
            vv = clean(r.get(v,""))
            if vv: triples.append({"s": t, "p":p, "o": vv, "o_type":"lit"})
        kr = "-".join([clean(r.get("ganji_kr_year","")), clean(r.get("ganji_kr_month","")), clean(r.get("ganji_kr_day",""))]).strip("-")
        cn = "-".join([clean(r.get("ganji_cn_year","")), clean(r.get("ganji_cn_month","")), clean(r.get("ganji_cn_day",""))]).strip("-")
        if kr: triples.append({"s": t, "p":"sexagenaryKR", "o": kr, "o_type":"lit"})
        if cn: triples.append({"s": t, "p":"sexagenaryCN", "o": cn, "o_type":"lit"})
        # 출처
        src, url = clean(r.get("source","")), clean(r.get("URL",""))
        if src: triples.append({"s": e, "p":"isRecordedIn", "o": src, "o_type":"lit"})
        if url: triples.append({"s": e, "p":"hasRecordURL", "o": url, "o_type":"lit"})
        if src: triples.append({"s": q, "p":"hasSource", "o": src, "o_type":"lit"})
        if url: triples.append({"s": q, "p":"hasSourceURL", "o": url, "o_type":"lit"})
        rec = {"row_index": i, "exam":{"id":e,"name":clean(r.get("name_exam",""))},
               "time":{"id":t}, "question":{"id":q,"name":clean(r.get("name_question",""))},
               "triples": triples}
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"[완료] Answer 제거 버전 triples 저장: {TRIPLES}")

[완료] Answer 제거 버전 triples 저장: /Users/songhune/Library/Mobile Documents/com~apple~CloudDocs/Workspace/korean_eda/notebook/eda_outputs/1번실험/links_fix/triples_no_answer.jsonl


In [None]:
# === 관계 시각화 러너 (한글/한자 폰트 강력 수정판) ==========================================
from pathlib import Path
import json, re, os
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import platform

# ----- 강력한 한글/한자 폰트 설정 -----
def setup_korean_fonts_strong():
    """강력한 한글/한자 폰트 설정 (캐시 클리어 + 직접 로드)"""
    system = platform.system()
    
    # 1. 폰트 캐시 클리어 시도
    try:
        cache_dir = fm.get_cachedir()
        if os.path.exists(cache_dir):
            for f in os.listdir(cache_dir):
                if f.startswith('font'):
                    try:
                        os.remove(os.path.join(cache_dir, f))
                    except:
                        pass
        fm._load_fontmanager(try_read_cache=False)
    except:
        pass
    
    # 2. 시스템별 폰트 파일 직접 로드
    font_paths = []
    if system == 'Darwin':  # macOS
        font_paths = [
            '/System/Library/Fonts/Supplemental/AppleGothic.ttf',
            '/System/Library/Fonts/AppleSDGothicNeo.ttc',
            '/Library/Fonts/AppleGothic.ttf',
        ]
    elif system == 'Windows':
        font_paths = ['C:/Windows/Fonts/malgun.ttf', 'C:/Windows/Fonts/gulim.ttc']
    else:  # Linux
        font_paths = ['/usr/share/fonts/truetype/nanum/NanumGothic.ttf']
    
    found_font = None
    for path in font_paths:
        if os.path.exists(path):
            try:
                fm.fontManager.addfont(path)
                found_font = fm.FontProperties(fname=path).get_name()
                print(f"[FONT] 등록 성공: {path} -> {found_font}")
                break
            except:
                continue
    
    # 3. 이름으로 찾기 (fallback)
    if not found_font:
        available = [f.name for f in fm.fontManager.ttflist]
        candidates = {
            'Darwin': ['AppleGothic', 'Apple SD Gothic Neo'],
            'Windows': ['Malgun Gothic', 'Gulim'],
            'Linux': ['NanumGothic', 'Noto Sans CJK KR']
        }.get(system, ['DejaVu Sans'])
        
        for font in candidates:
            if font in available:
                found_font = font
                print(f"[FONT] 찾음: {font}")
                break
    
    # 4. 설정 적용
    if found_font:
        plt.rcParams['font.family'] = found_font
    else:
        plt.rcParams['font.family'] = 'DejaVu Sans'
    
    plt.rcParams['axes.unicode_minus'] = False
    print(f"✅ 폰트 설정: {plt.rcParams['font.family']}")
    return found_font

# 폰트 설정 적용
setup_korean_fonts_strong()

# ----- 경로 설정 -----
ROOT = Path(__file__).resolve().parents[2] if "__file__" in globals() else Path(".").resolve().parents[1]
JSONL = ROOT / "notebook" / "eda_outputs" / "1번실험" / "triples_no_answer.jsonl"
OUTDIR = ROOT / "notebook" / "experiments" / "graphs"
OUTDIR.mkdir(parents=True, exist_ok=True)

print("[INPUT]", JSONL)
print("[OUTDIR]", OUTDIR)

# ----- 유틸 -----
def clean(s):
    if s is None: return ""
    return re.sub(r"\s+"," ", str(s).strip())

def guess_type_from_id(node_id):
    if not node_id: return "Unknown"
    c = node_id[0].upper()
    return {"E":"Exam","Q":"Question","T":"Time"}.get(c,"Unknown")

# ----- JSONL 로드 -----
rows = []
with open(JSONL, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            rows.append(json.loads(line))

print(f"[LOAD] records: {len(rows)}")

# ===== 1) 스키마 그래프(타입 레벨) =====
schema_counts = {}
for rec in rows:
    for tr in rec["triples"]:
        if tr.get("o_type") != "id":
            continue
        s, p, o = tr["s"], tr["p"], tr["o"]
        st, ot = guess_type_from_id(s), guess_type_from_id(o)
        key = (st, p, ot)
        schema_counts[key] = schema_counts.get(key, 0) + 1

SG = nx.DiGraph()
for (st, p, ot), cnt in schema_counts.items():
    SG.add_node(st)
    SG.add_node(ot)
    if SG.has_edge(st, ot):
        SG[st][ot]["labels"].add(p)
        SG[st][ot]["count"] += cnt
    else:
        SG.add_edge(st, ot, labels={p}, count=cnt)

plt.figure(figsize=(6, 4))
pos = nx.spring_layout(SG, seed=42)
nx.draw_networkx_nodes(SG, pos, node_size=1800)
nx.draw_networkx_labels(SG, pos, font_size=11)
nx.draw_networkx_edges(SG, pos, arrows=True)
edge_labels = {
    (u, v): ", ".join(sorted(d["labels"])) + f"\n(n={d['count']})"
    for u, v, d in SG.edges(data=True)
}
nx.draw_networkx_edge_labels(SG, pos, edge_labels=edge_labels, font_size=9)
schema_png = OUTDIR / "schema_graph.png"
plt.tight_layout()
plt.savefig(schema_png, dpi=200, bbox_inches='tight')
plt.close()
print("[SAVE]", schema_png)

# ===== 2) 인스턴스 그래프(샘플) =====
def build_instance_graph(rows_subset):
    G = nx.DiGraph()
    for rec in rows_subset:
        e = rec.get("exam",{}).get("id","")
        q = rec.get("question",{}).get("id","")
        t = rec.get("time",{}).get("id","")
        for n in [e,q,t]:
            if n: G.add_node(n, type=guess_type_from_id(n))
        for tr in rec["triples"]:
            if tr.get("o_type") != "id": 
                continue
            s, p, o = tr["s"], tr["p"], tr["o"]
            if not s or not o: 
                continue
            if G.has_edge(s,o):
                G[s][o]["labels"].add(p)
                G[s][o]["count"] += 1
            else:
                G.add_edge(s,o,labels={p},count=1)
    return G

start, end = 0, min(300, len(rows))
G = build_instance_graph(rows[start:end])

plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, k=0.7, seed=7)
nx.draw_networkx_nodes(G, pos, node_size=300)
labels = {n: f"{d.get('type','')}:{n[:10]}" for n,d in G.nodes(data=True)}
nx.draw_networkx_labels(G, pos, labels=labels, font_size=8)
nx.draw_networkx_edges(G, pos, arrows=True, width=0.8)
edge_labels = {(u,v): next(iter(d["labels"])) for u,v,d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=7)
inst_png = OUTDIR / f"instance_graph_rows{start}_{end}.png"
plt.tight_layout()
plt.savefig(inst_png, dpi=200, bbox_inches='tight')
plt.close()
print("[SAVE]", inst_png)

# (선택) pyvis 인터랙티브
try:
    from pyvis.network import Network
    net = Network(height="700px", width="100%", directed=True, notebook=False)
    for n, d in G.nodes(data=True):
        net.add_node(n, label=n, title=f"type={d.get('type','')}")
    for u,v,d in G.edges(data=True):
        title = ", ".join(d.get("labels", []))
        net.add_edge(u,v,title=title,label=title)
    html = OUTDIR / f"instance_graph_rows{start}_{end}.html"
    net.show(str(html))
    print("[SAVE]", html)
except Exception as e:
    print("[INFO] pyvis 미설치 또는 HTML 생성 실패:", e)

print("\n✅ 그래프 시각화 완료!")