<a href="https://colab.research.google.com/github/seirah-yang/F_roject/blob/main/%EC%83%9D%EC%84%B1%EB%AC%B8%EC%84%9C_%EA%B7%BC%EA%B1%B0%EB%B0%98%EC%98%81_%ED%8F%89%EA%B0%80%EA%B8%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0


In [3]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
# -*- coding: utf-8 -*-
import os
import docx
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# ─────────────────────────────────────────────
# 1️⃣ 임베더 캐시
# ─────────────────────────────────────────────
_EMBEDDER_CACHE = {"name": None, "model": None}

def _get_embedder(model_name="intfloat/e5-large"):
    global _EMBEDDER_CACHE
    if _EMBEDDER_CACHE["model"] and _EMBEDDER_CACHE["name"] == model_name:
        return _EMBEDDER_CACHE["model"]
    model = SentenceTransformer(model_name)
    _EMBEDDER_CACHE["name"] = model_name
    _EMBEDDER_CACHE["model"] = model
    return model

# ─────────────────────────────────────────────
# 2️⃣ 파일 로드 함수 (DOCX/PDF 자동 판별)
# ─────────────────────────────────────────────
def load_text_from_file(path):
    ext = os.path.splitext(path)[1].lower()
    if ext == ".docx":
        doc = docx.Document(path)
        return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
    elif ext == ".pdf":
        reader = PdfReader(path)
        return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
    else:
        return ""

# ─────────────────────────────────────────────
# 3️⃣ 폴더 내 모든 근거문헌 로드
# ─────────────────────────────────────────────
def load_law_corpus_from_dir(dir_path):
    corpus = []
    for file in os.listdir(dir_path):
        path = os.path.join(dir_path, file)
        if not os.path.isfile(path):
            continue
        if path.endswith((".docx", ".pdf")):
            try:
                text = load_text_from_file(path)
                if text.strip():
                    corpus.append(text)
            except Exception as e:
                print(f"[WARN] {file} 불러오기 실패: {e}")
    return corpus

# ─────────────────────────────────────────────
# 4️⃣ 근거문헌 준수도 평가 함수
# ─────────────────────────────────────────────
def reference_compliance_with_sources(
    section_text,
    law_corpus,
    model_name="intfloat/e5-large",
    threshold=0.8,
    top_k=5
):
    if not section_text or not law_corpus:
        return {"compliance_score": 0.0, "top_references": []}

    model = _get_embedder(model_name)

    emb_section = model.encode([section_text], normalize_embeddings=True)
    emb_law = model.encode(law_corpus, normalize_embeddings=True)

    sims = cosine_similarity(emb_section, emb_law)[0]
    matched_ratio = float((sims >= threshold).sum()) / len(law_corpus)
    top_idx = sims.argsort()[::-1][:top_k]
    top_refs = [(law_corpus[i][:200], float(sims[i])) for i in top_idx]

    return {
        "compliance_score": round(float(matched_ratio), 3),
        "top_references": top_refs
    }

# ─────────────────────────────────────────────
# 5️⃣ 여러 파일 일괄 평가 실행
# ─────────────────────────────────────────────
if __name__ == "__main__":
    # (1) 평가 대상 문서 폴더 경로
    target_dir = "/content/drive/MyDrive/1027"  # ← 평가할 문서 폴더 입력 (.docx / .pdf)
    law_dir = "/content/drive/MyDrive/reference_file"        # ← 근거문헌 폴더 입력

    # (2) 근거문헌 로드
    law_corpus = load_law_corpus_from_dir(law_dir)

    # (3) 평가 대상 폴더 순회
    results = []
    for file in os.listdir(target_dir):
        file_path = os.path.join(target_dir, file)
        if not os.path.isfile(file_path):
            continue
        if file_path.endswith((".docx", ".pdf")):
            print(f"\n📄 평가 중: {file}")
            section_text = load_text_from_file(file_path)
            result = reference_compliance_with_sources(section_text, law_corpus)
            results.append({
                "file": file,
                "compliance_score": result["compliance_score"]
            })
            print(f" → 준수도 점수: {result['compliance_score']:.3f}")

    # (4) 전체 요약 출력
    print("\n✅ 평가 완료 결과 요약:")
    for r in results:
        print(f"{r['file']:50s} | Score: {r['compliance_score']:.3f}")


📄 평가 중: RND_test1027.docx


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

 → 준수도 점수: 1.000

📄 평가 중: section_2_연구개발과제의 배경.docx
 → 준수도 점수: 1.000

📄 평가 중: section_9_추진방법 및 전략.docx
 → 준수도 점수: 1.000

📄 평가 중: section_14_경제적 성과창출의 기대효과.docx
 → 준수도 점수: 1.000

📄 평가 중: section_4_기술개발 핵심어(키워드).docx
 → 준수도 점수: 1.000

📄 평가 중: section_1_연구기획과제의 개요.docx
 → 준수도 점수: 1.000

📄 평가 중: section_8_연차별 개발내용 및 범위.docx
 → 준수도 점수: 1.000

📄 평가 중: section_11_신규사업 신설의 기대효과.docx
 → 준수도 점수: 1.000

📄 평가 중: section_5_연구개발 목표.docx
 → 준수도 점수: 1.000

📄 평가 중: section_10_과제 성과의 활용방안.docx
 → 준수도 점수: 1.000

📄 평가 중: section_6_연구개발 내용.docx
 → 준수도 점수: 1.000

📄 평가 중: section_13_사회적 가치창출의 기대효과.docx
 → 준수도 점수: 1.000

📄 평가 중: section_7_연차별 개발목표.docx
 → 준수도 점수: 1.000

📄 평가 중: section_12_사회적 가치 창출 계획.docx
 → 준수도 점수: 1.000

📄 평가 중: section_3_연구개발과제의 필요성.docx
 → 준수도 점수: 1.000

📄 평가 중: section_17_근거 법령 및 참고 문서 목록

In [9]:
# -*- coding: utf-8 -*-
import os
import docx
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

if __name__ == "__main__":
    # (1) 폴더 경로 설정
    target_dir = "/content/drive/MyDrive/1027"  # ← 평가할 문서 폴더 입력 (.docx / .pdf)
    law_dir = "/content/drive/MyDrive/reference_file"        # ← 근거문헌 폴더 입력
    output_csv = "/content/drive/MyDrive/results_summary.csv"

    # (2) 근거문헌 로드
    print(f"\n📚 근거문헌 로드 중: {law_dir}")
    law_corpus = load_law_corpus_from_dir(law_dir)
    print(f"   → 총 {len(law_corpus)}개 문헌 로드 완료\n")

    # (3) 평가 실행
    results = []
    for file in os.listdir(target_dir):
        file_path = os.path.join(target_dir, file)
        if not os.path.isfile(file_path):
            continue
        if file_path.endswith((".docx", ".pdf")):
            print(f"📄 평가 중: {file}")
            section_text = load_text_from_file(file_path)
            result = reference_compliance_with_sources(section_text, law_corpus)
            row = {
                "file": file,
                "compliance_score": result["compliance_score"]
            }
            # 상위 근거문헌 5개 추가
            for idx, (ref, sim) in enumerate(result["top_references"], start=1):
                row[f"top_reference_{idx}"] = ref[:120]  # 120자 미리보기
                row[f"sim_{idx}"] = round(sim, 3)
            results.append(row)
            print(f"   → 점수: {result['compliance_score']:.3f}")

    # (4) CSV로 저장
    if results:
        df = pd.DataFrame(results)
        df.to_csv(output_csv, index=False, encoding="utf-8-sig")
        print(f"\n✅ 평가 완료! 결과가 저장되었습니다: {output_csv}\n")
    else:
        print("⚠️ 평가 가능한 문서를 찾지 못했습니다.")


📚 근거문헌 로드 중: /content/drive/MyDrive/reference_file
   → 총 3개 문헌 로드 완료

📄 평가 중: RND_test1027.docx
   → 점수: 1.000
📄 평가 중: section_2_연구개발과제의 배경.docx
   → 점수: 1.000
📄 평가 중: section_9_추진방법 및 전략.docx
   → 점수: 1.000
📄 평가 중: section_14_경제적 성과창출의 기대효과.docx
   → 점수: 1.000
📄 평가 중: section_4_기술개발 핵심어(키워드).docx
   → 점수: 1.000
📄 평가 중: section_1_연구기획과제의 개요.docx
   → 점수: 1.000
📄 평가 중: section_8_연차별 개발내용 및 범위.docx
   → 점수: 1.000
📄 평가 중: section_11_신규사업 신설의 기대효과.docx
   → 점수: 1.000
📄 평가 중: section_5_연구개발 목표.docx
   → 점수: 1.000
📄 평가 중: section_10_과제 성과의 활용방안.docx
   → 점수: 1.000
📄 평가 중: section_6_연구개발 내용.docx
   → 점수: 1.000
📄 평가 중: section_13_사회적 가치창출의 기대효과.docx
   → 점수: 1.000
📄 평가 중: section_7_연차별 개발목표.docx
   → 점수: 1.000
📄 평가 중: section_12_사회적 가치 창출 계획.docx
   → 점수: 1.000
📄 평가 중: section_3_연구개발과제의 필요성.docx
   → 점수: 1.000
📄