<a href="https://colab.research.google.com/github/seoyen1122/solar_rag/blob/main/mmlu_pro/law.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
!pip3 install -qU python-dotenv PyPDF2 langchain langchain-community langchain-core langchain-text-splitters langchain_upstage oracledb python-dotenv

In [24]:
! pip install -q openai langchain tiktoken faiss-cpu

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
from langchain_upstage import UpstageEmbeddings, ChatUpstage

In [27]:
UPSTAGE_API_KEY = "up_g7T2cQoLKZH6Oi2n4MHOW706XAdSs"

In [28]:
upstage_embeddings = UpstageEmbeddings(api_key=UPSTAGE_API_KEY, model="embedding-passage")

In [29]:
cd /content/drive/MyDrive/Colab Notebooks/2025_2/nlp

/content/drive/MyDrive/Colab Notebooks/2025_2/nlp


# Cornell.edu
**알파벳 전체를 돌면서 URL 리스트 만들기**

In [30]:
import time
import string
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

In [31]:
base = "https://www.law.cornell.edu"
index_template = base + "/wex/all/{letter}"

header = {"User-Agent": "law/0.1 (for project; contact:seoyen1122@gmail.com)"}

In [32]:
def collect_links(letters=None):
  if letters is None:
    letters = list(string.ascii_lowercase)

  seen = set()
  entries = []

  for letter in letters:
    index_url = index_template.format(letter = letter)
    resp = requests.get(index_url, headers = header) #requests- 웹사이트에 요청보내고응답받기, index_url 주소로 httpget 요청 보냄, header = 나는 이런 이름의 봇이고 프로젝트 목적으로 요청 보낸다는 정보 전달용, return: request.Response객체, 이 페이지 정보, resp.status_code, resp.text, resp.content
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser") #다운받은 html문자열을 파싱해서 태그들을 쉽게 찾을 수 있는 객체로 바꾸는 코드.

    for a in soup.find_all("a", href=True): #이 페이지 안에서 href 속성이 있는 모든 <a> 태그를 찾아라 → 그 안에 들어있는 링크 주소(href)를 하나씩 꺼내자
      href = a["href"]

      if not href.startswith("/wex/"):
        continue
      if "/wex/all" in href:
        continue

      full_url = urljoin(base, href)
      if full_url in seen:
        continue

      term = a.get_text(strip=True)
      entries.append({"term": term, "url":full_url})
      seen.add(full_url)

    time.sleep(3)

  print("total entries:", len(entries))
  return entries

**각 항목 페이지에서 본문 텍스트 추출**

상단 네비게이션

h1 제목(용어)

그 아래 여러 단락/리스트가 정의/설명

맨 아래에 wex toolbox, terms of use같은 공통 푸터

main 안에 있는 p, ul, ol 태그 텍스트를 합치는 식으로 본문만 자르기

In [33]:
def fetch_article(url):
  resp = requests.get(url, headers = header)
  resp.raise_for_status()
  soup = BeautifulSoup(resp.text, "html.parser")

  main = soup.find("main") or soup

  #제목(h1/h2)
  title_tag = main.find(["h1", "h2"])
  title = title_tag.get_text(strip=True) if title_tag else ""

  #푸터/툴박스 부분 대충 제거 (string 에 "Wex Toolbox"가 들어가면 그 뒤는 날려버기)
  toolbox = main.find(string=lambda s: isinstance(s, str) and "Wex Toolbox" in s)
  if toolbox:
    parent = toolbox.parent
    sib = parent.next_sibling
    while sib is not None:
      next_sib = sib.next_sibling
      try:
        sib.decompose()
      except Exception:
        pass
      sib = next_sib

  chunks = []
  current_title = title or "Definition"
  current_texts = []

  for element in main.find_all(["h2", "p", "li"], recursive=True):
    if element.name == "h2":
      if current_texts:
        text = "\n\n".join(current_texts).strip()
        if text:
          chunks.append({
              "section_title": current_title,
              "text": text
          })
        current_texts = []
      current_title = element.get_text(" ", strip=True) or title

    elif element.name in ["p", "li"]:
      txt = element.get_text(" ", strip=True)
      if txt:
        current_texts.append(txt)

  if current_texts:
    text = "\n\n".join(current_texts).strip()
    if text:
      chunks.append({
          "section_title": current_title,
          "text": text
      })


  # 혹시 아무 청크도 못 만들었으면, 전체 p/li를 한 덩어리로라도 넣기 (optional)
  if not chunks:
      texts = []
      for element in main.find_all(["p", "li"]):
          txt = element.get_text(" ", strip=True)
          if txt:
              texts.append(txt)
      if texts:
          chunks.append({
              "section_title": title or "Definition",
              "text": "\n\n".join(texts)
          })

    # 최종 메타데이터 dict 반환
  meta = {
      "source_url": url,
      "title": title,
      "chunk_list": chunks,
  }
  return meta

**전체 크롤링 + jsonl로 저장**



In [34]:
import json

In [35]:
def crawl(output_path="wex_raw.jsonl", letters=None):
  entries = collect_links(letters)

  with open(output_path, "w", encoding="utf-8") as f:
    for i, entry in enumerate(entries):
      url = entry["url"]
      meta = fetch_article(url)

      if meta.get("chunk_list"):
        f.write(json.dumps(meta, ensure_ascii=False) + "\n")
        f.flush()

      time.sleep(3)


**청킹 + 임베딩 + FAISS**

In [36]:
import os
os.environ["UPSTAGE_API_KEY"] = UPSTAGE_API_KEY

In [37]:
import json
import faiss
import numpy as np
from langchain_upstage import UpstageEmbeddings

In [38]:
def chunk_text(text, max_chars=800, overlap_chars=200):
  chunks = []
  start = 0
  n = len(text)
  while start < n:
    end = min(start + max_chars, n)
    chunk = text[start:end]
    chunks.append(chunk)
    if end == n:
      break
    start = end - overlap_chars
  return chunks

def load_chunks(jsonl_path, max_chars=None, overlap_chars=200):
  chunks = []
  metadata = []

  with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
      rec = json.loads(line)
      source_url = rec["source_url"]
      title = rec["title"]
      chunk_list = rec.get("chunk_list", [])

      for sec_idx, c in enumerate(chunk_list):
          section_title = c.get("section_title", title)
          text = c.get("text", "")
          if not text.strip():
              continue

          if max_chars is None:
            chunks.append(text)
            metadata.append({
              "source_url": source_url,
              "title": title,
              "section_title": section_title,
              "section_index": sec_idx,
              "subchunk_index": 0,
            })
          else:
              for sub_idx, ch in enumerate(chunk_text(text, max_chars, overlap_chars)):
                chunks.append(ch)
                metadata.append({
                  "source_url": source_url,
                  "title": title,
                  "section_title": section_title,
                  "section_index": sec_idx,
                  "subchunk_index": sub_idx,
                })
    return chunks, metadata

In [40]:
def build_faiss_index(
    jsonl_path = "wex_structured.jsonl",
    index_path = "wex_faiss.index",
    meta_path = "wex_metadata.jsonl",
):
  chunks, metadata = load_chunks(jsonl_path, max_chars=None)
  print("num_chunks:", len(chunks))

  embeddings = UpstageEmbeddings(model="solar-embedding-1-large")

  vecs = embeddings.embed_documents(chunks)
  emb = np.array(vecs, dtype=np.float32)
  print("emb shape", emb.shape)

  dim = emb.shape[1]

  faiss.normalize_L2(emb)
  index = faiss.IndexFlatIP(dim)
  index.add(emb)

  faiss.write_index(index, index_path)

  with open(meta_path, "w", encoding="utf-8") as f:
    for m in metadata:
      f.write(json.dumps(m, ensure_ascii=False) + "\n")

  print("index saved to", index_path)
  print("metadata saved to", meta_path)

In [None]:
crawl("wex_structured_all.jsonl", letters=None)

total entries: 5456
num_chunks: 5544


BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 4000 tokens, but your request contains 4403 tokens. Please reduce the length of your input text or select only the most relevant portions to include in your request. For information on token counting methods and model-specific limits, please refer to our API reference documentation (https://console.upstage.ai/api/embeddings)", 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_body'}}

In [21]:
from langchain_upstage import UpstageEmbeddings
import numpy as np
import faiss
import json

def build_faiss_index(
    jsonl_path="wex_structured_all.jsonl",
    index_path="wex_faiss_all.index",
    meta_path="wex_metadata_all.jsonl",
):
    # 1) 청크 + 메타데이터 로드
    chunks, metadata = load_chunks(jsonl_path, max_chars=800)
    print("num_chunks:", len(chunks))

    # 2) Upstage 임베딩 모델 (★ suffix 빼고 base 이름만 사용)
    embeddings = UpstageEmbeddings(model="solar-embedding-1-large")

    # 3) 배치로 나눠서 embed_documents 호출 (예: 256개씩)
    all_vecs = []
    batch_size = 256

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i : i + batch_size]
        # 혹시라도 공백 문자열이 섞여 있으면 제거 (안전)
        batch = [c for c in batch if c.strip()]
        if not batch:
            continue

        vecs = embeddings.embed_documents(batch)
        all_vecs.extend(vecs)
        print(f"embedded {i + len(batch)} / {len(chunks)}")

    emb = np.array(all_vecs, dtype=np.float32)
    print("emb shape", emb.shape)

    dim = emb.shape[1]
    faiss.normalize_L2(emb)
    index = faiss.IndexFlatIP(dim)
    index.add(emb)

    faiss.write_index(index, index_path)

    with open(meta_path, "w", encoding="utf-8") as f:
        for m in metadata:
            f.write(json.dumps(m, ensure_ascii=False) + "\n")

    print("index saved to", index_path)
    print("metadata saved to", meta_path)


In [22]:
build_faiss_index(
    jsonl_path="wex_structured_all.jsonl",
    index_path="wex_faiss_all.index",
    meta_path="wex_metadata_all.jsonl",
)

num_chunks: 14385
embedded 256 / 14385
embedded 512 / 14385
embedded 768 / 14385
embedded 1024 / 14385
embedded 1280 / 14385
embedded 1536 / 14385
embedded 1792 / 14385
embedded 2048 / 14385
embedded 2304 / 14385
embedded 2560 / 14385
embedded 2816 / 14385
embedded 3072 / 14385
embedded 3328 / 14385
embedded 3584 / 14385
embedded 3840 / 14385
embedded 4096 / 14385
embedded 4352 / 14385
embedded 4608 / 14385
embedded 4864 / 14385
embedded 5120 / 14385
embedded 5376 / 14385
embedded 5632 / 14385
embedded 5888 / 14385
embedded 6144 / 14385
embedded 6400 / 14385
embedded 6656 / 14385
embedded 6912 / 14385
embedded 7168 / 14385
embedded 7424 / 14385
embedded 7680 / 14385
embedded 7936 / 14385
embedded 8192 / 14385
embedded 8448 / 14385
embedded 8704 / 14385
embedded 8960 / 14385
embedded 9216 / 14385
embedded 9472 / 14385
embedded 9728 / 14385
embedded 9984 / 14385
embedded 10240 / 14385
embedded 10496 / 14385
embedded 10752 / 14385
embedded 11008 / 14385
embedded 11264 / 14385
embedded 115

In [41]:
pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [42]:
#!/usr/bin/env python3
"""
create_faiss_index.py

- 'entries.jsonl' (Semantic Chunking된 파일)을 읽어옵니다.
- 각 청크(섹션)를 로드합니다.
- (안전 장치) 만약 섹션 텍스트가 1000자를 넘으면, 1000자 단위로 더 잘게 자릅니다.
- 이 과정에서 'title', 'source_url', 'section_title' 메타데이터를 모두 보존합니다.
- BGE-small 임베딩 모델을 사용하여 모든 청크를 임베딩합니다.
- 'faiss_index_philosophy'라는 이름의 로컬 FAISS 인덱스로 저장합니다.
"""

import jsonlines
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
import sys
import time

# ----------------------------------------------------
# 1. 설정값
# ----------------------------------------------------
JSONL_FILE = "wex_structured_all.jsonl"             # 입력 파일 (스크래핑 결과)
INDEX_NAME = "faiss_index_law"    # 저장할 FAISS 인덱스 이름

# "Safety Net" 청킹 설정 (H2 섹션이 너무 클 경우 대비)
CHUNK_SIZE = 1000   # 청크 최대 글자 수
CHUNK_OVERLAP = 100 # 청크 겹침

# ----------------------------------------------------
# 2. 임베딩 모델 로드
# ----------------------------------------------------
try:
    upstage_embeddings
except Exception as e:
    print(f"Error loading embedding model. Do you have a GPU runtime? {e}")
    # (GPU 런타임이 아닐 경우 CPU로 fallback)
    # model_kwargs = {'device': 'cpu'}
    # embedding_model = HuggingFaceEmbeddings(
    #     model_name=model_name,
    #     model_kwargs=model_kwargs,
    #     encode_kwargs=encode_kwargs
    # )
print("Embedding model loaded.")

# ----------------------------------------------------
# 3. JSONL 로드 및 '안전 장치' 청킹
# ----------------------------------------------------
print(f"Loading '{JSONL_FILE}' and applying safety net chunking...")

# '안전 장치' (H2 섹션이 너무 클 경우를 대비한) 텍스트 분할기
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)

all_final_chunks = [] # 최종적으로 FAISS에 들어갈 Document 객체 리스트

try:
    with jsonlines.open(JSONL_FILE, 'r') as reader:
        for entry in reader:
            # (1) 기본 메타데이터 (페이지 레벨)
            base_metadata = {
                "source": entry.get("source_url", "N/A"),
                "title": entry.get("title", "N/A"),
            }

            # (2) Semantic Chunking된 'chunk_list' 순회
            for chunk in entry.get("chunk_list", []):
                section_text = chunk.get("text")
                section_title = chunk.get("section_title", "N/A")

                if not section_text:
                    continue

                # (3) H2 섹션 텍스트가 CHUNK_SIZE(1000자)를 넘을 경우,
                #     text_splitter가 이 텍스트를 더 작은 '미니 청크'로 자름
                split_texts = text_splitter.split_text(section_text)

                # (4) 이 '미니 청크'들을 Document 객체로 변환
                for text_piece in split_texts:
                    # 메타데이터에 'section' 정보를 추가
                    final_metadata = base_metadata.copy()
                    final_metadata["section"] = section_title

                    new_doc = Document(page_content=text_piece, metadata=final_metadata)
                    all_final_chunks.append(new_doc)

except FileNotFoundError:
    print(f"Error: '{JSONL_FILE}' not found. Please run the scraping script first.")
    sys.exit()

print(f"Total 'mini-chunks' to be indexed: {len(all_final_chunks)}")

# ----------------------------------------------------
# 4. FAISS 임베딩 및 저장
# ----------------------------------------------------
if all_final_chunks:
    print("Starting FAISS index creation... (This may take a long time)")
    start_time = time.time()

    # FAISS.from_documents()를 사용하면
    # 텍스트 청크는 임베딩되고, 메타데이터는 그대로 벡터 스토어에 저장됩니다.
    db_psychology = FAISS.from_documents(all_final_chunks, upstage_embeddings)

    end_time = time.time()
    print(f"FAISS index created successfully in {end_time - start_time:.2f} seconds.")

    # 생성된 인덱스를 파일로 저장
    db_psychology.save_local(INDEX_NAME)

    print(f"FAISS index saved to folder: '{INDEX_NAME}'")
else:
    print("No chunks were created. FAISS index not built.")

Embedding model loaded.
Loading 'wex_structured_all.jsonl' and applying safety net chunking...
Total 'mini-chunks' to be indexed: 12089
Starting FAISS index creation... (This may take a long time)
FAISS index created successfully in 1463.01 seconds.
FAISS index saved to folder: 'faiss_index_law'
