<a href="https://colab.research.google.com/github/seoyen1122/solar_rag/blob/main/history.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### history

### preprocessing: 1. US histoy


In [1]:
#!/usr/bin/env python3
"""
scrape_history_content.py (★ No Selenium Ver. ★)

- (★수정★) TARGET_HISTORY_CHAPTER_URLS 목록에 U.S. History 176개 URL을 모두 채워넣었습니다.
- (자동) 각 URL을 requests로 방문하여, 페이지 제목(<h1>)을 'title'로 가져옵니다.
- (자동) 페이지 내부의 <h2> 태그를 기준으로 "Semantic Chunking"을 수행합니다.
- (자동) Philosophy와 동일한 메타데이터 구조로 'history.jsonl' 파일에 저장합니다.
"""

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import urllib.robotparser
import time
import json
import sys
from typing import List, Dict

# 설정
BASE_DOMAIN = "https://openstax.org"
OUTPUT_FILE = "history.jsonl"
DELAY_SECONDS = 0.5 # (requests는 selenium보다 빠르므로 0.5초도 충분)
MAX_RETRIES = 3

# robots.txt 검사 (OpenStax용)
ROBOTS_TXT = urljoin(BASE_DOMAIN, "/robots.txt")
rp = urllib.robotparser.RobotFileParser()
rp.set_url(ROBOTS_TXT)
try:
    rp.read()
except Exception as e:
    print(f"Warning: cannot read robots.txt at {ROBOTS_TXT}: {e}", file=sys.stderr)

HEADERS = {
    "User-Agent": "SolarPro-RAG-Scraper/1.0 (+https://your.org/contact) Python requests"
}


def can_fetch(url: str) -> bool:
    try:
        parsed = urlparse(url)
        return rp.can_fetch(HEADERS["User-Agent"], parsed.path)
    except Exception:
        return True


def get_soup(url: str, retries: int = MAX_RETRIES) -> BeautifulSoup:
    for attempt in range(1, retries + 1):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=15)
            if resp.status_code == 200:
                return BeautifulSoup(resp.text, "html.parser")
            else:
                print(f"Warning: status {resp.status_code} for {url}", file=sys.stderr)
        except Exception as e:
            print(f"Request error ({attempt}/{retries}) for {url}: {e}", file=sys.stderr)
        time.sleep(2 * attempt)
    return None


# ----------------------------------------------------
# (★수정됨★) U.S. History 176개 URL 전체 목록
# ----------------------------------------------------
TARGET_HISTORY_CHAPTER_URLS = [
    # Chapter 1
    'https://openstax.org/books/us-history/pages/1-1-the-americas',
    'https://openstax.org/books/us-history/pages/1-2-europe-on-the-brink-of-change',
    'https://openstax.org/books/us-history/pages/1-3-west-africa-and-the-role-of-slavery',
    # Chapter 2
    'https://openstax.org/books/us-history/pages/2-1-portuguese-exploration-and-spanish-conquest',
    'https://openstax.org/books/us-history/pages/2-2-religious-upheavals-in-the-developing-atlantic-world',
    'https://openstax.org/books/us-history/pages/2-3-challenges-to-spains-supremacy',
    'https://openstax.org/books/us-history/pages/2-4-new-worlds-in-the-americas-labor-commerce-and-the-columbian-exchange',
    # Chapter 3
    'https://openstax.org/books/us-history/pages/3-1-spanish-exploration-and-colonial-society',
    'https://openstax.org/books/us-history/pages/3-2-colonial-rivalries-dutch-and-french-colonial-ambitions',
    'https://openstax.org/books/us-history/pages/3-3-english-settlements-in-america',
    'https://openstax.org/books/us-history/pages/3-4-the-impact-of-colonization',
    # Chapter 4
    'https://openstax.org/books/us-history/pages/4-1-charles-ii-and-the-restoration-colonies',
    'https://openstax.org/books/us-history/pages/4-2-the-glorious-revolution-and-the-english-empire',
    'https://openstax.org/books/us-history/pages/4-3-an-empire-of-slavery-and-the-consumer-revolution',
    'https://openstax.org/books/us-history/pages/4-4-great-awakening-and-enlightenment',
    'https://openstax.org/books/us-history/pages/4-5-wars-for-empire',
    # Chapter 5
    'https://openstax.org/books/us-history/pages/5-1-confronting-the-national-debt-the-aftermath-of-the-french-and-indian-war',
    'https://openstax.org/books/us-history/pages/5-2-the-stamp-act-and-the-sons-and-daughters-of-liberty',
    'https://openstax.org/books/us-history/pages/5-3-the-townshend-acts-and-colonial-protest',
    'https://openstax.org/books/us-history/pages/5-4-the-destruction-of-the-tea-and-the-coercive-acts',
    'https://openstax.org/books/us-history/pages/5-5-disaffection-the-first-continental-congress-and-american-identity',
    # Chapter 6
    'https://openstax.org/books/us-history/pages/6-1-britains-law-and-order-strategy-and-its-consequences',
    'https://openstax.org/books/us-history/pages/6-2-the-early-years-of-the-revolution',
    'https://openstax.org/books/us-history/pages/6-3-war-in-the-south',
    'https://openstax.org/books/us-history/pages/6-4-identity-during-the-american-revolution',
    # Chapter 7
    'https://openstax.org/books/us-history/pages/7-1-common-sense-from-monarchy-to-an-american-republic',
    'https://openstax.org/books/us-history/pages/7-2-how-much-revolutionary-change',
    'https://openstax.org/books/us-history/pages/7-3-debating-democracy',
    'https://openstax.org/books/us-history/pages/7-4-the-constitutional-convention-and-federal-constitution',
    # Chapter 8
    'https://openstax.org/books/us-history/pages/8-1-competing-visions-federalists-and-democratic-republicans',
    'https://openstax.org/books/us-history/pages/8-2-the-new-american-republic',
    'https://openstax.org/books/us-history/pages/8-3-partisan-politics',
    'https://openstax.org/books/us-history/pages/8-4-the-united-states-goes-back-to-war',
    # Chapter 9
    'https://openstax.org/books/us-history/pages/9-1-early-industrialization-in-the-northeast',
    'https://openstax.org/books/us-history/pages/9-2-a-vibrant-capitalist-republic',
    'https://openstax.org/books/us-history/pages/9-3-on-the-move-the-transportation-revolution',
    'https://openstax.org/books/us-history/pages/9-4-a-new-social-order-class-divisions',
    # Chapter 10
    'https://openstax.org/books/us-history/pages/10-1-a-new-political-style-from-john-quincy-adams-to-andrew-jackson',
    'https://openstax.org/books/us-history/pages/10-2-the-rise-of-american-democracy',
    'https://openstax.org/books/us-history/pages/10-3-the-nullification-crisis-and-the-bank-war',
    'https://openstax.org/books/us-history/pages/10-4-indian-removal',
    'https://openstax.org/books/us-history/pages/10-5-the-tyranny-and-triumph-of-the-majority',
    # Chapter 11
    'https://openstax.org/books/us-history/pages/11-1-lewis-and-clark',
    'https://openstax.org/books/us-history/pages/11-2-the-missouri-crisis',
    'https://openstax.org/books/us-history/pages/11-3-independence-for-texas',
    'https://openstax.org/books/us-history/pages/11-4-the-mexican-american-war-1846-1848',
    'https://openstax.org/books/us-history/pages/11-5-free-or-slave-soil-the-dilemma-of-the-west',
    # Chapter 12
    'https://openstax.org/books/us-history/pages/12-1-the-economics-of-cotton',
    'https://openstax.org/books/us-history/pages/12-2-african-americans-in-the-antebellum-united-states',
    'https://openstax.org/books/us-history/pages/12-3-wealth-and-culture-in-the-south',
    'https://openstax.org/books/us-history/pages/12-4-the-filibuster-and-the-quest-for-new-slave-states',
    # Chapter 13
    'https://openstax.org/books/us-history/pages/13-1-an-awakening-of-religion-and-individualism',
    'https://openstax.org/books/us-history/pages/13-2-antebellum-communal-experiments',
    'https://openstax.org/books/us-history/pages/13-3-reforms-to-human-health',
    'https://openstax.org/books/us-history/pages/13-4-addressing-slavery',
    'https://openstax.org/books/us-history/pages/13-5-womens-rights',
    # Chapter 14
    'https://openstax.org/books/us-history/pages/14-1-the-compromise-of-1850',
    'https://openstax.org/books/us-history/pages/14-2-the-kansas-nebraska-act-and-the-republican-party',
    'https://openstax.org/books/us-history/pages/14-3-the-dred-scott-decision-and-sectional-strife',
    'https://openstax.org/books/us-history/pages/14-4-john-brown-and-the-election-of-1860',
    # Chapter 15
    'https://openstax.org/books/us-history/pages/15-1-the-origins-and-outbreak-of-the-civil-war',
    'https://openstax.org/books/us-history/pages/15-2-early-mobilization-and-war',
    'https://openstax.org/books/us-history/pages/15-3-1863-the-changing-nature-of-the-war',
    'https://openstax.org/books/us-history/pages/15-4-the-union-triumphant',
    # Chapter 16
    'https://openstax.org/books/us-history/pages/16-1-restoring-the-union',
    'https://openstax.org/books/us-history/pages/16-2-congress-and-the-remaking-of-the-south-1865-1866',
    'https://openstax.org/books/us-history/pages/16-3-radical-reconstruction-1867-1872',
    'https://openstax.org/books/us-history/pages/16-4-the-collapse-of-reconstruction',
    # Chapter 17
    'https://openstax.org/books/us-history/pages/17-1-the-westward-spirit',
    'https://openstax.org/books/us-history/pages/17-2-homesteading-dreams-and-realities',
    'https://openstax.org/books/us-history/pages/17-3-making-a-living-in-gold-and-cattle',
    'https://openstax.org/books/us-history/pages/17-4-the-assault-on-american-indian-life-and-culture',
    'https://openstax.org/books/us-history/pages/17-5-the-impact-of-expansion-on-chinese-immigrants-and-hispanic-citizens',
    # Chapter 18
    'https://openstax.org/books/us-history/pages/18-1-inventors-of-the-age',
    'https://openstax.org/books/us-history/pages/18-2-from-invention-to-industrial-growth',
    'https://openstax.org/books/us-history/pages/18-3-building-industrial-america-on-the-backs-of-labor',
    'https://openstax.org/books/us-history/pages/18-4-a-new-american-consumer-culture',
    # Chapter 19
    'https://openstax.org/books/us-history/pages/19-1-urbanization-and-its-challenges',
    'https://openstax.org/books/us-history/pages/19-2-the-african-american-great-migration-and-new-european-immigration',
    'https://openstax.org/books/us-history/pages/19-3-relief-from-the-chaos-of-urban-life',
    'https://openstax.org/books/us-history/pages/19-4-change-reflected-in-thought-and-writing',
    # Chapter 20
    'https://openstax.org/books/us-history/pages/20-1-political-corruption-in-postbellum-america',
    'https://openstax.org/books/us-history/pages/20-2-the-key-political-issues-patronage-tariffs-and-gold',
    'https://openstax.org/books/us-history/pages/20-3-farmers-revolt-in-the-populist-era',
    'https://openstax.org/books/us-history/pages/20-4-social-and-labor-unrest-in-the-1890s',
    # Chapter 21
    'https://openstax.org/books/us-history/pages/21-1-the-origins-of-the-progressive-spirit-in-america',
    'https://openstax.org/books/us-history/pages/21-2-progressivism-at-the-grassroots-level',
    'https://openstax.org/books/us-history/pages/21-3-new-voices-for-women-and-african-americans',
    'https://openstax.org/books/us-history/pages/21-4-progressivism-in-the-white-house',
    # Chapter 22
    'https://openstax.org/books/us-history/pages/22-1-turner-mahan-and-the-roots-of-empire',
    'https://openstax.org/books/us-history/pages/22-2-the-spanish-american-war-and-overseas-empire',
    'https://openstax.org/books/us-history/pages/22-3-economic-imperialism-in-east-asia',
    'https://openstax.org/books/us-history/pages/22-4-roosevelts-big-stick-foreign-policy',
    'https://openstax.org/books/us-history/pages/22-5-tafts-dollar-diplomacy',
    # Chapter 23
    'https://openstax.org/books/us-history/pages/23-1-american-isolationism-and-the-european-origins-of-war',
    'https://openstax.org/books/us-history/pages/23-2-the-united-states-prepares-for-war',
    'https://openstax.org/books/us-history/pages/23-3-a-new-home-front',
    'https://openstax.org/books/us-history/pages/23-4-from-war-to-peace',
    'https://openstax.org/books/us-history/pages/23-5-demobilization-and-its-difficult-aftermath',
    # Chapter 24
    'https://openstax.org/books/us-history/pages/24-1-prosperity-and-the-production-of-popular-entertainment',
    'https://openstax.org/books/us-history/pages/24-2-transformation-and-backlash',
    'https://openstax.org/books/us-history/pages/24-3-a-new-generation',
    'https://openstax.org/books/us-history/pages/24-4-republican-ascendancy-politics-in-the-1920s',
    # Chapter 25
    'https://openstax.org/books/us-history/pages/25-1-the-stock-market-crash-of-1929',
    'https://openstax.org/books/us-history/pages/25-2-president-hoovers-response',
    'https://openstax.org/books/us-history/pages/25-3-the-depths-of-the-great-depression',
    'https://openstax.org/books/us-history/pages/25-4-assessing-the-hoover-years-on-the-eve-of-the-new-deal',
    # Chapter 26
    'https://openstax.org/books/us-history/pages/26-1-the-rise-of-franklin-roosevelt',
    'https://openstax.org/books/us-history/pages/26-2-the-first-new-deal',
    'https://openstax.org/books/us-history/pages/26-3-the-second-new-deal',
    # Chapter 27
    'https://openstax.org/books/us-history/pages/27-1-the-origins-of-war-europe-asia-and-the-united-states',
    'https://openstax.org/books/us-history/pages/27-2-the-home-front',
    'https://openstax.org/books/us-history/pages/27-3-victory-in-the-european-theater',
    'https://openstax.org/books/us-history/pages/27-4-the-pacific-theater-and-the-atomic-bomb',
    # Chapter 28
    'https://openstax.org/books/us-history/pages/28-1-the-challenges-of-peacetime',
    'https://openstax.org/books/us-history/pages/28-2-the-cold-war',
    'https://openstax.org/books/us-history/pages/28-3-the-american-dream',
    'https://openstax.org/books/us-history/pages/28-4-popular-culture-and-mass-media',
    'https://openstax.org/books/us-history/pages/28-5-the-african-american-struggle-for-civil-rights',
    # Chapter 29
    'https://openstax.org/books/us-history/pages/29-1-the-kennedy-promise',
    'https://openstax.org/books/us-history/pages/29-2-lyndon-johnson-and-the-great-society',
    'https://openstax.org/books/us-history/pages/29-3-the-civil-rights-movement-marches-on',
    'https://openstax.org/books/us-history/pages/29-4-challenging-the-status-quo',
    # Chapter 30
    'https://openstax.org/books/us-history/pages/30-1-identity-politics-in-a-fractured-society',
    'https://openstax.org/books/us-history/pages/30-2-coming-apart-coming-together',
    'https://openstax.org/books/us-history/pages/30-3-vietnam-the-downward-spiral',
    'https://openstax.org/books/us-history/pages/30-4-watergate-nixons-domestic-nightmare',
    'https://openstax.org/books/us-history/pages/30-5-jimmy-carter-in-the-aftermath-of-the-storm',
    # Chapter 31
    'https://openstax.org/books/us-history/pages/31-1-the-reagan-revolution',
    'https://openstax.org/books/us-history/pages/31-2-political-and-cultural-fusions',
    'https://openstax.org/books/us-history/pages/31-3-a-new-world-order',
    'https://openstax.org/books/us-history/pages/31-4-bill-clinton-and-the-new-economy',
    # Chapter 32
    'https://openstax.org/books/us-history/pages/32-1-the-war-on-terror',
    'https://openstax.org/books/us-history/pages/32-2-the-domestic-mission',
    'https://openstax.org/books/us-history/pages/32-3-new-century-old-disputes',
    'https://openstax.org/books/us-history/pages/32-4-hope-and-change',
    'https://openstax.org/books/us-history/pages/32-5-political-divides-and-social-movements',
]

# ----------------------------------------------------
# extract_metadata_openstax 함수: OpenStax용 Semantic Chunking (H2 기준)
# ----------------------------------------------------
def extract_metadata_openstax(page_url: str) -> Dict:
    """
    OpenStax 콘텐츠 페이지에서 title(<h1>), source_url 및
    페이지 내부의 <h2> 섹션별 텍스트 청크(chunk_list)를 추출합니다.
    """
    result = {
        "source_url": page_url,
        "title": None,
        "chunk_list": []  # chunk_list가 핵심
    }

    if not can_fetch(page_url):
        print(f"robots disallow: {page_url}", file=sys.stderr)
        return result

    soup = get_soup(page_url)
    if soup is None:
        return result

    # Title (페이지의 <h1>이 챕터/섹션 제목. 예: "1.1 The Americas")
    h1 = soup.find("h1")
    if h1 and h1.get_text(strip=True):
        result["title"] = h1.get_text(strip=True)
    else:
        title_tag = soup.find("title") # Fallback
        if title_tag:
            result["title"] = title_tag.get_text(strip=True)

    # Main content: OpenStax는 <div data-type="page"> 사용
    content = soup.find("div", attrs={"data-type": "page"})
    if not content:
        print(f"Warning: Main content 'div[data-type=\"page\"]' not found for {page_url}", file=sys.stderr)
        return result

    # --- H2-Based Semantic Chunking 로직 (Philosophy와 동일) ---
    chunks = []
    # (★수정★) H2가 나오기 전의 첫 텍스트는 'Introduction' 또는 페이지 제목(h1)을 섹션명으로 사용
    current_section_title = result["title"] if result["title"] else "Introduction"
    current_text_list = []

    # 'content' 내부의 모든 관련 태그를 순서대로 순회
    for tag in content.find_all(['h2', 'h3', 'p', 'blockquote', 'ul', 'ol'], recursive=True):

        # (예외 처리) 불필요한 영역(예: 'Key Terms', 'Review Questions' 등) 건너뛰기
        parent_section = tag.find_parent("section", attrs={"data-type": True})
        if parent_section:
            data_type = parent_section.get('data-type', '')
            if any(skip_term in data_type for skip_term in ['key-terms', 'summary', 'review-questions', 'critical-thinking']):
                continue

        # (1) 새 H2 섹션(대주제)을 만났을 때
        if tag.name == 'h2':
            # 그 전까지 수집한 텍스트가 있다면, 이전 섹션 청크로 저장
            if current_text_list:
                chunk_text = "\n".join(current_text_list).strip()
                if len(chunk_text) > 50: # 최소 50자 이상
                    chunks.append({
                        # (★중요★) Philosophy와 메타데이터 키 통일
                        "section_title": current_section_title,
                        "text": chunk_text
                    })

            # 새 섹션 정보로 업데이트
            current_section_title = tag.get_text(" ", strip=True)
            current_text_list = [] # 텍스트 리스트 초기화

        # (2) H2가 아닌 내용물(h3, p, blockquote 등)
        elif tag.name in ['p', 'h3', 'h4', 'blockquote', 'ul', 'ol']:
            tag_text = tag.get_text(" ", strip=True)
            if tag_text:
                # h3/h4의 경우, 제목이라는 것을 명확히 하기 위해 마크업 추가
                if tag.name in ['h3', 'h4']:
                    current_text_list.append(f"\n--- {tag_text} ---\n")
                else:
                    current_text_list.append(tag_text)

    # (3) 루프가 끝난 후, 마지막 H2 섹션의 청크를 저장
    if current_text_list:
        chunk_text = "\n".join(current_text_list).strip()
        if len(chunk_text) > 50:
            chunks.append({
                "section_title": current_section_title,
                "text": chunk_text
            })

    result["chunk_list"] = chunks
    return result

# ----------------------------------------------------
# main 함수: 수동 URL 리스트 순회
# ----------------------------------------------------
def main():
    if not TARGET_HISTORY_CHAPTER_URLS:
        print("Error: 'TARGET_HISTORY_CHAPTER_URLS' 리스트가 비어있습니다.", file=sys.stderr)
        print("스크립트를 열어 목차 페이지에서 챕터 URL을 수동으로 추가해주세요.", file=sys.stderr)
        return

    out_f = open(OUTPUT_FILE, "w", encoding="utf-8")
    count = 0
    urls = TARGET_HISTORY_CHAPTER_URLS

    print(f"Total {len(urls)} pages to scrape...")

    for url in urls:
        print(f"[{count+1}/{len(urls)}] Processing {url}")

        meta = extract_metadata_openstax(url)

        if meta["chunk_list"]: # 청크가 하나라도 있을 때만 저장
            json_line = json.dumps(meta, ensure_ascii=False)
            out_f.write(json_line + "\n")
            out_f.flush()
            count += 1
        else:
            print(f"Warning: No chunks extracted for {url}. Skipping.", file=sys.stderr)

        time.sleep(DELAY_SECONDS)

    out_f.close()
    print(f"\nSuccessfully saved {count} pages (as entries) to {OUTPUT_FILE}")


if __name__ == "__main__":
    main()

Total 137 pages to scrape...
[1/137] Processing https://openstax.org/books/us-history/pages/1-1-the-americas
[2/137] Processing https://openstax.org/books/us-history/pages/1-2-europe-on-the-brink-of-change
[3/137] Processing https://openstax.org/books/us-history/pages/1-3-west-africa-and-the-role-of-slavery
[4/137] Processing https://openstax.org/books/us-history/pages/2-1-portuguese-exploration-and-spanish-conquest
[5/137] Processing https://openstax.org/books/us-history/pages/2-2-religious-upheavals-in-the-developing-atlantic-world
[6/137] Processing https://openstax.org/books/us-history/pages/2-3-challenges-to-spains-supremacy
[7/137] Processing https://openstax.org/books/us-history/pages/2-4-new-worlds-in-the-americas-labor-commerce-and-the-columbian-exchange
[8/137] Processing https://openstax.org/books/us-history/pages/3-1-spanish-exploration-and-colonial-society
[9/137] Processing https://openstax.org/books/us-history/pages/3-2-colonial-rivalries-dutch-and-french-colonial-ambiti

### preprocessing: 2. World histoy (1)

In [2]:
#!/usr/bin/env python3
"""
scrape_world_history_vol1.py (★ No Selenium Ver. ★)

- (★수정★) TARGET_HISTORY_CHAPTER_URLS 목록에 World History Vol 1 63개 URL을 모두 채워넣었습니다.
- (자동) 각 URL을 requests로 방문하여, 페이지 제목(<h1>)을 'title'로 가져옵니다.
- (자동) 페이지 내부의 <h2> 태그를 기준으로 "Semantic Chunking"을 수행합니다.
- (자동) Philosophy와 동일한 메타데이터 구조로 'history_world_vol1.jsonl' 파일에 저장합니다.
"""

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import urllib.robotparser
import time
import json
import sys
from typing import List, Dict

# 설정
BASE_DOMAIN = "https://openstax.org"
OUTPUT_FILE = "history_world_vol1.jsonl" # (★수정★) 덮어쓰기 방지
DELAY_SECONDS = 0.5
MAX_RETRIES = 3

# robots.txt 검사 (OpenStax용)
ROBOTS_TXT = urljoin(BASE_DOMAIN, "/robots.txt")
rp = urllib.robotparser.RobotFileParser()
rp.set_url(ROBOTS_TXT)
try:
    rp.read()
except Exception as e:
    print(f"Warning: cannot read robots.txt at {ROBOTS_TXT}: {e}", file=sys.stderr)

HEADERS = {
    "User-Agent": "SolarPro-RAG-Scraper/1.0 (+https://your.org/contact) Python requests"
}


def can_fetch(url: str) -> bool:
    try:
        parsed = urlparse(url)
        return rp.can_fetch(HEADERS["User-Agent"], parsed.path)
    except Exception:
        return True


def get_soup(url: str, retries: int = MAX_RETRIES) -> BeautifulSoup:
    for attempt in range(1, retries + 1):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=15)
            if resp.status_code == 200:
                return BeautifulSoup(resp.text, "html.parser")
            else:
                print(f"Warning: status {resp.status_code} for {url}", file=sys.stderr)
        except Exception as e:
            print(f"Request error ({attempt}/{retries}) for {url}: {e}", file=sys.stderr)
        time.sleep(2 * attempt)
    return None


# ----------------------------------------------------
# (★수정됨★) World History Vol 1 63개 URL 전체 목록 (Introduction 등 제외)
# ----------------------------------------------------
TARGET_HISTORY_CHAPTER_URLS = [
    # Chapter 1
    'https://openstax.org/books/world-history-volume-1/pages/1-1-developing-a-global-perspective',
    'https://openstax.org/books/world-history-volume-1/pages/1-2-primary-sources',
    'https://openstax.org/books/world-history-volume-1/pages/1-3-causation-and-interpretation-in-history',
    # Chapter 2
    'https://openstax.org/books/world-history-volume-1/pages/2-1-early-human-evolution-and-migration',
    'https://openstax.org/books/world-history-volume-1/pages/2-2-people-in-the-paleolithic-age',
    'https://openstax.org/books/world-history-volume-1/pages/2-3-the-neolithic-revolution',
    # Chapter 3
    'https://openstax.org/books/world-history-volume-1/pages/3-1-early-civilizations',
    'https://openstax.org/books/world-history-volume-1/pages/3-2-ancient-mesopotamia',
    'https://openstax.org/books/world-history-volume-1/pages/3-3-ancient-egypt',
    'https://openstax.org/books/world-history-volume-1/pages/3-4-the-indus-valley-civilization',
    # Chapter 4
    'https://openstax.org/books/world-history-volume-1/pages/4-1-from-old-babylon-to-the-medes',
    'https://openstax.org/books/world-history-volume-1/pages/4-2-egypts-new-kingdom',
    'https://openstax.org/books/world-history-volume-1/pages/4-3-the-persian-empire',
    'https://openstax.org/books/world-history-volume-1/pages/4-4-the-hebrews',
    # Chapter 5
    'https://openstax.org/books/world-history-volume-1/pages/5-1-ancient-china',
    'https://openstax.org/books/world-history-volume-1/pages/5-2-the-steppes',
    'https://openstax.org/books/world-history-volume-1/pages/5-3-korea-japan-and-southeast-asia',
    'https://openstax.org/books/world-history-volume-1/pages/5-4-vedic-india-to-the-fall-of-the-maurya-empire',
    # Chapter 6
    'https://openstax.org/books/world-history-volume-1/pages/6-1-early-mediterranean-peoples',
    'https://openstax.org/books/world-history-volume-1/pages/6-2-ancient-greece',
    'https://openstax.org/books/world-history-volume-1/pages/6-3-the-hellenistic-era',
    'https://openstax.org/books/world-history-volume-1/pages/6-4-the-roman-republic',
    'https://openstax.org/books/world-history-volume-1/pages/6-5-the-age-of-augustus',
    # Chapter 7
    'https://openstax.org/books/world-history-volume-1/pages/7-1-the-daily-life-of-a-roman-family',
    'https://openstax.org/books/world-history-volume-1/pages/7-2-slavery-in-the-roman-empire',
    'https://openstax.org/books/world-history-volume-1/pages/7-3-the-roman-economy-trade-taxes-and-conquest',
    'https://openstax.org/books/world-history-volume-1/pages/7-4-religion-in-the-roman-empire',
    'https://openstax.org/books/world-history-volume-1/pages/7-5-the-regions-of-rome',
    # Chapter 8
    'https://openstax.org/books/world-history-volume-1/pages/8-1-populating-and-settling-the-americas',
    'https://openstax.org/books/world-history-volume-1/pages/8-2-early-cultures-and-civilizations-in-the-americas',
    'https://openstax.org/books/world-history-volume-1/pages/8-3-the-age-of-empires-in-the-americas',
    # Chapter 9
    'https://openstax.org/books/world-history-volume-1/pages/9-1-africas-geography-and-climate',
    'https://openstax.org/books/world-history-volume-1/pages/9-2-the-emergence-of-farming-and-the-bantu-migrations',
    'https://openstax.org/books/world-history-volume-1/pages/9-3-the-kingdom-of-kush',
    'https://openstax.org/books/world-history-volume-1/pages/9-4-north-africas-mediterranean-and-trans-saharan-connections',
    # Chapter 10
    'https://openstax.org/books/world-history-volume-1/pages/10-1-the-eastward-shift',
    'https://openstax.org/books/world-history-volume-1/pages/10-2-the-byzantine-empire-and-persia',
    'https://openstax.org/books/world-history-volume-1/pages/10-3-the-kingdoms-of-aksum-and-himyar',
    'https://openstax.org/books/world-history-volume-1/pages/10-4-the-margins-of-empire',
    # Chapter 11
    'https://openstax.org/books/world-history-volume-1/pages/11-1-the-rise-and-message-of-islam',
    'https://openstax.org/books/world-history-volume-1/pages/11-2-the-arab-islamic-conquests-and-the-first-islamic-states',
    'https://openstax.org/books/world-history-volume-1/pages/11-3-islamization-and-religious-rule-under-islam',
    # Chapter 12
    'https://openstax.org/books/world-history-volume-1/pages/12-1-the-indian-ocean-world-in-the-early-middle-ages',
    'https://openstax.org/books/world-history-volume-1/pages/12-2-east-west-interactions-in-the-early-middle-ages',
    'https://openstax.org/books/world-history-volume-1/pages/12-3-border-states-sogdiana-korea-and-japan',
    # Chapter 13
    'https://openstax.org/books/world-history-volume-1/pages/13-1-the-post-roman-west-in-the-early-middle-ages',
    'https://openstax.org/books/world-history-volume-1/pages/13-2-the-seljuk-migration-and-the-call-from-the-east',
    'https://openstax.org/books/world-history-volume-1/pages/13-3-patriarch-and-papacy-the-church-and-the-call-to-crusade',
    'https://openstax.org/books/world-history-volume-1/pages/13-4-the-crusading-movement',
    # Chapter 14
    'https://openstax.org/books/world-history-volume-1/pages/14-1-song-china-and-the-steppe-peoples',
    'https://openstax.org/books/world-history-volume-1/pages/14-2-chinggis-khan-and-the-early-mongol-empire',
    'https://openstax.org/books/world-history-volume-1/pages/14-3-the-mongol-empire-fragments',
    'https://openstax.org/books/world-history-volume-1/pages/14-4-christianity-and-islam-outside-central-asia',
    # Chapter 15
    'https://openstax.org/books/world-history-volume-1/pages/15-1-culture-and-society-in-medieval-africa',
    'https://openstax.org/books/world-history-volume-1/pages/15-2-medieval-sub-saharan-africa',
    'https://openstax.org/books/world-history-volume-1/pages/15-3-the-people-of-the-sahel',
    # Chapter 16
    'https://openstax.org/books/world-history-volume-1/pages/16-1-asia-north-africa-and-europe-in-the-early-fourteenth-century',
    'https://openstax.org/books/world-history-volume-1/pages/16-2-famine-climate-change-and-migration',
    'https://openstax.org/books/world-history-volume-1/pages/16-3-the-black-death-from-east-to-west',
    'https://openstax.org/books/world-history-volume-1/pages/16-4-the-long-term-effects-of-global-transformation',
    # Chapter 17
    'https://openstax.org/books/world-history-volume-1/pages/17-1-the-ottomans-and-the-mongols',
    'https://openstax.org/books/world-history-volume-1/pages/17-2-from-the-mamluks-to-ming-china',
    'https://openstax.org/books/world-history-volume-1/pages/17-3-gunpowder-and-nomads-in-a-transitional-age',
]

# ----------------------------------------------------
# extract_metadata_openstax 함수: OpenStax용 Semantic Chunking (H2 기준)
# ----------------------------------------------------
def extract_metadata_openstax(page_url: str) -> Dict:
    """
    OpenStax 콘텐츠 페이지에서 title(<h1>), source_url 및
    페이지 내부의 <h2> 섹션별 텍스트 청크(chunk_list)를 추출합니다.
    """
    result = {
        "source_url": page_url,
        "title": None,
        "chunk_list": []  # chunk_list가 핵심
    }

    if not can_fetch(page_url):
        print(f"robots disallow: {page_url}", file=sys.stderr)
        return result

    soup = get_soup(page_url)
    if soup is None:
        return result

    # Title (페이지의 <h1>이 챕터/섹션 제목. 예: "1.1 The Americas")
    h1 = soup.find("h1")
    if h1 and h1.get_text(strip=True):
        result["title"] = h1.get_text(strip=True)
    else:
        title_tag = soup.find("title") # Fallback
        if title_tag:
            result["title"] = title_tag.get_text(strip=True)

    # Main content: OpenStax는 <div data-type="page"> 사용
    content = soup.find("div", attrs={"data-type": "page"})
    if not content:
        print(f"Warning: Main content 'div[data-type=\"page\"]' not found for {page_url}", file=sys.stderr)
        return result

    # --- H2-Based Semantic Chunking 로직 (Philosophy와 동일) ---
    chunks = []
    # (★수정★) H2가 나오기 전의 첫 텍스트는 'Introduction' 또는 페이지 제목(h1)을 섹션명으로 사용
    current_section_title = result["title"] if result["title"] else "Introduction"
    current_text_list = []

    # 'content' 내부의 모든 관련 태그를 순서대로 순회
    for tag in content.find_all(['h2', 'h3', 'p', 'blockquote', 'ul', 'ol'], recursive=True):

        # (예외 처리) 불필요한 영역(예: 'Key Terms', 'Review Questions' 등) 건너뛰기
        parent_section = tag.find_parent("section", attrs={"data-type": True})
        if parent_section:
            data_type = parent_section.get('data-type', '')
            if any(skip_term in data_type for skip_term in ['key-terms', 'summary', 'review-questions', 'critical-thinking']):
                continue

        # (1) 새 H2 섹션(대주제)을 만났을 때
        if tag.name == 'h2':
            # 그 전까지 수집한 텍스트가 있다면, 이전 섹션 청크로 저장
            if current_text_list:
                chunk_text = "\n".join(current_text_list).strip()
                if len(chunk_text) > 50: # 최소 50자 이상
                    chunks.append({
                        # (★중요★) Philosophy와 메타데이터 키 통일
                        "section_title": current_section_title,
                        "text": chunk_text
                    })

            # 새 섹션 정보로 업데이트
            current_section_title = tag.get_text(" ", strip=True)
            current_text_list = [] # 텍스트 리스트 초기화

        # (2) H2가 아닌 내용물(h3, p, blockquote 등)
        elif tag.name in ['p', 'h3', 'h4', 'blockquote', 'ul', 'ol']:
            tag_text = tag.get_text(" ", strip=True)
            if tag_text:
                # h3/h4의 경우, 제목이라는 것을 명확히 하기 위해 마크업 추가
                if tag.name in ['h3', 'h4']:
                    current_text_list.append(f"\n--- {tag_text} ---\n")
                else:
                    current_text_list.append(tag_text)

    # (3) 루프가 끝난 후, 마지막 H2 섹션의 청크를 저장
    if current_text_list:
        chunk_text = "\n".join(current_text_list).strip()
        if len(chunk_text) > 50:
            chunks.append({
                "section_title": current_section_title,
                "text": chunk_text
            })

    result["chunk_list"] = chunks
    return result

# ----------------------------------------------------
# main 함수: 수동 URL 리스트 순회
# ----------------------------------------------------
def main():
    if not TARGET_HISTORY_CHAPTER_URLS:
        print("Error: 'TARGET_HISTORY_CHAPTER_URLS' 리스트가 비어있습니다.", file=sys.stderr)
        print("스크립트를 열어 목차 페이지에서 챕터 URL을 수동으로 추가해주세요.", file=sys.stderr)
        return

    out_f = open(OUTPUT_FILE, "w", encoding="utf-8")
    count = 0
    urls = TARGET_HISTORY_CHAPTER_URLS

    print(f"Total {len(urls)} pages to scrape...")

    for url in urls:
        print(f"[{count+1}/{len(urls)}] Processing {url}")

        meta = extract_metadata_openstax(url)

        if meta["chunk_list"]: # 청크가 하나라도 있을 때만 저장
            json_line = json.dumps(meta, ensure_ascii=False)
            out_f.write(json_line + "\n")
            out_f.flush()
            count += 1
        else:
            print(f"Warning: No chunks extracted for {url}. Skipping.", file=sys.stderr)

        time.sleep(DELAY_SECONDS)

    out_f.close()
    print(f"\nSuccessfully saved {count} pages (as entries) to {OUTPUT_FILE}")


if __name__ == "__main__":
    main()

Total 63 pages to scrape...
[1/63] Processing https://openstax.org/books/world-history-volume-1/pages/1-1-developing-a-global-perspective
[2/63] Processing https://openstax.org/books/world-history-volume-1/pages/1-2-primary-sources
[3/63] Processing https://openstax.org/books/world-history-volume-1/pages/1-3-causation-and-interpretation-in-history
[4/63] Processing https://openstax.org/books/world-history-volume-1/pages/2-1-early-human-evolution-and-migration
[5/63] Processing https://openstax.org/books/world-history-volume-1/pages/2-2-people-in-the-paleolithic-age
[6/63] Processing https://openstax.org/books/world-history-volume-1/pages/2-3-the-neolithic-revolution
[7/63] Processing https://openstax.org/books/world-history-volume-1/pages/3-1-early-civilizations
[8/63] Processing https://openstax.org/books/world-history-volume-1/pages/3-2-ancient-mesopotamia
[9/63] Processing https://openstax.org/books/world-history-volume-1/pages/3-3-ancient-egypt
[10/63] Processing https://openstax.o

### preprocessing: 3. World histoy (2)

In [3]:
#!/usr/bin/env python3
"""
scrape_world_history_vol2.py (★ No Selenium Ver. ★)

- (★수정★) TARGET_HISTORY_CHAPTER_URLS 목록에 World History Vol 2 59개 URL을 (수동으로) 모두 채워넣었습니다.
- (자동) 각 URL을 requests로 방문하여, 페이지 제목(<h1>)을 'title'로 가져옵니다.
- (자동) 페이지 내부의 <h2> 태그를 기준으로 "Semantic Chunking"을 수행합니다.
- (자동) Philosophy와 동일한 메타데이터 구조로 'history_world_vol2.jsonl' 파일에 저장합니다.
"""

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import urllib.robotparser
import time
import json
import sys
from typing import List, Dict
import re # (URL 자동 생성을 삭제했으므로 re는 사실상 필요 없지만 유지)

# 설정
BASE_DOMAIN = "https://openstax.org"
OUTPUT_FILE = "history_world_vol2.jsonl" # 덮어쓰기 방지
DELAY_SECONDS = 0.5
MAX_RETRIES = 3

# robots.txt 검사 (OpenStax용)
ROBOTS_TXT = urljoin(BASE_DOMAIN, "/robots.txt")
rp = urllib.robotparser.RobotFileParser()
rp.set_url(ROBOTS_TXT)
try:
    rp.read()
except Exception as e:
    print(f"Warning: cannot read robots.txt at {ROBOTS_TXT}: {e}", file=sys.stderr)

HEADERS = {
    "User-Agent": "SolarPro-RAG-Scraper/1.0 (+https://your.org/contact) Python requests"
}


def can_fetch(url: str) -> bool:
    try:
        parsed = urlparse(url)
        return rp.can_fetch(HEADERS["User-Agent"], parsed.path)
    except Exception:
        return True


def get_soup(url: str, retries: int = MAX_RETRIES) -> BeautifulSoup:
    for attempt in range(1, retries + 1):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=15)
            if resp.status_code == 200:
                return BeautifulSoup(resp.text, "html.parser")
            else:
                print(f"Warning: status {resp.status_code} for {url}", file=sys.stderr)
        except Exception as e:
            print(f"Request error ({attempt}/{retries}) for {url}: {e}", file=sys.stderr)
        time.sleep(2 * attempt)
    return None


# ----------------------------------------------------
# (★수정됨★) World History Vol 2 59개 URL 전체 목록 (수동 생성)
# ----------------------------------------------------
BASE_URL = "https://openstax.org/books/world-history-volume-2/pages/"
TARGET_HISTORY_CHAPTER_URLS = [
    # Chapter 1
    f"{BASE_URL}1-1-developing-a-global-perspective",
    f"{BASE_URL}1-2-primary-sources",
    f"{BASE_URL}1-3-causation-and-interpretation-in-history",
    # Chapter 2
    f"{BASE_URL}2-1-india-and-international-connections",
    f"{BASE_URL}2-2-the-malacca-sultanate",
    f"{BASE_URL}2-3-exchange-in-east-asia",
    # Chapter 3
    f"{BASE_URL}3-1-the-roots-of-african-trade",
    f"{BASE_URL}3-2-the-songhai-empire",
    f"{BASE_URL}3-3-the-swahili-coast",
    f"{BASE_URL}3-4-the-trans-saharan-slave-trade",
    # Chapter 4
    f"{BASE_URL}4-1-a-connected-islamic-world",
    f"{BASE_URL}4-2-the-ottoman-empire",
    f"{BASE_URL}4-3-the-safavid-empire",
    # Chapter 5
    f"{BASE_URL}5-1-the-protestant-reformation",
    f"{BASE_URL}5-2-crossing-the-atlantic",
    f"{BASE_URL}5-3-the-mercantilist-economy",
    f"{BASE_URL}5-4-the-atlantic-slave-trade",
    # Chapter 6
    f"{BASE_URL}6-1-european-colonization-in-the-americas",
    f"{BASE_URL}6-2-the-rise-of-a-global-economy",
    f"{BASE_URL}6-3-capitalism-and-the-first-industrial-revolution",
    # Chapter 7
    f"{BASE_URL}7-1-the-enlightenment",
    f"{BASE_URL}7-2-the-exchange-of-ideas-in-the-public-sphere",
    f"{BASE_URL}7-3-revolutions-america-france-and-haiti",
    f"{BASE_URL}7-4-nationalism-liberalism-conservatism-and-the-political-order",
    # Chapter 8
    f"{BASE_URL}8-1-revolution-for-whom",
    f"{BASE_URL}8-2-spanish-north-america",
    f"{BASE_URL}8-3-spanish-south-america",
    f"{BASE_URL}8-4-portuguese-south-america",
    # Chapter 9
    f"{BASE_URL}9-1-the-second-industrial-revolution",
    f"{BASE_URL}9-2-motives-and-means-of-imperialism",
    f"{BASE_URL}9-3-colonial-empires",
    f"{BASE_URL}9-4-exploitation-and-resistance",
    # Chapter 10
    f"{BASE_URL}10-1-inventions-innovations-and-mechanization",
    f"{BASE_URL}10-2-life-in-the-industrial-city",
    f"{BASE_URL}10-3-coerced-and-semicoerced-labor",
    f"{BASE_URL}10-4-communities-in-diaspora",
    f"{BASE_URL}10-5-regulation-reform-and-revolutionary-ideologies",
    # Chapter 11
    f"{BASE_URL}11-1-alliances-expansion-and-conflict",
    f"{BASE_URL}11-2-the-collapse-of-the-ottomans-and-the-coming-of-war",
    f"{BASE_URL}11-3-total-war",
    f"{BASE_URL}11-4-war-on-the-homefront",
    f"{BASE_URL}11-5-the-war-ends",
    # Chapter 12
    f"{BASE_URL}12-1-recovering-from-world-war-i",
    f"{BASE_URL}12-2-the-formation-of-the-soviet-union",
    f"{BASE_URL}12-3-the-great-depression",
    f"{BASE_URL}12-4-old-empires-and-new-colonies",
    f"{BASE_URL}12-5-resistance-civil-rights-and-democracy",
    # Chapter 13
    f"{BASE_URL}13-1-an-unstable-peace",
    f"{BASE_URL}13-2-theaters-of-war",
    f"{BASE_URL}13-3-keeping-the-home-fires-burning",
    f"{BASE_URL}13-4-out-of-the-ashes",
    # Chapter 14
    f"{BASE_URL}14-1-the-cold-war-begins",
    f"{BASE_URL}14-2-the-spread-of-communism",
    f"{BASE_URL}14-3-the-non-aligned-movement",
    f"{BASE_URL}14-4-global-tensions-and-decolonization",
    f"{BASE_URL}14-5-a-new-world-order",
    # Chapter 15
    f"{BASE_URL}15-1-a-global-economy",
    f"{BASE_URL}15-2-debates-about-the-environment",
    f"{BASE_URL}15-3-science-and-technology-for-todays-world",
    f"{BASE_URL}15-4-ongoing-problems-and-solutions",
]
# ----------------------------------------------------


# ----------------------------------------------------
# extract_metadata_openstax 함수: OpenStax용 Semantic Chunking (H2 기준)
# ----------------------------------------------------
def extract_metadata_openstax(page_url: str) -> Dict:
    """
    OpenStax 콘텐츠 페이지에서 title(<h1>), source_url 및
    페이지 내부의 <h2> 섹션별 텍스트 청크(chunk_list)를 추출합니다.
    """
    result = {
        "source_url": page_url,
        "title": None,
        "chunk_list": []  # chunk_list가 핵심
    }

    if not can_fetch(page_url):
        print(f"robots disallow: {page_url}", file=sys.stderr)
        return result

    soup = get_soup(page_url)
    if soup is None:
        return result

    # Title (페이지의 <h1>이 챕터/섹션 제목. 예: "1.1 The Americas")
    h1 = soup.find("h1")
    if h1 and h1.get_text(strip=True):
        result["title"] = h1.get_text(strip=True)
    else:
        title_tag = soup.find("title") # Fallback
        if title_tag:
            result["title"] = title_tag.get_text(strip=True)

    # Main content: OpenStax는 <div data-type="page"> 사용
    content = soup.find("div", attrs={"data-type": "page"})
    if not content:
        print(f"Warning: Main content 'div[data-type=\"page\"]' not found for {page_url}", file=sys.stderr)
        return result

    # --- H2-Based Semantic Chunking 로직 (Philosophy와 동일) ---
    chunks = []
    # (★수정★) H2가 나오기 전의 첫 텍스트는 'Introduction' 또는 페이지 제목(h1)을 섹션명으로 사용
    current_section_title = result["title"] if result["title"] else "Introduction"
    current_text_list = []

    # 'content' 내부의 모든 관련 태그를 순서대로 순회
    for tag in content.find_all(['h2', 'h3', 'p', 'blockquote', 'ul', 'ol'], recursive=True):

        # (예외 처리) 불필요한 영역(예: 'Key Terms', 'Review Questions' 등) 건너뛰기
        parent_section = tag.find_parent("section", attrs={"data-type": True})
        if parent_section:
            data_type = parent_section.get('data-type', '')
            if any(skip_term in data_type for skip_term in ['key-terms', 'summary', 'review-questions', 'critical-thinking']):
                continue

        # (1) 새 H2 섹션(대주제)을 만났을 때
        if tag.name == 'h2':
            # 그 전까지 수집한 텍스트가 있다면, 이전 섹션 청크로 저장
            if current_text_list:
                chunk_text = "\n".join(current_text_list).strip()
                if len(chunk_text) > 50: # 최소 50자 이상
                    chunks.append({
                        # (★중요★) Philosophy와 메타데이터 키 통일
                        "section_title": current_section_title,
                        "text": chunk_text
                    })

            # 새 섹션 정보로 업데이트
            current_section_title = tag.get_text(" ", strip=True)
            current_text_list = [] # 텍스트 리스트 초기화

        # (2) H2가 아닌 내용물(h3, p, blockquote 등)
        elif tag.name in ['p', 'h3', 'h4', 'blockquote', 'ul', 'ol']:
            tag_text = tag.get_text(" ", strip=True)
            if tag_text:
                # h3/h4의 경우, 제목이라는 것을 명확히 하기 위해 마크업 추가
                if tag.name in ['h3', 'h4']:
                    current_text_list.append(f"\n--- {tag_text} ---\n")
                else:
                    current_text_list.append(tag_text)

    # (3) 루프가 끝난 후, 마지막 H2 섹션의 청크를 저장
    if current_text_list:
        chunk_text = "\n".join(current_text_list).strip()
        if len(chunk_text) > 50:
            chunks.append({
                "section_title": current_section_title,
                "text": chunk_text
            })

    result["chunk_list"] = chunks
    return result

# ----------------------------------------------------
# main 함수: 수동 URL 리스트 순회
# ----------------------------------------------------
def main():
    if not TARGET_HISTORY_CHAPTER_URLS:
        print("Error: 'TARGET_HISTORY_CHAPTER_URLS' 리스트가 비어있습니다.", file=sys.stderr)
        print("스크립트를 열어 목차 페이지에서 챕터 URL을 수동으로 추가해주세요.", file=sys.stderr)
        return

    out_f = open(OUTPUT_FILE, "w", encoding="utf-8")
    count = 0
    urls = TARGET_HISTORY_CHAPTER_URLS

    print(f"Total {len(urls)} pages to scrape...")

    for url in urls:
        print(f"[{count+1}/{len(urls)}] Processing {url}")

        # (★수정★) 'page_url' -> 'url'
        meta = extract_metadata_openstax(url)

        if meta["chunk_list"]: # 청크가 하나라도 있을 때만 저장
            json_line = json.dumps(meta, ensure_ascii=False)
            out_f.write(json_line + "\n")
            out_f.flush()
            count += 1
        else:
            print(f"Warning: No chunks extracted for {url}. Skipping.", file=sys.stderr)

        time.sleep(DELAY_SECONDS)

    out_f.close()
    print(f"\nSuccessfully saved {count} pages (as entries) to {OUTPUT_FILE}")


if __name__ == "__main__":
    main()

Total 60 pages to scrape...
[1/60] Processing https://openstax.org/books/world-history-volume-2/pages/1-1-developing-a-global-perspective
[2/60] Processing https://openstax.org/books/world-history-volume-2/pages/1-2-primary-sources
[3/60] Processing https://openstax.org/books/world-history-volume-2/pages/1-3-causation-and-interpretation-in-history
[4/60] Processing https://openstax.org/books/world-history-volume-2/pages/2-1-india-and-international-connections
[5/60] Processing https://openstax.org/books/world-history-volume-2/pages/2-2-the-malacca-sultanate
[6/60] Processing https://openstax.org/books/world-history-volume-2/pages/2-3-exchange-in-east-asia
[7/60] Processing https://openstax.org/books/world-history-volume-2/pages/3-1-the-roots-of-african-trade
[8/60] Processing https://openstax.org/books/world-history-volume-2/pages/3-2-the-songhai-empire
[9/60] Processing https://openstax.org/books/world-history-volume-2/pages/3-3-the-swahili-coast
[10/60] Processing https://openstax.or

##  solar embedding 사용하여 청크를 임베딩. FAISS 인덱스 생성

세 jsonl 파일을 하나로 합치기

In [4]:
import json
import os

# (★수정★) 스크래핑으로 생성된 3개 History 파일 목록
history_files = [
    'history.jsonl',          # U.S. History (scrape_history_content.py)
    'history_world_vol1.jsonl', # (scrape_world_history_vol1.py)
    'history_world_vol2.jsonl'  # (scrape_world_history_vol2.py)
]

output_file = 'history_all.jsonl'
total_entries = 0
total_chunks = 0

print(f"Starting merge into '{output_file}'...")

with open(output_file, 'w', encoding='utf-8') as outfile:
    for fname in history_files:
        if not os.path.exists(fname):
            print(f"Warning: File '{fname}' not found. Skipping.")
            continue

        print(f"Processing '{fname}'...")
        with open(fname, 'r', encoding='utf-8') as infile:
            for line in infile:
                # JSONL 파일을 한 줄씩 읽어서 그대로 출력 파일에 씀
                outfile.write(line)

                # (통계용)
                try:
                    entry = json.loads(line)
                    total_entries += 1
                    total_chunks += len(entry.get('chunk_list', []))
                except json.JSONDecodeError:
                    print(f"Warning: Could not decode JSON line in {fname}: {line[:50]}...")

print("\n--- Merge Complete ---")
print(f"Total entries (pages) merged: {total_entries}")
print(f"Total chunks (sections) merged: {total_chunks}")
print(f"Saved to: {output_file}")

Starting merge into 'history_all.jsonl'...
Processing 'history.jsonl'...
Processing 'history_world_vol1.jsonl'...
Processing 'history_world_vol2.jsonl'...

--- Merge Complete ---
Total entries (pages) merged: 260
Total chunks (sections) merged: 1685
Saved to: history_all.jsonl


In [5]:
from google.colab import userdata
userdata.get('UPSTAGE_API_KEY')

'up_VYzFNHEoEJPfAwYUNp5v9n1CPnMOm'

In [3]:
!pip install jsonlines
!pip install langchain_community
!pip install langchain_upstage

Collecting langchain_upstage
  Downloading langchain_upstage-0.7.4-py3-none-any.whl.metadata (3.3 kB)
Collecting langchain-core<0.4.0,>=0.3.78 (from langchain_upstage)
  Downloading langchain_core-0.3.79-py3-none-any.whl.metadata (3.2 kB)
Collecting langchain-openai<0.4.0,>=0.3.34 (from langchain_upstage)
  Downloading langchain_openai-0.3.35-py3-none-any.whl.metadata (2.4 kB)
Collecting pypdf<5.0.0,>=4.2.0 (from langchain_upstage)
  Downloading pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Collecting tokenizers<0.21.0,>=0.20.0 (from langchain_upstage)
  Downloading tokenizers-0.20.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading langchain_upstage-0.7.4-py3-none-any.whl (25 kB)
Downloading langchain_core-0.3.79-py3-none-any.whl (449 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m449.8/449.8 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_openai-0.3.35-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━

In [3]:
#!/usr/bin/env python3
"""
create_faiss_index_history.py (Solar Embedding)

- (★수정★) 'history_all.jsonl' (병합된 파일)을 읽어옵니다.
- 각 청크(섹션)를 로드합니다.
- (안전 장치) 만약 섹션 텍스트가 1000자를 넘으면, 1000자 단위로 더 잘게 자릅니다.
- 'title', 'source_url', 'section_title' 메타데이터를 모두 보존합니다.
- (★수정★) Upstage Solar Embedding 모델을 사용하여 모든 청크를 임베딩합니다.
- (★수정★) 'faiss_index_history_solar'라는 이름으로 로컬 FAISS 인덱스를 저장합니다.
"""

# ----------------------------------------------------
# 0. Colab에 필수 라이브러리 설치
# ----------------------------------------------------
import os
# Colab 환경에서 라이브러리 설치
try:
    import google.colab
    print("Installing libraries for Colab environment...")
    # Ensure faiss-cpu is installed and available directly in the Colab environment
    !pip install -q faiss-cpu
    # Re-install other necessary packages to ensure all dependencies are met and aligned
    # langchain and langchain_community are already specified in the previous cell's output
    # but re-installing here helps resolve any potential path/version issues after faiss installation.
    !pip install -q langchain langchain_community jsonlines langchain_upstage
    print("Installation complete.")
except ImportError:
    print("Not running in Colab. Skipping auto-installation.")


import jsonlines
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from langchain_upstage import UpstageEmbeddings
import sys
import time
from google.colab import userdata # Added for API key retrieval

# ----------------------------------------------------
# 1. 설정값 (★History용으로 수정됨★)
# ----------------------------------------------------
JSONL_FILE = "history_all.jsonl"             # 입력 파일 (방금 병합한 파일)
INDEX_NAME = "faiss_index_history_solar"    # 저장할 FAISS 인덱스 이름 (Modified)

# "Safety Net" 청킹 설정 (H2 섹션이 너무 클 경우 대비)
CHUNK_SIZE = 1000   # 청크 최대 글자 수
CHUNK_OVERLAP = 100 # 청크 겹침

# ----------------------------------------------------
# 2. 임베딩 모델 로드 (Upstage Solar Embedding)
# ----------------------------------------------------
print("Loading embedding model (solar-embedding-1-large)...")
UPSTAGE_API_KEY = userdata.get('UPSTAGE_API_KEY') # Get API key from Colab secrets
if not UPSTAGE_API_KEY:
    raise ValueError("UPSTAGE_API_KEY not found in Colab secrets. Please set it.")

embedding_model = UpstageEmbeddings(
    model="solar-embedding-1-large",
    upstage_api_key=UPSTAGE_API_KEY
)
print("Embedding model (solar-embedding-1-large) loaded.")

# ----------------------------------------------------
# 3. JSONL 로드 및 '안전 장치' 청킹
# ----------------------------------------------------
print(f"Loading '{JSONL_FILE}' and applying safety net chunking...")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)

all_final_chunks = [] # 최종적으로 FAISS에 들어갈 Document 객체 리스트

try:
    with jsonlines.open(JSONL_FILE, 'r') as reader:
        for entry in reader:
            # (1) 기본 메타데이터 (페이지 레벨)
            base_metadata = {
                "source": entry.get("source_url", "N/A"),
                "title": entry.get("title", "N/A"),
            }

            # (2) Semantic Chunking된 'chunk_list' 순회
            for chunk in entry.get("chunk_list", []):
                section_text = chunk.get("text")
                section_title = chunk.get("section_title", "N/A")

                if not section_text:
                    continue

                # (3) H2 섹션 텍스트가 CHUNK_SIZE(1000자)를 넘을 경우,
                #     text_splitter가 이 텍스트를 더 작은 '미니 청크'로 자름
                split_texts = text_splitter.split_text(section_text)

                # (4) 이 '미니 청크'들을 Document 객체로 변환
                for text_piece in split_texts:
                    # 메타데이터에 'section' 정보를 추가
                    final_metadata = base_metadata.copy()
                    final_metadata["section"] = section_title

                    new_doc = Document(page_content=text_piece, metadata=final_metadata)
                    all_final_chunks.append(new_doc)

except FileNotFoundError:
    print(f"Error: '{JSONL_FILE}' not found. Please run 'merge_jsonl.py' first.")
    sys.exit()

print(f"Total 'mini-chunks' to be indexed: {len(all_final_chunks)}")

# ----------------------------------------------------
# 4. FAISS 임베딩 및 저장
# ----------------------------------------------------
if all_final_chunks:
    print("Starting FAISS index creation (using solar-embedding-1-large)... (This may take a long time)")
    start_time = time.time()

    # FAISS.from_documents()를 사용하면
    # 텍스트 청크는 임베딩되고, 메타데이터는 그대로 벡터 스토어에 저장됩니다.
    db_history = FAISS.from_documents(all_final_chunks, embedding_model)

    end_time = time.time()
    print(f"FAISS index created successfully in {end_time - start_time:.2f} seconds.")

    # 생성된 인덱스를 파일로 저장
    db_history.save_local(INDEX_NAME)

    print(f"FAISS index saved to folder: '{INDEX_NAME}'")
else:
    print("No chunks were created. FAISS index not built.")


Installing libraries for Colab environment...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m122.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Could not find a version that satisfies the requirement upstage-langchain (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for upstage-langchain[0m[31m
[0mInstallation complete.
Loading embedding model (solar-embedding-1-large)...
Embedding model (solar-embedding-1-large) loaded.
Loading 'history_all.jsonl' and applying safety net chunking...
Total 'mini-chunks' to be indexed: 8831
Starting FAISS index creation (using solar-embedding-1-large)... (This may take a long time)
FAISS index created successfully in 935.14 seconds.
FAISS index saved to folder: 'faiss_index_history_solar'
