In [1]:
# pip install marker-pdf

In [2]:
import os
import re
import torch
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_ollama import ChatOllama
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from langchain_community.llms import HuggingFacePipeline, LlamaCpp
from langchain_core.documents import Document
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser
import pypdfium2 as pdfium
from llama_cpp import Llama


os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"

In [3]:
file_path = "SPRI_AI_Brief_2023년12월호_F.pdf"
num_chunk_size = 1000
num_chunk_overlap = 50

# step 1 : load document using marker-pdf
config_parser = ConfigParser({
    "output_format": "markdown",
    "paginate_output": True  # 페이지 구분 활성화
})
converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=create_model_dict(),
)

rendered = converter(file_path)

# 페이지별로 분리하여 LangChain Document로 변환
pages = re.split(r'\{(\d+)\}-+\n', rendered.markdown)
docs = []
for i in range(1, len(pages), 2):
    page_num = int(pages[i])
    content = pages[i + 1] if i + 1 < len(pages) else ""
    if content.strip():  # 빈 페이지 제외
        docs.append(Document(
            page_content=content.strip(),
            metadata={"source": file_path, "page": page_num}
        ))

# step 2 : split document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=num_chunk_size, chunk_overlap=num_chunk_overlap)
split_documents = text_splitter.split_documents(docs)

# step 3 : Embedding
# embeddings = OpenAIEmbeddings()
hf_embeddings = HuggingFaceEmbeddings(model_name = "BAAI/bge-m3", model_kwargs={"device": "cuda"}, encode_kwargs={"normalize_embeddings": True},)

# step 4 : vector DB
try:
    vectorstore = FAISS.load_local(
        folder_path="faiss_db",
        index_name="faiss_index",
        embeddings=hf_embeddings,
        allow_dangerous_deserialization=True,
    )
except:
    vectorstore = FAISS.from_documents(documents=split_documents, embedding=hf_embeddings)
    vectorstore.save_local("faiss_db", "faiss_index")

# vectorstore.add_documents(new_split_documents)
# vectorstroe.save_local("faiss_db", "faiss_index")

    Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (8.0) - (12.0)
    
Recognizing Layout: 100%|██████████| 23/23 [00:13<00:00,  1.73it/s]
Running OCR Error Detection: 100%|██████████| 2/2 [00:00<00:00,  8.30it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.99it/s]
Recognizing Text: 100%|██████████| 2/2 [00:00<00:00,  2.18it/s]
Recognizing tables: 100%|██████████| 1/1 [00:00<00:00,  1.65it/s]
Detecting bboxes: 0it [00:00, ?it/s]


In [4]:
from huggingface_hub import hf_hub_download

# step 5 : Retriever Search
retriever = vectorstore.as_retriever()

# step 6 : generate prompt template
prompt_template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Answer in Korean.

#Question:
{question}

#Context:
{context}

#Answer:"""

# step 7 : LLM (llama_cpp 직접 사용)
llm = Llama.from_pretrained(
    repo_id="unsloth/gpt-oss-20b-GGUF",
    filename="gpt-oss-20b-Q4_K_M.gguf",
    n_gpu_layers=-1,
    n_ctx=16384,
    verbose=False
)

# step 8: format_docs
def format_docs(docs):
    return "\n\n".join(
        f"[page {d.metadata.get('page', 0) + 1}] {d.page_content}" for d in docs
    )

# step 9: RAG 질의 함수 (스트리밍)
def ask(question: str):
    # retrieve
    docs = retriever.invoke(question)
    context = format_docs(docs)
    
    # prompt 생성
    prompt = prompt_template.format(question=question, context=context)
    
    # 스트리밍 출력
    output = llm.create_chat_completion(
        messages=[{"role": "user", "content": prompt}],
        stream=True
    )
    
    for chunk in output:
        delta = chunk["choices"][0]["delta"]
        if "content" in delta:
            print(delta["content"], end="", flush=True)
    print()  # 줄바꿈

llama_context: n_ctx_per_seq (16384) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified_iswa: using full-size SWA cache (ref: https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)


In [7]:
ask("삼성이 만든 생성AI 의 이름은 무엇인가요?")

<|channel|>analysis<|message|>We need to answer: "삼성이 만든 생성AI 의 이름은 무엇인가요?" The context says: 삼성전자 자체 개발 생성 AI '삼성 가우스'. So answer: 삼성 가우스. Provide in Korean.<|end|><|start|>assistant<|channel|>final<|message|>삼성 가우스입니다.


In [8]:
ask("AI 행동 강령에 대해 알려줘")

<|channel|>analysis<|message|>We need to answer: "AI 행동 강령에 대해 알려줘" (Tell me about AI Code of Conduct). Use the provided context. Summarize the key points: G7's International Code of Conduct for Advanced AI Systems, established at Hiroshima AI Process 2023-10-30. It includes risk identification and mitigation, transparency, accountability, information sharing, collaboration, security controls, content verification, source verification. Also mention US executive order on safe and trustworthy AI, but question is about AI 행동 강령. So focus on G7. Provide details: aims, key provisions, etc. Provide in Korean.<|end|><|start|>assistant<|channel|>final<|message|>**AI 행동 강령(International Code of Conduct for Advanced AI Systems)**  
*2023년 10월 30일, ‘히로시마 AI 프로세스’에서 G7(미국·일본·독일·영국·프랑스·이탈리아·캐나다) 주요 7개국이 합의한 국제적 규범입니다.*

| 핵심 내용 | 구체적 조치 |
|---|---|
| **위험 식별·완화** | • AI 수명주기 전반에 걸쳐 위험을 평가하고 완화 조치를 채택<br>• 출시·배포 후 발생한 취약점·오용·오용 유형을 파악해 즉각적 대응 |
| **투명성·책임성** | • AI 성능·한계 공개<br>• 적절·부적절 사용 영역을 명시해 책임