In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_ollama import ChatOllama
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from langchain_community.llms import HuggingFacePipeline
from dotenv import load_dotenv
import torch
import os
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
# os.environ["HF_HOME"] = "./cache/"

# pip install langchain-huggingface sentence-transformers


# model_name = "intfloat/multilingual-e5-large-instruct"
# model_name = "intfloat/multilingual-e5-large"



load_dotenv()

True

In [2]:
# step 1 : load document
docs = PyMuPDFLoader("SPRI_AI_Brief_2023년12월호_F.pdf").load()

# step 2 : split document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)

# step 3 : Embedding
# embeddings = OpenAIEmbeddings()
hf_embeddings = HuggingFaceEmbeddings(model_name = "BAAI/bge-m3", model_kwargs={"device": "cuda"}, encode_kwargs={"normalize_embeddings": True},)

# step 4 : vector DB
try:
    vectorstore = FAISS.load_local(
        folder_path="faiss_db",
        index_name="faiss_index",
        embeddings=hf_embeddings,
        allow_dangerous_deserialization=True,
    )
except:
    vectorstore = FAISS.from_documents(documents=split_documents, embedding=hf_embeddings)
    vectorstore.save_local("faiss_db", "faiss_index")

# vectorstore.add_documents(new_split_documents)
# vectorstroe.save_local("faiss_db", "faiss_index")




In [3]:
import torch

# step 5 : Retriever Search
retriever = vectorstore.as_retriever()

# step 6 : generate prompt
prompt = PromptTemplate.from_template(
    """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Answer in Korean.

#Question:
{question}

#Context:
{context}

#Answer:"""
)

# step 7 : LLM
# llm = ChatOpenAI(model="gpt-5-nano", temperature=0)
# llm = ChatOllama(model="gemma3:4b-it-q4_K_M", temperature=0, base_url="http://localhost:11434")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# 8bit 양자화 설정
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct",
    quantization_config=quantization_config,  # 8bit 양자화
    device_map="cuda:0"  # 명시적 GPU 설정
)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=40,
    temperature=0.01,  # 거의 0에 가까운 값 (일관성 유지)
    do_sample=True,    # 샘플링 활성화
    return_full_text=False,
    # device=0,  # device_map="cuda:0" 사용 시 제거
    pad_token_id=tokenizer.eos_token_id,  # 패딩 토큰 명시적 설정
)
llm = HuggingFacePipeline(pipeline=pipe)

# step 8: chain
def format_docs(docs):
    return "\n\n".join(
        f"[page {d.metadata.get('page', 0) + 1}] {d.page_content}" for d in docs
    )
chain = (
    {"context": retriever | RunnableLambda(format_docs), "question":RunnablePassthrough()}
    | prompt | llm | StrOutputParser()
)

result = chain.invoke("삼성이 만든 생성AI 의 이름은 무엇인가요?")
print(result)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


 
삼성 가우스의 이름은 '삼성 가우스'입니다. 

(삼성 가우스는 언어, 코드, 이미지의 3개 모델로 구성된 자체 개발 생성


In [4]:
result = chain.invoke("삼성이 만든 생성AI 의 이름은 무엇인가요?")
print(result)

 
삼성 가우스의 이름은 '삼성 가우스'입니다. 

(삼성 가우스는 언어, 코드, 이미지의 3개 모델로 구성된 자체 개발 생성


In [None]:
result = ""
for chunk in chain.stream("삼성이 만든 생성AI 의 이름은 무엇인가요?"):
    result += chunk
    print(chunk, end="", flush=True)

print(f'\n\n[FINAL] {result}')

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Exception in thread Thread-8 (generate):
Traceback (most recent call last):
  File "c:\Python312\Lib\threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "C:\Users\Administrator\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Python312\Lib\threading.py", line 1010, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Python312\Lib\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\transformers\generation\utils.py", line 2634, in generate
    result = self._sample(
             ^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\transformers\generation\utils.py", line 3615, in _sample
    outputs = self(**model_inputs, return_dict=True)
              ^^^^^^^^^^^^^^^^^^^^^^^^

In [None]:
# ollm = ChatOllama(model="gemma3:1b", temperature=0, base_url="http://localhost:11434")
# ollm.invoke("hello")

AIMessage(content='Hello there! How can I help you today? 😊 \n\nDo you have any questions for me, or would you like to chat about something?', additional_kwargs={}, response_metadata={'model': 'gemma3:1b', 'created_at': '2025-09-22T09:02:35.394489731Z', 'done': True, 'done_reason': 'stop', 'total_duration': 1459884529, 'load_duration': 95778883, 'prompt_eval_count': 10, 'prompt_eval_duration': 84568875, 'eval_count': 31, 'eval_duration': 1278073101, 'model_name': 'gemma3:1b'}, id='run--2182958f-2b44-477f-a5b8-fe8ecbdc83cb-0', usage_metadata={'input_tokens': 10, 'output_tokens': 31, 'total_tokens': 41})

In [None]:
'''
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))
'''

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."<|eot_id|>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."<|eot_id|>
