In [1]:
# 임베딩된 RAG를 통해 답변한다.
# 좀더 상세한 질문이 필요할땐 재질문하게 구성한다

In [1]:
import os
import sys
from dotenv import load_dotenv
from pathlib import Path
import uuid

from llama_cpp import Llama
from langchain_community.chat_models import ChatLlamaCpp
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage, BaseMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableConfig

from langgraph.graph import END, START, StateGraph
from typing import Annotated, List, Iterable, Tuple, Optional, Any
from typing_extensions import TypedDict
from langgraph.graph.message import add_messages
from IPython.display import display, Image

In [2]:
load_dotenv()

os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"

In [2]:
model_id = "unsloth/gpt-oss-20b-GGUF"
gguf_file = "gpt-oss-20b-Q4_K_M.gguf"

print(f"모델 로딩 중... ({gguf_file})")

# GGUF 모델 로드
llm = Llama.from_pretrained(
    repo_id=model_id,
    filename=gguf_file,
    n_gpu_layers=-1,  # 모든 레이어를 GPU로
    n_ctx=32768,       # 컨텍스트 길이
    verbose=False      # 로딩 정보 표시
)


모델 로딩 중... (gpt-oss-20b-Q4_K_M.gguf)


llama_context: n_ctx_per_seq (32768) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified_iswa: using full-size SWA cache (ref: https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)


In [3]:
# 대화 기록 저장 (사용자 질문과 AI 답변만)
messages = []

while True:
    # 사용자 입력
    user_input = input("\n[You] >>> ")
    
    # 종료 명령 체크
    if user_input.lower() in ['/bye']:
        print("\n대화를 종료합니다.")
        break
    
    if not user_input.strip():
        continue
    
    # 사용자 메시지 추가
    messages.append({"role": "user", "content": user_input})
    
    # 스트리밍 생성
    print("\n[AI] >>> ", end="", flush=True)
    
    # 스트리밍 출력 및 전체 응답 수집
    full_response = ""
    
    # llama-cpp-python의 create_chat_completion 사용
    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=4096,
        temperature=0.1,           # 다양성 추가
        repeat_penalty=1.1,        # 반복 억제
        top_p=0.9,                 # 더 나은 샘플링
        stream=True
    )
    
    for chunk in response:
        if 'choices' in chunk and len(chunk['choices']) > 0:
            delta = chunk['choices'][0].get('delta', {})
            if 'content' in delta:
                token = delta['content']
                print(token, end="", flush=True)
                full_response += token
    
    print("\n")  # 줄바꿈
    
    # GGUF 모델의 특수 토큰 처리
    # <|channel|>final<|message|> 이후의 내용이 최종 답변
    answer_only = ""
    
    # 여러 패턴 시도
    if '<|channel|>final<|message|>' in full_response:
        # GGUF 형식: <|channel|>final<|message|>답변내용
        split_pos = full_response.find('<|channel|>final<|message|>')
        answer_only = full_response[split_pos + len('<|channel|>final<|message|>'):].strip()
    elif 'assistantfinal' in full_response:
        # transformers 형식: assistantfinal답변내용
        split_pos = full_response.find('assistantfinal')
        answer_only = full_response[split_pos + len('assistantfinal'):].strip()
    else:
        # 패턴이 없으면 전체를 답변으로
        answer_only = full_response.strip()
    
    # <|end|> 같은 특수 토큰 제거
    answer_only = answer_only.replace('<|end|>', '').strip()
    
    # 답변 부분만 다시 정리해서 보여주기
    if '<|channel|>final<|message|>' in full_response or 'assistantfinal' in full_response:
        print("=" * 80)
        print("[AI 답변]")
        print("-" * 80)
        print(answer_only)
        print("=" * 80)
    
    # 답변만 대화 기록에 추가 (생각은 제외)
    messages.append({"role": "assistant", "content": answer_only})




[AI] >>> <|channel|>analysis<|message|>The user says "안녕" which is Korean for "Hello". We should respond in Korean. Probably a friendly greeting: "안녕하세요! 무엇을 도와드릴까요?" Or something like that. The instruction: we are ChatGPT, respond appropriately. There's no special constraints. So answer in Korean.<|end|><|start|>assistant<|channel|>final<|message|>안녕하세요! 어떻게 도와드릴까요?

[AI 답변]
--------------------------------------------------------------------------------
안녕하세요! 어떻게 도와드릴까요?

[AI] >>> <|channel|>analysis<|message|>User says "나는 성종이야" meaning "I am Seongjong". They might be roleplaying as King Seongjong of Joseon. We need to respond appropriately, maybe acknowledging and continuing conversation. The user hasn't asked a question yet. So we can ask what they'd like to talk about or confirm identity. Should keep friendly tone.<|end|><|start|>assistant<|channel|>final<|message|>안녕하세요, 성종님! 무엇을 도와드릴까요? 궁금한 점이나 이야기하고 싶은 주제가 있나요?

[AI 답변]
-------------------------------------------------------