In [None]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY and os.path.exists("api_key.txt"):
    with open("api_key.txt", "r", encoding="utf-8") as f:
        OPENAI_API_KEY = f.read().strip()
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
if not OPENAI_API_KEY:
    raise RuntimeError("❌ OPENAI_API_KEY가 설정되어 있지 않습니다. .env 또는 api_key.txt를 확인하세요.")


ValueError: Expected Embeddings to be non-empty list or numpy array, got [] in upsert.

In [None]:
# ✅ 1. Load the paper from arXiv
loader = WebBaseLoader(
    web_paths=("https://www.cancer.go.kr/lay1/S1T648C650/contents.do",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer("div", class_="cont_txt")  # ✅ 핵심 수정
    ),
)
docs = loader.load()


In [None]:
# ✅ 2. Chunk and embed
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=OpenAIEmbeddings(model='text-embedding-3-small', api_key=OPENAI_API_KEY),
    persist_directory="./rag_arxiv_db"  # <-- 안전하게 디렉토리 지정
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# ✅ 3. Define the model
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, api_key=OPENAI_API_KEY)

# ✅ 4. Define your RAG prompt manually (no hub dependency)
prompt = ChatPromptTemplate.from_template(
    """
You are an expert research assistant. 
Using the context below, answer the user question clearly and concisely.

<context>
{context}
</context>

Question: {question}
Answer:
"""
)

# ✅ 5. Helper: format retriever output
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# ✅ 6. Build the chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# ✅ 7. Test run
response = rag_chain.invoke("한국의 전체 암환자 5년 생존율은 얼마인가요?")
print(response)