In [1]:
import warnings
from dotenv import load_dotenv
from langchain import hub
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.llms import HuggingFacePipeline
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings

load_dotenv()
warnings.filterwarnings("ignore")

In [2]:
model_id = "beomi/llama-2-ko-7b"

# HuggingFacePipeline 객체 생성
llm = HuggingFacePipeline.from_model_id(
    model_id=model_id,
    # -1: CPU(default), 0번 부터는 CUDA 디바이스 번호 지정시 GPU 사용하여 추론
    device=0,
    task="text-generation",  # 텍스트 생성
    model_kwargs={"temperature": 0, "min_length": 200, "max_length": 1000},
    pipeline_kwargs={"max_new_tokens": 200},
)

Loading checkpoint shards: 100%|███████████████████████████████| 15/15 [00:03<00:00,  4.14it/s]


In [3]:
cache_dir = LocalFileStore("./.cache/practice/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=200,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/1.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

In [4]:
prompt = hub.pull("rlm/rag-prompt")

ragChain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
while True:
    question = input("질문: ")
    if question == "exit":
        break
    answer = ragChain.invoke(question)
    print(answer)