### 6.1 Data Loaders and Splitters

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,  # 앞 문서의 끝부분을 조금 가져오는 정도
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

print(loader.load_and_split(text_splitter=splitter))

In [None]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=950,
    chunk_overlap=100,  # 앞 문서의 끝부분을 조금 가져오는 정도
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

print(loader.load_and_split(text_splitter=splitter))

### 6.2 Tiktoken

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

# Tiktoken: OpenAI에서 만든 tokenization library.
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=950,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

print(loader.load_and_split(text_splitter=splitter))

### 6.4 Vector Store

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()

embedder.embed_query("Hi")

In [None]:
embedder.embed_documents([
    "Hi there!",
    "Oh, hello!",
    "What's your name?",
    "My friends call me World",
    "Hello World!"
])

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

# 캐시 저장 경로
cache_dir = LocalFileStore("./.cache/")

# Tiktoken: OpenAI에서 만든 tokenization library.
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=950,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

# 중복 요청 시 캐시된 결과를 반환
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

# Chroma 라이브러리로 캐시에서 임베딩 벡터 검색
vectorstore = Chroma.from_documents(docs, cached_embeddings)

In [7]:
results = vectorstore.similarity_search("winston")

results

[Document(page_content="Part One\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his

### 6.6 RetrievalQA

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

# 캐시 저장 경로
cache_dir = LocalFileStore("./.cache/")

# Tiktoken: OpenAI에서 만든 tokenization library.
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=950,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

# 중복 요청 시 캐시된 결과를 반환
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

# FAISS 라이브러리로 캐시에서 임베딩 벡터 검색
vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=vectorstore.as_retriever(),
)

chain.run("Describe the Victory Mansions.")

'The Victory Mansions are depicted as a dilapidated, run-down apartment complex in the novel "1984" by George Orwell. The buildings are old and in a state of disrepair, with rotting nineteenth-century houses, patched windows, sagging garden walls, and rubble from bomb sites. The surroundings are grimy and bleak, typical of the urban landscape in Airstrip One (London). The Victory Mansions are a stark contrast to the Ministry of Truth, which stands out as a gleaming, imposing structure in the same vicinity.'

In [14]:
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='refine',
    retriever=vectorstore.as_retriever(),
)

chain.run("Describe the Victory Mansions.")

'The Victory Mansions are a stark contrast to the towering, imposing Ministry of Truth that overlooks them. The rundown and dilapidated state of the apartment complex stands in sharp juxtaposition to the gleaming white concrete structure of the Ministry, which symbolizes the power and control of the Party in the oppressive society of Oceania. The dilapidated buildings, sagging garden walls, and patched windows of the Victory Mansions reflect the decay and neglect that permeate the lives of the residents, highlighting the stark disparities and harsh realities of life in this totalitarian society. The contrast between the Victory Mansions and the Ministry of Truth serves as a visual representation of the stark divide between the impoverished, downtrodden citizens of Oceania and the authoritarian regime that governs them.'

### 6.8 Stuff LCEL Chain

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(
    temperature=0.1,
)

# 캐시 저장 경로
cache_dir = LocalFileStore("./.cache/")

# Tiktoken: OpenAI에서 만든 tokenization library.
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=950,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

# 중복 요청 시 캐시된 결과를 반환
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

# FAISS 라이브러리로 캐시에서 임베딩 벡터 검색
vectorstore = FAISS.from_documents(docs, cached_embeddings)

# docs를 불러오는 역할
retriever = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer, just say you don't know. Don't make it up:\n\n{context}"),
    ("human", "{question}")
])

# CASE: chain_type = 'stuff' (default)
# RunnablePassthrough: chain.invoke의 문장을 그대로 대입
chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm

chain.invoke("Describe Victory Mansions.")

AIMessage(content='Victory Mansions is a building where Winston Smith lives. It has glass doors, a hallway that smells of boiled cabbage and old rag mats, and a large colored poster of an enormous face with the caption "BIG BROTHER IS WATCHING YOU." The flat is seven flights up, and the building has a faulty lift and a telescreen that cannot be completely shut off.')

### 6.9 Map Reduce LCEL Chain

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    temperature=0.1,
)

# 캐시 저장 경로
cache_dir = LocalFileStore("./.cache/")

# Tiktoken: OpenAI에서 만든 tokenization library.
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=950,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

# 중복 요청 시 캐시된 결과를 반환
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

# FAISS 라이브러리로 캐시에서 임베딩 벡터 검색
vectorstore = FAISS.from_documents(docs, cached_embeddings)

# docs를 불러오는 역할
retriever = vectorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question.
            Return any relevant text verbatim.
            ------
            {context}
            """
        ),
        ("human", "{question}")
    ]
)

map_doc_chain = map_doc_prompt | llm

# 문서들을 다 합쳐 하나의 문서로 반환하는 함수
def map_docs(inputs):
    documents = inputs['documents']
    question = inputs['question']

    return "\n\n".join(
        map_doc_chain.invoke(
            {"context":doc.page_content, "question":question}
        ).content for doc in documents
    )
    
# RunnableLambda: 사용자 정의 함수를 실행할 수 있는 기능을 제공
map_chain = {"documents": retriever, "question": RunnablePassthrough()} | RunnableLambda(map_docs) 

final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a question, create a final answer.
            If you don't know the answer, just say that you don't know.
            Don't try to make up the answer.
            ------
            {context}
            """
        ),
        ("human", "{question}"),
    ]
)

# CASE: chain_type = 'map_reduce' (default)
# RunnablePassthrough: chain.invoke의 문장을 그대로 대입
chain = {"context":map_chain, "question": RunnablePassthrough()} | final_prompt | llm

chain.invoke("Where does Winston go to work?")

AIMessage(content='Winston goes to work at the Ministry of Truth.')