In [9]:
import os
os.environ["TIKTOKEN_CACHE_DIR"] = './etc'

from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

from langchain.prompts import ChatPromptTemplate

from langchain.callbacks import StreamingStdOutCallbackHandler

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings

llm = ChatOpenAI(
    temperature=0.1,
)


In [10]:
loader = UnstructuredFileLoader("./files/document.txt")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

cache_dir = LocalFileStore("./cache/")

embedding = OpenAIEmbeddings()

docs = loader.load_and_split(text_splitter=splitter)

cached_embedding = CacheBackedEmbeddings.from_bytes_store(
    embedding, cache_dir
)

vectorstore = FAISS.from_documents(docs, cached_embedding)


In [11]:
results = vectorstore.similarity_search("where does winston live")

results

[Document(page_content='Part One\n1 It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his 

In [16]:
from langchain.chains import RetrievalQA

## chain의 생성자이며, llm을 인자로 받음
"""
    A retriever is an interface that returns documents given an unstructured query. 
    It is more general than a vector store. A retriever does not need to be able to store documents, only to return (or retrieve) them. 
    Retrievers can be created from vector stores, but are also broad enough to include Wikipedia search and Amazon Kendra.

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="refine", ## str을 받으며 default값은 stuff임,
    retriever=vectorstore.as_retriever(),      ## retriever는 class의 interface와 같다. (vectorstore를 포함한, document가 저장된 db나 cloud 환경을 일컫는다.)
)

chain.run("Where does Winston live?")
chain.run("Describe Victory Mansions")
"""

"In light of the new context provided, Victory Mansions can be further understood as a place of confinement and surveillance, where residents like Winston Smith must navigate the constant monitoring and control of the Party. The strategic positioning of the telescreen in Winston's living room, and his deliberate attempt to evade its gaze, highlights the pervasive nature of surveillance in every aspect of daily life within Victory Mansions.\n\nThe description of the alcove where Winston sits to avoid being seen by the telescreen underscores the lengths to which individuals must go to carve out moments of privacy and autonomy in a society where every action is scrutinized. The presence of the forbidden book, with its rarity and age, symbolizes Winston's rebellion against the Party's suppression of knowledge and history. The act of possessing such a book, even without any specific purpose in mind, is a subversive act within the confines of Victory Mansions.\n\nOverall, Victory Mansions em

In [17]:
## 이제 LCEL로 체인을 만들 때, prompt 전 모든 document를 가져와야 한다.
## 따라서 체인의 첫 순서에 Retriever를 넣어 준다.

"""
## 1. Retriever
retriever = vectorstore.as_retriever()

## 2. Prompt
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer just say you don't know, don't make it up:\n\n{context}"),
    ("human", "{question}"),
])

from langchain.schema.runnable import RunnablePassthrough

chain = {"context":retriever, "question": RunnablePassthrough()} | prompt | llm

## chain

##chain.run("Describe Victory Mansions")
chain.invoke("Describe Victory Mansions") 
## retriever에 str(Describe...) input으로 call되고, context로 prompt에 들거가게 됨
## question이 prompt로 전달되게 하기 위해 RunnablePassthrough를 사용함 (입력값을 다음 chain component에 던져줌)
## 만약 extra라는 추가 key를 넣어 RunnablePassthrough로 넘겨준다고 해도, 이 값은 invoke 시 넣어준 'Describe...'가 됨 (question과 동일)
"""

AIMessage(content="Victory Mansions is a building where Winston Smith lives. It has glass doors that let in gritty dust, and the hallway smells of boiled cabbage and old rag mats. There is a large colored poster of a man's face on the wall, and the building has a faulty lift due to the electricity being cut off during daylight hours. Winston's flat is on the seventh floor, and the building is described as having a grimy landscape.")

In [15]:
## Map Reduce LCEL Chain
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

sub_prompt = ChatPromptTemplate.from_messages([
    ("system",
     """
     Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
     ----------
     {context}
     """
     ),
    ("human", "{question}")
])

retriever = vectorstore.as_retriever()

## map_chain에는 사용자의 question이 message로 들어가므로 RunnablePassthrough 처리할 필요 없음
map_doc_chain = sub_prompt | llm

def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    return "\n\n".join(
        map_doc_chain.invoke({"question":question, "context":doc.page_content}).content
        ## 참고) page_content 는 langchain이 반환하는 Document class의 property이다.
        for doc in documents
    )

## map_chain은 map_doc_chain을 통해 전달받은 documents를 마지막의 chain에 전달해주는 역할만 수행
## RunnableLambda : 사용자 정의 함수를 실행할 수 있는 기능을 제공함

map_chain = {"documents":retriever, "question":RunnablePassthrough(),} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages([
    (
    "system", 
    """
    Given the following extracted parts of a long document and a question, create a final answer. 
    If you don't know the answer, just say that you don't know. Don't try to make up an answer.
    ------
    {context}
    """),
    ("human", "{question}"),
])



chain = {"context":map_chain , "question":RunnablePassthrough()} | final_prompt | llm

chain.invoke("Describe Victory Mansions")

AIMessage(content="Victory Mansions is a dilapidated building complex in London where Winston resides. It has cramped living conditions, with small apartments lacking basic amenities. The building has a telescreen in every apartment for surveillance by the Party. The hallway smells of boiled cabbage and old rag mats, with a poster of Big Brother watching. The building has a faulty lift and is home to all four Ministries of the government. Winston's apartment has a unique layout that allows him to stay out of sight of the telescreen while being heard.")