In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_classic.embeddings import CacheBackedEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_classic.storage import LocalFileStore
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

cache_dir = LocalFileStore("./.cache/")
chat = ChatOpenAI(
    temperature=0.1,
    # streaming=True,
    # callbacks=[StreamingStdOutCallbackHandler()]
)

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=600,
    chunk_overlap=100,
    separator="\n"
)
loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)
embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir, key_encoder="sha256"
)

vectorstore = Chroma.from_documents(docs, cached_embeddings)
retriever = vectorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''\n\n{context}"),
        ("human", "{question}"),
    ]
)
map_doc_chain = map_doc_prompt | chat


def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    results = [
        map_doc_chain.invoke({
            "context": doc.page_content,
            "question": question
        }).content
        for doc in documents
    ]

    return "\n\n".join(results)


map_chain = {"documents": retriever, "question": RunnablePassthrough()} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "Given the following extracted parts of a long document and a question, create a final answer. If you don't know the answer, just say that you don't know. Don't try to make up an answer.\n\n{context}"),
        ("human", "{question}"),
    ]
)

final_chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | chat
final_chain.invoke("Where does Winston go to work?")


AIMessage(content='Victory Mansions is a dilapidated and run-down apartment building in the novel "1984" by George Orwell. It is characterized by its shabby appearance, lack of amenities, and poor living conditions. The building is small and cramped, with faulty plumbing and a general lack of maintenance. Residents like Winston Smith live in a state of deprivation and constant surveillance due to the presence of telescreens monitoring their every move. The building stands as a stark contrast to the luxury enjoyed by the Inner Party members in Oceania.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 107, 'prompt_tokens': 477, 'total_tokens': 584, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, '

In [None]:
results = vectorstore.similarity_search("where does Winston live")
results