In [13]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.milvus import Milvus
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


In [14]:
from pymilvus import MilvusClient

client = MilvusClient()
if client.has_collection('rag_milvus_webinar'): 
    client.drop_collection('rag_milvus_webinar')

In [15]:
loader = PyPDFLoader(
    "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001813756/975b3e9b-268e-4798-a9e4-2a9a7c92dc10.pdf"
)
data = loader.load()

In [16]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [17]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
all_splits = text_splitter.split_documents(data)

In [19]:
vectorstore = Milvus.from_documents(documents=all_splits, embedding=embeddings, collection_name="rag_milvus_webinar")

In [20]:
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

In [21]:
llm = Ollama(
        model="llama3",
        stop=["<|eot_id|>"],
    )

In [22]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [23]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [24]:
for chunk in rag_chain.stream("What is WeWork? When were they introduced publicly?"):
    print(chunk, end="", flush=True)

WeWork is a company that provides shared workspace and community experiences for entrepreneurs, freelancers, startups, and small businesses. They offer flexible membership plans, including dedicated desks, private offices, and customized floors. WeWork was introduced publicly in October 2020 when The We Company changed its legal name to WeWork Inc. ("Legacy WeWork").

In [25]:
for chunk in rag_chain.stream("What is this document about?"):
    print(chunk, end="", flush=True)

This document appears to be a quarterly report by a company, likely discussing its financial situation and potential legal issues.

## Adding Sources

In [26]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

rag_chain_with_source.invoke("What is WeWork? When were they introduced publicly?")

{'context': [Document(page_content='choose from a dedicated desk, a private office or a fully customized floor with the flexibility to choose the type of membership that works for them on a monthly subscription basis,\nthrough a multi-year membership agreement or on a pay-as-you-go basis.\nThe Company’s operations are headquartered in New York.\nWeWork Companies Inc. was founded in 2010. The We Company was incorporated under the laws of the state of Delaware in April 2019 as a direct wholly-owned subsidiary of', metadata={'source': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001813756/975b3e9b-268e-4798-a9e4-2a9a7c92dc10.pdf', 'page': 12, 'pk': 449486236444269437}),
  Document(page_content='WeWork Companies Inc. As a result of various legal entity reorganization transactions undertaken in July 2019, The We Company became the holding company of the Company\'s\nbusiness, and the then-stockholders of WeWork Companies Inc. became the stockholders of The We Company. WeWork Companies Inc. is