In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.milvus import Milvus
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


In [None]:
from pymilvus import MilvusClient

client = MilvusClient()
if client.has_collection('rag_milvus_webinar'): 
    client.drop_collection('rag_milvus_webinar')

In [2]:
loader = PyPDFLoader(
    "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001813756/975b3e9b-268e-4798-a9e4-2a9a7c92dc10.pdf"
)
data = loader.load()

In [3]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
all_splits = text_splitter.split_documents(data)

In [7]:
vectorstore = Milvus.from_documents(documents=all_splits, embedding=embeddings, collection_name="rag_milvus_webinar")

In [8]:
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

In [9]:
llm = Ollama(
        model="llama3",
        stop=["<|eot_id|>"],
    )

In [10]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [11]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [12]:
for chunk in rag_chain.stream("What is WeWork? When were they introduced publicly?"):
    print(chunk, end="", flush=True)

WeWork is a shared workspace company that offers desks, private offices, and customized floors to its members. They were introduced publicly in 2010 when WeWork Companies Inc. was founded. Later, they went public with an initial public offering (IPO) on October 14, 2020, when The We Company changed its legal name to WeWork Inc. ("Legacy WeWork").

## Adding Sources

In [13]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

rag_chain_with_source.invoke("What is WeWork? When were they introduced publicly?")

{'context': [Document(page_content='choose from a dedicated desk, a private office or a fully customized floor with the flexibility to choose the type of membership that works for them on a monthly subscription basis,\nthrough a multi-year membership agreement or on a pay-as-you-go basis.\nThe Company’s operations are headquartered in New York.\nWeWork Companies Inc. was founded in 2010. The We Company was incorporated under the laws of the state of Delaware in April 2019 as a direct wholly-owned subsidiary of', metadata={'source': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001813756/975b3e9b-268e-4798-a9e4-2a9a7c92dc10.pdf', 'page': 12, 'pk': 449485716965032362}),
  Document(page_content='choose from a dedicated desk, a private office or a fully customized floor with the flexibility to choose the type of membership that works for them on a monthly subscription basis,\nthrough a multi-year membership agreement or on a pay-as-you-go basis.\nThe Company’s operations are headquartered in