In [None]:
!pip install -U langchain langchain-milvus pymilvus

In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_milvus import Milvus
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


In [4]:
from pymilvus import MilvusClient

client = MilvusClient('milvus_rag.db')
client.list_collections()

[]

In [2]:
loader = PyPDFLoader(
    "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001813756/975b3e9b-268e-4798-a9e4-2a9a7c92dc10.pdf"
)
data = loader.load()

In [3]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
all_splits = text_splitter.split_documents(data)

In [7]:
# The easiest way is to use Milvus Lite where everything is stored in a local file.
# If you have a Milvus server you can use the server URI such as "http://localhost:19530".
URI = "./milvus_demo.db"

vector_db = Milvus.from_documents(
    documents=all_splits, embedding=embeddings,
    connection_args={"uri": URI},
)

In [8]:
vectorstore = Milvus.from_documents(documents=all_splits, embedding=embeddings, collection_name="rag_milvus_webinar",connection_args={"uri": './milvus_rag.db'})

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

In [15]:
llm = Ollama(
        model="llama3",
        stop=["<|eot_id|>"],
    )

In [17]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [18]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [19]:
for chunk in rag_chain.stream("What is WeWork? When were they introduced publicly?"):
    print(chunk, end="", flush=True)

WeWork is a shared workspace company that provides flexible office spaces and memberships. They were introduced publicly when WeWork Companies Inc. was founded in 2010.

In [20]:
for chunk in rag_chain.stream("What is this document about?"):
    print(chunk, end="", flush=True)

This document appears to be the Form 10-Q quarterly report of a company, discussing its financial statements and potential issues that may impact its ability to continue as a going concern. It also mentions regulatory matters and forward-looking statements.

## Adding Sources

In [21]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

rag_chain_with_source.invoke("What is WeWork? When were they introduced publicly?")

{'context': [Document(page_content='choose from a dedicated desk, a private office or a fully customized floor with the flexibility to choose the type of membership that works for them on a monthly subscription basis,\nthrough a multi-year membership agreement or on a pay-as-you-go basis.\nThe Company’s operations are headquartered in New York.\nWeWork Companies Inc. was founded in 2010. The We Company was incorporated under the laws of the state of Delaware in April 2019 as a direct wholly-owned subsidiary of', metadata={'source': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001813756/975b3e9b-268e-4798-a9e4-2a9a7c92dc10.pdf', 'page': 12, 'pk': 450117772830310463}),
  Document(page_content='WeWork Companies Inc. As a result of various legal entity reorganization transactions undertaken in July 2019, The We Company became the holding company of the Company\'s\nbusiness, and the then-stockholders of WeWork Companies Inc. became the stockholders of The We Company. WeWork Companies Inc. is