In [12]:
# https://python.langchain.com/docs/integrations/retrievers/merger_retriever/

In [1]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings

from langchain.retrievers.merger_retriever import MergerRetriever
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever

from langchain.chains import RetrievalQA
from langchain import PromptTemplate, LLMChain
from langchain_community.chat_models import ChatOllama

DB_DIR = "./chroma_db_test"

llm = ChatOllama(model="llama2")

In [2]:
# Langsmith - to debug LLM responses

import os
from dotenv import load_dotenv
from langsmith import Client

load_dotenv()

lcs = os.getenv("LANGCHAIN_SECRET")

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "llm-multi-doc-single-vdb"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = lcs

client = Client()

In [7]:
sys_docs = Chroma(
    collection_name="system-documentation",
    persist_directory=DB_DIR,
    embedding_function=OllamaEmbeddings(),
)

llm_papers = Chroma(
    collection_name="llm-papers",
    persist_directory=DB_DIR,
    embedding_function=OllamaEmbeddings(),
)

ssi_docs = Chroma(
    collection_name="ssi-docs",
    persist_directory=DB_DIR,
    embedding_function=OllamaEmbeddings(),
)

In [8]:
# Define 3 diff retrievers
sys_docs_ret = sys_docs.as_retriever(search_type="similarity", search_kwargs={"k": 2})
llm_papers_ret = llm_papers.as_retriever(
    search_type="similarity", search_kwargs={"k": 2}
)
ssi_docs_ret = ssi_docs.as_retriever(search_type="similarity", search_kwargs={"k": 2})

# We just pass a list of retrievers.
merge_retriever = MergerRetriever(
    retrievers=[sys_docs_ret, llm_papers_ret, ssi_docs_ret]
)

# And if we want to clean the redundant documents "overlap" between the 3 retrievers.
embeddings_filter = EmbeddingsRedundantFilter(embeddings=OllamaEmbeddings(),similarity_threshold=0.9)
pipeline_comp = DocumentCompressorPipeline(transformers=[embeddings_filter])


compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline_comp, base_retriever=merge_retriever
)

In [17]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    return_source_documents=True,
)

In [29]:
# response parser
def process_llm_response(llm_response):
    print(llm_response["result"])
    print("\nSources:")
    sources = set([x.metadata["source"] for x in llm_response["source_documents"]])
    print(sources)

In [33]:
process_llm_response(
    qa_chain.invoke(
        "Can you please tell me about pandas shape method?"
    )
)

In pandas, the `shape` method is used to access and manipulate the dimensions of a DataFrame or Series. The `shape` method returns a tuple containing the number of rows (`nrows`) and columns (`ncolumns`) in the DataFrame or Series.

Here are some examples of how you can use the `shape` method:

1. Accessing the number of rows and columns in a DataFrame:
```
df = pd.read_csv('data.csv')
print(df.shape) # Output: (nrows, ncolumns)
```
2. Setting the number of rows and columns in a DataFrame:
```
# Set the number of rows to 10
df = df.shape((10,))

# Set the number of columns to 5
df = df.shape((5,))
```
3. Accessing the dimensions of a Series:
```
s = pd.Series([1, 2, 3])
print(s.shape) # Output: (nrows, ncolumns)
```
4. Setting the dimensions of a Series:
```
# Set the number of rows to 5
s = s.shape((5,))

# Set the number of columns to 3
s = s.shape((3, 5))
```
5. Accessing the dimensions of a DataFrame with multiple indices:
```
df = pd.read_csv('data.csv', index_col='date', columns=

In [32]:
process_llm_response(
    qa_chain.invoke(
        "Using can you please give me authors of the paper titled Sparks of Artificial General Intelligence?"
    )
)

The authors of the paper titled "Sparks of Artificial General Intelligence" are:

* David J. D. Bennett
* Slav Petrov
* Julian Togelius

They are from the following institutions:

* University of California, Berkeley
* New York University
* Cornell University

The paper was published in the Proceedings of the 36th International Conference on Machine Learning, 2019.

Sources:
{'data/system-documentation/pandas-basics.html', 'data/llm-papers/openai-paper.pdf', 'data/ssi-docs/db-ssi.pdf'}


In [18]:
process_llm_response(
    qa_chain.invoke("Can you please give me details for CZECH REP?")
)

Certainly! Here are some details about the Czech Republic:

1. Capital City: The capital city of the Czech Republic is Prague (Czech: Praha).
2. Language: The official language of the Czech Republic is Czech, a Slavic language closely related to Slovak. English is also widely spoken, particularly among young people and those in the service industry.
3. Currency: The currency of the Czech Republic is the Czech koruna (CZK), which is divided into 100 haléřů.
4. Population: As of 2020, the population of the Czech Republic is approximately 10.6 million people.
5. Area: The Czech Republic has a total area of around 78,866 square kilometers (30,452 square miles), making it slightly smaller than the state of Ohio in the United States.
6. Bordering Countries: The Czech Republic is bordered by Germany to the west, Austria to the south, Slovakia to the east and Poland to the northeast.
7. Climate: The Czech Republic has a temperate climate with four distinct seasons. Summers can be warm (average

In [14]:
loader = UnstructuredFileLoader("NER.txt")
document = loader.load()
separators = ["。", " "]
text_splitter = RecursiveCharacterTextSplitter(separators=separators, chunk_size=500, chunk_overlap=0)
texts = text_splitter.split_documents(document)
embeddings = OpenAIEmbeddings()
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.81)
retriever = FAISS.from_documents(texts, OpenAIEmbeddings()).as_retriever() # base retriever
compression_retriever = ContextualCompressionRetriever(base_compressor=relevant_filter, base_retriever=retriever) # document compression retriver

from langchain.prompts import PromptTemplate
prompt_template1 = """plase use context to answer question.

{context}

question: {question}
anwser:"""
PROMPT = PromptTemplate(
    template=prompt_template1, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": PROMPT,'verbose': True}
qa = RetrievalQA.from_chain_type(llm=OpenAI( verbose=True), chain_type="stuff", retriever=compression_retriever,return_source_documents=True, chain_type_kwargs=chain_type_kwargs)

query = "balabalabala" # replace it with question
result = qa({"query": query})
print(result)```
