# Multi-document Single-index RAG with LangChain and Redis Hybrid Search

## Environment Setup

In [2]:
import json
import os
import warnings
warnings.filterwarnings("ignore")
dir_path = os.getcwd()
parent_directory = os.path.dirname(dir_path)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["ROOT_DIR"] = parent_directory
print(dir_path)
print(parent_directory)

/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/multi_doc_RAG
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss


### Install Python Dependencies

In [2]:
!pip install -r $ROOT_DIR/requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Configure your Redis Stack


In [6]:
import os

# Replace values below with your own if using Redis Cloud instance
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
REDIS_PORT = os.getenv("REDIS_PORT", "6379")
REDIS_PASSWORD = os.getenv("REDIS_PASSWORD", "")

# If SSL is enabled on the endpoint, use rediss:// as the URL prefix
REDIS_URL = f"redis://{REDIS_HOST}:{REDIS_PORT}"
os.environ["REDIS_URL"] = REDIS_URL

### SentenceTransformerEmbeddings Models Cache folder
We are using `SentenceTransformerEmbeddings` in this demo and here we specify the cache folder. If you already downloaded the models in a local file system, set this folder here, otherwise the library tries to download the models in this folder if not available locally.

In particular, these models will be downloaded if not present in the cache folder:

models/models--sentence-transformers--all-MiniLM-L6-v2

models/models--sentence-transformers--all-mpnet-base-v2


In [4]:
#setting the local downloaded sentence transformer models f
os.environ["TRANSFORMERS_CACHE"] = f"{parent_directory}/models"

## RAG with LangChain

### Create Custom index based on your data using RedisVL

In [60]:
from redisvl.index import SearchIndex
from redisvl.schema import IndexSchema
from redis import Redis
index_name = 'langchain'
prefix = 'chunk'
schema = IndexSchema.from_yaml('sec_index.yaml')
client = Redis.from_url(REDIS_URL)
# create an index from schema and the client
index = SearchIndex(schema, client)
index.create(overwrite=True, drop=True)

09:57:49 redisvl.index.index INFO   Index already exists, overwriting.


In [61]:
# get info about the index
!rvl index info -i langchain

[32m09:57:50[0m [34m[RedisVL][0m [1;30mINFO[0m   Using Redis address from environment variable, REDIS_URL


Index Information:
╭──────────────┬────────────────┬────────────┬─────────────────┬────────────╮
│ Index Name   │ Storage Type   │ Prefixes   │ Index Options   │   Indexing │
├──────────────┼────────────────┼────────────┼─────────────────┼────────────┤
│ langchain    │ HASH           │ ['chunk']  │ []              │          1 │
╰──────────────┴────────────────┴────────────┴─────────────────┴────────────╯
Index Fields:
╭────────────────┬────────────────┬─────────┬────────────────┬────────────────┬────────────────┬────────────────┬────────────────┬────────────────┬─────────────────┬────────────────╮
│ Name           │ Attribute      │ Type    │ Field Option   │ Option Value   │ Field Option   │ Option Value   │ Field Option   │   Option Value │ Field Option    │ Option Value   │
├────────────────┼────────────────┼─────────┼────────────────┼────────────────┼──────

### Dataset Preparation (PDF Documents)

To best demonstrate Redis as a vector database layer, we will load a single
financial (10k filings) doc and preprocess it using some helpers from LangChain:

- `UnstructuredFileLoader` is not the only document loader type that LangChain provides. Docs: https://python.langchain.com/docs/integrations/document_loaders/unstructured_file
- `SentenceTransformersTokenTextSplitter` is what we use to create smaller chunks of text from the doc. Docs: https://api.python.langchain.com/en/latest/sentence_transformers/langchain_text_splitters.sentence_transformers.SentenceTransformersTokenTextSplitter.html

In [3]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings 
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", cache_folder=os.getenv("TRANSFORMERS_CACHE", f"{parent_directory}/models"))

In [96]:
from ingestion import get_sec_data
from ingestion import redis_bulk_upload 
sec_data = get_sec_data()

/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/resources/filings/VZ
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/resources/filings/AMZN
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/resources/filings/CAT
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/resources/filings/AAPL
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/resources/filings/PM
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/resources/filings/CMCSA
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/resources/filings/PFE
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/resources/filings/SPGI
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/resources/filings/SYK
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/resources/filings/MMC
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/resources/filings/PEP
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/resources/f

In [None]:
redis_bulk_upload(sec_data, index, embeddings)

## Vector Search with LangChain
**Important Note-2**: LangChain does not support JSON data types yet. Only supports HASH for now. This update should be coming soon.

In [7]:
from langchain_community.vectorstores import Redis as LangChainRedis
from utils import create_langchain_schemas_from_redis_schema

index_name = 'langchain'

vec_schema , main_schema = create_langchain_schemas_from_redis_schema('sec_index.yaml')

rds = LangChainRedis.from_existing_index( embedding=embeddings, 
                                          index_name= index_name, 
                                          schema = main_schema)

### Query the database
Now we can use the LangChain vector store class to perform similarity search operations on Redis

In [8]:
from langchain.vectorstores.redis import RedisText
from langchain.vectorstores.redis import RedisTag

In [9]:
f = RedisTag("ticker") == "AAPL"
rds.similarity_search(query="Profit How many employees work at this company???", k=4, distance_threshold=0.8, filter=f)

[Document(page_content='As of September 30, 2023 and September 24, 2022, the Company had total deferred revenue of $12.1 billion and $12.4 billion, respectively. As of September 30, 2023, the Company expects 67% of total deferred revenue to be realized in less than a year, 25% within one-to-two years, 7% within two-to-three years and 1% in greater than three years.\n\nNote 3 – Earnings Per Share\n\nThe following table shows the computation of basic and diluted earnings per share for 2023, 2022 and 2021 (net income in millions and shares in thousands):\n\n2023\n\n2022\n\n2021\n\nNumerator:\n\nNet income\n\n$\n\n96,995 $\n\n99,803 $\n\n94,680\n\nDenominator:\n\nWeighted-average basic shares outstanding Eﬀect of dilutive share-based awards Weighted-average diluted shares\n\n15,744,231 68,316 15,812,547\n\n16,215,963 109,856 16,325,819\n\n16,701,272 163,647 16,864,919\n\nBasic earnings per share Diluted earnings per share\n\n$ $\n\n6.16 $ 6.13 $\n\n6.15 $ 6.11 $\n\n5.67 5.61\n\nApproximate

In [85]:
# vector search with metadata filtering
f = RedisText("content") % "profit"
rds.similarity_search_with_score(query="Profit margins", k=4, filter=f)

[(Document(page_content='The percentage contribution of each operating segment fluctuates over time due to net operating revenues in certain operating segments growing at a faster rate compared to other operating segments. Net operating revenue growth rates are impacted by sales volume; price, product and geographic mix; foreign currency fluctuations; and acquisitions and divestitures. For additional information about the impact of foreign currency fluctuations, refer to the heading "Liquidity, Capital Resources and Financial Position — Foreign Exchange" below, and for additional information about acquisitions and divestitures, refer to Note 2 of Notes to Consolidated Financial Statements.\n\nGross Profit Margin\n\nGross profit margin is a ratio calculated by dividing gross profit by net operating revenues. Management believes gross profit margin provides investors with useful information related to the profitability of our business prior to considering all of the operating costs incur

In [86]:
# vector search with combinations of metadata filtering
f = (RedisText("content") % "profit") | (RedisText("content") % "revenue")
rds.similarity_search_with_score(query="Nike company revenue", k=4, filter=f)

[(Document(page_content='Revenues exceeded $500 million in each of 8, 10 and 10 countries outside the U.S. in 2020, 2019 and 2018, respectively. The U.S. is the only country to contribute more than 10% of total revenue in 2020, 2019 and 2018. As a percentage of revenues, our two largest national markets outside the U.S. were China, which contributed 6% of total revenue in each of 2020, 2019 and 2018, and Japan, which contributed 6% of total revenue in 2020 and 5% in each of 2019 and 2018.\n\nPfizer Inc.\n\n2020 Form 10-K\n\n105\n\n2018\n\n20,119 7,997 4,090 8,618\n\n40,825\n\nNotes to Consolidated Financial Statements Pfizer Inc. and Subsidiary Companies\n\nB. Other Revenue Information\n\nSignificant Customers\n\nWe sell our biopharmaceutical products primarily to customers in the wholesale sector.\n\nThe following summarizes revenue, as a percentage of total revenues, for our three largest U.S. wholesaler customers:\n\nYear Ended December 31,\n\n2020\n\n2019\n\nMcKesson, Inc. Amerisou

In [87]:
# filter results to a certain distance threshold
rds.similarity_search_with_score(query="Nike company revenue", k=4, distance_threshold=0.5)

[]

## RAG with Ollama running Llama 3 LLM

### Initialize a llama  LLM served via Ollama
Alternatively, if you like to connect to a local Ollama LLM, you can use below LLM. If you have a local OpenAI-compatible server running via vLLM , add your LLM here.

In [10]:
from langchain_community.llms import Ollama
llm = Ollama(model="llama3")

### Setup prompt
PromptTemplate defines the exect text of the response that would be fed to the LLM. This step is optional, but the defaults usually work well for OpenAI and might fall short for other models.

In [11]:
def get_prompt():
    """Create the QA chain."""
    from langchain.prompts import PromptTemplate

    # Define our prompt
    prompt_template = """Use the following pieces of context from financial 10k filings data to answer the user question at the end. Only use the result from tools and evidence provided to you. If you don't know the answer, say that you don't know, don't try to make up an answer. Provide the source of the document that you used to get the answer.

    This should be in the following format:

    Question: [question here]
    Answer: [answer here]
    Source: [source document here]

    Begin!

    Context:
    ---------
    {context}
    ---------
    Question: {question}
    Answer:"""

    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )
    return prompt

### Putting it all together

This is where the Langchain brings all the components together in a form of a simple RAG application with the financial PDF document.

In [12]:
from langchain.chains import RetrievalQA

def get_search_kwargs(filters, distance_threshold):
    return {"distance_threshold":distance_threshold,"filter":filters}
    

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=rds.as_retriever(search_type="similarity_distance_threshold",
                               search_kwargs={"distance_threshold":0.8, 'include_metadata': True}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": get_prompt()},
    verbose=True
)

### Finally - let's ask questions!



In [13]:
query = "What was Nike's revenue last year compared to this year??"
#res=qa(query)
#res['result']
qa.invoke({"query": query, "search_kwargs": {}})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': "What was Nike's revenue last year compared to this year??",
 'search_kwargs': {},
 'result': "There is no data related to Nike's revenue in the provided context. The context appears to be from financial 10K filings of a company, possibly Walmart or its subsidiary Sam's Club, and provides information on their primary indication or class, total revenues, and comparable sales for certain product categories. There is no mention of Nike or its products.",
 'source_documents': [Document(page_content='Sam\'s Club comparable sales increased 8.7% and 1.6% in fiscal 2021 and 2020, respectively. For fiscal 2021, Sam\'s Club comparable sales benefited from growth in transactions and average ticket resulting from the COVID-19 pandemic, partially offset by both our decision to remove tobacco from certain club locations and by lower fuel sales. Sam\'s Club comparable sales for fiscal 2020 benefited from growth in transactions and higher fuel sales, which were partially offset by lower aver

In [59]:
query = "What was Apple's revenue last year compared to this year??"
res=qa(query)
res['result']



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'A financial question!\n\nAccording to the provided 10-K filings, we can find the answer.\n\nIn the 2022 Form 10-K, it is mentioned that:\n\n"Fiscal 2022 Highlights...\nTotal net sales increased 8% or $28.5 billion during 2022 compared to 2021..."\n\nAnd in the 2021 Form 10-K (not provided), we would find the revenue figure for the previous year.\n\nSo, let\'s assume the revenue for 2021 is x.\n\nThen, the revenue for 2022 would be x + $28.5 billion (8% increase).\n\nNow, if you provide me with the numbers:\n\n21,280 (2021) and\n46,291 (2022)\n\nI can help you find the difference between last year\'s revenue and this year\'s revenue.\n\nPlease provide the numbers!'

In [44]:
query = "How many products does Nike offer? What is the industry that Nike is part of?"
res=qa(query)
res['result']



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


"Question: How many products does Nike offer? What is the industry that Nike is part of?\nAnswer: According to the provided context, Nike offers a wide range of products including athletic footwear, apparel, equipment, accessories, and services. The company's principal business activity is the design, development, and worldwide marketing and selling of these products.\n\nSource: 2023 Form 10-K filing with the Securities and Exchange Commission (SEC), page 1-2"

In [122]:
query = "what was revenue of Apple in 2022?"
res=qa(query)
res['result']



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


"Based on the provided context from Apple's 10-K filings, we can infer that the revenue for 2022 is not explicitly stated. However, we can find the net sales by reportable segment for 2023 and compare it to the same period in 2022.\n\nFrom the table, we see that:\n\n* Total net sales in 2023 was $383,285\n* Total net sales in 2022 was $394,328\n\nSo, the revenue (or net sales) of Apple in 2022 was approximately $394,328 million."

In [118]:
query = "How many employees work at Nike???"
res=qa(query)
res['result']



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'I apologize, but the provided context does not mention Nike or any information about its workforce. The context appears to be from a 10-K filing of PepsiCo, Inc.\n\nQuestion: None\nAnswer: N/A\nSource: N/A'

### Adding query analysis and hybrid search in QA chain

In [14]:
from custom_ners import get_redis_filters

/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss/multi_doc_RAG
/Users/rouzbeh.farahmand/PycharmProjects/commit/financial-vss
 ✅ Loaded doc info for  110 tickers...


In [15]:
#Plugin your own query_analysis here, that includes NER, topic detection, intent detection, semantic routing etc. 
def query_analysis(q):
    filters = get_redis_filters(q)
    print(filters)
    return filters
    

def ask_question(question,
                 filters = None,
                 distance_threshold =0.8,
                 search_type="similarity_distance_threshold"):
    
    if filters is None:
        filters = query_analysis(question)
    if filters is not None:
        search_args = {"distance_threshold":distance_threshold, 
                   'include_metadata': True, 
                   'filter':filters}
    else:
        search_args = {"distance_threshold":distance_threshold, 
                   'include_metadata': True}
        
    fqa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=rds.as_retriever(search_type=search_type,
                                   search_kwargs= search_args),
        return_source_documents=True,
        chain_type_kwargs={"prompt": get_prompt()},
        verbose=True
    )
    response = fqa(question)
    return response  

In [1]:
ask_question("what is the revenue of APPLE?")

NameError: name 'ask_question' is not defined

## Cleanup

Cleanup the index and data.

In [47]:
#rds.drop_index(index_name=index_name, redis_url=REDIS_URL, delete_documents=True)