# Document Retrieval

In [1]:
import os
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import MarkdownTextSplitter
from langchain.vectorstores import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever
import pandas as pd
from tqdm import tqdm

In [2]:
pd.set_option('display.max_colwidth', None)

In [5]:
def similarity_search_retrieval(query, retriever):
    docs = retriever.similarity_search(query, k=5)
    sources = [i.metadata.get('source') for i in docs]
    return sources

In [6]:
def ensemble_retriever_retrieval(query, ensemble_retriever):
    docs = ensemble_retriever.get_relevant_documents(query)
    sources = [doc.metadata.get('source') for doc in docs]
    return sources

In [7]:
# Load documents 
loader = DirectoryLoader('../data/external', glob="**/*.md", loader_cls=TextLoader)
documents = loader.load()
text_splitter = MarkdownTextSplitter(chunk_overlap=0)
texts = text_splitter.split_documents(documents)

text_content = [doc.page_content if hasattr(doc, 'page_content') else doc.content for doc in texts]

In [9]:
# Embeddings
embeddings = HuggingFaceEmbeddings()

In [10]:
# create Chroma vector store
chroma_vectorstore = Chroma.from_documents(texts, embeddings)

In [11]:
# Create BM25 retriever
bm25_retriever = BM25Retriever.from_texts(text_content)
bm25_retriever.k = 5  

In [12]:
# Create Chroma retriever
chroma_retriever = chroma_vectorstore.as_retriever(search_kwargs={"k": 6})

In [13]:
# Create ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, chroma_retriever],
    weights=[0.5, 0.5]  #weights
)

## Getting query from df

In [27]:
df = pd.read_csv('final-ds.csv')[['Question','Source']]
df = df[:1000]

In [28]:
# Iterate through rows in the DataFrame and retrieve sources
for index, row in tqdm(df.iterrows()):
    question = row['Question']

    # Retrieve sources using similarity search
    sources_similarity_search = similarity_search_retrieval(question, chroma_vectorstore)

    # Retrieve sources using ensemble retriever
    sources_ensemble_retriever = ensemble_retriever_retrieval(question, ensemble_retriever)

    # Update the DataFrame with the new columns
    df.at[index, 'source_similarity_search'] = ', '.join(map(str, sources_similarity_search))
    df.at[index, 'source_ensemble_retriever'] = ', '.join(map(str, sources_ensemble_retriever))


1000it [03:23,  4.92it/s]


In [29]:
df = df[['Question','Source','source_similarity_search','source_ensemble_retriever']]

In [30]:
df.head()

Unnamed: 0,Question,Source,source_similarity_search,source_ensemble_retriever
0,How do I install the Knative CLI? \n,../data/processed/dataset-qa/rosa-docs-serverless_35.json,"../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md","None, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md"
1,How do I use the Knative CLI to describe a subscription?\n,../data/processed/dataset-qa/rosa-docs-serverless_35.json,"../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md","../data/external/rosa-docs/serverless.md, None, None, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md"
2,How do I use the Knative CLI to list existing subscriptions?\n,../data/processed/dataset-qa/rosa-docs-serverless_35.json,"../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md","../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md, None, None, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md"
3,How do I update a subscription using the Knative CLI?\n,../data/processed/dataset-qa/rosa-docs-serverless_35.json,"../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md","../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md, None, None, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md"
4,What parameters can I configure for event delivery? \n,../data/processed/dataset-qa/rosa-docs-serverless_35.json,"../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md","None, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md, None, ../data/external/rosa-docs/serverless.md, ../data/external/rosa-docs/serverless.md"
