<img src="img/retrieval.png" width="700" height="500">

### Maximum Marginal Relevance

<img src="img/mmr.png" width="600" height="700">

In [1]:
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import Chroma

In [2]:
# Embeddings
embed = HuggingFaceInstructEmbeddings(model_name='hkunlp/instructor-base')

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


  model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))


In [3]:
# vector store chroma db
persist_dir = 'chromadb_data'

vectordb = Chroma(
    embedding_function=embed,
    persist_directory=persist_dir
)

In [4]:
vectordb._collection.count()

84

In [5]:
question = "work experience"
response_mmr = vectordb.max_marginal_relevance_search(query=question, k=2, fetch_k=3)
response_mmr

[Document(metadata={'page': 0, 'source': 'C:/Users/Ritesh_Patel/Documents/resume/Nitesh_Sidhu_Deloitte.pdf'}, page_content='WORK EXPERIENCE  \nCONSULTANT – Deloitte Consulting India Pvt. Ltd. / Gurugram                                          Jun 2021 – present'),
 Document(metadata={'page': 0, 'source': 'C:/Users/Ritesh_Patel/Documents/resume/Nitesh_Sidhu_Deloitte.pdf'}, page_content='SUMMARY  \nData Engineer having 4+ years of experience with a track record in designing, implementing, and maintaining data infrastructure')]

In [6]:
response_ss = vectordb.similarity_search(query=question, k=2)
response_ss

[Document(metadata={'page': 0, 'source': 'C:/Users/Ritesh_Patel/Documents/resume/Nitesh_Sidhu_Deloitte.pdf'}, page_content='WORK EXPERIENCE  \nCONSULTANT – Deloitte Consulting India Pvt. Ltd. / Gurugram                                          Jun 2021 – present'),
 Document(metadata={'page': 0, 'source': 'C:/Users/Ritesh_Patel/Documents/resume/vanya_dataengineer.pdf'}, page_content="Experience\nDELOITTE & TOUCHE, LLP's | Gurugram, Haryana\nData Engineer | 03/2021 - Present")]

### LLM Aided Retreival aka Self Query

<img src="img/self_query.png" width="600" height="700">

In [7]:
from langchain.llms import HuggingFaceHub
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import Chroma

In [8]:
# Embeddings
embed = HuggingFaceInstructEmbeddings(model_name='hkunlp/instructor-base')

load INSTRUCTOR_Transformer
max_seq_length  512


In [9]:
# vector store chroma db
persist_dir = 'chromadb_data'

vectordb = Chroma(
    embedding_function=embed,
    persist_directory=persist_dir
)

In [38]:
# llm model from hugging face
llm = HuggingFaceHub(repo_id='google/flan-t5-large')

In [34]:
# filePaths = ['C:/Users/Ritesh_Patel/Documents/resume/Kaustubh_Bhairi_Resume.pdf',
#             'C:/Users/Ritesh_Patel/Documents/resume/Naukri_KushagraPandey[5y_0m].pdf',
#             'C:/Users/Ritesh_Patel/Documents/resume/Nitesh_Sidhu_Deloitte.pdf',
#             'C:/Users/Ritesh_Patel/Documents/resume/vanya_dataengineer.pdf']

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The resume the chunk is from, should be one of `C:/Users/Ritesh_Patel/Documents/resume/Kaustubh_Bhairi_Resume.pdf`, `C:/Users/Ritesh_Patel/Documents/resume/Naukri_KushagraPandey[5y_0m].pdf`, `C:/Users/Ritesh_Patel/Documents/resume/Nitesh_Sidhu_Deloitte.pdf`,or `C:/Users/Ritesh_Patel/Documents/resume/vanya_dataengineer.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the resume",
        type="integer",
    ),
]

In [39]:
document_content_description = "resume"
retreiver = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [40]:
import json
question = "How many years of experience Kaustubh Bhairi has?"
response = retreiver.get_relevant_documents(question)

OutputParserException: Parsing text
json  "query": "years of experience" 
 raised following error:
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)

### Compression

<img src="img/compresssion.png" width="700" height="600">

In [41]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [43]:
# Wrap our vectorstore
llm = HuggingFaceHub(repo_id='google/flan-t5-small')
compressor = LLMChainExtractor.from_llm(llm)

In [44]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [46]:
question = "How many years of experience Kaustubh Bhairi has?"
compressed_docs = compression_retriever.get_relevant_documents(question)

In [47]:
compressed_docs

[Document(metadata={'page': 0, 'source': 'C:/Users/Ritesh_Patel/Documents/resume/Kaustubh_Bhairi_Resume.pdf'}, page_content='KAUSTUBH BHAIRI Maharashtra, India (+91'),
 Document(metadata={'page': 0, 'source': 'C:/Users/Ritesh_Patel/Documents/resume/Naukri_KushagraPandey[5y_0m].pdf'}, page_content='5'),
 Document(metadata={'page': 0, 'source': 'C:/Users/Ritesh_Patel/Documents/resume/vanya_dataengineer.pdf'}, page_content='vsvanyasharma82@gmail.com Summary Experienced Data Engineer with over'),
 Document(metadata={'page': 0, 'source': 'C:/Users/Ritesh_Patel/Documents/resume/vanya_dataengineer.pdf'}, page_content='No_outpUT')]

In [48]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

In [49]:
question = "How many years of experience Kaustubh Bhairi has?"
compressed_docs = compression_retriever.get_relevant_documents(question)

In [50]:
compressed_docs

[Document(metadata={'page': 0, 'source': 'C:/Users/Ritesh_Patel/Documents/resume/Kaustubh_Bhairi_Resume.pdf'}, page_content='KAUSTUBH BHAIRI Maharashtra, India (+91'),
 Document(metadata={'page': 0, 'source': 'C:/Users/Ritesh_Patel/Documents/resume/Kaustubh_Bhairi_Resume.pdf'}, page_content='No_output'),
 Document(metadata={'page': 0, 'source': 'C:/Users/Ritesh_Patel/Documents/resume/Kaustubh_Bhairi_Resume.pdf'}, page_content='Data Engineer with two years of experience in maintaining and building ETL proce sses.'),
 Document(metadata={'page': 0, 'source': 'C:/Users/Ritesh_Patel/Documents/resume/vanya_dataengineer.pdf'}, page_content='No_output')]