In [2]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import chromadb
from chromadb.config import Settings
from pypdf import PdfReader
from typing import List
from langchain.schema import Document

In [3]:
import os

***Fill your OpenAI Keys Here***

In [4]:
os.environ["OPENAI_API_KEY"] = ""

## Data Ingestion
Let's ingest some sample documents, embed them and store in the collection

In [5]:
pdf = PdfReader("sample_dataset/Resume.pdf")
pages = []
for page in pdf.pages:
    text = page.extract_text()
    pages.append(text)

## Convert text into Document

In [6]:
def text_to_document(text: str | List[str]) -> List[Document]:
    """Converts text or list of text into document chunks"""
    if isinstance(text, str):
        text = [text]

    doc_chunks = []
    #split document into chunks
    for i, doc in enumerate(text):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, 
            separators=["\n", "\r\n", "\r"], 
            chunk_overlap=0
        )

        chunks = text_splitter.split_text(doc)
        
        for j, chunk in enumerate(chunks):
            document = Document(
                page_content = chunk,
                metadata = {
                    "page": i, 
                    "chunk": i,
                    "source": f"{i}-{j}"
                }
            )
            doc_chunks.append(document)
    return doc_chunks

In [7]:
doc_chunks = text_to_document(pages)
print(doc_chunks)

[Document(page_content="+918985086974\nranjeetkumar692@gmail.com\nLinkedIn , GithubC O N T A C T P R O F I L E\nS K I L L S\nE D U C A T I O NW O R K  E X P E R I E N C E\nProgramming/Frameworks\nNodeJs, Python, Javascript, Java,\nAWSMySQL, PostgreSQL, MongoDB,\nCassandra, Bigquery, AWS Athena, Redis,\nElasticsearch  \nMaster of Computer Applications\nUniversity of Hyderabad, Hyderabad\n2012-2015\nB.Sc. (Information Technology)\nSt. Xavier's College, Ranchi\n2008-2011Sr Product Architect\nSummit Publishing Co, Inc  (Remote)\nRecommendation System:  Developed a recommendation system that can be\neasily integrated with Summit Media's brand websites and recommend articles\nfor further reading to the readers. It was a javascript widget using NodeJS-\nbased backend and ML services.\nAWS Cost Optimization:  Optimized datalake and data warehouse usages to\nreduce data product infrastructure costs up to $6000 monthly.    \nUnified Identity Solution:  a javascript and python-based project to id

## Embed and Store Document (ChromaDB)
once the pdf data is successfully parsed, we will embed this data and store in chromadb

In [7]:
vector_store = Chroma.from_documents(
    documents = doc_chunks, 
    embedding = HuggingFaceEmbeddings(), 
    persist_directory = "./test",
    collection_name ="test_collection"
)

In [8]:
client = chromadb.Client(Settings(
    persist_directory="./test" # Optional, defaults to .chromadb/ in the current directory
))

In [10]:
index = Chroma(
    embedding_function = HuggingFaceEmbeddings(), 
    persist_directory = "./test",
    collection_name ="test_collection"
)

In [12]:
qa = RetrievalQA.from_chain_type(
            llm=OpenAI(), 
            chain_type="stuff", 
            retriever=index.as_retriever(), 
            return_source_documents=True,
            verbose=True
        )

In [13]:
qa("What are the skills of Ranjeet")



[1m> Entering new  chain...[0m

[1m> Finished chain.[0m


{'query': 'What are the skills of Ranjeet',
 'result': " Ranjeet's skills include programming/frameworks such as NodeJs, Python, Javascript, Java, AWS, MySQL, PostgreSQL, MongoDB, Cassandra, Bigquery, AWS Athena, Redis, Elasticsearch, HTML, CSS, ReactJs, AngularJs, Shell Script, Cloud, Git, JIRA, Redmine, Docker, Google Analytics, Mixpanel, Looker Studio, Camunda BPM, Keycloak, Microservices, Linux, REST APIs, Serverless, Coaching, and Agile Methodology.",
 'source_documents': [Document(page_content="+918985086974\nranjeetkumar692@gmail.com\nLinkedIn , GithubC O N T A C T P R O F I L E\nS K I L L S\nE D U C A T I O NW O R K  E X P E R I E N C E\nProgramming/Frameworks\nNodeJs, Python, Javascript, Java,\nAWSMySQL, PostgreSQL, MongoDB,\nCassandra, Bigquery, AWS Athena, Redis,\nElasticsearch  \nMaster of Computer Applications\nUniversity of Hyderabad, Hyderabad\n2012-2015\nB.Sc. (Information Technology)\nSt. Xavier's College, Ranchi\n2008-2011Sr Product Architect\nSummit Publishing Co, In

In [15]:
qa("How many year's of experience Ranjeet has?")



[1m> Entering new  chain...[0m

[1m> Finished chain.[0m


{'query': "How many year's of experience Ranjeet has?",
 'result': ' Ranjeet has 8 years of experience.',
 'source_documents': [Document(page_content="+918985086974\nranjeetkumar692@gmail.com\nLinkedIn , GithubC O N T A C T P R O F I L E\nS K I L L S\nE D U C A T I O NW O R K  E X P E R I E N C E\nProgramming/Frameworks\nNodeJs, Python, Javascript, Java,\nAWSMySQL, PostgreSQL, MongoDB,\nCassandra, Bigquery, AWS Athena, Redis,\nElasticsearch  \nMaster of Computer Applications\nUniversity of Hyderabad, Hyderabad\n2012-2015\nB.Sc. (Information Technology)\nSt. Xavier's College, Ranchi\n2008-2011Sr Product Architect\nSummit Publishing Co, Inc  (Remote)\nRecommendation System:  Developed a recommendation system that can be\neasily integrated with Summit Media's brand websites and recommend articles\nfor further reading to the readers. It was a javascript widget using NodeJS-\nbased backend and ML services.\nAWS Cost Optimization:  Optimized datalake and data warehouse usages to\nreduce data

In [16]:
qa("Can you list down all organizations, where Ranjeet worked in past")



[1m> Entering new  chain...[0m

[1m> Finished chain.[0m


{'query': 'Can you list down all organizations, where Ranjeet worked in past',
 'result': ' Ranjeet worked at Summit Publishing Co, Inc, Neo Script Technologies Pvt Ltd and Cloudboost.io.',
 'source_documents': [Document(page_content="+918985086974\nranjeetkumar692@gmail.com\nLinkedIn , GithubC O N T A C T P R O F I L E\nS K I L L S\nE D U C A T I O NW O R K  E X P E R I E N C E\nProgramming/Frameworks\nNodeJs, Python, Javascript, Java,\nAWSMySQL, PostgreSQL, MongoDB,\nCassandra, Bigquery, AWS Athena, Redis,\nElasticsearch  \nMaster of Computer Applications\nUniversity of Hyderabad, Hyderabad\n2012-2015\nB.Sc. (Information Technology)\nSt. Xavier's College, Ranchi\n2008-2011Sr Product Architect\nSummit Publishing Co, Inc  (Remote)\nRecommendation System:  Developed a recommendation system that can be\neasily integrated with Summit Media's brand websites and recommend articles\nfor further reading to the readers. It was a javascript widget using NodeJS-\nbased backend and ML services.\n

In [14]:
qa("List down education qualifications of Ranjeet")



[1m> Entering new  chain...[0m

[1m> Finished chain.[0m


{'query': 'List down education qualifications of Ranjeet',
 'result': " Ranjeet has a Master of Computer Applications from University of Hyderabad and a B.Sc. (Information Technology) from St. Xavier's College, Ranchi.",
 'source_documents': [Document(page_content="+918985086974\nranjeetkumar692@gmail.com\nLinkedIn , GithubC O N T A C T P R O F I L E\nS K I L L S\nE D U C A T I O NW O R K  E X P E R I E N C E\nProgramming/Frameworks\nNodeJs, Python, Javascript, Java,\nAWSMySQL, PostgreSQL, MongoDB,\nCassandra, Bigquery, AWS Athena, Redis,\nElasticsearch  \nMaster of Computer Applications\nUniversity of Hyderabad, Hyderabad\n2012-2015\nB.Sc. (Information Technology)\nSt. Xavier's College, Ranchi\n2008-2011Sr Product Architect\nSummit Publishing Co, Inc  (Remote)\nRecommendation System:  Developed a recommendation system that can be\neasily integrated with Summit Media's brand websites and recommend articles\nfor further reading to the readers. It was a javascript widget using NodeJS-\nb