In [7]:
#Building and Implementing Pinecone Vector Databases
#https://www.analyticsvidhya.com/blog/2024/06/pinecone-vector-databases/

In [8]:
#!pip install pinecone langchain langchain_pinecone langchain-openai langchain-community pypdf python-dotenv
import os
from dotenv import load_dotenv
import pinecone
from pinecone import ServerlessSpec
from pinecone import Pinecone, ServerlessSpec
from langchain.text_splitter import RecursiveCharacterTextSplitter # To split the text into smaller chunks
from langchain_openai import OpenAIEmbeddings # To create embeddings
from langchain_pinecone import PineconeVectorStore # To connect with the Vectorstore
from langchain_community.document_loaders import DirectoryLoader # To load files in a directory
from langchain_community.document_loaders import PyPDFLoader # To parse the PDFs

In [9]:
#3, Environment Setup
#Load API keys:
os.environ["OPENAI_API_KEY"] = "your OpenAI API Key"
os.environ["PINECONE_API_KEY"] = "your pinecone api key"

In [10]:
#4, Pinecone Configuration
index_name = "pinecone-index-test1" #give the name to your index, or you can use an index which you created previously and load that.
#here we are using the new fresh index name
pc = Pinecone(api_key="1e14ea53-ac97-4bc3-9b6a-06c83608e0fb")
#Get your Pinecone API key to connect after successful login and put it here.
pc

<pinecone.control.pinecone.Pinecone at 0x1c27f3074c0>

In [11]:
#Index Creation or Loading
pc.create_index(
  name=index_name,
  dimension=1536, # Replace with your model dimensions
  metric="cosine", # Replace with your model metric
  spec=ServerlessSpec(cloud="aws",
       region="us-east-1"
   )
)
while not pc.describe_index(index_name).status["ready"]:
    time.sleep(1)
index= pc.Index(index_name)
print("index created")
print(index.describe_index_stats())

index created
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [12]:
#5: Data Preparation and Loading for Vector Database Ingestion
#Setting Key Parameters

In [13]:
DATA_DIR_PATH = "./documents"  # "/content/drive/MyDrive/Data", Directory containing our PDF files
CHUNK_SIZE = 1024  # Size of each text chunk for processing
CHUNK_OVERLAP = 0  # Amount of overlap between chunks
INDEX_NAME = index_name  # Name of our Pinecone index

In [14]:
#Loading PDF Documents
#To load our PDF files, we’ll use LangChain’s DirectoryLoader in conjunction with the PyPDFLoader. 
# This combination allows us to efficiently process multiple PDF files from a specified directory.

In [15]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
loader = DirectoryLoader(
    path=DATA_DIR_PATH,  # Directory containing our PDFs
    glob="**/*.pdf",     # Pattern to match PDF files (including subdirectories)
    loader_cls=PyPDFLoader  # Specifies we're loading PDF files
)
docs = loader.load()  # This loads all matching PDF files
print(f"Total Documents loaded: {len(docs)}")

Total Documents loaded: 2


In [16]:
docs[0]

Document(metadata={'source': 'documents\\AILead.pdf', 'page': 0}, page_content='AI Lead  \nWhat You Will Be Doing  \nAs a key member of a leading provider of technology -enabled revenue cycle \nmanagement solutions for health systems, you will be responsible for building AI and \nMachine learning models and pipelines, with a focus on generative AI, LLMs, and \npredictive modeli ng. Your role will involve collaborating closely with business owners to \nunderstand their data requirements and product specifications, enabling them to make \ninformed data -related decisions and product design choices.  \n Additionally, you will work alongside data scientists, software engineers, and \ndevelopers to deliver innovative solutions. Defining strategic priorities for AI and GenAI \ndevelopment across the company and educating both technical and business teams on \nthe latest AI advancements will be crucial aspects of your role. You may also need to \nquickly adapt to new tools and technologies, s

In [17]:
type(docs[0])

langchain_core.documents.base.Document

In [18]:
# we can convert the Document object to a python dict using the .dict() method.
print(f"keys associated with a Document: {docs[0].dict().keys()}")

keys associated with a Document: dict_keys(['id', 'metadata', 'page_content', 'type'])


In [19]:
print(f"{'-'*15}\nFirst 100 charachters of the page content: {docs[1].page_content[:100]}\n{'-'*15}")
print(f"Metadata associated with the document: {docs[1].metadata}\n{'-'*15}")
print(f"Datatype of the document: {docs[1].type}\n{'-'*15}")

---------------
First 100 charachters of the page content: • Experience with LLM -based pipelines and retrieval augmented generation 
techniques is a plus . 
•
---------------
Metadata associated with the document: {'source': 'documents\\AILead.pdf', 'page': 1}
---------------
Datatype of the document: Document
---------------


In [20]:
#  We loop through each document and add additional metadata - filename, quarter, and year
for doc in docs:
   filename = doc.dict()['metadata']['source'].split("\\")[-1]
   #quarter = doc.dict()['metadata']['source'].split("\\")[-2]
   #year = doc.dict()['metadata']['source'].split("\\")[-3]
   doc.metadata = {"filename": filename, "source": doc.dict()['metadata']['source'], "page": doc.dict()['metadata']['page']}

# To veryfy that the metadata is indeed added to the document
print(f"Metadata associated with the document: {docs[0].metadata}\n{'-'*15}")
print(f"Metadata associated with the document: {docs[1].metadata}\n{'-'*15}")
#print(f"Metadata associated with the document: {docs[2].metadata}\n{'-'*15}")
#print(f"Metadata associated with the document: {docs[3].metadata}\n{'-'*15}")

Metadata associated with the document: {'filename': 'AILead.pdf', 'source': 'documents\\AILead.pdf', 'page': 0}
---------------
Metadata associated with the document: {'filename': 'AILead.pdf', 'source': 'documents\\AILead.pdf', 'page': 1}
---------------


In [21]:
for i in range(len(docs)) :
  print(f"Metadata associated with the document: {docs[i].metadata}\n{'-'*15}")

Metadata associated with the document: {'filename': 'AILead.pdf', 'source': 'documents\\AILead.pdf', 'page': 0}
---------------
Metadata associated with the document: {'filename': 'AILead.pdf', 'source': 'documents\\AILead.pdf', 'page': 1}
---------------


In [22]:
#6: Optimizing Data for Vector Databases
#Recursive Character Chunking, a method that balances efficiency with content coherence. 
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=0
)
documents = text_splitter.split_documents(docs)

In [23]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)
documents = text_splitter.split_documents(docs)
len(docs), len(documents)

(2, 5)

In [24]:
#Step7: Embedding and Vector Store Creation
embeddings = OpenAIEmbeddings(model = "text-embedding-ada-002") # Initialize the embedding model
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001C20C00F1F0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001C20C02C790>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [25]:
docs_already_in_pinecone = "N" #input("Are the vectors already added in DB: N")
# check if the documents were already added to the vector database
if docs_already_in_pinecone == "Y" or docs_already_in_pinecone == "y":
   docsearch = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
   print("Existing Vectorstore is loaded")
# if not then add the documents to the vectore db
elif docs_already_in_pinecone == "N" or docs_already_in_pinecone == "n":
   docsearch = PineconeVectorStore.from_documents(documents, embeddings, index_name=index_name)
   print("New vectorstore is created and loaded")
else:
   print("Please type Y - for yes and N - for no")

New vectorstore is created and loaded


In [29]:
#Using the Vector Store for Retrieval
# Here we are defing how to use the loaded vectorstore as retriver
retriver = docsearch.as_retriever()
retriver.invoke("what does the team looks like?")

[Document(metadata={'filename': 'AILead.pdf', 'page': 1.0, 'source': 'documents\\AILead.pdf'}, page_content='and machine learning.'),
 Document(metadata={'filename': 'AILead.pdf', 'page': 0.0, 'source': 'documents\\AILead.pdf'}, page_content='experienc e. \n• Proficiency with Machine Learning ecosystem tools such as PyTorch/Tensorflow, \nscikit-learn, xgboost . \n• Ability to research and implement solutions based on novel algorithms .'),
 Document(metadata={'filename': 'AILead.pdf', 'page': 1.0, 'source': 'documents\\AILead.pdf'}, page_content='• Experience with LLM -based pipelines and retrieval augmented generation \ntechniques is a plus . \n• Familiarity with tools like langchain/llamaindex/haystack/Azure AI studio . \n• Proficiency in SQL, Azure Data Factory, or similar for database handling . \n• Knowledge of Software Development best practices and MLOps fundamentals . \n• Ability to work independently and in a fast -paced team environment . \n• Excellent written and verbal commu