In [4]:
import getpass
import os
import time

from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
os.environ['LANGCHAIN_TRACING_V2']='true'
os.environ['LANGCHAIN_API_KEY']=os.getenv('LANGCHAIN_API_KEY')

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

In [7]:
import time

index_name = "hybrid"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [8]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [11]:
from langchain_community.document_loaders import PyPDFLoader
pdf = 'books/chemistry/ch2.pdf'

loader = PyPDFLoader(pdf)
doc = loader.load()
doc

[Document(metadata={'source': 'books/chemistry/ch2.pdf', 'page': 0}, page_content="2.CLASSIFICATION OF COMPOUNDS \n \n1 \n SANKALP GROUP OF SCHOOLS \n  \nCLASSIFICATION AND NAMING OF CHEMICAL COMPOUNDS \nUnderstanding the Categories of Compounds in Chemistry \nIn the vast domain of chemistry, there are numerous compounds, each having unique properties and \nbehavior. Studying each one individually would be an overwhelming task, but thankfully, these \nchemical compounds can be classified into certain groups. This means that by knowing the category \nto which a compound belongs, we can quickly understand its basic properties. \nFor instance, consider hydrochloric acid ( HCl). This compound falls under the category of acids. So, \nby understanding the general characteristics and behaviors of acids, we can predict those of HCl. \nPrimarily, we can categorize most compounds into five main groups: \n1. Acids \n2. Bases \n3. Salts \n4. Metallic Oxides \n5. Non-metallic Oxides \nThe first thr

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Initialize the splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000,  # Max token size per chunk
    chunk_overlap=500,  # Overlap for context retention
    separators=["\n\n", "\n", ".", " "]  # Split by paragraphs, then sentences
)

# Apply to text
chunks = splitter.split_documents(doc)
chunks

[Document(metadata={'source': 'books/chemistry/ch2.pdf', 'page': 0}, page_content="2.CLASSIFICATION OF COMPOUNDS \n \n1 \n SANKALP GROUP OF SCHOOLS \n  \nCLASSIFICATION AND NAMING OF CHEMICAL COMPOUNDS \nUnderstanding the Categories of Compounds in Chemistry \nIn the vast domain of chemistry, there are numerous compounds, each having unique properties and \nbehavior. Studying each one individually would be an overwhelming task, but thankfully, these \nchemical compounds can be classified into certain groups. This means that by knowing the category \nto which a compound belongs, we can quickly understand its basic properties. \nFor instance, consider hydrochloric acid ( HCl). This compound falls under the category of acids. So, \nby understanding the general characteristics and behaviors of acids, we can predict those of HCl. \nPrimarily, we can categorize most compounds into five main groups: \n1. Acids \n2. Bases \n3. Salts \n4. Metallic Oxides \n5. Non-metallic Oxides \nThe first thr

In [13]:
chunk_texts = [chunk.page_content for chunk in chunks]
vector_embeddings = embeddings.embed_documents(chunk_texts)


In [14]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [20]:
# Load BM25Retriever from a file
import pickle
with open("bm25_model.pkl", "rb") as f:
    bm25_retriever = pickle.load(f)

print("BM25 retriever loaded.")


BM25 retriever loaded.


from langchain.retrievers import BM25Retriever

# Load the BM25 retriever
bm25_retriever=BM25Retriever.from_texts(chunk_texts)
bm25_retriever

import pickle

# Save BM25Retriever to a file
with open("bm25_model.pkl", "wb") as f:
    pickle.dump(bm25_retriever, f)

print("BM25 retriever saved.")

In [21]:
from langchain.vectorstores import Pinecone
name = 'chem' # agent to find the correct namespace from the index
vector_store = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings,namespace=name)

from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(chunks))]

vector_store.add_documents(documents=chunks, ids=uuids)

['08b9fb35-e2ce-459f-8358-083909670626',
 '15a08bd2-40fe-44f9-bad1-22d2672f164d',
 '157dfe5a-8d98-4a48-8918-44d4cc56b199',
 'e5270e63-7999-46aa-b366-2ac51260bc60',
 'ff2f3b2d-9397-4b54-a6a1-6410a033cc76',
 '10d63e3b-d232-4d64-a437-901bb20be8ab',
 'e70ab342-b7c4-4304-acee-fa435394411b',
 'f362fbde-26e3-4a23-859d-a8f95d4b5bd4',
 'd1f5a76f-23e1-401c-823f-148924508d8e']