In [None]:
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain_community.document_loaders import TextLoader
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

load_dotenv()
api_key = os.getenv("PINECONE_API_KEY")

if not api_key:
    raise ValueError("PINECONE_API_KEY is not found in the .env file")

try:
    pc = Pinecone(api_key=api_key)
except Exception as e:
    raise Exception(f"Failed to initialize the pinecone client{e}")

In [22]:
index_name = "first-pc-rag"

if index_name in pc.list_indexes().names():
    print("Index already exists")
else:
    print("Index does not exists")

# connect to the index
index = pc.Index(index_name)

# load and split document
loader = TextLoader("policies.txt", encoding="utf-8")
documents = loader.load()
print(f"Loaded {len(documents)} document(s)")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
print(f"split into {len(chunks)} chunks")

embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectors = []
for i, chunk in enumerate(chunks):
    vector = embeddings_model.embed_query(chunk.page_content)
    vectors.append({
        "id": f"chunk-{i}",
        "values": vector,
        "metadata": {
            "text": chunk.page_content,
            "source": chunk.metadata.get("source", "policies.txt")
        }
    })







Index already exists
Loaded 1 document(s)
split into 8 chunks


In [23]:
# upserting into pinecone
batch_size=100
for i in range(0, len(vectors), batch_size):
    batch = vectors[i:1 + batch_size]
    index.upsert(vectors=batch)
    print(f"Upserted batch {i//batch_size + 1}/{(len(vectors)-1)//batch_size + 1}")
print(f"successfully upserted {len(vectors)} vectors")

Upserted batch 1/1
successfully upserted 8 vectors


In [25]:
query = "what is the name of the company"
query_vector = embeddings_model.embed_query(query)

results = index.query(
    vector=query_vector,
    top_k=3,
    include_metadata=True
)
if results['matches']:
    for match in results['matches']:
        print(f"score: {match['score']:.4f}")
        print(f"Text: {match['metadata']['text'][:200]}...\n")
else: 
    print("No matches found")

score: 0.3785
Text: THe name of our comapny, DelishDine Restaurants Ltd. – Company Policy Manual
1. Introduction
Welcome to DelishDine Restaurants Ltd.
This policy manual outlines the standards, rules, and expectations t...

score: 0.3168
Text: 11. Confidentiality
All recipes, operational procedures, and customer information are proprietary. Sharing company secrets with outside parties is strictly prohibited....

score: 0.1307
Text: Maintain confidentiality regarding customer information.
8. Health and Safety
Follow all safety procedures for kitchen equipment and cleaning materials.
Report accidents or injuries immediately to a s...

