In [1]:
from pinecone import Pinecone, ServerlessSpec
import os

from dotenv import load_dotenv, find_dotenv

# Load environment variables from a .env file if it exists
_ = load_dotenv(find_dotenv())

# Retrieve the Pinecone API key from the environment variables
pinecone_api_key = os.environ.get("PINECONE_API_KEY")

# Initialize the Pinecone client with the retrieved API key
pc = Pinecone(api_key=pinecone_api_key)

  from tqdm.autonotebook import tqdm


In [2]:
import time

# Define the name of the Pinecone index
index_name = "vidavox-tech-test" 

# Retrieve the list of existing indexes from the Pinecone client
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

# Check if the desired index already exists
if index_name not in existing_indexes:
    # Create a new Pinecone index with the specified name, dimension, and metric
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    # Wait until the index is ready
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

# Retrieve the created or existing index
index = pc.Index(index_name)

In [3]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load the PDF
pdf_path = "../synthetics-dataset.pdf"
loader = PyPDFLoader(pdf_path)

# Extract text from PDF
documents = loader.load()

# Combine text from all pages
pdf_text = "\n".join([doc.page_content for doc in documents])

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust chunk size based on your needs
    chunk_overlap=100,  # Overlap ensures context is maintained
    length_function=len,
    separators=["\n\n", "\n", " "]  # Split text based on these separators
)

# Split text into chunks
chunks = text_splitter.split_text(pdf_text)


In [11]:
# Import the PineconeVectorStore class from langchain_pinecone
from langchain_pinecone import PineconeVectorStore

# Initialize the GoogleGenerativeAIEmbeddings with the specified model
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

# Create a PineconeVectorStore instance using the previously created Pinecone index and the embeddings
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [12]:
# Import the uuid4 function from the uuid module to generate unique identifiers
from uuid import uuid4

# Generate a list of unique identifiers (UUIDs) for each chunk of text
# The length of the list is equal to the number of chunks
uids = [str(uuid4()) for _ in range(len(chunks))]

# Add the chunks of text to the Pinecone vector store
# Each chunk is associated with a unique identifier from the uids list
vector_store.add_texts(chunks, ids=uids)

['2171cd93-3086-457a-9af0-adf39e84d51c',
 '41cd6b9d-950e-4fc1-bad1-640cf448f8d4',
 '7da82c7a-92f3-4f69-b355-0e5d1ad8482f',
 '643c059f-2d0c-4501-80a6-51bb41a87c33',
 '0276755d-f679-4c97-9c81-96869cfa5a70',
 'ac88d647-2139-4f26-83a4-caeffc423fd9',
 '3868e3bd-48d5-4a64-a07c-f357eb6aca08',
 '2737464e-0451-414e-96c3-90ebe3a737b2',
 '61512a1e-6e9e-45f8-a854-be51fa184948',
 '37d79f7f-ab1b-466d-a8ae-e2306ba7c6e2',
 'e8d4170b-666c-4670-9150-f4e53ee36f25',
 '52156a41-df81-4a3f-a23c-7b4dc42ce3dc',
 'c9489c57-0fa0-490a-a79b-ee2f68c9fe63',
 '43e17e4e-2b01-42f8-bff1-9deb4149e196',
 '12540f0c-69d0-4855-ba0d-25142bfbdfcb',
 'a753ef66-50a0-4f5e-9943-c596ab6cac7f',
 '647a38e2-9d47-4571-9855-fa14d0061f70',
 '3f6ae609-ec4f-439e-adec-89665548270e',
 'fa10f80c-f589-4fa1-aa85-460717a911a9',
 'f43ffc89-f136-4d9c-870b-c7bedfd3cea9',
 'e7e364d0-fc38-4491-acbc-985da0792d01',
 'c7147e26-be05-4d29-914d-54bf39a8e5bf',
 '10118b6f-7fa6-4ac0-9368-57217f88dc8d',
 '94a49817-f69e-45e4-853c-042780d1df95',
 '17fedce5-14ef-