In [None]:
%pip install Pinecone

In [6]:
import pinecone
from pinecone import Pinecone

import uuid

from langchain.embeddings import GoogleGenerativeAIEmbeddings
from langchain.schema.embeddings import Embeddings

In [None]:
from pydantic import BaseModel, Field

In [None]:
class Chunk(BaseModel):
    title: str
    content: str
    keywords: list[str]
    named_entities: list[str]
    timestamp_range: str

class ChunkResponse(BaseModel):
    chunks: list[Chunk]

In [None]:
class ChunkVectorDB:
  def __init__(self,
               embedding_model : Embeddings,
               api_key : str = "pcsk_8eiAt_DKBYBA3H1mQg3RsGd8qRwcmh7AdGxfet3XxeE3poUVKHEt8Zpbms3q3wgXeD7Ct",
               index_name : str = "dense-index"):

    # initialize pinecone
    self.pc = Pinecone(api_key= api_key)
    self.index_name = index_name
    self.embedding_model = embedding_model

    # create pinecone index
    if not self.pc.has_index(self.index_name):
        self.pc.create_index_for_model(
            name=self.index_name,
            cloud="aws",
            region="us-east-1",
            embed={
                "model":"llama-text-embed-v2",
                "field_map":{"text": "chunk_text"}
            }
        )

    self.index = self.pc.Index(self.index_name)

  def vectorize_chunks(self, chunks: list[Chunk]):
    texts = [chunk.content for chunk in chunks]
    embeddings = self.embedding_model.embed_documents(texts)

    vectors = []
    for chunk, vector in zip(chunks, embeddings):
      chunk_id = str(uuid.uuid4())
      vectors.append({
          "id": chunk_id,
          "values": vector,
          "metadata": {
              "title": chunk.title,
              "keywords": chunk.keywords,
              "named_entities": chunk.named_entities,
              "timestamp_range": chunk.timestamp_range
          }
      })

    self.index.upsert("chunks", vectors)

  def read_vectors(self):
    all_ids = []

    for page in self.index.list_vectors():
        ids = [v.id for v in page.vectors]
        all_ids.extend(ids)

    print(f"Total IDs: {len(all_ids)}")

    batch_size = 100
    for i in range(0, len(all_ids), batch_size):
        batch_ids = all_ids[i:i+batch_size]
        response = self.index.fetch(ids=batch_ids)
        for vector_id, vector in response.vectors.items():
            print(f"\nID: {vector_id}")
            print(f"Values: {vector.values}")
            print(f"Metadata: {vector.metadata}")

  def delete_index(self):
    self.pc.delete_index(name=self.index_name)