In [30]:
import os

os.environ["GOOGLE_API_KEY"] = "AIzaSyAgedLQmOOyikNiR3R6ZkhKJa4JgHJ14G4"

In [31]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

In [32]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./db",  # Where to save data locally, remove if not necessary
)

In [33]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

['0312f322-05fe-46d1-adb0-72dda5f187c8',
 '48404c28-ba6a-41e4-bc46-3144bb42219d',
 'fdfa00e0-9d10-4dc9-a392-e0a37149b98a',
 'fd49e38d-b27e-43e6-83ae-8bcf3b5d3ba4',
 'e493fa18-c020-4294-b13c-b13a77aef3b8',
 '4b6d5027-5928-4352-8332-7e3650c4b36d',
 '91003ba4-ad3f-4979-8cd2-7751a01bafcf',
 'a09b1aca-5fda-4a44-bc1b-0b7a223f8a9f',
 '51d924df-65f7-4702-aad3-8865d46bb3cf',
 'ade328c3-c02f-4fa7-9e01-9eccb1d3a830']

In [8]:
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=2,
    filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet'}]
* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet'}]


In [10]:
vector_store.get()

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [15]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from loguru import logger


class DocumentManager:
    def __init__(self, directory_path, glob_pattern="./*.md"):
        self.directory_path = directory_path
        self.glob_pattern = glob_pattern
        self.documents = None

    def load_documents(self):
        loader = DirectoryLoader(
            self.directory_path,
            glob=self.glob_pattern,
            show_progress=True,
            loader_cls=UnstructuredMarkdownLoader,
        )
        self.documents = loader.load()

    def split_documents(self, chunk_size=1000, chunk_overlap=500):
        self.load_documents()
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
        chunks = text_splitter.split_documents(self.documents)
        logger.info(chunks)
        return chunks

In [41]:
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.schema import Document
from loguru import logger
from os import path
from shutil import rmtree


class EmbeddingManager:
    def __init__(
        self, chunks: list[Document] = None, ids:list = None, persist_directory="./chroma_langchain_dbs", embedding=None
    ):
        self.chunks = chunks
        self.ids = ids
        self.persist_directory = persist_directory
        self.vectordb = None
        self.embedding = embedding or GoogleGenerativeAIEmbeddings(
            model="models/text-embedding-004"
        )

    def get_vectordb(self, **kwargs):
        logger.info("Trying load the vectordb")
        self.vectordb = Chroma(
           collection_name="chatbot_bank_service", persist_directory=self.persist_directory, embedding_function=self.embedding
        )
        logger.success("Success load vectordb")
        return self.vectordb

    # Method to create and persist embeddings
    def create_and_persist_embeddings(self):
        try:
            if path.exists(self.persist_directory):
                logger.info("Remove existent persist chroma db directory")
                rmtree(self.persist_directory)
                logger.success("Success remove existent persist chroma db directory")

            logger.info(
                "Creating an instance of Chroma with the sections and the embeddings"
            )
            self.vectordb = Chroma(
                collection_name="chatbot_bank_service",
                embedding_function=self.embedding,
                persist_directory=self.persist_directory,  # Where to save data locally, remove if not necessary
            )
            logger.success(
                "Success creating an instance of Chroma with the sections and the embeddings"
            )
            logger.info("Adding document to vectordb")
            logger.info(self.chunks[0])
            self.vectordb.add_documents(documents=self.chunks, ids = self.ids)
            logger.success("Success adding document to vectordb")

        except Exception as e:
            logger.exception(e)


In [42]:
embedding_manager = EmbeddingManager(chunks= documents, ids=uuids)
embedding_manager.create_and_persist_embeddings()


[32m2024-12-09 21:25:03.933[0m | [1mINFO    [0m | [36m__main__[0m:[36mcreate_and_persist_embeddings[0m:[36m33[0m - [1mRemove existent persist chroma db directory[0m
[32m2024-12-09 21:25:03.935[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mcreate_and_persist_embeddings[0m:[36m35[0m - [32m[1mSuccess remove existent persist chroma db directory[0m
[32m2024-12-09 21:25:03.936[0m | [1mINFO    [0m | [36m__main__[0m:[36mcreate_and_persist_embeddings[0m:[36m37[0m - [1mCreating an instance of Chroma with the sections and the embeddings[0m
[32m2024-12-09 21:25:03.941[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mcreate_and_persist_embeddings[0m:[36m45[0m - [32m[1mSuccess creating an instance of Chroma with the sections and the embeddings[0m
[32m2024-12-09 21:25:03.942[0m | [1mINFO    [0m | [36m__main__[0m:[36mcreate_and_persist_embeddings[0m:[36m48[0m - [1mAdding document to vectordb[0m
[32m2024-12-09 21:25:03.943[0m | [1mINFO    

In [43]:
embedding_manager.get_vectordb().get()

[32m2024-12-09 21:25:07.892[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_vectordb[0m:[36m22[0m - [1mTrying load the vectordb[0m
[32m2024-12-09 21:25:07.898[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mget_vectordb[0m:[36m26[0m - [32m[1mSuccess load vectordb[0m


{'ids': ['0312f322-05fe-46d1-adb0-72dda5f187c8',
  '48404c28-ba6a-41e4-bc46-3144bb42219d',
  'fdfa00e0-9d10-4dc9-a392-e0a37149b98a',
  'fd49e38d-b27e-43e6-83ae-8bcf3b5d3ba4',
  'e493fa18-c020-4294-b13c-b13a77aef3b8',
  '4b6d5027-5928-4352-8332-7e3650c4b36d',
  '91003ba4-ad3f-4979-8cd2-7751a01bafcf',
  'a09b1aca-5fda-4a44-bc1b-0b7a223f8a9f',
  '51d924df-65f7-4702-aad3-8865d46bb3cf',
  'ade328c3-c02f-4fa7-9e01-9eccb1d3a830'],
 'embeddings': None,
 'documents': ['I had chocolate chip pancakes and scrambled eggs for breakfast this morning.',
  'The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.',
  'Building an exciting new project with LangChain - come check it out!',
  'Robbers broke into the city bank and stole $1 million in cash.',
  "Wow! That was an amazing movie. I can't wait to see it again.",
  'Is the new iPhone worth the price? Read this review to find out.',
  'The top 10 soccer players in the world right now.',
  'LangGraph is the best framewo